%matplotlib inline
import pandas as pd
import numpy as np
import re, string
from tensorflow.keras.layers import Dropout, Dense
from tensorflow.keras.models import Sequential
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
pt1= pd.read_excel('../groups.xlsx',sheet_name='Sheet1')
# pt1.drop_duplicates(subset ="Name",
# keep = False, inplace = True)
pt1["What we're about"].head()
def clean_text(article):
clean1 = re.sub(r'['+string.punctuation + '’—”'+']', "", article.lower())
return re.sub(r'\W+', ' ', clean1)
from string import digits
remove_digits = str.maketrans('', '', digits)
pt1['tokenized'] = pt1["What we're about"].map(lambda x: clean_text(str(x).translate(remove_digits)))
pt1['tokenized'].head()
Lemmatisation (or lemmatization) in linguistics is the process of grouping together the inflected forms of a word so they can be analysed as a single item, identified by the word's lemma, or dictionary form.
In computational linguistics, lemmatisation is the algorithmic process of determining the lemma of a word based on its intended meaning. Unlike stemming, lemmatisation depends on correctly identifying the intended part of speech and meaning of a word in a sentence, as well as within the larger context surrounding that sentence, such as neighboring sentences or even an entire document. As a result, developing efficient lemmatisation algorithms is an open area of research.
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
lemma_word = []
wordnet_lemmatizer = WordNetLemmatizer()
for i, row in pt1.iterrows():
word = ""
for w in row['tokenized'].split():
word1 = wordnet_lemmatizer.lemmatize(w, pos = "n")
word2 = wordnet_lemmatizer.lemmatize(word1, pos = "v")
word3 = wordnet_lemmatizer.lemmatize(word2, pos = ("a"))
word = word + " " + word3
lemma_word.append(word)
pt1['lemma_word'] = lemma_word
pt1['lemma_word'].head()
from nltk.corpus import stopwords
X = []
for i, row in pt1.iterrows():
for sw in stopwords.words('english'):
words = str(row['lemma_word']).replace(sw,"")
X.append(words)
pt1['corpus'] = X
pt1['num_wds'] = pt1['corpus'].str.split().apply(lambda x: len(set(x)))
ax=pt1['num_wds'].plot(kind='hist', bins=50, fontsize=14, figsize=(12,10))
ax.set_title('Distribution of Dataset\n', fontsize=20)
ax.set_ylabel('Frequency', fontsize=18)
ax.set_xlabel('Number of Words', fontsize=18);
pt1 = pt1[pt1['num_wds']>=10]
from collections import Counter
wd_counts = Counter()
for i, row in pt1.iterrows():
wd_counts.update(row['corpus'].split())
wd_counts.most_common(30)
len(set(wd_counts))
In information retrieval, tf–idf or TFIDF, short for term frequency–inverse document frequency, is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus.It is often used as a weighting factor in searches of information retrieval, text mining, and user modeling. The tf–idf value increases proportionally to the number of times a word appears in the document and is offset by the number of documents in the corpus that contain the word, which helps to adjust for the fact that some words appear more frequently in general. tf–idf is one of the most popular term-weighting schemes today; 83% of text-based recommender systems in digital libraries use tf–idf.
pt1 = pt1.reset_index()
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(pt1['corpus'])
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)
df.head()
all_tfidf = pd.DataFrame(df.mean())
all_tfidf.columns = ["Score"]
all_tfidf = all_tfidf.sort_values(by=['Score'],ascending=False)
import matplotlib.pyplot as plt
fig=plt.figure(figsize=(15, 5), dpi= 100, facecolor='w', edgecolor='k')
names = all_tfidf.index.values[0:20]
plt.bar(range(20),list(all_tfidf.iloc[0:20]['Score']),tick_label=names)
plt.show()
wordDict1 = dict.fromkeys(wd_counts,0)
print(len(set(wordDict1)))
data=[]
for i,row in pt1.iterrows():
wordDict = wordDict1.copy()
rowSet = set(row['corpus'].split())
for word in rowSet:
try:
wordDict[word] +=1
except:
continue
data.append(wordDict)
df1 = pd.DataFrame(data)
df1.head()
from wordcloud import WordCloud
word_freq1 = all_tfidf.iloc[0:201].T.to_dict('index')
word_freq = word_freq1['Score']
fig=plt.figure(figsize=(14, 12), dpi= 80, facecolor='w', edgecolor='k')
wc = WordCloud(background_color="white",width=1000,height=1000, max_words=300,relative_scaling=0.25,normalize_plurals=True).generate_from_frequencies(word_freq)
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
df['Category'] = pt1['Category']
df1['Category'] = pt1['Category']
df_train=df.sample(frac=0.60,random_state=1207)
df_test=df.drop(df_train.index)
df_train1=df1.sample(frac=0.60,random_state=1207)
df_test1=df1.drop(df_train.index)
df_train1.to_excel("Training.xlsx", encoding='utf-8')
df_test1.to_excel("Testing.xlsx", encoding='utf-8')
df_train.to_excel("TFIDFTraining.xlsx", encoding='utf-8')
df_test.to_excel("TFIDFTesting.xlsx", encoding='utf-8')