In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import re, string
from tensorflow.keras.layers import  Dropout, Dense
from tensorflow.keras.models import Sequential
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics

Read the dataset

In [2]:
pt1= pd.read_excel('../groups.xlsx',sheet_name='Sheet1')
# pt1.drop_duplicates(subset ="Name", 
#                      keep = False, inplace = True) 
In [3]:
pt1["What we're about"].head()
Out[3]:
0    Comedy show wrapped in a 70’s style burrito. D...
1    Interested in Experiencing local art in Deep E...
2    Meet with other local people who enjoy the per...
3    This is a group for anyone interested in drawi...
4    This group is brought to you by VERB KULTURE E...
Name: What we're about, dtype: object

Split unique words from the description of each group. (field: What we're about)

Remove Punctuation

In [4]:
def clean_text(article):
    clean1 = re.sub(r'['+string.punctuation + '’—”'+']', "", article.lower())
    return re.sub(r'\W+', ' ', clean1)

Remove Digits

In [5]:
from string import digits
remove_digits = str.maketrans('', '', digits)
pt1['tokenized'] = pt1["What we're about"].map(lambda x: clean_text(str(x).translate(remove_digits)))
In [6]:
pt1['tokenized'].head()
Out[6]:
0    comedy show wrapped in a s style burrito dj sp...
1    interested in experiencing local art in deep e...
2    meet with other local people who enjoy the per...
3    this is a group for anyone interested in drawi...
4    this group is brought to you by verb kulture e...
Name: tokenized, dtype: object

Lemmatization

Lemmatisation (or lemmatization) in linguistics is the process of grouping together the inflected forms of a word so they can be analysed as a single item, identified by the word's lemma, or dictionary form.

In computational linguistics, lemmatisation is the algorithmic process of determining the lemma of a word based on its intended meaning. Unlike stemming, lemmatisation depends on correctly identifying the intended part of speech and meaning of a word in a sentence, as well as within the larger context surrounding that sentence, such as neighboring sentences or even an entire document. As a result, developing efficient lemmatisation algorithms is an open area of research.

In [7]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize 
lemma_word = []
wordnet_lemmatizer = WordNetLemmatizer()
for i, row in pt1.iterrows():
    word = ""
    for w in row['tokenized'].split():
        word1 = wordnet_lemmatizer.lemmatize(w, pos = "n")
        word2 = wordnet_lemmatizer.lemmatize(word1, pos = "v")
        word3 = wordnet_lemmatizer.lemmatize(word2, pos = ("a"))
        word  = word + " " + word3
    lemma_word.append(word)
In [8]:
pt1['lemma_word'] = lemma_word
pt1['lemma_word'].head()
Out[8]:
0     comedy show wrap in a s style burrito dj spin...
1     interest in experience local art in deep ellu...
2     meet with other local people who enjoy the pe...
3     this be a group for anyone interest in draw o...
4     this group be bring to you by verb kulture en...
Name: lemma_word, dtype: object

Remove Stopwords

In [9]:
from nltk.corpus import stopwords
X = []
for i, row in pt1.iterrows():
    for sw in stopwords.words('english'):
        words = str(row['lemma_word']).replace(sw,"")
    X.append(words)
pt1['corpus'] = X

Overview the distribution of this dataset

In [10]:
pt1['num_wds'] = pt1['corpus'].str.split().apply(lambda x: len(set(x)))
ax=pt1['num_wds'].plot(kind='hist', bins=50, fontsize=14, figsize=(12,10))
ax.set_title('Distribution of Dataset\n', fontsize=20)
ax.set_ylabel('Frequency', fontsize=18)
ax.set_xlabel('Number of Words', fontsize=18);

Only use records which contains the number of words larger than 10

In [11]:
pt1 = pt1[pt1['num_wds']>=10]

Convert text to Text Array (Bag of words)

In [12]:
from collections import Counter
wd_counts = Counter()
for i, row in pt1.iterrows():
    wd_counts.update(row['corpus'].split())

To see the top 30 frequency words.

In [13]:
wd_counts.most_common(30)
Out[13]:
[('and', 6373),
 ('to', 5886),
 ('be', 5616),
 ('the', 5393),
 ('a', 4383),
 ('of', 3409),
 ('in', 2614),
 ('you', 2390),
 ('for', 2358),
 ('we', 2170),
 ('group', 1725),
 ('our', 1439),
 ('this', 1391),
 ('with', 1345),
 ('or', 1304),
 ('your', 1059),
 ('that', 1056),
 ('have', 1053),
 ('on', 991),
 ('will', 934),
 ('at', 836),
 ('meet', 825),
 ('event', 825),
 ('all', 822),
 ('who', 747),
 ('it', 732),
 ('if', 726),
 ('dallas', 646),
 ('u', 599),
 ('join', 591)]

To see how many unique words we have

In [14]:
len(set(wd_counts))
Out[14]:
10332

TF-IDF

In information retrieval, tf–idf or TFIDF, short for term frequency–inverse document frequency, is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus.It is often used as a weighting factor in searches of information retrieval, text mining, and user modeling. The tf–idf value increases proportionally to the number of times a word appears in the document and is offset by the number of documents in the corpus that contain the word, which helps to adjust for the fact that some words appear more frequently in general. tf–idf is one of the most popular term-weighting schemes today; 83% of text-based recommender systems in digital libraries use tf–idf.

In [15]:
pt1 = pt1.reset_index()
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(pt1['corpus'])
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)
In [16]:
df.head()
Out[16]:
aarp ab abandon abandonded abatement abc aberg abide abilene ability ... 𝔸𝕡𝕣𝕚𝕝 𝔼𝕤𝕥 𝘾𝙤𝙢𝙢𝙪𝙣𝙞𝙩𝙮 𝙂𝙑𝙊 𝙂𝙤𝙤𝙙 𝙂𝙤𝙤𝙙𝙑𝙞𝙗𝙚𝙨𝙊𝙣𝙡𝙮 𝙊𝙣𝙡𝙮 𝙑𝙞𝙗𝙚𝙨 𝟚𝟘 𝟚𝟘𝟙𝟡
0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 10305 columns

In [17]:
all_tfidf = pd.DataFrame(df.mean())
all_tfidf.columns = ["Score"]
all_tfidf = all_tfidf.sort_values(by=['Score'],ascending=False)
In [21]:
import matplotlib.pyplot as plt
fig=plt.figure(figsize=(15, 5), dpi= 100, facecolor='w', edgecolor='k')
names = all_tfidf.index.values[0:20]
plt.bar(range(20),list(all_tfidf.iloc[0:20]['Score']),tick_label=names)
plt.show()

Normal frequency count table

In [26]:
wordDict1 = dict.fromkeys(wd_counts,0)
print(len(set(wordDict1)))
data=[]
for i,row in pt1.iterrows():
    wordDict = wordDict1.copy()
    rowSet = set(row['corpus'].split())
    for word in rowSet:
        try:
            wordDict[word] +=1
        except:
            continue
    data.append(wordDict)  
df1 = pd.DataFrame(data)
df1.head()
10332
Out[26]:
comedy show wrap in a s style burrito dj spinnin ... subconscious itsonia strategize httpwwwwritethedocsorg wtd portland berlin documentarians httpwwwwritethedocsorgslack httpwwwwritethedocsorgcodeofconduct
0 1 1 1 1 1 1 1 1 1 1 ... 0 0 0 0 0 0 0 0 0 0
1 0 0 0 1 0 0 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 0
2 0 1 0 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 0 0 0 1 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 10332 columns

Using WordCloud to see the distribution of words

In [22]:
from wordcloud import WordCloud
word_freq1 = all_tfidf.iloc[0:201].T.to_dict('index')
word_freq = word_freq1['Score']
In [23]:
fig=plt.figure(figsize=(14, 12), dpi= 80, facecolor='w', edgecolor='k')
wc = WordCloud(background_color="white",width=1000,height=1000, max_words=300,relative_scaling=0.25,normalize_plurals=True).generate_from_frequencies(word_freq)
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
Out[23]:
(-0.5, 999.5, 999.5, -0.5)

Randomly select 60% data as the training sample, the rest as the testing sample

In [27]:
df['Category'] = pt1['Category']
df1['Category'] = pt1['Category']
df_train=df.sample(frac=0.60,random_state=1207)
df_test=df.drop(df_train.index)
df_train1=df1.sample(frac=0.60,random_state=1207)
df_test1=df1.drop(df_train.index)

Save the training and testing dataset

In [28]:
df_train1.to_excel("Training.xlsx", encoding='utf-8')
df_test1.to_excel("Testing.xlsx", encoding='utf-8')
df_train.to_excel("TFIDFTraining.xlsx", encoding='utf-8')
df_test.to_excel("TFIDFTesting.xlsx", encoding='utf-8')
In [ ]: