This project focuses on analyzing the sentiments of Twitter data
related to specific keywords. Utilizing the syuzhet
package, it conducts sentiment scoring based on the comprehensive NRC
Word-Emotion Association Lexicon. The analysis encompasses eight
distinct emotional dimensions: anger, anticipation, disgust, fear, joy,
sadness, surprise, and trust.
Additionally, the project features a visualization component that tracks the evolution of the overall sentiment score over time, offering a dynamic view of public sentiment trends in relation to the chosen keywords.
library(data.table);library(textstem);library(lubridate)
library(tm);library(syuzhet);library(ggplot2);library(hrbrthemes);library(wordcloud);library(wordcloud2)
lyx
). If there is no
query, then this value is NO_QUERY
.robotickilldozr
)Lyx
is
cool)data <- read.csv("./inputs.csv", header = FALSE, col.names = c("target", "ids", "date", "flag", "user", "text"), encoding = "UTF-8")
summary(data)
target ids date flag user text
Min. :0 Min. :1.468e+09 Length:1600000 Length:1600000 Length:1600000 Length:1600000
1st Qu.:0 1st Qu.:1.957e+09 Class :character Class :character Class :character Class :character
Median :2 Median :2.002e+09 Mode :character Mode :character Mode :character Mode :character
Mean :2 Mean :1.999e+09
3rd Qu.:4 3rd Qu.:2.177e+09
Max. :4 Max. :2.329e+09
search_tweets <- function(tweets_df, keyword, ignore.case = TRUE) {
tryCatch({
# Ensure tweets_df is a data.table
setDT(tweets_df)
# Vectorized search for keyword
suppressWarnings(keyword_filtered <- tweets_df[grepl(keyword, text, ignore.case = ignore.case)])
return(keyword_filtered)
}, error = function(e) {
stop("An error occurred: ", conditionMessage(e))
})
}
test_result <- search_tweets(data,"apple")
summary(test_result)
target ids date flag user text
Min. :0.000 Min. :1.468e+09 Length:3732 Length:3732 Length:3732 Length:3732
1st Qu.:0.000 1st Qu.:1.969e+09 Class :character Class :character Class :character Class :character
Median :0.000 Median :2.051e+09 Mode :character Mode :character Mode :character Mode :character
Mean :1.883 Mean :2.030e+09
3rd Qu.:4.000 3rd Qu.:2.204e+09
Max. :4.000 Max. :2.329e+09
Lemmatization
is a linguistic process that reduces a
word to its base or dictionary form, known as a lemma
.lemmatization
considers the context and transforms
words to their meaningful base forms. For example, “running,” “ran,” and
“runs” would all be lemmatized to “run.”Lemmatization
is important in NLP because it groups
together different inflected forms of a word, allowing them to be
analyzed as a single item.preprocess_text <- function(text_column) {
text_column <- tolower(text_column) # Convert text to lowercase
text_column <- removePunctuation(text_column) # Remove punctuation
text_column <- removeNumbers(text_column) # Remove numbers
text_column <- removeWords(text_column, stopwords("en")) # Remove stopwords
text_column <- stripWhitespace(text_column) # Strip extra whitespace
text_column <- lemmatize_strings(text_column) # Apply lemmatization
return(text_column)
}
positively
or negatively
expressed.syuzhet
package, which
relies on the NRC Word-Emotion Association Lexicon. This lexicon
associates words with emotions
(like joy, sadness,
anger, etc.) and sentiment
valence
(positive or negative).analyze_sentiment <- function(preprocessed_text) {
sentiment_scores <- get_nrc_sentiment(preprocessed_text)
sentiment_scores$overall_sentiment <- sentiment_scores$positive - sentiment_scores$negative
return(sentiment_scores)
}
# Preprocess the text
test_result[, text_clean := preprocess_text(text)]
# Calculate sentiment scores
sentiment_results <- analyze_sentiment(test_result$text_clean)
## Word Cloud
# Create a corpus
docs <- Corpus(VectorSource(test_result$text_clean))
dtm <- TermDocumentMatrix(docs)
matrix <- as.matrix(dtm)
words <- sort(rowSums(matrix),decreasing=TRUE)
df_words <- data.frame(word = names(words),freq=words)
set.seed(1234) # for reproducibility
# wordcloud(words = df_words$word, freq = df_words$freq, min.freq = 1,
# max.words=200, random.order=FALSE, rot.per=0.15,
# colors=brewer.pal(8, "Set2"))
wordcloud2(data=df_words, size=4, color='random-dark')
colnames(sentiment_results)
[1] "anger" "anticipation" "disgust" "fear" "joy" "sadness"
[7] "surprise" "trust" "negative" "positive" "overall_sentiment"
# Create a data frame with average scores for each emotion
emotion_averages <- data.frame(
Anger = mean(sentiment_results$anger),
Anticipation = mean(sentiment_results$anticipation),
Disgust = mean(sentiment_results$disgust),
Fear = mean(sentiment_results$fear),
Joy = mean(sentiment_results$joy),
Sadness = mean(sentiment_results$sadness),
Surprise = mean(sentiment_results$surprise),
Trust = mean(sentiment_results$trust)
)
# Add a row for the minimum value for each category for plotting
emotion_df <- rbind(rep(0, ncol(emotion_averages)),rep(1, ncol(emotion_averages)), emotion_averages)
# Create the radar chart
fmsb::radarchart(emotion_df, axistype = 1,
#custom polygon
pcol=rgb(0.2,0.5,0.5,0.9) , pfcol=rgb(0.2,0.5,0.5,0.5) , plwd=4,
#custom the grid
cglcol="grey", cglty=1, axislabcol="grey", caxislabels=seq(0,1,5), cglwd=0.8,
#custom labels
vlcex=1.2)
NA
NA
# Chart
p <- ggplot(sentiment_results, aes(x=x) ) +
# Top
geom_density( aes(x = positive, y = ..density..), fill="#69b3a2" ) +
geom_label( aes(x=4.5, y=0.25, label="Positive"), color="#69b3a2") +
# Bottom
geom_density( aes(x = negative, y = -..density..), fill= "#404080") +
geom_label( aes(x=4.5, y=-0.25, label="Negative"), color="#404080") +
theme_ipsum() +
xlab("Tweets Sentimental Score")
p
# basic histogram
p <- ggplot(sentiment_results, aes(x=overall_sentiment)) +
geom_histogram(binwidth = 1, fill="#69b3a2", color="#e9ecef", alpha=0.9) +
theme_ipsum() +
xlab("Overall Sentimental Score")
p
test_result$date_parsed <- as.Date(strptime(test_result$date, format = "%a %b %d %H:%M:%S PDT %Y"))
summary(test_result$date_parsed)
Min. 1st Qu. Median Mean 3rd Qu. Max.
"2009-04-06" "2009-05-29" "2009-06-05" "2009-06-02" "2009-06-17" "2009-06-25"
test_result$overall_sentiment <- sentiment_results$overall_sentiment
daily_avg_sentiment <- aggregate(overall_sentiment ~ date_parsed, data = test_result, mean)
summary(daily_avg_sentiment)
date_parsed overall_sentiment
Min. :2009-04-06 Min. :-0.6667
1st Qu.:2009-05-09 1st Qu.: 0.2020
Median :2009-05-27 Median : 0.3158
Mean :2009-05-23 Mean : 0.3783
3rd Qu.:2009-06-10 3rd Qu.: 0.5294
Max. :2009-06-25 Max. : 2.0000
# Fit a linear model
model <- lm(overall_sentiment ~ date_parsed, data = daily_avg_sentiment)
# Extract slope and R^2 value
slope <- coef(model)[2]
r_squared <- summary(model)$r.squared
# Your original ggplot code
p <- ggplot(daily_avg_sentiment, aes(x = date_parsed, y = overall_sentiment)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
labs(title = "Daily Average Overall Sentiment Score",
x = "Date",
y = "Average Sentiment Score") +
theme_minimal()
# Add annotations for slope and R^2
p + annotate("text", x = as.Date(quantile(as.numeric(daily_avg_sentiment$date_parsed),0.75)), y = 1.2,
label = paste("Slope:", round(slope, 4), "\nR^2:", round(r_squared, 4)),
hjust = 0, vjust = 0)
NA
NA