You are on page 1of 14

8_Textual Analysis_2020

Lennard Schmidt
5/10/2020

require("dplyr")
require("tidytext")
require("textdata")
require("widyr")

Load Dataset
happy_df <- read.csv("./happydb.csv", sep = ";")

Prepare Dataframe and Check for NA


happy_df <- happy_df[,-c(1,4,6,8)]
happy_df$cleaned_hm <- as.character(happy_df$cleaned_hm)
colnames(happy_df) <- c("wid", "reflection_period", "cleaned_hm", "num_sentence", "hm_category")

sapply(happy_df, function(x) sum(is.na(x)))

## wid reflection_period cleaned_hm num_sentence


## 0 0 0 0
## hm_category
## 0

Investigate Variables
skimr::skim(happy_df)

Data summary
Name happy_df
Number of rows 100535
Number of columns 5
_______________________
Column type frequency:
character 1
factor 2
numeric 2
________________________
Group variables None
Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace

cleaned_hm 0 1 6 6532 0 96481 365

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts

reflection_period 0 1 FALSE 2 3m: 50704, 24h: 49831

hm_category 0 1 FALSE 7 aff: 34168, ach: 33993, enj: 11144, bon: 10727

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist

wid 0 1 2746.62 3535.01 1 410 1125 3507 13839 ▇▂▁▁▁


num_sentence 0 1 1.34 1.30 1 1 1 1 69 ▇▁▁▁▁

Tokenize by Word
happy_df.tidy <- tidytext::unnest_tokens(happy_df, word, cleaned_hm)
dplyr::count(happy_df.tidy, word, sort = TRUE)
## # A tibble: 27,549 x 2
## word n
## <chr> <int>
## 1 i 104285
## 2 my 74039
## 3 a 71630
## 4 to 57289
## 5 and 56695
## 6 the 52502
## 7 was 33927
## 8 for 27126
## 9 in 26429
## 10 me 25540
## # ... with 27,539 more rows

Remove Stopwords
happy_df.clean <- dplyr::anti_join(happy_df.tidy, tidytext::get_stopwords())

## Joining, by = "word"

dplyr::count(happy_df.clean, word, sort = TRUE)

## # A tibble: 27,381 x 2
## word n
## <chr> <int>
## 1 happy 18732
## 2 got 13378
## 3 made 11435
## 4 went 9616
## 5 time 9328
## 6 new 8870
## 7 day 8048
## 8 work 7864
## 9 last 6391
## 10 good 5851
## # ... with 27,371 more rows
Compute and Visualize Word Counts
happy_df.count <- dplyr::count(happy_df.clean, word, sort = TRUE)
happy_df.count <- happy_df.count[which( happy_df.count$word != "happy" &
happy_df.count$word != "happiest" ),]
happy_df.count$word <- reorder(happy_df.count$word, happy_df.count$n)
happy_df.count <- head(happy_df.count, 20)

ggplot2::ggplot(happy_df.count, ggplot2::aes(x = word, y = n)) +


ggplot2::geom_col() +
ggplot2::coord_flip() +
ggpubr::theme_pubclean()
Compute TF-IDF
happy_df.count <- dplyr::count(happy_df.clean, wid, word, sort = TRUE)
happy_df.count <- happy_df.count[which(happy_df.count$n > 10 &
happy_df.count$word != "happy" &
happy_df.count$word != "happiest" &
happy_df.count$word != "happiness" &
nchar(happy_df.count$word) > 3),]
head(happy_df.count)

## # A tibble: 6 x 3
## wid word n
## <int> <chr> <int>
## 1 280 life 123
## 2 120 event 92
## 3 280 good 89
## 4 954 time 84
## 5 40 went 80
## 6 5 made 79

tidytext::bind_tf_idf(happy_df.count, word, wid, n)

## # A tibble: 2,256 x 6
## wid word n tf idf tf_idf
## <int> <chr> <int> <dbl> <dbl> <dbl>
## 1 280 life 123 0.370 2.51 0.930
## 2 120 event 92 0.262 3.69 0.968
## 3 280 good 89 0.268 2.64 0.708
## 4 954 time 84 0.0617 1.80 0.111
## 5 40 went 80 0.0458 1.70 0.0779
## 6 5 made 79 0.745 1.84 1.37
## 7 455 time 76 0.0285 1.80 0.0511
## 8 40 thank 75 0.0429 6.40 0.275
## 9 954 roti 73 0.0536 6.40 0.343
## 10 55 really 72 0.270 2.74 0.738
## # ... with 2,246 more rows

Visualize as Word Cloud


wordcloud::wordcloud(happy_df.count$word, happy_df.count$n, min.freq = 1, max.words = 100, random.order=FALSE)

Join Sentiment Dictionary and Visualize Sentiment


Counts
happy_df.sen <- dplyr::inner_join(happy_df.clean, tidytext::get_sentiments("nrc"), by = "word")
happy_df.sen <- dplyr::inner_join(happy_df.sen, tidytext::get_sentiments("afinn"), by = "word")
head(happy_df.sen, 10)
## wid reflection_period num_sentence hm_category word sentiment
## 1 2053 24h 1 affection successful anticipation
## 2 2053 24h 1 affection successful joy
## 3 2053 24h 1 affection successful positive
## 4 2053 24h 1 affection successful trust
## 5 2053 24h 1 affection sympathy positive
## 6 2053 24h 1 affection sympathy sadness
## 7 2 24h 1 affection happy anticipation
## 8 2 24h 1 affection happy joy
## 9 2 24h 1 affection happy positive
## 10 2 24h 1 affection happy trust
## value
## 1 3
## 2 3
## 3 3
## 4 3
## 5 2
## 6 2
## 7 3
## 8 3
## 9 3
## 10 3

happy_df.sen_count <- count(happy_df.sen, sentiment, word, sort = TRUE)


happy_df.sen_count$word <- reorder(happy_df.sen_count$word, happy_df.sen_count$n)
happy_df.sen_count <- by(happy_df.sen_count, happy_df.sen_count["sentiment"], head, n=5)
happy_df.sen_count <- Reduce(rbind, happy_df.sen_count)

ggplot2::ggplot(happy_df.sen_count, ggplot2::aes(x = word, y = n, fill = sentiment)) +


ggplot2::geom_col(show.legend = FALSE) +
ggplot2::facet_wrap(~sentiment, scales = "free") +
ggplot2::labs(y = "Contribution to sentiment", x = NULL) +
ggplot2::coord_flip() +
ggpubr::theme_pubclean()
Compute and Visualize Count by Reflection Time
happy_df.sen_count <- dplyr::count(happy_df.sen, reflection_period, sentiment, sort = TRUE)
happy_df.sen_count$sentiment <- reorder(happy_df.sen_count$sentiment, happy_df.sen_count$n)

ggplot2::ggplot(happy_df.sen_count, ggplot2::aes(x = sentiment, y = n, fill = reflection_period)) +


ggplot2::geom_bar(stat = "identity", position = "dodge") +
ggplot2::labs(y = "Contribution to sentiment", x = NULL) +
ggplot2::coord_flip() +
ggpubr::theme_pubclean()
Compute and Visualize Mean Valence by Reflection
Period
happy_df.sen_count <- dplyr::count(happy_df.sen, reflection_period, word, value, sort = TRUE)
happy_df.sen_count$value <- happy_df.sen_count$value * happy_df.sen_count$n
happy_df.sen_agg <- aggregate(cbind(n, value) ~ reflection_period, happy_df.sen_count, sd)
happy_df.sen_agg$mean_value <- happy_df.sen_agg$value / happy_df.sen_agg$n
head(happy_df.sen_agg)

## reflection_period n value mean_value


## 1 24h 1538.305 4560.348 2.964529
## 2 3m 1638.364 4873.171 2.974413
ggplot2::ggplot(happy_df.sen_agg, ggplot2::aes(x= reflection_period, y = mean_value, fill = reflection_period)) +
ggplot2::geom_col() +
ggpubr::theme_pubclean()

Tokenize and Visualize by Sentence


happy_df.tidy <- tidytext::unnest_tokens(happy_df, sentence, cleaned_hm, token = "sentences")
happy_df.count <- dplyr::count(happy_df.tidy, sentence, sort = TRUE)
happy_df.count$sentence <- reorder(happy_df.count$sentence, happy_df.count$n)
happy_df.count <- head(happy_df.count, 10)

ggplot2::ggplot(happy_df.count, ggplot2::aes(x = sentence, y = n)) +


ggplot2::geom_col() +
ggplot2::coord_flip() +
ggplot2::scale_x_discrete(labels = function(x) stringr::str_wrap(x, width = 60)) +
ggpubr::theme_pubclean()

Tokenize and Visualize by N-Gram


happy_df.tidy <- tidytext::unnest_tokens(happy_df, bigram, cleaned_hm, token = "ngrams", n = 2)
happy_df.count <- dplyr::count(happy_df.tidy, bigram, sort = TRUE)
happy_df.count$bigram <- reorder(happy_df.count$bigram, happy_df.count$n)
happy_df.count <- head(happy_df.count, 20)

ggplot2::ggplot(happy_df.count, ggplot2::aes(x = bigram, y = n)) +


ggplot2::geom_col() +
ggplot2::coord_flip() +
ggpubr::theme_pubclean()

Compute Word Pairs and Correlations


word_pair <- widyr::pairwise_count(happy_df.clean, word, wid, sort = TRUE)
head(word_pair, 10)

## # A tibble: 10 x 3
## item1 item2 n
## <chr> <chr> <dbl>
## 1 made happy 2987
## 2 happy made 2987
## 3 made got 2393
## 4 got made 2393
## 5 new got 2311
## 6 got new 2311
## 7 time got 2300
## 8 got time 2300
## 9 got happy 2287
## 10 happy got 2287

word_cor <- widyr::pairwise_cor(happy_df.clean[sample(nrow(happy_df.clean), 1000),], word, wid, sort = TRUE)


head(word_cor[which(word_cor$correlation != 1),], 10)

## # A tibble: 10 x 3
## item1 item2 correlation
## <chr> <chr> <dbl>
## 1 client learned 1.
## 2 attended wheeler 1.
## 3 hardly human 1.
## 4 meet walt 1.
## 5 word area 1.
## 6 normally engineer 1.
## 7 creek content 1.
## 8 force die 1.
## 9 spring neighborhood 1.
## 10 know performed 1.

Visualize Correlations
word_cor <- head(word_cor[which(word_cor$correlation != 1),],100)
g <- igraph::graph_from_data_frame(word_cor)

ggraph::ggraph(g, layout = "fr") +


ggraph::geom_edge_link(ggplot2::aes(edge_alpha = correlation), show.legend = FALSE) +
ggraph::geom_node_point(color = "lightblue", size = 4) +
ggraph::geom_node_text(ggplot2::aes(label = name), repel = TRUE) +
ggplot2::theme_void()

You might also like