Professional Documents
Culture Documents
Lennard Schmidt
5/10/2020
require("dplyr")
require("tidytext")
require("textdata")
require("widyr")
Load Dataset
happy_df <- read.csv("./happydb.csv", sep = ";")
Investigate Variables
skimr::skim(happy_df)
Data summary
Name happy_df
Number of rows 100535
Number of columns 5
_______________________
Column type frequency:
character 1
factor 2
numeric 2
________________________
Group variables None
Variable type: character
hm_category 0 1 FALSE 7 aff: 34168, ach: 33993, enj: 11144, bon: 10727
Tokenize by Word
happy_df.tidy <- tidytext::unnest_tokens(happy_df, word, cleaned_hm)
dplyr::count(happy_df.tidy, word, sort = TRUE)
## # A tibble: 27,549 x 2
## word n
## <chr> <int>
## 1 i 104285
## 2 my 74039
## 3 a 71630
## 4 to 57289
## 5 and 56695
## 6 the 52502
## 7 was 33927
## 8 for 27126
## 9 in 26429
## 10 me 25540
## # ... with 27,539 more rows
Remove Stopwords
happy_df.clean <- dplyr::anti_join(happy_df.tidy, tidytext::get_stopwords())
## Joining, by = "word"
## # A tibble: 27,381 x 2
## word n
## <chr> <int>
## 1 happy 18732
## 2 got 13378
## 3 made 11435
## 4 went 9616
## 5 time 9328
## 6 new 8870
## 7 day 8048
## 8 work 7864
## 9 last 6391
## 10 good 5851
## # ... with 27,371 more rows
Compute and Visualize Word Counts
happy_df.count <- dplyr::count(happy_df.clean, word, sort = TRUE)
happy_df.count <- happy_df.count[which( happy_df.count$word != "happy" &
happy_df.count$word != "happiest" ),]
happy_df.count$word <- reorder(happy_df.count$word, happy_df.count$n)
happy_df.count <- head(happy_df.count, 20)
## # A tibble: 6 x 3
## wid word n
## <int> <chr> <int>
## 1 280 life 123
## 2 120 event 92
## 3 280 good 89
## 4 954 time 84
## 5 40 went 80
## 6 5 made 79
## # A tibble: 2,256 x 6
## wid word n tf idf tf_idf
## <int> <chr> <int> <dbl> <dbl> <dbl>
## 1 280 life 123 0.370 2.51 0.930
## 2 120 event 92 0.262 3.69 0.968
## 3 280 good 89 0.268 2.64 0.708
## 4 954 time 84 0.0617 1.80 0.111
## 5 40 went 80 0.0458 1.70 0.0779
## 6 5 made 79 0.745 1.84 1.37
## 7 455 time 76 0.0285 1.80 0.0511
## 8 40 thank 75 0.0429 6.40 0.275
## 9 954 roti 73 0.0536 6.40 0.343
## 10 55 really 72 0.270 2.74 0.738
## # ... with 2,246 more rows
## # A tibble: 10 x 3
## item1 item2 n
## <chr> <chr> <dbl>
## 1 made happy 2987
## 2 happy made 2987
## 3 made got 2393
## 4 got made 2393
## 5 new got 2311
## 6 got new 2311
## 7 time got 2300
## 8 got time 2300
## 9 got happy 2287
## 10 happy got 2287
## # A tibble: 10 x 3
## item1 item2 correlation
## <chr> <chr> <dbl>
## 1 client learned 1.
## 2 attended wheeler 1.
## 3 hardly human 1.
## 4 meet walt 1.
## 5 word area 1.
## 6 normally engineer 1.
## 7 creek content 1.
## 8 force die 1.
## 9 spring neighborhood 1.
## 10 know performed 1.
Visualize Correlations
word_cor <- head(word_cor[which(word_cor$correlation != 1),],100)
g <- igraph::graph_from_data_frame(word_cor)