You are on page 1of 4

MODULE 11 – TEXT MINING

1) Extract reviews of any product of your choice from Amazon


2) Perform sentiment analysis
#clear environment
rm(list=ls())

#clear plot
dev.off()

#clear terminal - Ctrl+L

#packages install
install.packages("pacman")
require(pacman) # Gives a confirmation message.
library(pacman) # No message.
p_load(rvest, XML,magrittr)

# MODULE 11 - TEXT MINING


# Extract reviews of any product of your choice from Amazon
# Perform sentiment analysis

######### Amazon Reviews ###########

aurl <- "https://www.amazon.com/product-reviews/B07HDHLPBM?


reviewerType=all_reviews"

reviews <- NULL

for (i in 1:20){
aw <- read_html(as.character(paste(aurl,i,sep ="=")))
rev <- aw %>% html_nodes(".review-text") %>% html_text()
reviews <- c(reviews,rev)
}

write.table(reviews,"aw.txt")
getwd()

##################################

txt <- reviews

str(txt)
length(txt)

# Corpus
p_load(tm)
x <- Corpus(VectorSource(txt))

inspect(x[1])
inspect(x[160])

x <- tm_map(x, function(x) iconv(enc2utf8(x), sub='byte'))

# Data Cleansing
x1 <- tm_map(x, tolower)
inspect(x1[1])

x1 <- tm_map(x1, removePunctuation)


inspect(x1[1])

inspect(x1[5])
x1 <- tm_map(x1, removeNumbers)
inspect(x1[1])

x1 <- tm_map(x1, removeWords, stopwords('english'))


inspect(x1[1])
inspect(x1[3])

# striping white spaces


x1 <- tm_map(x1, stripWhitespace)
inspect(x1[1])

# Term document matrix


# converting unstructured data to structured format using TDM

tdm <- TermDocumentMatrix(x1)


tdm
dtm <- t(tdm) # transpose
dtm <- DocumentTermMatrix(x1)

tdm <- as.matrix(tdm)


dim(tdm)

tdm[1:20, 1:20]

inspect(x[3])

# Bar plot
w <- rowSums(tdm)
w

w_sub <- subset(w, w >= 100)


w_sub

barplot(w_sub, las=1, col = rainbow(30))


# Term "xxxx" repeats maximum number of times
x1 <- tm_map(x1, removeWords,
c('apple','can','get','got','one','verizon','also'))
x1 <- tm_map(x1, stripWhitespace)

tdm <- TermDocumentMatrix(x1)


tdm

tdm <- as.matrix(tdm)


tdm[100:109, 1:20]

# Bar plot after removal of the term 'one', 'read'


w <- rowSums(tdm)
w

w_sub <- subset(w, w >= 65)


w_sub
sort(w_sub)

barplot(w_sub, las=2, col = rainbow(30))

#repeat with word removal clean up


# Term "xxxx" repeats maximum number of times
x1 <- tm_map(x1, removeWords, c('phone',
'apple','can','get','got','one','verizon','also','just','like','store','watc
h','makes','without'))
x1 <- tm_map(x1, stripWhitespace)

tdm <- TermDocumentMatrix(x1)


tdm

tdm <- as.matrix(tdm)


tdm[100:109, 1:20]

#Repeat run: Bar plot

##### Word cloud #####


p_load(wordcloud)

wordcloud(words = names(w_sub), freq = w_sub)

w_sub1 <- sort(rowSums(tdm), decreasing = TRUE)


head(w_sub1)

wordcloud(words = names(w_sub1), freq = w_sub1) # all words are considered

# better visualization
wordcloud(words = names(w_sub1), freq = w_sub1,
random.order=F,colors=rainbow(30),scale=c(2,0.5),rot.per=0.4)
windows()

wordcloud(words = names(w_sub1), freq = w_sub1, random.order=F,colors=


rainbow(30),scale=c(3,0.5),rot.per=0.3)
?wordcloud

#for review of the Apple Watch, key things discussed are data/ cellular,
hours, screen, heart rate monitor

You might also like