Professional Documents
Culture Documents
RDataMining Slides Text Mining
RDataMining Slides Text Mining
Yanchang Zhao
http://www.RDataMining.com
28 May 2015
Presented at AusDM 2014 (QUT, Brisbane) in Nov 2014 and at UJAT (Mexico) in Sept 2014
1 / 34
Outline
Introduction
Extracting Tweets
Text Cleaning
Frequent Words and Associations
Word Cloud
Clustering
Topic Modelling
Online Resources
2 / 34
Text Mining
text categorization
text clustering
entity extraction
sentiment analysis
document summarization
...
3 / 34
Chapter 10: Text Mining, R and Data Mining: Examples and Case Studies.
http://www.rdatamining.com/docs/RDataMining.pdf
4 / 34
Outline
Introduction
Extracting Tweets
Text Cleaning
Frequent Words and Associations
Word Cloud
Clustering
Topic Modelling
Online Resources
5 / 34
Retrieve Tweets
Retrieve recent tweets by @RDataMining
## Option 1: retrieve tweets from Twitter
library(twitteR)
tweets <- userTimeline("RDataMining", n = 3200)
6 / 34
14
Outline
Introduction
Extracting Tweets
Text Cleaning
Frequent Words and Associations
Word Cloud
Clustering
Topic Modelling
Online Resources
8 / 34
library(tm)
# build a corpus, and specify the source to be character vectors
myCorpus <- Corpus(VectorSource(tweets.df$text))
# convert to lower case
# tm v0.6
myCorpus <- tm_map(myCorpus, content_transformer(tolower))
# tm v0.5-10
# myCorpus <- tm_map(myCorpus, tolower)
# remove URLs
removeURL <- function(x) gsub("http[^[:space:]]*", "", x)
# tm v0.6
myCorpus <- tm_map(myCorpus, content_transformer(removeURL))
# tm v0.5-10
# myCorpus <- tm_map(myCorpus, removeURL)
9 / 34
remove punctuation
myCorpus <- tm_map(myCorpus, removePunctuation)
remove numbers
myCorpus <- tm_map(myCorpus, removeNumbers)
10 / 34
11 / 34
# tm v0.5-10
# myCorpus <- tm_map(myCorpus, stemCompletion)
# tm v0.6
stemCompletion2 <- function(x, dictionary) {
x <- unlist(strsplit(as.character(x), " "))
# Unexpectedly, stemCompletion completes an empty string to
# a word in dictionary. Remove empty string to avoid above issue.
x <- x[x != ""]
x <- stemCompletion(x, dictionary=dictionary)
x <- paste(x, sep="", collapse=" ")
PlainTextDocument(stripWhitespace(x))
}
myCorpus <- lapply(myCorpus, stemCompletion2, dictionary=myCorpusCopy)
myCorpus <- Corpus(VectorSource(myCorpus))
##
##
##
##
##
3
3
http://stackoverflow.com/questions/25206049/
stemcompletion-is-not-working
12 / 34
13 / 34
<<TermDocumentMatrix
Non-/sparse entries:
Sparsity
:
Maximal term length:
Weighting
:
14 / 34
Outline
Introduction
Extracting Tweets
Text Cleaning
Frequent Words and Associations
Word Cloud
Clustering
Topic Modelling
Online Resources
15 / 34
<<TermDocumentMatrix
Non-/sparse entries:
Sparsity
:
Maximal term length:
Weighting
:
Docs
Terms
101 102 103 104 105 106 107 108 109 110
r
0
1
1
0
0
0
0
0
1
1
ramachandran
0
0
0
0
0
0
0
0
0
0
random
0
0
0
0
0
0
0
0
0
0
ranked
0
0
0
0
0
0
0
0
0
0
rann
0
0
0
0
0
0
0
0
0
0
rapidmining
0
0
0
0
0
0
0
0
0
0
16 / 34
[1]
[4]
[7]
[10]
[13]
[16]
[19]
[22]
"analysis"
"book"
"data"
"introduction"
"package"
"research"
"social"
"use"
"application"
"code"
"example"
"mining"
"position"
"see"
"tutorial"
"big"
"computational"
"group"
"network"
"r"
"slides"
"university"
17 / 34
Terms
library(ggplot2)
ggplot(df, aes(x = term, y = freq)) + geom_bar(stat = "identity") +
xlab("Terms") + ylab("Count") + coord_flip()
use
university
tutorial
social
slides
see
research
r
position
package
network
mining
introduction
group
example
data
computational
code
book
big
application
analysis
0
50
100
150
Count
18 / 34
data
mahout
recommendation
sets
supports
frequent
itemset
mining
0.48
0.30
0.30
0.30
0.30
0.27
0.26
19 / 34
library(graph)
library(Rgraphviz)
plot(tdm, term = freq.terms, corThreshold = 0.1, weighting = T)
use
group
university
tutorial
social
network
analysis
package
computational
mining
research
position
data
big
see
example
code
book
application
introduction
slides
20 / 34
Outline
Introduction
Extracting Tweets
Text Cleaning
Frequent Words and Associations
Word Cloud
Clustering
Topic Modelling
Online Resources
21 / 34
m <- as.matrix(tdm)
# calculate the frequency of words and sort it by frequency
word.freq <- sort(rowSums(m), decreasing = T)
# colors
pal <- brewer.pal(9, "BuGn")
pal <- pal[-(1:4)]
22 / 34
search
dynamic
area
social
network
position
simple
chapter
open
video
vacancies
titled
conference
postdocresearch
research
example
analysis big
use
can
notes
slides
csiro
application
studies
web
group
package
tutorial
data
mining
computational university
top
postdoctoral
short
china
media
distributed
snowfall opportunity
contain
performance
itemset
charts
knowledge frequent
canberra canada visit
sydney
initial
elsevier cloud
visualisations thanks
industrial
experience
technique
outlier
hadoop answers
added quick
start text
work plot ibm new
forecasting
document
due pdf analytics
week present
tried format
modeling
introduction fellow
youtubegoogle
interacting
time nd job
mid software
amp
machine
book
access
view rdatamining
reference
tj
project statistical
graphmay
southern scientist
talk
sentiment case now
fast create
see
published
postdoc edited
large
watson
list
technological
online
engineer
cfp recent
th
free
high
guidance
us
tool
user
april
topic
linkedin
australia
find
handling
igraph
parallel get
build series
jan graphical
rstudio
detection
code
business
follow
associate
website process
rule
ausdm card
detailed map
melbourne
dmapps
learn
call
program california senior
event
language database science lecture coursealso
spatial easier
prediction centerlab cluster
analyst
poll
dataset
draft
check function join twitterpage submission
singapore state kdnuggets
provided advanced
cran
comment classification dec
informatics management
wwwrdataminingcom
tweet
track microsoft
san
23 / 34
Outline
Introduction
Extracting Tweets
Text Cleaning
Frequent Words and Associations
Word Cloud
Clustering
Topic Modelling
Online Resources
24 / 34
25 / 34
big
use
slides
data
20
mining
package
example
tutorial
computational
book
application
social
network
analysis
university
position
research
10
30
Height
40
50
60
plot(fit)
rect.hclust(fit, k = 6)
# cut tree into 6 clusters
Cluster Dendrogram
26 / 34
1
2
3
4
5
6
1
2
3
4
5
6
1
2
3
analysis application
big book computational
0.136
0.076 0.136 0.015
0.061
0.026
0.154 0.154 0.256
0.026
0.857
0.000 0.000 0.000
0.000
0.078
0.026 0.013 0.052
0.052
0.083
0.024 0.000 0.048
0.107
0.091
0.061 0.152 0.000
0.000
mining network package position
r research
0.409
0.000
0.015
0.076 0.197
0.030
1.128
0.000
0.205
0.000 0.974
0.026
0.095
0.952
0.095
0.190 0.286
0.048
0.104
0.013
0.117
0.039 0.000
0.013
0.107
0.036
0.190
0.000 1.190
0.000
0.091
0.000
0.000
0.667 0.000
0.970
tutorial university
use
0.076
0.030 0.015
0.000
0.000 0.231
0.190
0.048 0.095
data example
1.015
0.030
1.487
0.231
0.048
0.095
0.000
0.065
0.024
0.274
0.515
0.000
slides social
0.091 0.000
0.051 0.000
0.095 0.810
0.104 0.013
0.167 0.000
0.000 0.121
27 / 34
for (i in 1:k) {
cat(paste("cluster ", i, ": ", sep = ""))
s <- sort(kmeansResult$centers[i, ], decreasing = T)
cat(names(s)[1:5], "\n")
# print the tweets of every cluster
# print(tweets[which(kmeansResultcluster==i)])
}
##
##
##
##
##
##
cluster
cluster
cluster
cluster
cluster
cluster
1:
2:
3:
4:
5:
6:
28 / 34
Outline
Introduction
Extracting Tweets
Text Cleaning
Frequent Words and Associations
Word Cloud
Clustering
Topic Modelling
Online Resources
29 / 34
Topic Modelling
dtm <- as.DocumentTermMatrix(tdm)
library(topicmodels)
lda <- LDA(dtm, k = 8) # find 8 topics
(term <- terms(lda, 6)) # first 6 terms of every topic
##
##
##
##
##
##
##
##
##
##
##
##
##
##
[1,]
[2,]
[3,]
[4,]
[5,]
[6,]
[1,]
[2,]
[3,]
[4,]
[5,]
[6,]
Topic 1
Topic 2
Topic 3
Topic 4
"r"
"data"
"mining" "r"
"example" "introduction" "r"
"package"
"code"
"big"
"data"
"use"
"mining" "analysis"
"text"
"group"
"data"
"slides"
"package" "example"
"rule"
"mining"
"time"
"cluster"
Topic 6
Topic 7
Topic 8
"data"
"research"
"analysis"
"r"
"position"
"r"
"job"
"data"
"network"
"mining"
"university" "computational"
"lecture"
"analytics" "tutorial"
"university" "scientist" "slides"
Topic 5 ...
"data"
...
"mining" ...
"applicat...
"r"
...
"use"
...
"due"
...
Topic Modelling
# first topic identified for every document (tweet)
topic <- topics(lda, 1)
topics <- data.frame(date=as.IDate(tweets.df$created), topic)
qplot(date, ..count.., data=topics, geom="density",
fill=term[topic], position="stack")
0.4
term[topic]
analysis, r, network, computational, tutorial, slides
data, introduction, big, analysis, slides, mining
count
0.2
0.0
201107
201201
201207
201301
201307
31 / 34
Outline
Introduction
Extracting Tweets
Text Cleaning
Frequent Words and Associations
Word Cloud
Clustering
Topic Modelling
Online Resources
32 / 34
Online Resources
33 / 34
The End
Thanks!
Email: yanchang(at)rdatamining.com
34 / 34