Copy Entire Document Content in R Studio

Updated R Script Compiled by Mr.
Anup Sharma (Strictly to be used as class notes)
#### COPY ENTIRE DOCUMENT CONTENT IN R STUDIO #####
#===================================================
#----------------basics-------------------------
#===================================================
#### NUMERIC VARIABLES ####
# Assign value to a variable
x=2
x<-2
3->y
# Assign value to multiple variables
z=y=4
# Remove variable z
rm(z)
# Check type of variable x
class(x)
# Change type of variable x from numeric to Integer
x=as.integer(x)
# Check x type again
class(x)
# Check if x is numeric variable now or not
is.numeric(x) # True because integer is subset of Numeric
# Check if x is Integer variable now or not
is.integer(x)
#===================================================
#### Character Variables ####
# Assign "A Grade" text to a variable a
a="A Grade"
# Check length of a
nchar(a)
Updated R Script Compiled by Mr. Anup Sharma (Strictly to be used as class notes)
# Check length of a number 3000
nchar(3000)
#===================================================
#### Date Variable ####
# Assign todays date to variable b
b = as.POSIXct("2019-11-15 10:30") # as YYYY-MM-DD
# Check class of variable b
class(b)
#===================================================
#### Logical Variable ####
# Assign a logical value to variable c and d
c=TRUE
d=FALSE
# check type of variable c and d
class(c)
class(d)
# Compare if x>y
x>y
# y<=x ??
y<=x
# IF x = y ??
x==y
# if x not equal to y ??
x!=y
#### VECTORS ####
# Make a vector "grades" holding grades for 10 students
grades=c("a","b","a","c","d","a","b","a","c","d")
# Make a vector "marks" holding marks of 10 students

marks=c(90,85,98,72,54,93,86,90,70,45)
# Make a vector "rno" holding rollno of 10 students
rno=c(1:10)
# Add bonus marks = 2 and update marks variable
marks=marks+2
marks
# Convert marks to 10 scale and assign to slab variable
slab=marks/10
slab
# What's grade of 3rd student?
grades[3]
# What are slab for 3rd and 5th student?
slab[c(3,5)]
# What are marks for last three roll nos?
marks[8:10]
#===================================================
#### Factors ####
# Check factors for grades
as.factor(grades)
# Check numeric value of factor variables
gf=as.factor(grades)
as.numeric(gf)
#===================================================
#### NA ####
# Add roll no 11 with marks NA and grades NA
rno=c(rno,11)
marks=c(marks,NA)
grades=c(grades,NA)
# Update slab
slab=marks/10
slab
# Mean class marks?
mean(marks) ## Wrong answer as NA considered
mean(marks,na.rm=TRUE) ## True to strip NA before calculation
#===================================================
#### Pipes ####
# Add package magrittr from library
library(magrittr)
# Find mean marks
marks%>%mean ## Error as NA included
# Find mean after scraping NA
marks%>%is.na%>%mean # wrong as it's average no of NA
marks%>%mean(na.rm=TRUE)
#===================================================
#### DATA.FRAMES ####
# Make a dataframe with variables rno,marks,grades and slab
myclass=data.frame(Roll_Number=rno,Marks=marks,CGPA=slab,Grades=grades)
myclass
# Dimensions of myclass
dim(myclass) ## Row and Columns
# number of rows
nrow(myclass)
# number of columns
ncol(myclass)
# List of Row names
rownames(myclass)
# List of Column names
colnames(myclass)
# Head of dataframe
head(myclass)
# Tail of dataframe
tail(myclass)
# grades?
myclass[,4]
myclass[,"Grades"]
# grade of 5th roll number
myclass[5,4]
# performance of 5th roll number
myclass[5,]
# Roll number wise grades
myclass[,c(1,4)]
#==========-{ GGPLOT }=======
library(ggplot2)
data(diamonds)
mydata=diamonds
head(mydata)
#Base Histogram
hist(mydata$carat,main="Carat Histogram",xlab="Carat")
# Base Scatterplot
plot(price~carat,data=mydata)
# Boxplot
boxplot(mydata$carat)
##### Using ggplot2#####################################
# Histogram for discrete measurement
ggplot(data=mydata)+geom_histogram(aes(x=carat))
# Density for continous measurement
ggplot(data=mydata)+geom_density(aes(x=carat),fill="skyblue")
# Scatterplot
ggplot(mydata,aes(x=carat,y=price))+geom_point()
# save previous thing to g variable to add layers
g=ggplot(mydata,aes(x=carat,y=price))
# Add colors to g
g+geom_point(aes(color=color))
#Make faceted plots
g+geom_point(aes(color=color))+facet_wrap(~color)
g+geom_point(aes(color=color))+facet_grid(cut~clarity)
#Facet with histogram
ggplot(mydata,aes(x=carat))+geom_histogram()+facet_wrap(~color)
# Boxplots
ggplot(mydata,aes(y=carat,x=1))+geom_boxplot()
ggplot(mydata,aes(y=carat,x=cut))+geom_boxplot()
#Line plot
ggplot(economics,aes(x=date,y=pop))+geom_line(color="red")
#### AREA PLOT ######
library(ggplot2)
# create data
xValue=1:50
yValue=cumsum(rnorm(50))
data=data.frame(xValue,yValue)
# area Plot
ggplot(data)+geom_area(aes(x=xValue, y=yValue))
#### BAR PLOT######
#data
df=data.frame(dose=c("D0.5", "D1", "D2"),len=c(4.2, 10, 29.5))
#barplot
ggplot(data=df, aes(x=dose, y=len)) +
geom_bar(stat="identity", fill="steelblue")+
geom_text(aes(label=len), vjust=1.6, color="white", size=3.5)+
theme_minimal()
# data
df2=data.frame(supp=rep(c("VC", "OJ"), each=3),
dose=rep(c("D0.5", "D1", "D2"),2),
len=c(6.8, 15, 33, 4.2, 10, 29.5))
# Stacked barplot with multiple groups
ggplot(data=df2, aes(x=dose, y=len, fill=supp)) +
geom_bar(stat="identity")
# Use position=position_dodge()
ggplot(data=df2, aes(x=dose, y=len, fill=supp)) +
geom_bar(stat="identity", position=position_dodge())
###########dot plot#####################
dd=ToothGrowth
head(dd)
class(dd$dose)
class(dd)
dd$dose=as.factor(dd$dose)
ggplot(data=dd)+geom_dotplot(aes(x=dose,y=len,color=supp),binaxis="y",stackdir="center")
### PIE CHART ####

#data
df=data.frame(dose=c("D0.5", "D1", "D2"),len=c(4.2, 10, 29.5))
#barplot
bp= ggplot(data=df, aes(x="", y=len, fill=dose)) +
geom_bar(stat="identity")
pie=bp + coord_polar("y", start=0)
pie
##############correlogram###########
#####install.packages("GGally")
install.packages("corrplot")
h=cor(mtcars[,c(2,3,5)])
corr=head(round(h,2))
###corrplot(corr,method="circle")
library(corrplot)
corrplot(corr,method="circle")
corrplot(corr,method="pie")
corrplot(corr,method="color")
corrplot(corr,method="number")
#####################################################################
#=========-{ DPLYR }========
library(dplyr)
## Use of Pipes
diamonds %>% head(4) %>% dim
class(diamonds)
head(diamonds)
diamonds
## Print carat and price columns w/i and w/o pipes
select(diamonds,carat,price)
diamonds %>% select(carat,price)
diamonds %>% select(1,7)
## Print all except carat and price columns w/i and w/o pipes
diamonds %>% select(c(-carat,-price))
## Filter where cut is ideal
diamonds %>% filter(cut=='Ideal')
diamonds[diamonds$cut=='Ideal',] ### Base R equivalent
## Filter where cut is Ideal or Good
diamonds %>% filter(cut==c("Ideal","Good"))
## Filter where price >=1000
diamonds %>% filter(price>=1000)
## Multiple and condition carat>2 and price<14000 using , or &
diamonds %>%filter(carat>2,price<14000)
diamonds %>%filter(carat>2&price<14000)
## or condition carat <1 or >5
diamonds %>% filter(carat<1|carat>5)
## comparision with value
diamonds %>% filter(cut=="Ideal")
## compare using variable now
ia="Ideal"
diamonds %>% filter(cut==ia)
## to display selected rows 1 to 5, 8th and 15:20
diamonds %>% slice(c(1:5,8,15:20))
## display rows except 1st row
diamonds %>% slice(-1)

## add one more column ratio = price/carat
diamonds %>% mutate(ratio=price/carat)
## can use created variable in same call
diamonds %>%
select(carat,price) %>%
mutate(ratio=price/carat,double=ratio*2)
### Add magrittr package for following
diamonds2=diamonds
diamonds %<>%
select(carat,price) %>%
mutate(ratio=price/carat,double=ratio*2)
diamonds2
###
## summary
summarize(diamonds,mean(price))
diamonds %>% summarize(mean(price))
## group by
diamonds %>% group_by(cut) %>% summarize(AvgPr=mean(price),SumCarat=sum(carat))
diamonds %>% group_by(cut,color) %>% summarize(AvgPr=mean(price),SumCarat=sum(carat))
##################time series & forecasting #######
# create a vector
rainfall=c(12,43,32,23,23,31,56,20,30,12,45,78)
# convert to time series object, freq means number of data points per year
rats=ts(rainfall,start=c(2017,6),frequency=12)
rats
# create another vector
rainfall1=c(12,21,33,24,23,15,22,12,12,12,13,70)
# merge 2 vectors in a matrix
comr=matrix(c(rainfall,rainfall1),nrow=12)
# convert matrix to ts object
rats1=ts(comr,start=c(2017,6),frequency = 12)
library(forecast)
# Save a data to be analysed in a variable
dd=rats ## Also try AirPassengers
start(dd)
end(dd)
frequency(dd)
class(dd)
d1=auto.arima(dd)
# PRedict for next 3 periods
d2=predict(d1,n.ahead=3)
# Plot actual and prediction together
ts.plot(dd,d2$pred,col=c("blue","red"),lty=c(1,3))
########## Machine Learning
install.packages(c("tm","SnowballC","topicmodels","wordcloud","sentimentr","syuzhet"))
library(tm)
library(SnowballC)
library(topicmodels)
library(wordcloud)
library(plyr)
library(dplyr)
library(stringr)
library(ggplot2)
library(httr)
library(reshape2)
library(sentimentr)
library(scales)
library(RCurl)
library(syuzhet)
getwd()
filenames = list.files(getwd(),pattern="*.txt")
files = lapply(filenames,readLines)
#create corpus from vector
articles.corpus = Corpus(VectorSource(files))
class(articles.corpus)
##Text Preprocessing
# make each letter lowercase
articles.corpus = tm_map(articles.corpus, tolower)
# remove punctuation
articles.corpus = tm_map(articles.corpus, removePunctuation)
#remove numbers
articles.corpus = tm_map(articles.corpus, removeNumbers)
# remove generic and custom stopwords
stopwords()
articles.corpus = tm_map(articles.corpus, removeWords, stopwords())
articles.corpus = tm_map(articles.corpus, removeWords, c("and", "the", "have", "was", "with"))

#Visualization - Wordcloud
library(wordcloud)
wordcloud(articles.corpus, random.order = F)
#Create TDM
#A term document matrix is a way of representing the words in the text as a table (or matrix) of
numbers. The rows of the matrix represent the text responses to be analysed, and the columns of
the matrix represent the words from the text that are to be used in the analysis.
tdm= TermDocumentMatrix(articles.corpus)
class(tdm)
tdm=as.matrix(tdm)
tdm
termfreq = rowSums(as.matrix(tdm))
termfreq
#Subsetting TDM
termfreqsubset= subset(termfreq, termfreq>=4)
class(termfreqsubset)
#Creating a dataframe
library(ggplot2)
tdmdf= data.frame(term=names(termfreqsubset), freq=termfreqsubset)
View(tdmdf)
tdmplot= ggplot(tdmdf, aes(x=term, y=freq)) +
geom_bar(stat="identity") + xlab("Terms") + ylab("Count") +

coord_flip() +
theme(axis.text=element_text(size=6))
tdmplot
#Wordcloud
wc= as.matrix(tdm) #making a matrix
wordfreq= sort(rowSums(wc), decreasing = T)
#Colors
pal= brewer.pal(9, "BuGn")[-(1:4)]
colors()
nwc= wordcloud(words=names(wordfreq), freq= wordfreq, min.freq=3,
random.order = F, colors= pal)
################################################
#Sentiment Analysis
library(sentimentr)
class(articles.corpus)
class(a)
a=as.character(articles.corpus)
mysentiment = get_nrc_sentiment(a)
SentimentScores = data.frame(colSums(mysentiment[,]))
SentimentScores
#Giving Name to the scores column
names(SentimentScores) = "Score"
SentimentScores
#Giving Row Names
SentimentScores = cbind("sentiment" = rownames(SentimentScores),
SentimentScores)
SentimentScores
#Removing Row names
rownames(SentimentScores) = NULL
SentimentScores
#Plotting the sentiment Scores
ggplot(SentimentScores, aes(x = sentiment, y = Score))+
geom_bar(aes(fill = sentiment), stat = "identity") +
theme(legend.position = "none") +
xlab("Sentiment") + ylab("Score") +
ggtitle("Total Sentiment Score")
#Topic Modeling
#Latent dirichlet allocation (LDA) models are a widely used topic modeling technique.
#Create DTM
articleDtm = DocumentTermMatrix(articles.corpus,
control = list(minWordLength = 3))
k=3
SEED = 1234
article.lda = LDA(articleDtm, k, method="Gibbs",
control=list(seed = SEED))
lda.topics = as.matrix(topics(article.lda))
lda.topics
lda.terms = terms(article.lda)
lda.terms
################## Machine learning ################3
library(caret)
library(kernlab)
library(e1071)
### Naive Bayes Model ###
#install.packages("klaR")
library(klaR)
data1=iris
colnames(data1)
partition = createDataPartition(data1$Species,p=.80,list=F)
partition
datatr=data1[partition,]
datate=data1[-partition,]
modelfit=train(Species~.,data=datatr,method="nb")
prediction=predict(modelfit,newdata=datate)
confusionMatrix(prediction,datate$Species)
### Logistic Regression Model ###
#install.packages("mlbench")
data(BreastCancer, package="mlbench")
bc <- BreastCancer[complete.cases(BreastCancer), ]
partition = createDataPartition(bc$Class,p=.80,list=F)
partition
datatr=bc[partition,]
datate=bc[-partition,]
modelfit=train(Class ~ Cell.shape,data=datatr,method="glm",family="binomial")
prediction=predict(modelfit,newdata=datate)
confusionMatrix(prediction,datate$Class)
## ex.2 for logistic reg
data(spam)
z1=spam
head(z1)
z2=createDataPartition(z1$type,p=0.75,list=F)
ztrain=z1[z2,]
ztest=z1[-z2,]
zfit=train(type~.,data=ztrain,method="glm",family="binomial")
pred=predict(zfit,newdata=ztest)
confusionMatrix(pred,ztest$type)
#Clustering
iris
iris1=iris[,-5]
##################################################
plot(iris1$Sepal.Length, iris$Sepal.Width)
clu=kmeans(dist(iris1), 3)
clu$cluster
library(dplyr)
iris2=iris1 %>%
mutate(clusters=clu$cluster)
head(iris2)
#########################################
#hierarchical clustering
iris2=iris1[,-5]
h1=hclust(dist(iris2), method="ward.D2")
plot(h1, cex=0.2)

Copy Entire Document Content in R Studio

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Copy Entire Document Content in R Studio

Uploaded by

Copyright:

Available Formats

Updated R Script Compiled by Mr.

Anup Sharma (Strictly to be used as class notes)

#### COPY ENTIRE DOCUMENT CONTENT IN R STUDIO #####

#### NUMERIC VARIABLES ####

# Assign value to a variable

# Assign value to multiple variables

# Check type of variable x

# Change type of variable x from numeric to Integer

# Check x type again

# Check if x is numeric variable now or not

is.numeric(x) # True because integer is subset of Numeric

# Check if x is Integer variable now or not

#### Character Variables ####

# Assign "A Grade" text to a variable a

# Check length of a number 3000

#### Date Variable ####

# Assign todays date to variable b

b = as.POSIXct("2019-11-15 10:30") # as YYYY-MM-DD

# Check class of variable b

#### Logical Variable ####

# Assign a logical value to variable c and d

# check type of variable c and d

#### VECTORS ####

# Make a vector "grades" holding grades for 10 students

# Make a vector "marks" holding marks of 10 students

# Make a vector "rno" holding rollno of 10 students

# Add bonus marks = 2 and update marks variable

# Convert marks to 10 scale and assign to slab variable

# What's grade of 3rd student?

# What are slab for 3rd and 5th student?

# What are marks for last three roll nos?

#### Factors ####

# Check factors for grades

# Check numeric value of factor variables

# Add roll no 11 with marks NA and grades NA

# Mean class marks?

mean(marks) ## Wrong answer as NA considered

mean(marks,na.rm=TRUE) ## True to strip NA before calculation

#### Pipes ####

# Add package magrittr from library

# Find mean marks

marks%>%mean ## Error as NA included

# Find mean after scraping NA

marks%>%is.na%>%mean # wrong as it's average no of NA

#### DATA.FRAMES ####

# Make a dataframe with variables rno,marks,grades and slab

dim(myclass) ## Row and Columns

# List of Row names

# List of Column names

# grade of 5th roll number

# performance of 5th roll number

# Roll number wise grades

#==========-{ GGPLOT }=======

##### Using ggplot2#####################################

# Histogram for discrete measurement

# Density for continous measurement

# save previous thing to g variable to add layers

#Make faceted plots

#Facet with histogram

#### AREA PLOT ######

#### BAR PLOT######

df=data.frame(dose=c("D0.5", "D1", "D2"),len=c(4.2, 10, 29.5))

ggplot(data=df, aes(x=dose, y=len)) +