You are on page 1of 17

Updated R Script Compiled by Mr.

Anup Sharma (Strictly to be used as class notes)

#### COPY ENTIRE DOCUMENT CONTENT IN R STUDIO #####

#===================================================

#----------------basics-------------------------

#===================================================

#### NUMERIC VARIABLES ####

# Assign value to a variable

x=2

x<-2

3->y

# Assign value to multiple variables

z=y=4

# Remove variable z

rm(z)

# Check type of variable x

class(x)

# Change type of variable x from numeric to Integer

x=as.integer(x)

# Check x type again

class(x)

# Check if x is numeric variable now or not

is.numeric(x) # True because integer is subset of Numeric

# Check if x is Integer variable now or not

is.integer(x)

#===================================================

#### Character Variables ####

# Assign "A Grade" text to a variable a

a="A Grade"

# Check length of a

nchar(a)
Updated R Script Compiled by Mr. Anup Sharma (Strictly to be used as class notes)

# Check length of a number 3000

nchar(3000)

#===================================================

#### Date Variable ####

# Assign todays date to variable b

b = as.POSIXct("2019-11-15 10:30") # as YYYY-MM-DD

# Check class of variable b

class(b)

#===================================================

#### Logical Variable ####

# Assign a logical value to variable c and d

c=TRUE

d=FALSE

# check type of variable c and d

class(c)

class(d)

# Compare if x>y

x>y

# y<=x ??

y<=x

# IF x = y ??

x==y

# if x not equal to y ??

x!=y

#### VECTORS ####

# Make a vector "grades" holding grades for 10 students

grades=c("a","b","a","c","d","a","b","a","c","d")

# Make a vector "marks" holding marks of 10 students


Updated R Script Compiled by Mr. Anup Sharma (Strictly to be used as class notes)

marks=c(90,85,98,72,54,93,86,90,70,45)

# Make a vector "rno" holding rollno of 10 students

rno=c(1:10)

# Add bonus marks = 2 and update marks variable

marks=marks+2

marks

# Convert marks to 10 scale and assign to slab variable

slab=marks/10

slab

# What's grade of 3rd student?

grades[3]

# What are slab for 3rd and 5th student?

slab[c(3,5)]

# What are marks for last three roll nos?

marks[8:10]

#===================================================

#### Factors ####

# Check factors for grades

as.factor(grades)

# Check numeric value of factor variables

gf=as.factor(grades)

as.numeric(gf)

#===================================================

#### NA ####

# Add roll no 11 with marks NA and grades NA

rno=c(rno,11)

marks=c(marks,NA)

grades=c(grades,NA)

# Update slab
Updated R Script Compiled by Mr. Anup Sharma (Strictly to be used as class notes)

slab=marks/10

slab

# Mean class marks?

mean(marks) ## Wrong answer as NA considered

mean(marks,na.rm=TRUE) ## True to strip NA before calculation

#===================================================

#### Pipes ####

# Add package magrittr from library

library(magrittr)

# Find mean marks

marks%>%mean ## Error as NA included

# Find mean after scraping NA

marks%>%is.na%>%mean # wrong as it's average no of NA

marks%>%mean(na.rm=TRUE)

#===================================================

#### DATA.FRAMES ####

# Make a dataframe with variables rno,marks,grades and slab

myclass=data.frame(Roll_Number=rno,Marks=marks,CGPA=slab,Grades=grades)

myclass

# Dimensions of myclass

dim(myclass) ## Row and Columns

# number of rows

nrow(myclass)

# number of columns

ncol(myclass)

# List of Row names

rownames(myclass)

# List of Column names

colnames(myclass)
Updated R Script Compiled by Mr. Anup Sharma (Strictly to be used as class notes)

# Head of dataframe

head(myclass)

# Tail of dataframe

tail(myclass)

# grades?

myclass[,4]

myclass[,"Grades"]

# grade of 5th roll number

myclass[5,4]

# performance of 5th roll number

myclass[5,]

# Roll number wise grades

myclass[,c(1,4)]

#==========-{ GGPLOT }=======

library(ggplot2)

data(diamonds)

mydata=diamonds

head(mydata)

#Base Histogram

hist(mydata$carat,main="Carat Histogram",xlab="Carat")

# Base Scatterplot

plot(price~carat,data=mydata)

# Boxplot

boxplot(mydata$carat)

##### Using ggplot2#####################################

# Histogram for discrete measurement

ggplot(data=mydata)+geom_histogram(aes(x=carat))
Updated R Script Compiled by Mr. Anup Sharma (Strictly to be used as class notes)

# Density for continous measurement

ggplot(data=mydata)+geom_density(aes(x=carat),fill="skyblue")

# Scatterplot

ggplot(mydata,aes(x=carat,y=price))+geom_point()

# save previous thing to g variable to add layers

g=ggplot(mydata,aes(x=carat,y=price))

# Add colors to g

g+geom_point(aes(color=color))

#Make faceted plots

g+geom_point(aes(color=color))+facet_wrap(~color)

g+geom_point(aes(color=color))+facet_grid(cut~clarity)

#Facet with histogram

ggplot(mydata,aes(x=carat))+geom_histogram()+facet_wrap(~color)

# Boxplots

ggplot(mydata,aes(y=carat,x=1))+geom_boxplot()

ggplot(mydata,aes(y=carat,x=cut))+geom_boxplot()

#Line plot

ggplot(economics,aes(x=date,y=pop))+geom_line(color="red")

#### AREA PLOT ######

library(ggplot2)

# create data

xValue=1:50

yValue=cumsum(rnorm(50))

data=data.frame(xValue,yValue)

# area Plot

ggplot(data)+geom_area(aes(x=xValue, y=yValue))
Updated R Script Compiled by Mr. Anup Sharma (Strictly to be used as class notes)

#### BAR PLOT######

#data

df=data.frame(dose=c("D0.5", "D1", "D2"),len=c(4.2, 10, 29.5))

#barplot

ggplot(data=df, aes(x=dose, y=len)) +

geom_bar(stat="identity", fill="steelblue")+

geom_text(aes(label=len), vjust=1.6, color="white", size=3.5)+

theme_minimal()

# data

df2=data.frame(supp=rep(c("VC", "OJ"), each=3),

dose=rep(c("D0.5", "D1", "D2"),2),

len=c(6.8, 15, 33, 4.2, 10, 29.5))

# Stacked barplot with multiple groups

ggplot(data=df2, aes(x=dose, y=len, fill=supp)) +

geom_bar(stat="identity")

# Use position=position_dodge()

ggplot(data=df2, aes(x=dose, y=len, fill=supp)) +

geom_bar(stat="identity", position=position_dodge())

###########dot plot#####################

dd=ToothGrowth

head(dd)

class(dd$dose)

class(dd)

dd$dose=as.factor(dd$dose)

ggplot(data=dd)+geom_dotplot(aes(x=dose,y=len,color=supp),binaxis="y",stackdir="center")

### PIE CHART ####


Updated R Script Compiled by Mr. Anup Sharma (Strictly to be used as class notes)

#data

df=data.frame(dose=c("D0.5", "D1", "D2"),len=c(4.2, 10, 29.5))

#barplot

bp= ggplot(data=df, aes(x="", y=len, fill=dose)) +

geom_bar(stat="identity")

pie=bp + coord_polar("y", start=0)

pie

##############correlogram###########

#####install.packages("GGally")

install.packages("corrplot")

h=cor(mtcars[,c(2,3,5)])

corr=head(round(h,2))

###corrplot(corr,method="circle")

library(corrplot)

corrplot(corr,method="circle")

corrplot(corr,method="pie")

corrplot(corr,method="color")

corrplot(corr,method="number")

#####################################################################

#=========-{ DPLYR }========

library(dplyr)

## Use of Pipes

diamonds %>% head(4) %>% dim

class(diamonds)
Updated R Script Compiled by Mr. Anup Sharma (Strictly to be used as class notes)

head(diamonds)

diamonds

## Print carat and price columns w/i and w/o pipes

select(diamonds,carat,price)

diamonds %>% select(carat,price)

diamonds %>% select(1,7)

## Print all except carat and price columns w/i and w/o pipes

diamonds %>% select(c(-carat,-price))

## Filter where cut is ideal

diamonds %>% filter(cut=='Ideal')

diamonds[diamonds$cut=='Ideal',] ### Base R equivalent

## Filter where cut is Ideal or Good

diamonds %>% filter(cut==c("Ideal","Good"))

## Filter where price >=1000

diamonds %>% filter(price>=1000)

## Multiple and condition carat>2 and price<14000 using , or &

diamonds %>%filter(carat>2,price<14000)

diamonds %>%filter(carat>2&price<14000)

## or condition carat <1 or >5

diamonds %>% filter(carat<1|carat>5)

## comparision with value

diamonds %>% filter(cut=="Ideal")

## compare using variable now

ia="Ideal"

diamonds %>% filter(cut==ia)

## to display selected rows 1 to 5, 8th and 15:20

diamonds %>% slice(c(1:5,8,15:20))

## display rows except 1st row

diamonds %>% slice(-1)


Updated R Script Compiled by Mr. Anup Sharma (Strictly to be used as class notes)

## add one more column ratio = price/carat

diamonds %>% mutate(ratio=price/carat)

## can use created variable in same call

diamonds %>%

select(carat,price) %>%

mutate(ratio=price/carat,double=ratio*2)

### Add magrittr package for following

diamonds2=diamonds

diamonds %<>%

select(carat,price) %>%

mutate(ratio=price/carat,double=ratio*2)

diamonds2

###

## summary

summarize(diamonds,mean(price))

diamonds %>% summarize(mean(price))

## group by

diamonds %>% group_by(cut) %>% summarize(AvgPr=mean(price),SumCarat=sum(carat))

diamonds %>% group_by(cut,color) %>% summarize(AvgPr=mean(price),SumCarat=sum(carat))

##################time series & forecasting #######

# create a vector

rainfall=c(12,43,32,23,23,31,56,20,30,12,45,78)

# convert to time series object, freq means number of data points per year

rats=ts(rainfall,start=c(2017,6),frequency=12)

rats

# create another vector

rainfall1=c(12,21,33,24,23,15,22,12,12,12,13,70)
Updated R Script Compiled by Mr. Anup Sharma (Strictly to be used as class notes)

# merge 2 vectors in a matrix

comr=matrix(c(rainfall,rainfall1),nrow=12)

# convert matrix to ts object

rats1=ts(comr,start=c(2017,6),frequency = 12)

library(forecast)

# Save a data to be analysed in a variable

dd=rats ## Also try AirPassengers

start(dd)

end(dd)

frequency(dd)

class(dd)

d1=auto.arima(dd)

# PRedict for next 3 periods

d2=predict(d1,n.ahead=3)

# Plot actual and prediction together

ts.plot(dd,d2$pred,col=c("blue","red"),lty=c(1,3))

########## Machine Learning

install.packages(c("tm","SnowballC","topicmodels","wordcloud","sentimentr","syuzhet"))

library(tm)

library(SnowballC)

library(topicmodels)

library(wordcloud)

library(plyr)

library(dplyr)

library(stringr)
Updated R Script Compiled by Mr. Anup Sharma (Strictly to be used as class notes)

library(ggplot2)

library(httr)

library(reshape2)

library(sentimentr)

library(scales)

library(RCurl)

library(syuzhet)

getwd()

filenames = list.files(getwd(),pattern="*.txt")

files = lapply(filenames,readLines)

#create corpus from vector

articles.corpus = Corpus(VectorSource(files))

class(articles.corpus)

##Text Preprocessing

# make each letter lowercase

articles.corpus = tm_map(articles.corpus, tolower)

# remove punctuation

articles.corpus = tm_map(articles.corpus, removePunctuation)

#remove numbers

articles.corpus = tm_map(articles.corpus, removeNumbers)

# remove generic and custom stopwords

stopwords()

articles.corpus = tm_map(articles.corpus, removeWords, stopwords())

articles.corpus = tm_map(articles.corpus, removeWords, c("and", "the", "have", "was", "with"))


Updated R Script Compiled by Mr. Anup Sharma (Strictly to be used as class notes)

#Visualization - Wordcloud

library(wordcloud)

wordcloud(articles.corpus, random.order = F)

#Create TDM

#A term document matrix is a way of representing the words in the text as a table (or matrix) of
numbers. The rows of the matrix represent the text responses to be analysed, and the columns of
the matrix represent the words from the text that are to be used in the analysis.

tdm= TermDocumentMatrix(articles.corpus)

class(tdm)

tdm=as.matrix(tdm)

tdm

termfreq = rowSums(as.matrix(tdm))

termfreq

#Subsetting TDM

termfreqsubset= subset(termfreq, termfreq>=4)

class(termfreqsubset)

#Creating a dataframe

library(ggplot2)

tdmdf= data.frame(term=names(termfreqsubset), freq=termfreqsubset)

View(tdmdf)

tdmplot= ggplot(tdmdf, aes(x=term, y=freq)) +

geom_bar(stat="identity") + xlab("Terms") + ylab("Count") +


Updated R Script Compiled by Mr. Anup Sharma (Strictly to be used as class notes)

coord_flip() +

theme(axis.text=element_text(size=6))

tdmplot

#Wordcloud

wc= as.matrix(tdm) #making a matrix

wordfreq= sort(rowSums(wc), decreasing = T)

#Colors

pal= brewer.pal(9, "BuGn")[-(1:4)]

colors()

nwc= wordcloud(words=names(wordfreq), freq= wordfreq, min.freq=3,

random.order = F, colors= pal)

################################################

#Sentiment Analysis

library(sentimentr)

class(articles.corpus)

class(a)

a=as.character(articles.corpus)

mysentiment = get_nrc_sentiment(a)

SentimentScores = data.frame(colSums(mysentiment[,]))

SentimentScores

#Giving Name to the scores column

names(SentimentScores) = "Score"

SentimentScores

#Giving Row Names

SentimentScores = cbind("sentiment" = rownames(SentimentScores),

SentimentScores)

SentimentScores
Updated R Script Compiled by Mr. Anup Sharma (Strictly to be used as class notes)

#Removing Row names

rownames(SentimentScores) = NULL

SentimentScores

#Plotting the sentiment Scores

ggplot(SentimentScores, aes(x = sentiment, y = Score))+

geom_bar(aes(fill = sentiment), stat = "identity") +

theme(legend.position = "none") +

xlab("Sentiment") + ylab("Score") +

ggtitle("Total Sentiment Score")

#Topic Modeling

#Latent dirichlet allocation (LDA) models are a widely used topic modeling technique.

#Create DTM

articleDtm = DocumentTermMatrix(articles.corpus,

control = list(minWordLength = 3))

k=3

SEED = 1234

article.lda = LDA(articleDtm, k, method="Gibbs",

control=list(seed = SEED))

lda.topics = as.matrix(topics(article.lda))

lda.topics

lda.terms = terms(article.lda)

lda.terms

################## Machine learning ################3

library(caret)

library(kernlab)

library(e1071)
Updated R Script Compiled by Mr. Anup Sharma (Strictly to be used as class notes)

### Naive Bayes Model ###

#install.packages("klaR")

library(klaR)

data1=iris

colnames(data1)

partition = createDataPartition(data1$Species,p=.80,list=F)

partition

datatr=data1[partition,]

datate=data1[-partition,]

modelfit=train(Species~.,data=datatr,method="nb")

prediction=predict(modelfit,newdata=datate)

confusionMatrix(prediction,datate$Species)

### Logistic Regression Model ###

#install.packages("mlbench")

data(BreastCancer, package="mlbench")

bc <- BreastCancer[complete.cases(BreastCancer), ]

partition = createDataPartition(bc$Class,p=.80,list=F)

partition

datatr=bc[partition,]

datate=bc[-partition,]

modelfit=train(Class ~ Cell.shape,data=datatr,method="glm",family="binomial")

prediction=predict(modelfit,newdata=datate)

confusionMatrix(prediction,datate$Class)

## ex.2 for logistic reg

data(spam)
Updated R Script Compiled by Mr. Anup Sharma (Strictly to be used as class notes)

z1=spam

head(z1)

z2=createDataPartition(z1$type,p=0.75,list=F)

ztrain=z1[z2,]

ztest=z1[-z2,]

zfit=train(type~.,data=ztrain,method="glm",family="binomial")

pred=predict(zfit,newdata=ztest)

confusionMatrix(pred,ztest$type)

#Clustering

iris

iris1=iris[,-5]

##################################################

plot(iris1$Sepal.Length, iris$Sepal.Width)

clu=kmeans(dist(iris1), 3)

clu$cluster

library(dplyr)

iris2=iris1 %>%

mutate(clusters=clu$cluster)

head(iris2)

#########################################

#hierarchical clustering

iris2=iris1[,-5]

h1=hclust(dist(iris2), method="ward.D2")

plot(h1, cex=0.2)

You might also like