You are on page 1of 18

 Window 1 : editor ( for all the commands and editing)

 Window 2 : Console ( output)


 Window 3: Environment (For all the variables and and history)
 Window 4: Workplace (For package, files, plots)

1. Commands

 R is K sensitive ( for capital and small alphabets)


 Ctrl + L = to clear window 2
 Enter does not work alone it works with ctrl+enter
 All the functions of R start with small letters only
 Vector : numeric = age, Character= Name
 Integer : Salary ( always use L to define an integer)
 Factor Vector: gender
 To check the vector of gender= Class(GendF)

2. How make a file in r

 Use the command name of the file you want= data.frame(the variables you want)
 In order to remove any variable = rm(name of the variable)
 View(Data)= to open the data table
 Edit(Datafile)i.e name of the file = to edit the data table
 To check the top 6 values= head(name of the file)
 To check the last 6 values = tail(name of the file)
 To check the dimensions of the file = Dim(Datafile)
 To check the structure = str(Datafile)
 To check the summary = summary(Datafile)

3. How to make the file functional ( as the files are already inbuilt in r)

 New Script ctrl+shift+N


 Datasets::mtcars
 Make your own file : Angel=mtcars( which will be the dummy of the original file)

Various functions on that file

 In order to get the details: ?filename


 No. of rows= nrow(filename)
 No. of columns= ncol(file name )
 Ques: Extract rows from 20-30 which all vectors

 Make a subsetA= Angel[20:30,]
 (use big brackets and comma for all the vectors)

Ques: Extract rows from 20-30 by taking only 3 vectors ( any)


 Make any other file say B
 B=Angel[20:30,1:3]
File will be created
Write the name of the file say B and then press ctrl+ enter

Extract mtcars with 6 cylinders only


 C=Angel[Angel$cyl==6,]
 C (ctrl+ enter) ( To view the file)

Extract mtcars with 6 cylinder and 4 gear only


D=Angel[(Angel$cyl==6 & Angel$gear==4),]

Extract mtcars with 6 cylinders with any 3 columns only


E=Angel[Angel$cyl==6,1:3]

 Dplyr packages is used for filtering, sorting, searching etc.

Big data==(pipe)== small filtered data==(pipe)==desired data

#Extract mtcars with 3 col only named (mpg,cyl and disp)


Pipe operator= ctrl+shift+m
A=Angel %>%
select(mpg,cyl,disp)

#Extract mtcars with mpg>20 and 3 col only named (mpg,cyl,disp)

B=Angel %>%
select(mpg,cyl,disp) %>%
filter(mpg>20)

#Extract mtcars with (mpg>20,disp>150, and cyl>4)


# 3 col only named (mpg, cyl and disp)
C=Angel %>%
select(mpg,cyl,disp) %>%
filter(mpg>20,disp>150,cyl>4)
C
#extract mtcars with (wt>2.5)
#and 3 col only named (disp,gear and wt)
#arranges in descending order of disp
library(dplyr) ( always call the function again because if one time the window is closed we
have to call the function again as it is temporary)
D=Angel %>%
select(disp,gear,wt) %>%
filter(wt>2.5) %>%
arrange(desc(disp))
D

#create and add one new column named (wtkg)


#hint: 1 lbs = 0.454 kg
?mtcars
E=Angel %>%
mutate(wtkg=wt*1000*.454)
E

#extract rows 20-30 with all the vectors with dplyr


F=Angel %>%
slice(20:30,)
F
#extract the rows 20-30 with 3 vectors

G=Angel %>%
slice(20:30) %>%
select(1:3)
G
#extract mtcars with 6 cylinders only
H= Angel %>%
filter(cyl==6)
H
#Extract mtcars with 6 cylinders and 4 gears only
I= Angel %>%
filter(cyl==6,gear==4)
I
#extract mtcars with (mpg>20,disp>150 and cyl>4)
#without 3 col named (gear,carb,and wt)

J= Angel %>%
filter(mpg>20,disp>150,cyl>4) %>%
select(-gear,-carb,-wt)
J
Filter: within the rows
Select: with rows and coloumns
#extract mean of wt of cars cyl wise group
K=Angel %>%
group_by(cyl) %>%
summarise(mean(wt))
K

Missing values

#To check the missing values

is.na(air)

#number of missing values in airquality


sum(is.na(air))

median will not be calculated if there is any missing value


median(air$Ozone)

#histogram
Hist(air$Ozone)

Hist(air$Ozone,xlab='Ozone',ylab='Mean Ozone in ppm',main='HISTOGRAM')

#piechart
Pie(Angel$cyl)
Pie(Angel$cyl,xlab='Angel Cylinder',main = 'PIE CHART')

#Scatter Plot

Scatter.smooth(air$Ozone)

#Boxplot
Boxplot(air$Ozone)
UNIT 2

Ggplot2
library(ggplot2)

#Distribution -----1. Histogram 2.Density 3.Boxplot

#make histogram of mtcars based on the number of cylinder

# make histogram for cyl in mtcars

Angel= mtcars

ggplot(Angel,aes(x=cyl))+

geom_histogram()

stat_bin()` using `bins = 30`. Pick better value with `binwidth`.


geom_histogram(bins=5) ( according to the width of the bar)

#for changing the colours

ggplot(Angel,aes(x=cyl))+

geom_histogram(bins=5,color='red',fill='yellow')

#Theme (Background)

ggplot(Angel,aes(x=cyl))+

geom_histogram(bins=5,color='red',fill='yellow')+

theme_classic()

#Density plot
ggplot(Angel,aes(x=cyl))+

geom_density(bins=5,color='red',fill='yellow')+

theme_classic()
#for changing the density of cylinders on the basis of no of cylinders
ggplot(Angel,aes(x=cyl))+
geom_density(bins=5,color='red',fill='yellow')+
theme_classic()+
facet_wrap(Angel$cyl)

#Boxplot we need one discrete and one continous variable


#Boxplot of mpg
boxplot(Angel$mpg)
#fill the color in box plot
ggplot(Angel,aes(y=mpg))+
geom_boxplot(fill='red')
#Box plot of mpg based on the group of numbers of cylinders

ggplot(Angel,aes(Y=mpg,X=as.factor(cyl)))+
geom_boxplot()

#fill the color


ggplot(Angel,aes(y=mpg,fill=as.factor(cyl)))+
geom_boxplot()

#change the theme


ggplot(Angel,aes(y=mpg,fill=as.factor(cyl)))+
geom_boxplot()+
theme_classic()
#If we want it to compare it with gear instead of cyl
ggplot(Angel,aes(y=mpg,Fill=as.factor(gear)))+
geom_boxplot()+
theme_classic()

#compare Both
install.packages("gridExtra")
grid.arrange(A,B,ncol=2)

#compare 3-4 graphs together (make different files name them as A, B,C, D)
install.packages("gridExtra")
gird.arrange(A,B,C,ncol=2)
# for bar graphs
library(ggplot2)
ggplot(Angel,aes(X=cyl))+
geom_bar()
# for changing the colour and width of the bars of the graph
ggplot(Angel,aes(x=cyl))+
geom_bar(width=.5,color='red',fill='yellow')

#dot plot chart


ggplot(Angel,aes(X=cyl))+
geom_dotplot(width=.5,color='red',fill='yellow')
##pie chart of number of species in iris
datasets::iris
Irs=iris
library(ggplot2)
ggplot(irs,aes(X="", fill=as.factor(Species)))+
geom_bar(width = 1)+
coord_polar("y",start=0)

#piechart of no of cylinders in Angel


datasets:: mtcars
Angel=mtcars
library(ggplot2)
ggplot(Angel,aes(x="", fill=as.factor(Cyl)))+
geom_bar(width = 1)+
coord_polar("y",start=360)

#Pie chart of mpg based on no. of group of cylinders


datasets:: mtcars
Angel=mtcars
library(ggplot2)
ggplot(Angel,aes(X="", y=mpg,fill=factor(cyl)))+
geom_bar(stat="identity")+ to keep the data identical
coord_polar("y",start=0)

#change the colour


ggplot(iris,aes(X=iris$Sepal.Length,Y=iris$Sepal.Width))+
geom_point(aes(color=as.factor(Species)))
##correllogram

install.packages("ggcorrplot")
library(ggcorrplot)

X=c(2.5,3.2,4.5)
y=c(1.2,1.6,1.9)
cor(X,y)
cor(mtcars$mpg,mtcars$Cyl)
round(cor(mtcars$mpg,mtcars$Cyl),1)

# for correlation of each vector in mtcars and graph

cor(mtcars)
A=round(cor(mtcars),1)
ggcorrplot(A)

# to change the graph squares to circles


ggcorrplot(A, method ="circle")
ggcorrplot(A,method= "circle", type="upper")

#to change the colour


ggcorrplot(A,hc.order = TRUE,lab = TRUE, lab_size = 3, type =
"lower",color=c("red","brown","yellow"),title="Correlogram of 'MTCARS'")

One lecture is missing


Forecast is important
(just for information)
{To make the non stationary data in to stationary data (the length of the node is not
equal= non stationary)

 Create a variable A = log(AirPassengers)


Plot(A)
Now the variance of the data has been removed

 Trend line must be horizontal to the x axis


So we will take the differentiation of log(AirPassengers)
No the time effect has been removed and is stationary data and is ready for
forecasting.}

#forecast
#step1 Forecasting the model
#steo 2 Prediction
#Step 3 Antilog
#step4 Model Validation-Plot the data
AirPassengers
#forecast data for next 4 years

Model works on loop hence trace would stop when the 1 model has been formed

#ARIMA MODEL(will use these 4 steps)


Modelfit=auto.arima(log(AirPassengers),Approximation = F, trace = F)
Prediction=predict(Modelfit,n.ahead = 48 )
Prediction
Prediction1= round(2.718^Prediction$pred,0)
ts.plot(AirPassengers,Prediction1,lty=c(1,3),col=c("blue","red"))

import the file from computer


tractorsales=read.csv(File.choose(),Header = T)
class(tractorsales)
head(tractorsales)
#to convert data files into time series
tractor.ts= ts(tractorsales$Number.of.Tractor.Sold,start=c(2003,1),frequency = 12)
tractor.ts
class(tractor.ts)

Modelfit=auto.arima(log(tractor.ts),approximation = F, trace =F)


Prediction= predict(Modelfit,n.ahead = 48)
Prediction
Prediction1=round(2.718^Prediction$Pred,0)
ts.plot(tractor.ts,Prediction1,lty=c(1,3),col=c("blue","red"))
#Machine Learning

#ETP

#Experience Task Performance Measure

#Decisions are based on two models : Supervised and Unsupervised Model

#supervised Model

#Data must have Label Column or Dependent Variable and observations;

#F1 F2 F3 label; Like iris data

#if label is categorical data- we will use classification technique;

#if label is number/continous number- we wil use Regression

# example= gmail account(primary, Promotions,social)

#Unsupervised Model

#Data which does not have the label column;

#F1,F2,F3,Fe,F4 no label; or if we remove species from iris

#Machine Creates the label column- Clustering

#hence computer will change the unsupervised data into supervised data

# 5 STEPS IN MACHINE LEARNING

#1. Data

#2. Data Pre Processing (NA must be handled)

#data must be properly scaled

#3. Data Partitioning: Training (70-80%), Testing(20-30%)

#4. Model Building

#5. Model Validation

#(caret,Kernlab,e1071 packages)

#(m.l (mainly classification technique),(same as caret) but in data files,same as caret)


#step1 : Data

data(spam)

head(spam)

tail(spam)

Machine Learning

Type must be Factor

Step1.: Data Retrieving


data(spam)
head(spam)
tail(spam)

#since label column is categorical- Classification technique


str(spam)

Step 2: data pre processing

#label col is factor no need of pre processing

sum(is.na(spam))

# Step3: Data partitioning


set.seed(12345) = {to freeze the output, or to stop the randomness again and again if
we run this again and data will remain same.)

intrain= createDataPartition(y=spam$type, p=0.75, list= F )

training= spam[intrain,]
testing= spam[-intrain,]

#step 4 : Model Building


3 methods for classification
KNN= Kein nearest method
Glm= Generalised linear model
NB= Name based theorem
#step4: model Building (dot shows n no. of variables)
modelfit= train(type~., data = training, method="knn")

#step 5: Model Validation(we check the testing data)

prediction= predict(modelfit, newdata= testing)


#create a model
confusionMatrix(prediction,testing$type)

Result
Confusion Matrix and Statistics

Reference
Prediction nonspam spam
nonspam 583 114
spam 114 339 (spam, non spam and vice versa
predicti-on must be minimum lead to a good model)

Accuracy : 0.8017 (high accuracy)


95% CI : (0.7775, 0.8244)
No Information Rate : 0.6061
P-Value [Acc > NIR] : <2e-16

Kappa : 0.5848

Mcnemar's Test P-Value : 1

Sensitivity : 0.8364 (this should be high)


Specificity : 0.7483 (this should be high)
Pos Pred Value : 0.8364
Neg Pred Value : 0.7483
Prevalence : 0.6061
Detection Rate : 0.5070
Detection Prevalence : 0.6061
Balanced Accuracy : 0.7924

'Positive' Class : nonspam


Download the data from site kaggle.com

GLM use caret

Step 1

Data retrieving

Diabetes=read.csv(file.choose(),header=T)

head(Diabetes)

step2

data preprocessing

sum(is.na(Diabetes))

class(Diabetes$Outcome)

Diabetes$Outcome=as.factor(Diabetes$Outcome)

class(Diabetes$Outcome)

(It is coming integer, it must be factor)

str(Diabetes)

#step 3

Data partitioning

library(caret)

set.seed(1234)

intrain= createDataPartition(y=Diabetes$Outcome,p=0.75, list=F)

Training= Diabetes[intrain,]

Testing= Diabetes[-intrain,]

#step4

Data modeling

modelfit = train(Outcome~.,data training,method=”glm”)

#step 5 : data validation

Prediction= predict(modelfit,newdata=testing)

confusionMatrix(prediction,testing$outcome)
Knn

Same first 3 steps

#step4

Data modeling

modelfit2 = train(outcome~.,data training,method=”knn”)

#step 5 : data validation

prediction2= predict(modelfit2,newdata=testing)

confusionMatrix(prediction2,testing$outcome)

NB

Same 3 steps

library(klar)

#Step4

Data modeling

Modelfit3 = train(outcome~.,data training,method=”nb”)

#step 5 : data validation

Prediction3= predict(modelfit3,newdata=testing)

confusionMatrix(prediction3,testing$outcome)
#for Unsupervised model use packages [factoextra, nbClust, useful,descr]

We are going to use clustering technique

iris

irs=iris[,-5]

irs

#now it is unsupervised learning example- no label

 #computing the number of clusters : Plot, Dendogram method


 #create the clusters
 #validate

To plot the clusters : Elbow method

fviz_nbclust(irs, kmeans, method= ”wss” ) +

labs(subtitle = “Elbow Method”)

( by showing graph we will identify the no of clusters. Upside graph will only be taken, the horizontal will
not be taken)

(4 clusters)

#silhouette Method

fviz_nbclust(irs,kmeans,method="silhouette")+

labs(subtitle = "Silhouette Method")

(2 clusters)

#Gap statistics Method

fviz_nbclust(irs,kmeans,method="gap_stat",nstart=25,nboot=50)+labs(subtitle = "Gap Statistics


Method")

(6 clusters)
#Hartigan Plot

h1=PlotHartigan(FitKMeans(irs))

plot(h1)

(5 clusters) (do not inlcude red and green together)

You might also like