R Basics

 Window 1 : editor ( for all the commands and editing)
 Window 2 : Console ( output)

 Window 3: Environment (For all the variables and and history)
 Window 4: Workplace (For package, files, plots)
1. Commands
 R is K sensitive ( for capital and small alphabets)

 Ctrl + L = to clear window 2
 Enter does not work alone it works with ctrl+enter
 All the functions of R start with small letters only
 Vector : numeric = age, Character= Name
 Integer : Salary ( always use L to define an integer)
 Factor Vector: gender
 To check the vector of gender= Class(GendF)
2. How make a file in r
 Use the command name of the file you want= data.frame(the variables you want)
 In order to remove any variable = rm(name of the variable)
 View(Data)= to open the data table
 Edit(Datafile)i.e name of the file = to edit the data table
 To check the top 6 values= head(name of the file)
 To check the last 6 values = tail(name of the file)
 To check the dimensions of the file = Dim(Datafile)
 To check the structure = str(Datafile)
 To check the summary = summary(Datafile)
3. How to make the file functional ( as the files are already inbuilt in r)
 New Script ctrl+shift+N

 Datasets::mtcars
 Make your own file : Angel=mtcars( which will be the dummy of the original file)
Various functions on that file
 In order to get the details: ?filename

 No. of rows= nrow(filename)
 No. of columns= ncol(file name )
 Ques: Extract rows from 20-30 which all vectors

 Make a subsetA= Angel[20:30,]
 (use big brackets and comma for all the vectors)
Ques: Extract rows from 20-30 by taking only 3 vectors ( any)

 Make any other file say B
 B=Angel[20:30,1:3]
File will be created
Write the name of the file say B and then press ctrl+ enter
Extract mtcars with 6 cylinders only

 C=Angel[Angel$cyl==6,]
 C (ctrl+ enter) ( To view the file)
Extract mtcars with 6 cylinder and 4 gear only

D=Angel[(Angel$cyl==6 & Angel$gear==4),]
Extract mtcars with 6 cylinders with any 3 columns only

E=Angel[Angel$cyl==6,1:3]
 Dplyr packages is used for filtering, sorting, searching etc.
Big data==(pipe)== small filtered data==(pipe)==desired data
#Extract mtcars with 3 col only named (mpg,cyl and disp)

Pipe operator= ctrl+shift+m
A=Angel %>%
select(mpg,cyl,disp)
#Extract mtcars with mpg>20 and 3 col only named (mpg,cyl,disp)
B=Angel %>%
select(mpg,cyl,disp) %>%
filter(mpg>20)
#Extract mtcars with (mpg>20,disp>150, and cyl>4)

# 3 col only named (mpg, cyl and disp)
C=Angel %>%
select(mpg,cyl,disp) %>%
filter(mpg>20,disp>150,cyl>4)
C
#extract mtcars with (wt>2.5)
#and 3 col only named (disp,gear and wt)
#arranges in descending order of disp
library(dplyr) ( always call the function again because if one time the window is closed we
have to call the function again as it is temporary)
D=Angel %>%
select(disp,gear,wt) %>%
filter(wt>2.5) %>%
arrange(desc(disp))
D
#create and add one new column named (wtkg)

#hint: 1 lbs = 0.454 kg
?mtcars
E=Angel %>%
mutate(wtkg=wt*1000*.454)
E
#extract rows 20-30 with all the vectors with dplyr

F=Angel %>%
slice(20:30,)
F
#extract the rows 20-30 with 3 vectors
G=Angel %>%
slice(20:30) %>%
select(1:3)
G
#extract mtcars with 6 cylinders only
H= Angel %>%
filter(cyl==6)
H
#Extract mtcars with 6 cylinders and 4 gears only
I= Angel %>%
filter(cyl==6,gear==4)
I
#extract mtcars with (mpg>20,disp>150 and cyl>4)
#without 3 col named (gear,carb,and wt)
J= Angel %>%
filter(mpg>20,disp>150,cyl>4) %>%
select(-gear,-carb,-wt)
J
Filter: within the rows
Select: with rows and coloumns
#extract mean of wt of cars cyl wise group
K=Angel %>%
group_by(cyl) %>%
summarise(mean(wt))
K
Missing values
#To check the missing values
is.na(air)
#number of missing values in airquality

sum(is.na(air))
median will not be calculated if there is any missing value

median(air$Ozone)
#histogram
Hist(air$Ozone)
Hist(air$Ozone,xlab='Ozone',ylab='Mean Ozone in ppm',main='HISTOGRAM')
#piechart
Pie(Angel$cyl)
Pie(Angel$cyl,xlab='Angel Cylinder',main = 'PIE CHART')
#Scatter Plot
Scatter.smooth(air$Ozone)
#Boxplot
Boxplot(air$Ozone)
UNIT 2
Ggplot2
library(ggplot2)
#Distribution -----1. Histogram 2.Density 3.Boxplot
#make histogram of mtcars based on the number of cylinder
# make histogram for cyl in mtcars
Angel= mtcars
ggplot(Angel,aes(x=cyl))+
geom_histogram()
stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

geom_histogram(bins=5) ( according to the width of the bar)
#for changing the colours
geom_histogram(bins=5,color='red',fill='yellow')
#Theme (Background)
geom_histogram(bins=5,color='red',fill='yellow')+
theme_classic()
#Density plot
geom_density(bins=5,color='red',fill='yellow')+
theme_classic()
#for changing the density of cylinders on the basis of no of cylinders
geom_density(bins=5,color='red',fill='yellow')+
theme_classic()+
facet_wrap(Angel$cyl)
#Boxplot we need one discrete and one continous variable

#Boxplot of mpg
boxplot(Angel$mpg)
#fill the color in box plot
ggplot(Angel,aes(y=mpg))+
geom_boxplot(fill='red')
#Box plot of mpg based on the group of numbers of cylinders
ggplot(Angel,aes(Y=mpg,X=as.factor(cyl)))+
geom_boxplot()
#fill the color

ggplot(Angel,aes(y=mpg,fill=as.factor(cyl)))+
geom_boxplot()
#change the theme

ggplot(Angel,aes(y=mpg,fill=as.factor(cyl)))+
geom_boxplot()+
theme_classic()
#If we want it to compare it with gear instead of cyl
ggplot(Angel,aes(y=mpg,Fill=as.factor(gear)))+
geom_boxplot()+
theme_classic()
#compare Both
install.packages("gridExtra")
grid.arrange(A,B,ncol=2)
#compare 3-4 graphs together (make different files name them as A, B,C, D)
install.packages("gridExtra")
gird.arrange(A,B,C,ncol=2)
# for bar graphs
library(ggplot2)
ggplot(Angel,aes(X=cyl))+
geom_bar()
# for changing the colour and width of the bars of the graph
geom_bar(width=.5,color='red',fill='yellow')
#dot plot chart

ggplot(Angel,aes(X=cyl))+
geom_dotplot(width=.5,color='red',fill='yellow')
##pie chart of number of species in iris
datasets::iris
Irs=iris
library(ggplot2)
ggplot(irs,aes(X="", fill=as.factor(Species)))+
geom_bar(width = 1)+
coord_polar("y",start=0)
#piechart of no of cylinders in Angel

datasets:: mtcars
Angel=mtcars
library(ggplot2)
ggplot(Angel,aes(x="", fill=as.factor(Cyl)))+
geom_bar(width = 1)+
#Pie chart of mpg based on no. of group of cylinders

datasets:: mtcars
Angel=mtcars
library(ggplot2)
ggplot(Angel,aes(X="", y=mpg,fill=factor(cyl)))+
geom_bar(stat="identity")+ to keep the data identical
#change the colour

ggplot(iris,aes(X=iris$Sepal.Length,Y=iris$Sepal.Width))+
geom_point(aes(color=as.factor(Species)))
##correllogram
install.packages("ggcorrplot")
library(ggcorrplot)
X=c(2.5,3.2,4.5)
y=c(1.2,1.6,1.9)
cor(X,y)
cor(mtcars$mpg,mtcars$Cyl)
round(cor(mtcars$mpg,mtcars$Cyl),1)
# for correlation of each vector in mtcars and graph
cor(mtcars)
A=round(cor(mtcars),1)
ggcorrplot(A)
# to change the graph squares to circles

ggcorrplot(A, method ="circle")
ggcorrplot(A,method= "circle", type="upper")
#to change the colour

ggcorrplot(A,hc.order = TRUE,lab = TRUE, lab_size = 3, type =
"lower",color=c("red","brown","yellow"),title="Correlogram of 'MTCARS'")
One lecture is missing

Forecast is important
(just for information)
{To make the non stationary data in to stationary data (the length of the node is not
equal= non stationary)
 Create a variable A = log(AirPassengers)

Plot(A)
Now the variance of the data has been removed
 Trend line must be horizontal to the x axis

So we will take the differentiation of log(AirPassengers)
No the time effect has been removed and is stationary data and is ready for
forecasting.}
#forecast
#step1 Forecasting the model
#steo 2 Prediction
#Step 3 Antilog
#step4 Model Validation-Plot the data
AirPassengers
#forecast data for next 4 years
Model works on loop hence trace would stop when the 1 model has been formed
#ARIMA MODEL(will use these 4 steps)

Modelfit=auto.arima(log(AirPassengers),Approximation = F, trace = F)
Prediction=predict(Modelfit,n.ahead = 48 )
Prediction
Prediction1= round(2.718^Prediction$pred,0)
ts.plot(AirPassengers,Prediction1,lty=c(1,3),col=c("blue","red"))
import the file from computer

tractorsales=read.csv(File.choose(),Header = T)
class(tractorsales)
head(tractorsales)
#to convert data files into time series
tractor.ts= ts(tractorsales$Number.of.Tractor.Sold,start=c(2003,1),frequency = 12)
tractor.ts
class(tractor.ts)
Modelfit=auto.arima(log(tractor.ts),approximation = F, trace =F)

Prediction= predict(Modelfit,n.ahead = 48)
Prediction
Prediction1=round(2.718^Prediction$Pred,0)
ts.plot(tractor.ts,Prediction1,lty=c(1,3),col=c("blue","red"))
#Machine Learning
#ETP
#Experience Task Performance Measure
#Decisions are based on two models : Supervised and Unsupervised Model
#supervised Model
#Data must have Label Column or Dependent Variable and observations;
#F1 F2 F3 label; Like iris data
#if label is categorical data- we will use classification technique;
#if label is number/continous number- we wil use Regression
# example= gmail account(primary, Promotions,social)
#Unsupervised Model
#Data which does not have the label column;
#F1,F2,F3,Fe,F4 no label; or if we remove species from iris
#Machine Creates the label column- Clustering
#hence computer will change the unsupervised data into supervised data
# 5 STEPS IN MACHINE LEARNING
#1. Data
#2. Data Pre Processing (NA must be handled)
#data must be properly scaled
#3. Data Partitioning: Training (70-80%), Testing(20-30%)
#4. Model Building
#5. Model Validation
#(caret,Kernlab,e1071 packages)
#(m.l (mainly classification technique),(same as caret) but in data files,same as caret)

#step1 : Data
data(spam)
head(spam)
tail(spam)
Machine Learning
Type must be Factor
Step1.: Data Retrieving

data(spam)
head(spam)
tail(spam)
#since label column is categorical- Classification technique

str(spam)
Step 2: data pre processing
#label col is factor no need of pre processing
sum(is.na(spam))
# Step3: Data partitioning

set.seed(12345) = {to freeze the output, or to stop the randomness again and again if
we run this again and data will remain same.)
intrain= createDataPartition(y=spam$type, p=0.75, list= F )
training= spam[intrain,]
testing= spam[-intrain,]
#step 4 : Model Building

3 methods for classification
KNN= Kein nearest method
Glm= Generalised linear model
NB= Name based theorem
#step4: model Building (dot shows n no. of variables)
modelfit= train(type~., data = training, method="knn")
#step 5: Model Validation(we check the testing data)
prediction= predict(modelfit, newdata= testing)

#create a model
confusionMatrix(prediction,testing$type)
Result
Confusion Matrix and Statistics
Reference
Prediction nonspam spam
nonspam 583 114
spam 114 339 (spam, non spam and vice versa
predicti-on must be minimum lead to a good model)
Accuracy : 0.8017 (high accuracy)

95% CI : (0.7775, 0.8244)
No Information Rate : 0.6061
P-Value [Acc > NIR] : <2e-16
Kappa : 0.5848
Mcnemar's Test P-Value : 1
Sensitivity : 0.8364 (this should be high)

Specificity : 0.7483 (this should be high)
Pos Pred Value : 0.8364
Neg Pred Value : 0.7483
Prevalence : 0.6061
Detection Rate : 0.5070
Detection Prevalence : 0.6061
Balanced Accuracy : 0.7924
'Positive' Class : nonspam

Download the data from site kaggle.com
GLM use caret
Step 1
Data retrieving
Diabetes=read.csv(file.choose(),header=T)
head(Diabetes)
step2
data preprocessing
sum(is.na(Diabetes))
class(Diabetes$Outcome)
Diabetes$Outcome=as.factor(Diabetes$Outcome)
class(Diabetes$Outcome)
(It is coming integer, it must be factor)
str(Diabetes)
#step 3
Data partitioning
library(caret)
set.seed(1234)
intrain= createDataPartition(y=Diabetes$Outcome,p=0.75, list=F)
Training= Diabetes[intrain,]
Testing= Diabetes[-intrain,]
#step4
Data modeling
modelfit = train(Outcome~.,data training,method=”glm”)
#step 5 : data validation
Prediction= predict(modelfit,newdata=testing)
confusionMatrix(prediction,testing$outcome)
Knn
Same first 3 steps
#step4
Data modeling
modelfit2 = train(outcome~.,data training,method=”knn”)
prediction2= predict(modelfit2,newdata=testing)
confusionMatrix(prediction2,testing$outcome)
NB
Same 3 steps
library(klar)
#Step4
Data modeling
Modelfit3 = train(outcome~.,data training,method=”nb”)
Prediction3= predict(modelfit3,newdata=testing)
confusionMatrix(prediction3,testing$outcome)
#for Unsupervised model use packages [factoextra, nbClust, useful,descr]
We are going to use clustering technique
iris
irs=iris[,-5]
irs
#now it is unsupervised learning example- no label
 #computing the number of clusters : Plot, Dendogram method

 #create the clusters
 #validate
To plot the clusters : Elbow method
fviz_nbclust(irs, kmeans, method= ”wss” ) +
labs(subtitle = “Elbow Method”)
( by showing graph we will identify the no of clusters. Upside graph will only be taken, the horizontal will
not be taken)
(4 clusters)
#silhouette Method
fviz_nbclust(irs,kmeans,method="silhouette")+
labs(subtitle = "Silhouette Method")
(2 clusters)
#Gap statistics Method
fviz_nbclust(irs,kmeans,method="gap_stat",nstart=25,nboot=50)+labs(subtitle = "Gap Statistics

Method")
(6 clusters)
#Hartigan Plot
h1=PlotHartigan(FitKMeans(irs))
plot(h1)
(5 clusters) (do not inlcude red and green together)

R Basics

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

R Basics

Uploaded by

Copyright:

Available Formats

 Window 1 : editor ( for all the commands and editing)

 Window 2 : Console ( output)

 R is K sensitive ( for capital and small alphabets)

2. How make a file in r

 New Script ctrl+shift+N

Various functions on that file

 In order to get the details: ?filename

Ques: Extract rows from 20-30 by taking only 3 vectors ( any)

Extract mtcars with 6 cylinders only

Extract mtcars with 6 cylinder and 4 gear only

Extract mtcars with 6 cylinders with any 3 columns only

 Dplyr packages is used for filtering, sorting, searching etc.

Big data==(pipe)== small filtered data==(pipe)==desired data

#Extract mtcars with 3 col only named (mpg,cyl and disp)

#Extract mtcars with mpg>20 and 3 col only named (mpg,cyl,disp)

#Extract mtcars with (mpg>20,disp>150, and cyl>4)

#create and add one new column named (wtkg)

#extract rows 20-30 with all the vectors with dplyr

#To check the missing values

#number of missing values in airquality

median will not be calculated if there is any missing value

Hist(air$Ozone,xlab='Ozone',ylab='Mean Ozone in ppm',main='HISTOGRAM')

#Distribution -----1. Histogram 2.Density 3.Boxplot

#make histogram of mtcars based on the number of cylinder

# make histogram for cyl in mtcars

stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#for changing the colours

#Boxplot we need one discrete and one continous variable

#fill the color

#change the theme

#dot plot chart

#piechart of no of cylinders in Angel

#Pie chart of mpg based on no. of group of cylinders

#change the colour

# for correlation of each vector in mtcars and graph

# to change the graph squares to circles

#to change the colour

One lecture is missing

 Create a variable A = log(AirPassengers)

 Trend line must be horizontal to the x axis

#ARIMA MODEL(will use these 4 steps)

import the file from computer

Modelfit=auto.arima(log(tractor.ts),approximation = F, trace =F)

#Experience Task Performance Measure

#Decisions are based on two models : Supervised and Unsupervised Model

#Data must have Label Column or Dependent Variable and observations;

#F1 F2 F3 label; Like iris data

#if label is categorical data- we will use classification technique;

#if label is number/continous number- we wil use Regression

# example= gmail account(primary, Promotions,social)

#Data which does not have the label column;

#F1,F2,F3,Fe,F4 no label; or if we remove species from iris

#Machine Creates the label column- Clustering

# 5 STEPS IN MACHINE LEARNING

#2. Data Pre Processing (NA must be handled)

#data must be properly scaled

#3. Data Partitioning: Training (70-80%), Testing(20-30%)

#4. Model Building

#5. Model Validation

#(m.l (mainly classification technique),(same as caret) but in data files,same as caret)

Type must be Factor

Step1.: Data Retrieving

#since label column is categorical- Classification technique