Professional Documents
Culture Documents
1. Commands
Use the command name of the file you want= data.frame(the variables you want)
In order to remove any variable = rm(name of the variable)
View(Data)= to open the data table
Edit(Datafile)i.e name of the file = to edit the data table
To check the top 6 values= head(name of the file)
To check the last 6 values = tail(name of the file)
To check the dimensions of the file = Dim(Datafile)
To check the structure = str(Datafile)
To check the summary = summary(Datafile)
3. How to make the file functional ( as the files are already inbuilt in r)
B=Angel %>%
select(mpg,cyl,disp) %>%
filter(mpg>20)
G=Angel %>%
slice(20:30) %>%
select(1:3)
G
#extract mtcars with 6 cylinders only
H= Angel %>%
filter(cyl==6)
H
#Extract mtcars with 6 cylinders and 4 gears only
I= Angel %>%
filter(cyl==6,gear==4)
I
#extract mtcars with (mpg>20,disp>150 and cyl>4)
#without 3 col named (gear,carb,and wt)
J= Angel %>%
filter(mpg>20,disp>150,cyl>4) %>%
select(-gear,-carb,-wt)
J
Filter: within the rows
Select: with rows and coloumns
#extract mean of wt of cars cyl wise group
K=Angel %>%
group_by(cyl) %>%
summarise(mean(wt))
K
Missing values
is.na(air)
#histogram
Hist(air$Ozone)
#piechart
Pie(Angel$cyl)
Pie(Angel$cyl,xlab='Angel Cylinder',main = 'PIE CHART')
#Scatter Plot
Scatter.smooth(air$Ozone)
#Boxplot
Boxplot(air$Ozone)
UNIT 2
Ggplot2
library(ggplot2)
Angel= mtcars
ggplot(Angel,aes(x=cyl))+
geom_histogram()
ggplot(Angel,aes(x=cyl))+
geom_histogram(bins=5,color='red',fill='yellow')
#Theme (Background)
ggplot(Angel,aes(x=cyl))+
geom_histogram(bins=5,color='red',fill='yellow')+
theme_classic()
#Density plot
ggplot(Angel,aes(x=cyl))+
geom_density(bins=5,color='red',fill='yellow')+
theme_classic()
#for changing the density of cylinders on the basis of no of cylinders
ggplot(Angel,aes(x=cyl))+
geom_density(bins=5,color='red',fill='yellow')+
theme_classic()+
facet_wrap(Angel$cyl)
ggplot(Angel,aes(Y=mpg,X=as.factor(cyl)))+
geom_boxplot()
#compare Both
install.packages("gridExtra")
grid.arrange(A,B,ncol=2)
#compare 3-4 graphs together (make different files name them as A, B,C, D)
install.packages("gridExtra")
gird.arrange(A,B,C,ncol=2)
# for bar graphs
library(ggplot2)
ggplot(Angel,aes(X=cyl))+
geom_bar()
# for changing the colour and width of the bars of the graph
ggplot(Angel,aes(x=cyl))+
geom_bar(width=.5,color='red',fill='yellow')
install.packages("ggcorrplot")
library(ggcorrplot)
X=c(2.5,3.2,4.5)
y=c(1.2,1.6,1.9)
cor(X,y)
cor(mtcars$mpg,mtcars$Cyl)
round(cor(mtcars$mpg,mtcars$Cyl),1)
cor(mtcars)
A=round(cor(mtcars),1)
ggcorrplot(A)
#forecast
#step1 Forecasting the model
#steo 2 Prediction
#Step 3 Antilog
#step4 Model Validation-Plot the data
AirPassengers
#forecast data for next 4 years
Model works on loop hence trace would stop when the 1 model has been formed
#ETP
#supervised Model
#Unsupervised Model
#hence computer will change the unsupervised data into supervised data
#1. Data
#(caret,Kernlab,e1071 packages)
data(spam)
head(spam)
tail(spam)
Machine Learning
sum(is.na(spam))
training= spam[intrain,]
testing= spam[-intrain,]
Result
Confusion Matrix and Statistics
Reference
Prediction nonspam spam
nonspam 583 114
spam 114 339 (spam, non spam and vice versa
predicti-on must be minimum lead to a good model)
Kappa : 0.5848
Step 1
Data retrieving
Diabetes=read.csv(file.choose(),header=T)
head(Diabetes)
step2
data preprocessing
sum(is.na(Diabetes))
class(Diabetes$Outcome)
Diabetes$Outcome=as.factor(Diabetes$Outcome)
class(Diabetes$Outcome)
str(Diabetes)
#step 3
Data partitioning
library(caret)
set.seed(1234)
Training= Diabetes[intrain,]
Testing= Diabetes[-intrain,]
#step4
Data modeling
Prediction= predict(modelfit,newdata=testing)
confusionMatrix(prediction,testing$outcome)
Knn
#step4
Data modeling
prediction2= predict(modelfit2,newdata=testing)
confusionMatrix(prediction2,testing$outcome)
NB
Same 3 steps
library(klar)
#Step4
Data modeling
Prediction3= predict(modelfit3,newdata=testing)
confusionMatrix(prediction3,testing$outcome)
#for Unsupervised model use packages [factoextra, nbClust, useful,descr]
iris
irs=iris[,-5]
irs
( by showing graph we will identify the no of clusters. Upside graph will only be taken, the horizontal will
not be taken)
(4 clusters)
#silhouette Method
fviz_nbclust(irs,kmeans,method="silhouette")+
(2 clusters)
(6 clusters)
#Hartigan Plot
h1=PlotHartigan(FitKMeans(irs))
plot(h1)