R Studio

rm(list=ls())
#Assign a variable with an

integer value a <- 10L a
is.integer(a) #to check whether the value is
integer or not
#character type
str <- 'R
programming'
str s <-
"cse3505 -"
s class(s)
#some useful functions

paste(s,str) sprintf("%s has
scored %d marks","Sita",90)
substr(str,start=5,stop=10)
sub("e","C",str)
str
print(str)
#complex type
cmp <- 21+10i
sqrt(-1) sqrt(-1+0i)
sqrt(as.complex(-1)) #explicit type
conversion
#logical type lg
<- TRUE
p=TRUE;q=FAL
SE
p&q;p|q;!p
#Obtain the class and type of the variable
class(a)
typeof(a)
class(str)
typeof(str)
class(cmp)
typeof(cmp)
class(lg)
typeof(lg)
#special number Inf representing infinity

1/0
1/Inf log(0) #find
natural log.
#you can represent base value as 2nd

argument log(10,2) #base 2 log(10,10)
#base 10
#NaN represents a undefined value (also indicates a missing value)

0/0
##create vectors x <- 1:20
#always creates an integer vector
x
#class and length of a
vector class(x)
length(x)
#using c() x <- c(0.1,0.2)

##numeric vector x <-
c(TRUE,FALSE) ##logical
vector x <- c(T,F) ##logical
vector x <- c("A","B","C")
##character vector x <-
c(1L,2L,15L,27L) ##integer
vector x x <- c(1+2i,3)
##complex vector
#using
vector() x
<- vector()
x length(x)
class(x)
x <-
vector("character",length
= 10) x
#Implicit type coercion -
mixed objects y <- c(1.5,"a")
#character y y <- c(1.5,TRUE)
#numeric y
y <- c(TRUE,"a")
#character y
#Explicit type
coercion x <- 2.5
class(x)
as.integer(x) x
x <- -1:5 x
class(x)
as.numeric
(x)
as.logical(x)
as.characte
r(x)
as.complex
(x)
#Non-sensical coercion results in NAs

x <- c('a','b','c')
x as.
numeric(x)
as.logical(x)
#vector
arithmetics x
<- c(1,3,5) y <-
c(2,4,6)
x+y
x-y x*y x/y
help(options
) ?options
options(digit
s=2)
#recycling
rule y <-
c(2,4,6,8,10)
x+y
#create
matrices m <-
matrix()
m
m <- matrix(nrow=3,ncol=2)
m
attributes(m) dim(m) m <- matrix() m <-
matrix(1:6,nrow=3,ncol=2) #constructed column-wise m
<- matrix(1:6,nrow=3,ncol=2,byrow = TRUE)
#constructed column-wise
m
#constructing from
vector m <- 1:6
dim(m) <- c(3,2)

m
#constructing using
column-binding x <- 1:3 x y
<- 10:12 y cbind(x,y)
#constructing using row-

binding rbind(x,y)
#matrix multiplication x <-

matrix(c(1,2,3,4),nrow=2,ncol=2)
y <-
matrix(c(10,10,10,10),nrow=2,n
col=2) x y x*y #does element-
wise multiplication x%*%y
#does matrix multiplication
#similarly, use x%/%y for matrix division. Otherwise, it does element-wise division
x t(x) #transpose of a
matrix solve(x) #inverse
of a matrix det(x) #
determinant of a matrix
#creating a List x <-

list(1,'a',TRUE,1+3i,6.7,c(10,20,
30)) x
#factors x <-
factor(c("male","female")) x x <-
factor(c("low","medium","high",
"low")) table(x) unclass(x)
#missing values x
<-
c(1,2,NA,5,NaN,6)
is.na(x) is.nan(x)
# Data frame ----------------------------------------------

-------------------rm(list=ls())
# table with the same type within a column and different types between columns #
defined with a data.frame() function id=c(1,2,3) name=c("a","b","c") marks = c(50, 0, 25)
sample_df=data.frame(id,name,marks) sample_df
my_df <- data.frame(id = c(1, 2, 3),

name = c("Ramu","Raju","Ravi"),
marks = c(50, 40, 25))
my_df
#dimension of the data frame

dim(my_df)
#columns of the data frame

names(my_df)
#structure of the data frame

str(my_df)
#summary statistics of the data frame

summary(my_df)
head(my_df) #top 6 rows in the data

frame tail(my_df) #bottom 6 rows in the
data frame
################ ADDING/Removing columns

# Ways to add a column
my_df
#initialize with 0
my_df$name
my_df$perf <- 0
my_df
my_df$perf <- c("very good","good","needs

to improve") my_df
#can use [[]],[],[,] my_df[["perf"]] <-c("very

good","good","needs to improve")
my_df["perf"] <- c("very
good","good","needs to improve")
my_df[,"perf"] <- c("very
good","good","needs to improve") my_df[5]
<- 0 my_df
# Ways to remove the column
my_df[5] <- NULL

my_df$V5 <- NULL
my_df my_df$perf
<- NULL
my_df[["perf"]] <-
NULL
my_df["perf"] <-
NULL my_df[5] <-
NULL my_df$V5 <-
NULL #subsetting
df1 <- subset(my_df,

select=c(id,marks)) df1 df1 <-
subset(my_df, select=-marks)
df1 View(df1)
my_df$mark2 <-
c(30,20,10) my_df
#sum of all marks

sum(my_df$mark2)
#rowsum
my_df$total <-
rowSums(my_df[c(3,5)]) my_df
#max
max(my_df$total)
#index at which max value is present
which.max(my_df$total)
#name of the student who got the

max mark
my_df[["name"]][which.max(my_d
f$total)]
my_df$name[which.max(my_df$to
tal)]
my_df[which.max(my_df$total),2]
my_df my_df <-

rbind(my_df,data.frame(id=4,name="avgscore",marks=mean(my_df$marks),perf="meanperf",mark2=mean(my_df$mark
2),total=mean(my_df$total))) getwd()
write.csv(my_df,"marks1.csv")
write.csv(my_df,"marks.csv",row.names = FALSE)
#R datasets
stu_marks <- read.csv("marks.csv")

str(stu_marks)
stu_marks
stu_marks$mark2
stu_marks[4]
stu_marks[3,3]
stu_marks[3,5]
stu_marks
stu_marks[c(1,3),c(
2,5)]
stu_temp <- stu_marks[c(-2,-4)]

stu_temp
row.names(stu_temp)
row.names(stu_temp) <-
stu_marks$name stu_temp
boolv <-
stu_temp["mark2"]>10
boolv
row.names(stu_temp)[bo
olv]
#---------------------------------------------------------------
-----
library(help=datas
ets)
data(mtcars) # Loading mtcars

data set cars <-mtcars # Save the
data into workspace
# Viewing data set mtcars

# Total data set in console
View(mtcars) # Viewing dataset in
spreadsheet
head(mtcars) # Viewing top-6 observations (default:

top-6) tail(mtcars) # Viewing bottom 6
observations str(mtcars) # Viewing data
dictionary names(mtcars) # Viewing column
names v1 <- mtcars$mpg # Assigning single variable
from mtcars data to v1 v2 <- mtcars$cyl v3 <- mtcars$disp
v4 <- mtcars$hp newvar <- mtcars$disp + mtcars$hp
mtcars1<-rbind(v1,v2,v3,v4) # Combined as rows

#Horizontal joins mtcars1 mtcars2<-
cbind(v1,v2,v3,v4) # Combined as columns # Vertical
joins mtcars2
#create a variable obs_subset and have rows 4

to 10 in mtcars obs_subset <- mtcars[4:10,]
obs_subset
#create a variable var_subset and have only the

columns 1,5,9 var_subset <- mtcars[,c(1,5,9)]
var_subset
#subsetting
#create a variable subset1 and have only mpg and cyl variables of mtcars
#using indexing subset1 <-
mtcars[,c(1,2)] head(subset1,3)
#using subset() subset2 <-
subset(mtcars,select=c(mpg,cyl))
subset2
#create a variable subset3 and have only the rows

where mpg>18 subset3 <- subset(mtcars,mpg>18)
subset3
#create a variable subset3 and have only the rows where

mpg>18 and cyl>5 subset4 <- subset(mtcars, mpg>18
&cyl>5) subset4
#exclude mpg and cyl columns subset4 <-

subset(mtcars, mpg>18 &cyl>5, select=c(-mpg,-
cyl)) subset4
#install.packages("M
ASS") library(MASS)
data("survey")
#clear workspace
rm(list=ls())
loan <- read.csv("loans data.csv") loan <-

read.table("loans data.csv",header = TRUE,sep
= ",")
dim(loan)
str(loan)
head(loan,3)
tail(loan,2)
summary(loan)
any(is.na(loan))
sum(is.na(loan))
loan_cln <-
na.omit(loan)
nrow(loan)
nrow(loan_cln)
loan_cln2 <-
loan[complete.cases(loan),]
nrow(loan_cln2)
#Loading the dplyr package

library(dplyr)
#loading
data
data("mtca
rs") cars <-
mtcars
#dimension of the data

dim(cars)
#structure of the data

str(cars)
#is.na(cars) #NA or NaN

#checking for missing
values any(is.na(cars))
sum(is.na(cars))
#################### Viewing data ########################
#fetching top 6 rows

head(cars)
#fetching last 6 rows

tail(cars)
#viewing data
View(cars)
#summary
summary(cars)
cars
#tbl_df(cars)
as_tibble(cars)
glimpse(cars)
############ Subsetting Rows (Observations) #################
#filtering based on single

condition filter(cars, mpg>25)
#filtering based on multiple

condition filter(cars, mpg>25&
hp >90)
#Remove duplicate rows

distinct(cars)
#Randomly select fraction of rows

sample_frac(cars,0.2)
#Randomly select no. of rows

sample_n(cars,5)
#selecting rows by position

slice(cars,11:15)
slice_sample(cars,n=5)
temp <-
filter(cars,mpg>25)
slice_sample(temp,n
=2)
cars %>%
filter(mpg>25) %>%
slice_sample(n=2)
#unique values in a column

unique(cars$cyl)
#no. of values under each unique category

table(cars$cyl)
#grouping
cars %>%
group_by(cyl)%
>%
slice_sample(n
=2)
############ Subsetting Columns (variables) #################
#selecting single column

dplyr::select(cars,mpg)
cars %>%
dplyr::select(mpg)%>%
head(3)
#slice_sample(n=3)
#selecting multiple columns
dplyr::select(cars,mpg,cyl,gear)
dplyr::select(cars,c("mpg","cyl","ge
ar"))
names(cars)
#select all columns between a range of columns
(inclusive) dplyr::select(cars,hp:am)
#combining filter and select- using pipe operator

cars %>%
filter(mpg>18)%>%
dplyr::select(mpg,cyl)%>%
head(3)
names(cars)
#selecting columns starting with 'd'
dplyr::select(cars,starts_with('d'))
#selecting columns ending with 't'

dplyr::select(cars,ends_with('t'))
#selecting columns
containing 'g'
dplyr::select(cars,contains('g
')) #selecting columns
matching regular expression
dplyr::select(cars,matches('..
a.'))
#Excluding certain columns

select(cars,c(-mpg,-cyl))
############ Arranging data

#################
#arrange the data in ascending order of mpg

arrange(cars,mpg)
#arrange the data in descending order of mpg

arrange(cars,desc(mpg))
#arrange the data in order based on more than

one column arrange(cars, mpg,disp)
arrange(cars,mpg,desc(disp))
############ Making new variables #################
#creating a new column

mutate(cars,newvar=disp-hp )
#combining functions
#create a new variable that sum up disp and hp
and filter only
#the rows where mpg>25 &
disp>90 #and select only mpg,
disp, hp, newvar cars %>%
mutate(newvar2=disp+hp)%>%
filter(mpg>25,disp>90)%>%
dplyr::select(mpg,disp,hp,newv
ar2)%>% sample_n(2)
############ summarizing data

#################
#Always group_by is used along with summarise. It is applied on categorical value
cars %>% group_by(cyl) %>%
summarize(cnt=n()) #count of unique
cyl values
table(cars$cyl)
#computing max, min and standard dev cars %>% group_by(cyl) %>%
summarize(mx_mpg=max(mpg),mi_mpg=min(mpg),std_mpg=sd(mpg),mn=mean(
mpg),md=median(mpg))
#clear workspace
rm(list=ls())
loan <- read.csv("loans data.csv") loan <-

= ",")
dim(loan)
str(loan)
head(loan,3)
tail(loan,2)
summary(loan)
any(is.na(loan))
sum(is.na(loan))
loan_cln <-
na.omit(loan)
nrow(loan)
nrow(loan_cln)
loan_cln2 <-
loan[complete.cases(loan),]
nrow(loan_cln2)
#Loading the dplyr package

library(dplyr)
#loading
data
data("mtca
rs") cars <-
mtcars

dim(cars)

str(cars)
#is.na(cars) #NA or NaN

#checking for missing values
any(is.na(cars))
sum(is.na(cars))
#################### Viewing data ########################
head(cars)

tail(cars)
#viewing data
View(cars)
#summary
summary(cars)
cars
#tbl_df(cars)
as_tibble(cars)
glimpse(cars)
############ Subsetting Rows (Observations) #################
#filtering based on single

condition filter(cars, mpg>25)
#filtering based on multiple

condition filter(cars, mpg>25&
hp >90)
#Remove duplicate rows

distinct(cars)
#Randomly select fraction of rows

sample_frac(cars,0.2)
#Randomly select no. of rows

sample_n(cars,5)
#selecting rows by position

slice(cars,11:15)
slice_sample(cars,n=5)
temp <-
filter(cars,mpg>25)
slice_sample(temp,n
=2)
cars %>%
filter(mpg>25) %>%
slice_sample(n=2)

unique(cars$cyl)
#no. of values under each unique category

table(cars$cyl)
#grouping
cars %>%
group_by(cyl)%
>%
slice_sample(n
=2)
############ Subsetting Columns (variables) #################
#selecting single column

dplyr::select(cars,mpg)
cars %>%
dplyr::select(mpg)%>%
head(3)
#slice_sample(n=3)
#selecting multiple columns

dplyr::select(cars,mpg,cyl,gear)
dplyr::select(cars,c("mpg","cyl","ge
ar"))
names(cars)
#select all columns between a range of columns
(inclusive) dplyr::select(cars,hp:am)
#combining filter and select- using pipe operator
cars %>%
filter(mpg>18)%>%
dplyr::select(mpg,cyl)%>%
head(3)
names(cars)
#selecting columns starting with 'd'
dplyr::select(cars,starts_with('d'))
#selecting columns ending with 't'

dplyr::select(cars,ends_with('t'))
#selecting columns containing 'g'

dplyr::select(cars,contains('g'))
#selecting columns matching regular expression

dplyr::select(cars,matches('..a.'))
#Excluding certain columns

select(cars,c(-mpg,-cyl))
############ Arranging data

#################
#arrange the data in ascending order of mpg

arrange(cars,mpg)
#arrange the data in descending order of mpg

arrange(cars,desc(mpg))
#arrange the data in order based on more than

one column arrange(cars, mpg,disp)
arrange(cars,mpg,desc(disp))
############ Making new variables #################

#creating a new column
mutate(cars,newvar=disp-hp )
#combining functions
#create a new variable that sum up disp and hp
and filter only
#the rows where mpg>25 &
disp>90 #and select only mpg,
disp, hp, newvar cars %>%
mutate(newvar2=disp+hp)%>%
filter(mpg>25,disp>90)%>%
dplyr::select(mpg,disp,hp,newv
ar2)%>% sample_n(2)
############ summarizing data

#################
#Always group_by is used along with summarise. It is applied on categorical value
cars %>% group_by(cyl) %>%
summarize(cnt=n()) #count of unique
cyl values
table(cars$cyl)
#computing max, min and standard dev cars %>% group_by(cyl) %>%
summarize(mx_mpg=max(mpg),mi_mpg=min(mpg),std_mpg=sd(mpg),mn=mean(
mpg),md=median(mpg))
rm(list=ls())
#creating array from

vectors v1 <- c(1,2,3)
v2 <- c(4,5,6,7,8,9)
A1 <- array(c(v1,v2),dim = c(3,3,2))

A1
#naming columns and rows

rname <- c("r1","r2","r3") cname
<- c("c1","c2","c3") mname <-
c("mat1","mat2") dimnames(A1)
<- list(rname,cname,mname)
A1 <- array(c(v1,v2),dim = c(3,3,2),dimnames = list(rname,cname,mname))
A1
#printing the second row of second matrix

A1[2,,2]
A1["r2",,"mat2"]
#printing the second column of first matrix
A1[,2,1]
A1[,"c2","mat1"]
#printing the element in the 2nd row and 3rd column of second matrix
A1[2,3,2]
#printing the second matrix

A1[,,2]
A1[,,"mat2"]
#Manipulating array elements

M1 <- A1[,,1]
M2 <- A1[,,2]
M3 <- M1+M2
M3
M1
#Aggregation on array
elements apply(M1,1,sum)
#1- along row
apply(M2,2,sum) #2 -along
column A1
apply(A1,1,sum)
apply(A1,2,mean)
rm(list=ls())
#To create date / To

represent date d <- date()
d class(d)
#as.Date(d)
#to convert date string to date class d <-
as.Date("2022-8-25") #default format -year-
month-day class(d)
d as.Date("2022-8-25
10:44:22")
as.Date("2022-8-25
21:15")
#to see the internal representation

unclass(d)
#to represent both date and

time as.POSIXct("2022-8-25")
pd <- as.POSIXct("2022-8-25
21:15") pd
class(pd)
unclass(pd)
pd <- as.POSIXlt("2022-8-
25") pd
class(pd)
#getting meta using

unclass() unclass(pd)
names(unclass(pd))
pd <- as.POSIXlt("2022-8-17
21:15:30") pd$sec pd$hour
pd$min pd$mday pd$year
unlist(pd)
#if format is different

as.Date("25/8/2022",format="%d/%
m/%Y") date() as.Date("August
25,2022",format="%B %d,%Y")
as.Date("25Aug22",format="%d%b%
y")
#Checking the class

class(as.Date("2022-8-25 21:15"))
class(as.POSIXct("2022-8-25
21:15")) class(as.POSIXlt("2022-8-
25 21:15"))
#Getting date, time and

zone p <- Sys.Date() #only
current date class(p)
Sys.time() #current date, time and timezone
Sys.timezone()
#difference in dates Sys.Date()-

as.Date("1979-03-21")
difftime(Sys.Date(),as.Date("1979-03-
21"),units = "weeks") #getting weekdays and
basic arithmetic d <- as.Date("2022-8-17") d
#to find weekday of the date
weekdays(d)
#add or subtract to create new date(s)

d+1 d+1:5
weekdays(d+
1:5)
#check for seq

and rep #using
sequence d
dt <- seq(d,by="2
months",length.out = 6) dt
#getting month and

quarter months(d)
months(dt)
quarters(dt)
#lubridate::today() #lubridate package

#ISOdate(2021,8,25)
?strptime
help("strptime")
datestring<-"August 17, 2022 04:20"

convertedForm<-
strptime(datestring,"%B %d, %Y %H:%M")
class(convertedForm) convertedForm
x <- as.Date("2020-01-01") y <- strptime("25 Aug

2020 09:00:00", "%d %b %Y %H:%M:%S") x-y
class(x)
class(y)
x <- as.POSIXlt(x)
x-y
#different time zones x <-

as.POSIXct("2021-08-25
08:00:00") x
xgmt<-as.POSIXct("2021-08-25 08:00:00",
tz="GMT") xgmt
xgmt-x
#if(!file.exists("data"))
# dir.create("data")
#fileurl <- "https://data.baltimorecity.gov/api/views/dz54-2aru/rows.csv?accessType=DOWNLOAD"

#download.file(fileurl,destfile = "E:/sweetlin-personal/coursera/data/camera.csv")
#list.files("E:/sweetlin-personal/coursera/data")
#dateofdownload <- date()

#dateofdownload
rm(list=ls())
#Reading flat file using read.table() loan <-
= ",") str(loan) head(loan,2)
#Reading flat file using read.csv() loan1 <-

read.csv("loans data.csv") str(loan1) df <-
read.table("tabsepfile.txt",header =
FALSE,sep = "\t") str(df)
df <- read.table("slashsepfile.txt",header = FALSE,sep="/",strip.white = TRUE,na.strings = "EMPTY")

str(df)
#Reading Excel file

#you need to import xlsx package
#install.packages("xlsx")
#library(xlsx)
#loan <- read.xlsx("loan.xls",sheetIndex=1,
header=TRUE)
#install.packages("XLConnect")
#library(XLConnect)
library(readxl)
#excel_sheets('E:/sweetlin-official/FALL 2020 -2021/CSE3505/R
programs/loans data.xlsx') excel_sheets("loans data.xlsx") df <-
read_excel("loans data.xlsx",sheet="sample")
str(df)
#XLConnect, XLSX, readxl
#Reading XML file

#You need to install XML
package
install.packages("XML")
library(XML)
#library(methods)
#install.packages("RCurl") #library(RCurl)
library(httr) fileurl <-
"https://www.w3schools.com/xml/simpl
e.xml" xmldata <- GET(fileurl) doc <-
xmlTreeParse(xmldata,useInternal=TRUE)
root <-
xmlRoot(doc)
root
xmlName(root)
names(root)
#Accessing parts of xml file in the same

way as list root[[1]] #accessing 1st
food root[[1]][[1]] #accessing name of
the 1st food
#Extracting parts of XML file- value of all nodes

xmlSApply(root,xmlValue)
root <- xmlSApply(root,function(x) xmlSApply(x,xmlValue))
root
#Extracting individual nodes of

XML file
xpathSApply(root,"//name",xmlVal
ue)
xpathSApply(root,"//price",xmlVal
ue)
xml_df <- data.frame(t(root),row.names =

NULL) str(xml_df)
#Reading JSON file

#Loading jsonlite
package
library(jsonlite) jdata <-
fromJSON("https://api.github.com/users/jtleek
/repos") names(jdata)
class(jdata) str(jdata)
head(jdata,2)
#Extracting nested
objects
names(jdata$owner)
jdata$owner$login
#writing to json file

data(iris) str(iris)
head(iris,2) jfile <-
toJSON(iris,pretty = TRUE)
cat(jfile)
#reading json file

irisdata <-
fromJSON(jfile)
head(irisdata)
#clear workspace
rm(list=ls())
############ Reading data

#################
#using read.table() loan_data <- read.table("loans
data.csv",header = TRUE,sep = ",") loan <-
read.csv("loans data.csv")

dim(loan)

str(loan)
#view of the data in a table
View(loan)

head(loan)

tail(loan)
#summary of the data

summary(loan)
############ Cleaning data

#################
#checking for missing values in the data
any(is.na(loan)) #NA NaN
#checking for the total no. of missing values in

the data sum(is.na(loan))
#cleaning NA values
loan_clean <-
na.omit(loan)
sum(is.na(loan_clean)) str(loan_clean)
loan_clean1 <- loan[complete.cases(loan),]
#boolean indexing sum(is.na(loan_clean1))
#imputation - filling the missing values

#cleaning Amount.Requested Column
#checking for the total no. of missing values in a particular column
sum(is.na(loan$Amount.Requested)
)
unique(loan$Amount.Requested)
#changing to numeric types

loan$Amount.Requested <-
as.integer(loan$Amount.Requested)
str(loan)

unique(loan$Amount.Requested)
mean(loan$Amount.Requested,na.rm = TRUE)
median(loan$Amount.Requested,na.rm = TRUE)
#library(dplyr)
library(tidyverse)
#Decide whether to impute with mean or median loan %>%
summarize(avg=mean(Amount.Requested,na.rm =
TRUE),med=median(Amount.Requested,na.rm = TRUE))
loan <- loan %>%

mutate(Amount.Requested=replace(Amount.Requested,is.na(Amount.Requested),median(Amount.Req
uested,na.rm = TRUE)))
sum(is.na(loan$Amount.Requested)
)
#Rename a column loan <-
loan %>%
rename(Amt_Req=Amount.Re
quested) names(loan)
str(loan)
#cleaning Amount.Funded.By.Investors column
sum(is.na(loan$Amount.Funded.By.Investors))
unique(loan$Amount.Funded.By.In
vestors) str(loan)
loan <- loan%>%
rename(Amt_fund=Amount.Funded.By.Investors)
#convert the type to numeric
loan$Amt_fund <-
as.numeric(loan$Amt_fund)
#checking for NA values

sum(is.na(loan$Amt_fund))
#check impute with mean or

median loan%>%
summarize(avg=mean(loan$Amt_f
und,na.rm =
TRUE),md=median(loan$Amt_fund,
na.rm = TRUE))
loan <- loan%>%

mutate(Amt_fund=replace(Amt_fund,is.na(Amt_fund),median(Amt_fund,
na.rm = TRUE)))
sum(is.na(loan$Amt_fund))
str(loan)
#cleaning Interest.Rate column
sum(is.na(loan$Interest.Rate))
#cleaning unwanted substring in a chr
column loan <- loan %>%
mutate(Interest.Rate=gsub("%","",Int
erest.Rate))
head(loan$Interest.Rate,2)
loan$Interest.Rate <- as.numeric(loan$Interest.Rate)

head(loan$Interest.Rate,2)
loan$Interest.Rate <-
as.numeric(loan$Interest.Rate) str(loan)
#cleaning Loan.Length column

sum(is.na(loan$Loan.Length))
unique(loan$Loan.Length)
loan <- loan %>%

mutate(Loan.Length=gsub("
months","",Loan.Length))
loan$Loan.Length <- as.integer(loan$Loan.Length)
#filtering the rows with NA values

loan%>%
filter(is.na(Loan.Length))
#drop the rows with

NA values loan <-
loan%>%
drop_na(Loan.Length)
#checking
#cleaning Employment.Length column

sum(is.na(loan$Employment.Length))
unique(loan$Employment.Length)
loan <- loan %>% mutate(Employment.Length=gsub(" year|

years|< |\\+","",Employment.Length))
loan$Employment.Length <- as.integer(loan$Employment.Length)
#checking
sum(is.na(loan$Employment.Len
gth))
table(loan$Employment.Length)
mean(table(loan$Employment.Len
gth))
loan <- loan%>%

mutate(Employment.Length=replace(Employment.Length,is.na(Employment.Len
gth),2))
#checking
sum(is.na(loan$Employment.Lengt
h))
#cleaning FICO.Range column

head(loan$FICO.Range,2) loan <-
loan %>%
separate(FICO.Range,c("fico-
low","fico-high")) str(loan)
names(loan)
loan$`fico-high` <-
as.integer(loan$`fico-high`)
loan$`fico-low` <-
as.integer(loan$`fico-low`) str(loan)
sum(is.na(loan$`fico-high`))
sum(is.na(loan$`fico-low`))
unique(loan$`fico-high`)
unique(loan$`fico-low`)
#statistical analysis - Numerical measure
str(faithful) #faithful - built-in data
head(faithful)
#Central tendency measure

mean(faithful$eruptions)
#median
median(faithful$eruptions)
#Measure of dispersion
range(faithful$eruptions)
max(faithful$eruptions)-
min(faithful$eruptions)
#quartile
quantile(faithful$eruption
s)
#Inter-quartile range
IQR(faithful$eruptions)
#percentile
quantile(faithful$eruptions,c(.27,.3
5,.65))
#variance
var(faithful$eruptions)
#standard deviation
sd(faithful$eruptions)
#covariance
cov(faithful$eruptions,faithful$wai
ting)
#correlation
cor(faithful$eruptions,faithful$wai
ting)
#moment -third central moment

# the second central moment of a population
is its variance library(e1071)
moment(faithful$eruptions,3, center = TRUE)
#skewness
skewness(faithful$eruptions)
#kurtosis
kurtosis(faithful$eruptions)
#frequency
distributio #step1 -
find range
range(faithful$erupti
ons)
#step2 - Break the range into non-overlapping sub-intervals by defining a sequence of

equal distance break points. breaks <- seq(1.5,5.5,by=0.5) breaks
#step3- Classify the eruption durations according to the half-unit-length sub-

intervals with cut. interval <- cut(faithful$eruptions,breaks,right=FALSE)
#step 4 - Compute the frequency of eruptions in each sub-interval with the table function.
Interval_freq =
table(interval)
Interval_freq
cbind(Interval_freq)
#relative frequency relfreq
<-
Interval_freq/nrow(faithful)
old=options(digits = 2)
cbind(Interval_freq,relfreq)
#cumulative frequency
cumfreq <-
cumsum(table(interval))
cumfreq cbind(cumfreq)
rm(list=ls())
library(help=graphi
cs)
data("airquality")
str(airquality)
#to set the margin

par(mar=c(2,2,2,2))
#1D scatter plot

plot(airquality$Ozone)
#2D scatter plot
plot(airquality$Ozone,airquality$W
ind)
?plot
#type argument in plot

plot(airquality$Ozone,type="l")
#title and axis labels arguments

plot(airquality$Ozone,main = "ozone levels",xlab =
"index",ylab = "ozone")
#histogram
hist(airquality$Solar.R)
#boxplot
summary(airquality$Ozone)
boxplot(airquality$Ozone)
#multiple boxplot
boxplot(airquality[,1:4],main="multiple
box plots")
#pie chart
unique(airquality$Wind)
table(airquality$Wind)
wind_freq <- table(airquality$Wind)

wind_above8 <- wind_freq>8
wind_freq wind_above8
wind_above8data <-
wind_freq[wind_above8]
wind_above8data
table(wind_above8)
pie(wind_above8data,radius=1)
par(mar=c(1,1,1,1))
#grid of charts
par(mfrow=c(2,3),mar=c(2,2,2,1),las=0, bty="n")
plot(airquality$Ozone)
plot(airquality$Ozone,airquality$Wind)
plot(airquality$Ozone,type ='l')
barplot(airquality$Ozone, main = 'Ozone levels', ylab
= 'ozone value') hist(airquality$Solar.R)
boxplot(airquality$Ozone)
#lattice graph
library(lattice)
#density plot
densityplot(airquality$Ozone)
#scatter plot matrix

splom(airquality[c(1,3,4)])
#scatter plot depicting the combination

of 2 variables data("mtcars") df <-
mtcars
str(df)
par(mar=c(4,4.5,1,
1))
plot(df$wt,df$mpg)
unique(df$cyl) cyl_factor <- factor(df$cyl,levels =

c(4,6,8),labels = c("4cyl","6cyl","8cyl"))
unique(df$gear) gear_factor <- factor(df$gear,levels =

c(3,4,5),labels = c("3 gears","4 gears", "5 gears"))
xyplot(df$mpg~df$wt|cyl_factor*gear_factor,main="scatter plots: Cylinders and Gears",xlab = "weight of

car",ylab = "miles per gallon") gear_factor
freq_gear <- table(gear_factor) freq_gear

barplot(freq_gear,col=c("red","green","blue"))
pie(freq_gear,labels=c("3 gears","4 gears", "5
gears"),col=c("red","green","blue"),radius=1)
rm(list=ls())
data("mtcars")
#install.packages("ggplot2") library(ggplot2)
head(mtcars,2) #scatter plot ggplot(data=mtcars,
mapping=aes(x=wt,y=mpg))+geom_point()
unique(mtcars$cyl) cyl_factor <-
factor(mtcars$cyl,levels = c(4,6,8),labels =
c("4cyl","6cyl","8cyl"))
unique(mtcars$gear) gear_factor <- factor(mtcars$gear,levels =

c(3,4,5),labels = c("3 gears","4 gears", "5 gears"))
#scatter plot - multiple variables through color

ggplot(mtcars,aes(x=wt,y=mpg,color=gear_factor))+geom_point()
#scatter plot - multiple variables through size

ggplot(mtcars,aes(x=wt,y=mpg,size=qsec))+geo
m_point()
#scatter plot - multiple variables through both color and shape
ggplot(mtcars,aes(x=wt,y=mpg,shape=gear_factor))+geom_point(aes(color=cyl_factor,size=4))+geom_point(color="grey",
size=1.5)
ggplot(mtcars,aes(x=wt,y=mpg,shape=gear_factor))+geom_point(aes(color=cyl_factor,size=4))+geom_point(color="grey",
size=1.5)
#scatter plot- adding best fit line ggplot(mtcars,

aes(x=wt,y=mpg))+geom_point()+geom_smooth(method="
lm")
###########bar plot ########### ggplot(mtcars,

aes(x=gear_factor))+geom_bar() ggplot(mtcars,
aes(x=gear_factor,fill=gear_factor,color="red"))+geom_bar() +ggtitle("frquency
plot of gear")
#flipping the bar direction ggplot(mtcars,

aes(x=gear_factor))+geom_bar()+coord_flip()
#bar plot for 2 variables ggplot(mtcars,

aes(x=cyl_factor,fill=gear_factor))+geom_bar(position='sta
ck')
#################### pie chart ############ ggplot(mtcars,

aes(x="",y=mpg,fill=cyl_factor))+geom_bar(width =
1,stat='identity')+coord_polar("y",start = 0)
#################### histogram ###########

ggplot(mtcars,aes(x=hp))+geom_histogram()+labs(title = "Distribution of
hp",y='frequency')
#setting bin size ggplot(mtcars,aes(x=hp))+geom_histogram(bins =

3)+labs(title = "Distribution of hp",y='frequency')
#setting bin width ggplot(mtcars,aes(x=hp))+geom_histogram(binwidth =

30)+labs(title = "Distribution of hp",y='frequency')
#with border and fill color ggplot(mtcars,aes(x=hp))+geom_histogram(binwidth =

30,color='green',fill='yellow')+labs(title = "Distribution of hp",y='frequency')
#facets ggplot(mtcars,aes(x=hp))+geom_histogram(color="white",fill="blue")+labs(title = "Distribution of

hp",y='frequency')+facet_wrap(cyl_factor,ncol=1)
################ Kernel density curve ############ ggplot(mtcars,

aes(x=hp))+geom_density()+labs(title="Distribution of hp",x="horse
power",y='density')
#with fill color ggplot(mtcars,

aes(x=hp))+geom_density(fill='blue',color='red')+labs(title="Distribution of
hp",x="horse power",y='density')
############## Line plot ###############

library(dplyr) d <-
sample_n(mtcars,10)
ggplot(d,aes(x=wt,y=drat))+ge
om_line() d
#with varied thickness and color points

ggplot(d,aes(x=wt,y=drat))+geom_line(aes(size=2,color='red'))+geom_point(aes(size=2,color='blue'))
################# box plot

################ ggplot(mtcars,
aes(x=mpg))+geom_boxplot()
#multiple box plots ggplot(mtcars,
aes(x=cyl_factor,y=mpg))+geom_boxplot()

R Studio

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

R Studio

Uploaded by

Copyright:

Available Formats

rm(list=ls())

#Assign a variable with an

#some useful functions

#Obtain the class and type of the variable

#special number Inf representing infinity

#you can represent base value as 2nd

#NaN represents a undefined value (also indicates a missing value)

#using c() x <- c(0.1,0.2)

#Non-sensical coercion results in NAs

dim(m) <- c(3,2)

#constructing using row-

#matrix multiplication x <-

#creating a List x <-

# Data frame ----------------------------------------------

my_df <- data.frame(id = c(1, 2, 3),

#dimension of the data frame

#columns of the data frame

#structure of the data frame

#summary statistics of the data frame

head(my_df) #top 6 rows in the data

################ ADDING/Removing columns

my_df$perf <- c("very good","good","needs

#can use [[]],[],[,] my_df[["perf"]] <-c("very

# Ways to remove the column

my_df[5] <- NULL

df1 <- subset(my_df,

#sum of all marks

#name of the student who got the

my_df my_df <-

stu_marks <- read.csv("marks.csv")

stu_temp <- stu_marks[c(-2,-4)]

data(mtcars) # Loading mtcars

# Viewing data set mtcars

head(mtcars) # Viewing top-6 observations (default:

mtcars1<-rbind(v1,v2,v3,v4) # Combined as rows

#create a variable obs_subset and have rows 4

#create a variable var_subset and have only the

#create a variable subset3 and have only the rows

#create a variable subset3 and have only the rows where

#exclude mpg and cyl columns subset4 <-

loan <- read.csv("loans data.csv") loan <-

#Loading the dplyr package

#dimension of the data

#structure of the data

#is.na(cars) #NA or NaN

#fetching top 6 rows

#fetching last 6 rows

############ Subsetting Rows (Observations) #################

#filtering based on single

#filtering based on multiple

#Remove duplicate rows

#Randomly select fraction of rows

#Randomly select no. of rows

#selecting rows by position

#unique values in a column

#no. of values under each unique category

############ Subsetting Columns (variables) #################

#selecting single column

#combining filter and select- using pipe operator

#selecting columns ending with 't'

#Excluding certain columns

############ Arranging data

#arrange the data in ascending order of mpg