You are on page 1of 25

rm(list=ls())

#Assign a variable with an


integer value a <- 10L a
is.integer(a) #to check whether the value is
integer or not

#character type
str <- 'R
programming'
str s <-
"cse3505 -"
s class(s)

#some useful functions


paste(s,str) sprintf("%s has
scored %d marks","Sita",90)
substr(str,start=5,stop=10)
sub("e","C",str)

str
print(str)

#complex type
cmp <- 21+10i
sqrt(-1) sqrt(-1+0i)
sqrt(as.complex(-1)) #explicit type
conversion

#logical type lg
<- TRUE
p=TRUE;q=FAL
SE
p&q;p|q;!p

#Obtain the class and type of the variable

class(a)
typeof(a)
class(str)
typeof(str)
class(cmp)
typeof(cmp)
class(lg)
typeof(lg)

#special number Inf representing infinity


1/0
1/Inf log(0) #find
natural log.

#you can represent base value as 2nd


argument log(10,2) #base 2 log(10,10)
#base 10

#NaN represents a undefined value (also indicates a missing value)


0/0
##create vectors x <- 1:20
#always creates an integer vector
x
#class and length of a
vector class(x)
length(x)

#using c() x <- c(0.1,0.2)


##numeric vector x <-
c(TRUE,FALSE) ##logical
vector x <- c(T,F) ##logical
vector x <- c("A","B","C")
##character vector x <-
c(1L,2L,15L,27L) ##integer
vector x x <- c(1+2i,3)
##complex vector

#using
vector() x
<- vector()
x length(x)
class(x)

x <-
vector("character",length
= 10) x
#Implicit type coercion -
mixed objects y <- c(1.5,"a")
#character y y <- c(1.5,TRUE)
#numeric y

y <- c(TRUE,"a")
#character y

#Explicit type
coercion x <- 2.5
class(x)
as.integer(x) x

x <- -1:5 x
class(x)
as.numeric
(x)
as.logical(x)
as.characte
r(x)
as.complex
(x)

#Non-sensical coercion results in NAs


x <- c('a','b','c')
x as.
numeric(x)
as.logical(x)

#vector
arithmetics x
<- c(1,3,5) y <-
c(2,4,6)
x+y
x-y x*y x/y
help(options
) ?options
options(digit
s=2)

#recycling
rule y <-
c(2,4,6,8,10)
x+y

#create
matrices m <-
matrix()
m

m <- matrix(nrow=3,ncol=2)
m
attributes(m) dim(m) m <- matrix() m <-
matrix(1:6,nrow=3,ncol=2) #constructed column-wise m
<- matrix(1:6,nrow=3,ncol=2,byrow = TRUE)
#constructed column-wise
m

#constructing from
vector m <- 1:6

dim(m) <- c(3,2)


m

#constructing using
column-binding x <- 1:3 x y
<- 10:12 y cbind(x,y)

#constructing using row-


binding rbind(x,y)

#matrix multiplication x <-


matrix(c(1,2,3,4),nrow=2,ncol=2)
y <-
matrix(c(10,10,10,10),nrow=2,n
col=2) x y x*y #does element-
wise multiplication x%*%y
#does matrix multiplication

#similarly, use x%/%y for matrix division. Otherwise, it does element-wise division

x t(x) #transpose of a
matrix solve(x) #inverse
of a matrix det(x) #
determinant of a matrix

#creating a List x <-


list(1,'a',TRUE,1+3i,6.7,c(10,20,
30)) x

#factors x <-
factor(c("male","female")) x x <-
factor(c("low","medium","high",
"low")) table(x) unclass(x)
#missing values x
<-
c(1,2,NA,5,NaN,6)
is.na(x) is.nan(x)

# Data frame ----------------------------------------------


-------------------rm(list=ls())

# table with the same type within a column and different types between columns #
defined with a data.frame() function id=c(1,2,3) name=c("a","b","c") marks = c(50, 0, 25)
sample_df=data.frame(id,name,marks) sample_df

my_df <- data.frame(id = c(1, 2, 3),


name = c("Ramu","Raju","Ravi"),
marks = c(50, 40, 25))
my_df

#dimension of the data frame


dim(my_df)

#columns of the data frame


names(my_df)

#structure of the data frame


str(my_df)

#summary statistics of the data frame


summary(my_df)

head(my_df) #top 6 rows in the data


frame tail(my_df) #bottom 6 rows in the
data frame

################ ADDING/Removing columns


# Ways to add a column

my_df
#initialize with 0
my_df$name

my_df$perf <- 0
my_df

my_df$perf <- c("very good","good","needs


to improve") my_df

#can use [[]],[],[,] my_df[["perf"]] <-c("very


good","good","needs to improve")
my_df["perf"] <- c("very
good","good","needs to improve")
my_df[,"perf"] <- c("very
good","good","needs to improve") my_df[5]
<- 0 my_df

# Ways to remove the column

my_df[5] <- NULL


my_df$V5 <- NULL
my_df my_df$perf
<- NULL
my_df[["perf"]] <-
NULL
my_df["perf"] <-
NULL my_df[5] <-
NULL my_df$V5 <-
NULL #subsetting

df1 <- subset(my_df,


select=c(id,marks)) df1 df1 <-
subset(my_df, select=-marks)
df1 View(df1)

my_df$mark2 <-
c(30,20,10) my_df

#sum of all marks


sum(my_df$mark2)

#rowsum
my_df$total <-
rowSums(my_df[c(3,5)]) my_df

#max
max(my_df$total)
#index at which max value is present
which.max(my_df$total)

#name of the student who got the


max mark
my_df[["name"]][which.max(my_d
f$total)]
my_df$name[which.max(my_df$to
tal)]
my_df[which.max(my_df$total),2]

my_df my_df <-


rbind(my_df,data.frame(id=4,name="avgscore",marks=mean(my_df$marks),perf="meanperf",mark2=mean(my_df$mark
2),total=mean(my_df$total))) getwd()

write.csv(my_df,"marks1.csv")
write.csv(my_df,"marks.csv",row.names = FALSE)

#R datasets

stu_marks <- read.csv("marks.csv")


str(stu_marks)

stu_marks
stu_marks$mark2
stu_marks[4]
stu_marks[3,3]
stu_marks[3,5]
stu_marks
stu_marks[c(1,3),c(
2,5)]

stu_temp <- stu_marks[c(-2,-4)]


stu_temp

row.names(stu_temp)
row.names(stu_temp) <-
stu_marks$name stu_temp
boolv <-
stu_temp["mark2"]>10
boolv
row.names(stu_temp)[bo
olv]
#---------------------------------------------------------------
-----
library(help=datas
ets)

data(mtcars) # Loading mtcars


data set cars <-mtcars # Save the
data into workspace

# Viewing data set mtcars


# Total data set in console
View(mtcars) # Viewing dataset in
spreadsheet

head(mtcars) # Viewing top-6 observations (default:


top-6) tail(mtcars) # Viewing bottom 6
observations str(mtcars) # Viewing data
dictionary names(mtcars) # Viewing column
names v1 <- mtcars$mpg # Assigning single variable
from mtcars data to v1 v2 <- mtcars$cyl v3 <- mtcars$disp
v4 <- mtcars$hp newvar <- mtcars$disp + mtcars$hp

mtcars1<-rbind(v1,v2,v3,v4) # Combined as rows


#Horizontal joins mtcars1 mtcars2<-
cbind(v1,v2,v3,v4) # Combined as columns # Vertical
joins mtcars2

#create a variable obs_subset and have rows 4


to 10 in mtcars obs_subset <- mtcars[4:10,]
obs_subset

#create a variable var_subset and have only the


columns 1,5,9 var_subset <- mtcars[,c(1,5,9)]
var_subset

#subsetting
#create a variable subset1 and have only mpg and cyl variables of mtcars
#using indexing subset1 <-
mtcars[,c(1,2)] head(subset1,3)
#using subset() subset2 <-
subset(mtcars,select=c(mpg,cyl))
subset2

#create a variable subset3 and have only the rows


where mpg>18 subset3 <- subset(mtcars,mpg>18)
subset3

#create a variable subset3 and have only the rows where


mpg>18 and cyl>5 subset4 <- subset(mtcars, mpg>18
&cyl>5) subset4

#exclude mpg and cyl columns subset4 <-


subset(mtcars, mpg>18 &cyl>5, select=c(-mpg,-
cyl)) subset4
#install.packages("M
ASS") library(MASS)
data("survey")

#clear workspace
rm(list=ls())

loan <- read.csv("loans data.csv") loan <-


read.table("loans data.csv",header = TRUE,sep
= ",")

dim(loan)
str(loan)
head(loan,3)
tail(loan,2)
summary(loan)
any(is.na(loan))
sum(is.na(loan))

loan_cln <-
na.omit(loan)
nrow(loan)
nrow(loan_cln)

loan_cln2 <-
loan[complete.cases(loan),]
nrow(loan_cln2)

#Loading the dplyr package


library(dplyr)

#loading
data
data("mtca
rs") cars <-
mtcars

#dimension of the data


dim(cars)

#structure of the data


str(cars)

#is.na(cars) #NA or NaN


#checking for missing
values any(is.na(cars))
sum(is.na(cars))
#################### Viewing data ########################

#fetching top 6 rows


head(cars)

#fetching last 6 rows


tail(cars)

#viewing data
View(cars)

#summary

summary(cars)

cars
#tbl_df(cars)
as_tibble(cars)

glimpse(cars)

############ Subsetting Rows (Observations) #################

#filtering based on single


condition filter(cars, mpg>25)

#filtering based on multiple


condition filter(cars, mpg>25&
hp >90)

#Remove duplicate rows


distinct(cars)

#Randomly select fraction of rows


sample_frac(cars,0.2)

#Randomly select no. of rows


sample_n(cars,5)

#selecting rows by position


slice(cars,11:15)

slice_sample(cars,n=5)

temp <-

filter(cars,mpg>25)

slice_sample(temp,n

=2)

cars %>%
filter(mpg>25) %>%
slice_sample(n=2)

#unique values in a column


unique(cars$cyl)

#no. of values under each unique category


table(cars$cyl)

#grouping
cars %>%
group_by(cyl)%
>%
slice_sample(n
=2)

############ Subsetting Columns (variables) #################

#selecting single column


dplyr::select(cars,mpg)

cars %>%
dplyr::select(mpg)%>%
head(3)
#slice_sample(n=3)
#selecting multiple columns
dplyr::select(cars,mpg,cyl,gear)
dplyr::select(cars,c("mpg","cyl","ge
ar"))

names(cars)
#select all columns between a range of columns
(inclusive) dplyr::select(cars,hp:am)

#combining filter and select- using pipe operator


cars %>%
filter(mpg>18)%>%
dplyr::select(mpg,cyl)%>%
head(3)

names(cars)
#selecting columns starting with 'd'
dplyr::select(cars,starts_with('d'))

#selecting columns ending with 't'


dplyr::select(cars,ends_with('t'))

#selecting columns
containing 'g'
dplyr::select(cars,contains('g
')) #selecting columns
matching regular expression
dplyr::select(cars,matches('..
a.'))

#Excluding certain columns


select(cars,c(-mpg,-cyl))

############ Arranging data


#################

#arrange the data in ascending order of mpg


arrange(cars,mpg)

#arrange the data in descending order of mpg


arrange(cars,desc(mpg))

#arrange the data in order based on more than


one column arrange(cars, mpg,disp)

arrange(cars,mpg,desc(disp))

############ Making new variables #################

#creating a new column


mutate(cars,newvar=disp-hp )

#combining functions
#create a new variable that sum up disp and hp
and filter only
#the rows where mpg>25 &
disp>90 #and select only mpg,
disp, hp, newvar cars %>%
mutate(newvar2=disp+hp)%>%
filter(mpg>25,disp>90)%>%
dplyr::select(mpg,disp,hp,newv
ar2)%>% sample_n(2)

############ summarizing data


#################
#Always group_by is used along with summarise. It is applied on categorical value
cars %>% group_by(cyl) %>%
summarize(cnt=n()) #count of unique
cyl values

table(cars$cyl)

#computing max, min and standard dev cars %>% group_by(cyl) %>%
summarize(mx_mpg=max(mpg),mi_mpg=min(mpg),std_mpg=sd(mpg),mn=mean(
mpg),md=median(mpg))

#clear workspace
rm(list=ls())

loan <- read.csv("loans data.csv") loan <-


read.table("loans data.csv",header = TRUE,sep
= ",")

dim(loan)
str(loan)
head(loan,3)
tail(loan,2)
summary(loan)
any(is.na(loan))
sum(is.na(loan))

loan_cln <-
na.omit(loan)
nrow(loan)
nrow(loan_cln)

loan_cln2 <-
loan[complete.cases(loan),]
nrow(loan_cln2)

#Loading the dplyr package


library(dplyr)

#loading
data
data("mtca
rs") cars <-
mtcars

#dimension of the data


dim(cars)

#structure of the data


str(cars)

#is.na(cars) #NA or NaN


#checking for missing values

any(is.na(cars))
sum(is.na(cars))
#################### Viewing data ########################
#fetching top 6 rows
head(cars)

#fetching last 6 rows


tail(cars)

#viewing data
View(cars)

#summary
summary(cars)

cars

#tbl_df(cars)
as_tibble(cars)

glimpse(cars)

############ Subsetting Rows (Observations) #################

#filtering based on single


condition filter(cars, mpg>25)

#filtering based on multiple


condition filter(cars, mpg>25&
hp >90)

#Remove duplicate rows


distinct(cars)

#Randomly select fraction of rows


sample_frac(cars,0.2)

#Randomly select no. of rows


sample_n(cars,5)

#selecting rows by position


slice(cars,11:15)

slice_sample(cars,n=5)

temp <-

filter(cars,mpg>25)

slice_sample(temp,n

=2)

cars %>%
filter(mpg>25) %>%
slice_sample(n=2)

#unique values in a column


unique(cars$cyl)

#no. of values under each unique category


table(cars$cyl)

#grouping
cars %>%
group_by(cyl)%
>%
slice_sample(n
=2)

############ Subsetting Columns (variables) #################

#selecting single column


dplyr::select(cars,mpg)

cars %>%
dplyr::select(mpg)%>%
head(3)
#slice_sample(n=3)

#selecting multiple columns


dplyr::select(cars,mpg,cyl,gear)
dplyr::select(cars,c("mpg","cyl","ge
ar"))

names(cars)
#select all columns between a range of columns
(inclusive) dplyr::select(cars,hp:am)

#combining filter and select- using pipe operator

cars %>%
filter(mpg>18)%>%
dplyr::select(mpg,cyl)%>%
head(3)

names(cars)
#selecting columns starting with 'd'
dplyr::select(cars,starts_with('d'))

#selecting columns ending with 't'


dplyr::select(cars,ends_with('t'))

#selecting columns containing 'g'


dplyr::select(cars,contains('g'))

#selecting columns matching regular expression


dplyr::select(cars,matches('..a.'))

#Excluding certain columns


select(cars,c(-mpg,-cyl))

############ Arranging data


#################

#arrange the data in ascending order of mpg


arrange(cars,mpg)

#arrange the data in descending order of mpg


arrange(cars,desc(mpg))

#arrange the data in order based on more than


one column arrange(cars, mpg,disp)

arrange(cars,mpg,desc(disp))

############ Making new variables #################


#creating a new column
mutate(cars,newvar=disp-hp )

#combining functions
#create a new variable that sum up disp and hp
and filter only
#the rows where mpg>25 &
disp>90 #and select only mpg,
disp, hp, newvar cars %>%
mutate(newvar2=disp+hp)%>%
filter(mpg>25,disp>90)%>%
dplyr::select(mpg,disp,hp,newv
ar2)%>% sample_n(2)

############ summarizing data


#################
#Always group_by is used along with summarise. It is applied on categorical value
cars %>% group_by(cyl) %>%
summarize(cnt=n()) #count of unique
cyl values

table(cars$cyl)

#computing max, min and standard dev cars %>% group_by(cyl) %>%
summarize(mx_mpg=max(mpg),mi_mpg=min(mpg),std_mpg=sd(mpg),mn=mean(
mpg),md=median(mpg))

rm(list=ls())

#creating array from


vectors v1 <- c(1,2,3)
v2 <- c(4,5,6,7,8,9)

A1 <- array(c(v1,v2),dim = c(3,3,2))


A1

#naming columns and rows


rname <- c("r1","r2","r3") cname
<- c("c1","c2","c3") mname <-
c("mat1","mat2") dimnames(A1)
<- list(rname,cname,mname)
A1 <- array(c(v1,v2),dim = c(3,3,2),dimnames = list(rname,cname,mname))
A1

#printing the second row of second matrix


A1[2,,2]
A1["r2",,"mat2"]
#printing the second column of first matrix
A1[,2,1]

A1[,"c2","mat1"]

#printing the element in the 2nd row and 3rd column of second matrix
A1[2,3,2]

#printing the second matrix


A1[,,2]
A1[,,"mat2"]

#Manipulating array elements


M1 <- A1[,,1]
M2 <- A1[,,2]
M3 <- M1+M2
M3

M1
#Aggregation on array
elements apply(M1,1,sum)
#1- along row
apply(M2,2,sum) #2 -along
column A1
apply(A1,1,sum)
apply(A1,2,mean)

rm(list=ls())

#To create date / To


represent date d <- date()
d class(d)

#as.Date(d)
#to convert date string to date class d <-
as.Date("2022-8-25") #default format -year-
month-day class(d)
d as.Date("2022-8-25
10:44:22")
as.Date("2022-8-25
21:15")

#to see the internal representation


unclass(d)

#to represent both date and


time as.POSIXct("2022-8-25")
pd <- as.POSIXct("2022-8-25
21:15") pd
class(pd)
unclass(pd)

pd <- as.POSIXlt("2022-8-
25") pd
class(pd)

#getting meta using


unclass() unclass(pd)
names(unclass(pd))

pd <- as.POSIXlt("2022-8-17
21:15:30") pd$sec pd$hour
pd$min pd$mday pd$year
unlist(pd)

#if format is different


as.Date("25/8/2022",format="%d/%
m/%Y") date() as.Date("August
25,2022",format="%B %d,%Y")
as.Date("25Aug22",format="%d%b%
y")

#Checking the class


class(as.Date("2022-8-25 21:15"))
class(as.POSIXct("2022-8-25
21:15")) class(as.POSIXlt("2022-8-
25 21:15"))

#Getting date, time and


zone p <- Sys.Date() #only
current date class(p)
Sys.time() #current date, time and timezone
Sys.timezone()

#difference in dates Sys.Date()-


as.Date("1979-03-21")
difftime(Sys.Date(),as.Date("1979-03-
21"),units = "weeks") #getting weekdays and
basic arithmetic d <- as.Date("2022-8-17") d
#to find weekday of the date
weekdays(d)

#add or subtract to create new date(s)


d+1 d+1:5
weekdays(d+
1:5)

#check for seq


and rep #using
sequence d
dt <- seq(d,by="2
months",length.out = 6) dt

#getting month and


quarter months(d)
months(dt)
quarters(dt)

#lubridate::today() #lubridate package


#ISOdate(2021,8,25)

?strptime
help("strptime")

datestring<-"August 17, 2022 04:20"


convertedForm<-
strptime(datestring,"%B %d, %Y %H:%M")
class(convertedForm) convertedForm

x <- as.Date("2020-01-01") y <- strptime("25 Aug


2020 09:00:00", "%d %b %Y %H:%M:%S") x-y

class(x)
class(y)

x <- as.POSIXlt(x)
x-y

#different time zones x <-


as.POSIXct("2021-08-25
08:00:00") x

xgmt<-as.POSIXct("2021-08-25 08:00:00",
tz="GMT") xgmt

xgmt-x

#if(!file.exists("data"))
# dir.create("data")

#fileurl <- "https://data.baltimorecity.gov/api/views/dz54-2aru/rows.csv?accessType=DOWNLOAD"


#download.file(fileurl,destfile = "E:/sweetlin-personal/coursera/data/camera.csv")
#list.files("E:/sweetlin-personal/coursera/data")

#dateofdownload <- date()


#dateofdownload
rm(list=ls())
#Reading flat file using read.table() loan <-
read.table("loans data.csv",header = TRUE,sep
= ",") str(loan) head(loan,2)

#Reading flat file using read.csv() loan1 <-


read.csv("loans data.csv") str(loan1) df <-
read.table("tabsepfile.txt",header =
FALSE,sep = "\t") str(df)

df <- read.table("slashsepfile.txt",header = FALSE,sep="/",strip.white = TRUE,na.strings = "EMPTY")


str(df)

#Reading Excel file


#you need to import xlsx package
#install.packages("xlsx")
#library(xlsx)
#loan <- read.xlsx("loan.xls",sheetIndex=1,
header=TRUE)

#install.packages("XLConnect")
#library(XLConnect)

library(readxl)
#excel_sheets('E:/sweetlin-official/FALL 2020 -2021/CSE3505/R
programs/loans data.xlsx') excel_sheets("loans data.xlsx") df <-
read_excel("loans data.xlsx",sheet="sample")

str(df)

#XLConnect, XLSX, readxl

#Reading XML file


#You need to install XML
package
install.packages("XML")
library(XML)
#library(methods)
#install.packages("RCurl") #library(RCurl)
library(httr) fileurl <-
"https://www.w3schools.com/xml/simpl
e.xml" xmldata <- GET(fileurl) doc <-
xmlTreeParse(xmldata,useInternal=TRUE)

root <-
xmlRoot(doc)
root
xmlName(root)
names(root)

#Accessing parts of xml file in the same


way as list root[[1]] #accessing 1st
food root[[1]][[1]] #accessing name of
the 1st food

#Extracting parts of XML file- value of all nodes


xmlSApply(root,xmlValue)

root <- xmlSApply(root,function(x) xmlSApply(x,xmlValue))

root

#Extracting individual nodes of


XML file
xpathSApply(root,"//name",xmlVal
ue)
xpathSApply(root,"//price",xmlVal
ue)

xml_df <- data.frame(t(root),row.names =


NULL) str(xml_df)

#Reading JSON file


#Loading jsonlite
package
library(jsonlite) jdata <-
fromJSON("https://api.github.com/users/jtleek
/repos") names(jdata)

class(jdata) str(jdata)
head(jdata,2)
#Extracting nested
objects
names(jdata$owner)
jdata$owner$login

#writing to json file


data(iris) str(iris)
head(iris,2) jfile <-
toJSON(iris,pretty = TRUE)
cat(jfile)

#reading json file


irisdata <-
fromJSON(jfile)
head(irisdata)

#clear workspace
rm(list=ls())

############ Reading data


#################
#using read.table() loan_data <- read.table("loans
data.csv",header = TRUE,sep = ",") loan <-
read.csv("loans data.csv")

#dimension of the data


dim(loan)

#structure of the data


str(loan)
#view of the data in a table
View(loan)

#fetching top 6 rows


head(loan)

#fetching last 6 rows


tail(loan)

#summary of the data


summary(loan)

############ Cleaning data


#################
#checking for missing values in the data
any(is.na(loan)) #NA NaN

#checking for the total no. of missing values in


the data sum(is.na(loan))

#cleaning NA values
loan_clean <-
na.omit(loan)

sum(is.na(loan_clean)) str(loan_clean)

loan_clean1 <- loan[complete.cases(loan),]

#boolean indexing sum(is.na(loan_clean1))

#imputation - filling the missing values


#cleaning Amount.Requested Column
#checking for the total no. of missing values in a particular column

sum(is.na(loan$Amount.Requested)
)

unique(loan$Amount.Requested)

#changing to numeric types


loan$Amount.Requested <-
as.integer(loan$Amount.Requested)
str(loan)

#unique values in a column


unique(loan$Amount.Requested)

mean(loan$Amount.Requested,na.rm = TRUE)
median(loan$Amount.Requested,na.rm = TRUE)

#library(dplyr)
library(tidyverse)
#Decide whether to impute with mean or median loan %>%
summarize(avg=mean(Amount.Requested,na.rm =
TRUE),med=median(Amount.Requested,na.rm = TRUE))

loan <- loan %>%


mutate(Amount.Requested=replace(Amount.Requested,is.na(Amount.Requested),median(Amount.Req
uested,na.rm = TRUE)))

sum(is.na(loan$Amount.Requested)
)
#Rename a column loan <-
loan %>%
rename(Amt_Req=Amount.Re
quested) names(loan)
str(loan)
#cleaning Amount.Funded.By.Investors column
sum(is.na(loan$Amount.Funded.By.Investors))

unique(loan$Amount.Funded.By.In
vestors) str(loan)
loan <- loan%>%
rename(Amt_fund=Amount.Funded.By.Investors)
#convert the type to numeric
loan$Amt_fund <-
as.numeric(loan$Amt_fund)

#checking for NA values


sum(is.na(loan$Amt_fund))

#check impute with mean or


median loan%>%
summarize(avg=mean(loan$Amt_f
und,na.rm =
TRUE),md=median(loan$Amt_fund,
na.rm = TRUE))

loan <- loan%>%


mutate(Amt_fund=replace(Amt_fund,is.na(Amt_fund),median(Amt_fund,
na.rm = TRUE)))

sum(is.na(loan$Amt_fund))

str(loan)
#cleaning Interest.Rate column
sum(is.na(loan$Interest.Rate))
#cleaning unwanted substring in a chr
column loan <- loan %>%
mutate(Interest.Rate=gsub("%","",Int
erest.Rate))
head(loan$Interest.Rate,2)

loan$Interest.Rate <- as.numeric(loan$Interest.Rate)


head(loan$Interest.Rate,2)

loan$Interest.Rate <-
as.numeric(loan$Interest.Rate) str(loan)

#cleaning Loan.Length column


sum(is.na(loan$Loan.Length))

unique(loan$Loan.Length)

loan <- loan %>%


mutate(Loan.Length=gsub("
months","",Loan.Length))

loan$Loan.Length <- as.integer(loan$Loan.Length)

sum(is.na(loan$Loan.Length))

unique(loan$Loan.Length)

#filtering the rows with NA values


loan%>%
filter(is.na(Loan.Length))

#drop the rows with


NA values loan <-
loan%>%
drop_na(Loan.Length)

#checking
sum(is.na(loan$Loan.Length))

unique(loan$Loan.Length)

#cleaning Employment.Length column


sum(is.na(loan$Employment.Length))

unique(loan$Employment.Length)

loan <- loan %>% mutate(Employment.Length=gsub(" year|


years|< |\\+","",Employment.Length))

loan$Employment.Length <- as.integer(loan$Employment.Length)

#checking
unique(loan$Employment.Length)
sum(is.na(loan$Employment.Len
gth))

table(loan$Employment.Length)
mean(table(loan$Employment.Len
gth))

loan <- loan%>%


mutate(Employment.Length=replace(Employment.Length,is.na(Employment.Len
gth),2))
#checking
sum(is.na(loan$Employment.Lengt
h))
unique(loan$Employment.Length)

#cleaning FICO.Range column


head(loan$FICO.Range,2) loan <-
loan %>%
separate(FICO.Range,c("fico-
low","fico-high")) str(loan)
names(loan)

loan$`fico-high` <-
as.integer(loan$`fico-high`)
loan$`fico-low` <-
as.integer(loan$`fico-low`) str(loan)
sum(is.na(loan$`fico-high`))
sum(is.na(loan$`fico-low`))
unique(loan$`fico-high`)
unique(loan$`fico-low`)
#statistical analysis - Numerical measure
str(faithful) #faithful - built-in data
head(faithful)

#Central tendency measure


mean(faithful$eruptions)
#median
median(faithful$eruptions)

#Measure of dispersion
range(faithful$eruptions)
max(faithful$eruptions)-
min(faithful$eruptions)
#quartile
quantile(faithful$eruption
s)

#Inter-quartile range
IQR(faithful$eruptions)

#percentile
quantile(faithful$eruptions,c(.27,.3
5,.65))

#variance
var(faithful$eruptions)

#standard deviation
sd(faithful$eruptions)

#covariance
cov(faithful$eruptions,faithful$wai
ting)

#correlation
cor(faithful$eruptions,faithful$wai
ting)

#moment -third central moment


# the second central moment of a population
is its variance library(e1071)
moment(faithful$eruptions,3, center = TRUE)

#skewness
skewness(faithful$eruptions)

#kurtosis
kurtosis(faithful$eruptions)

#frequency
distributio #step1 -
find range
range(faithful$erupti
ons)

#step2 - Break the range into non-overlapping sub-intervals by defining a sequence of


equal distance break points. breaks <- seq(1.5,5.5,by=0.5) breaks

#step3- Classify the eruption durations according to the half-unit-length sub-


intervals with cut. interval <- cut(faithful$eruptions,breaks,right=FALSE)

#step 4 - Compute the frequency of eruptions in each sub-interval with the table function.
Interval_freq =
table(interval)
Interval_freq
cbind(Interval_freq)
#relative frequency relfreq
<-
Interval_freq/nrow(faithful)

old=options(digits = 2)
cbind(Interval_freq,relfreq)

#cumulative frequency
cumfreq <-
cumsum(table(interval))
cumfreq cbind(cumfreq)

rm(list=ls())
library(help=graphi
cs)
data("airquality")
str(airquality)

#to set the margin


par(mar=c(2,2,2,2))

#1D scatter plot


plot(airquality$Ozone)

#2D scatter plot

plot(airquality$Ozone,airquality$W
ind)

?plot

#type argument in plot


plot(airquality$Ozone,type="l")

#title and axis labels arguments


plot(airquality$Ozone,main = "ozone levels",xlab =
"index",ylab = "ozone")

#histogram
hist(airquality$Solar.R)

#boxplot
summary(airquality$Ozone)
boxplot(airquality$Ozone)

#multiple boxplot
boxplot(airquality[,1:4],main="multiple
box plots")

#pie chart
unique(airquality$Wind)
table(airquality$Wind)

wind_freq <- table(airquality$Wind)


wind_above8 <- wind_freq>8
wind_freq wind_above8
wind_above8data <-
wind_freq[wind_above8]
wind_above8data
table(wind_above8)
pie(wind_above8data,radius=1)
par(mar=c(1,1,1,1))

#grid of charts
par(mfrow=c(2,3),mar=c(2,2,2,1),las=0, bty="n")
plot(airquality$Ozone)
plot(airquality$Ozone,airquality$Wind)
plot(airquality$Ozone,type ='l')
barplot(airquality$Ozone, main = 'Ozone levels', ylab
= 'ozone value') hist(airquality$Solar.R)
boxplot(airquality$Ozone)

#lattice graph
library(lattice)

#density plot
densityplot(airquality$Ozone)

#scatter plot matrix


splom(airquality[c(1,3,4)])

#scatter plot depicting the combination


of 2 variables data("mtcars") df <-
mtcars
str(df)
par(mar=c(4,4.5,1,
1))
plot(df$wt,df$mpg)

unique(df$cyl) cyl_factor <- factor(df$cyl,levels =


c(4,6,8),labels = c("4cyl","6cyl","8cyl"))

unique(df$gear) gear_factor <- factor(df$gear,levels =


c(3,4,5),labels = c("3 gears","4 gears", "5 gears"))

xyplot(df$mpg~df$wt|cyl_factor*gear_factor,main="scatter plots: Cylinders and Gears",xlab = "weight of


car",ylab = "miles per gallon") gear_factor

freq_gear <- table(gear_factor) freq_gear


barplot(freq_gear,col=c("red","green","blue"))
pie(freq_gear,labels=c("3 gears","4 gears", "5
gears"),col=c("red","green","blue"),radius=1)

rm(list=ls())
data("mtcars")

#install.packages("ggplot2") library(ggplot2)
head(mtcars,2) #scatter plot ggplot(data=mtcars,
mapping=aes(x=wt,y=mpg))+geom_point()
unique(mtcars$cyl) cyl_factor <-
factor(mtcars$cyl,levels = c(4,6,8),labels =
c("4cyl","6cyl","8cyl"))

unique(mtcars$gear) gear_factor <- factor(mtcars$gear,levels =


c(3,4,5),labels = c("3 gears","4 gears", "5 gears"))

#scatter plot - multiple variables through color


ggplot(mtcars,aes(x=wt,y=mpg,color=gear_factor))+geom_point()

#scatter plot - multiple variables through size


ggplot(mtcars,aes(x=wt,y=mpg,size=qsec))+geo
m_point()
#scatter plot - multiple variables through both color and shape
ggplot(mtcars,aes(x=wt,y=mpg,shape=gear_factor))+geom_point(aes(color=cyl_factor,size=4))+geom_point(color="grey",
size=1.5)

ggplot(mtcars,aes(x=wt,y=mpg,shape=gear_factor))+geom_point(aes(color=cyl_factor,size=4))+geom_point(color="grey",
size=1.5)

#scatter plot- adding best fit line ggplot(mtcars,


aes(x=wt,y=mpg))+geom_point()+geom_smooth(method="
lm")

###########bar plot ########### ggplot(mtcars,


aes(x=gear_factor))+geom_bar() ggplot(mtcars,
aes(x=gear_factor,fill=gear_factor,color="red"))+geom_bar() +ggtitle("frquency
plot of gear")

#flipping the bar direction ggplot(mtcars,


aes(x=gear_factor))+geom_bar()+coord_flip()

#bar plot for 2 variables ggplot(mtcars,


aes(x=cyl_factor,fill=gear_factor))+geom_bar(position='sta
ck')

#################### pie chart ############ ggplot(mtcars,


aes(x="",y=mpg,fill=cyl_factor))+geom_bar(width =
1,stat='identity')+coord_polar("y",start = 0)

#################### histogram ###########


ggplot(mtcars,aes(x=hp))+geom_histogram()+labs(title = "Distribution of
hp",y='frequency')

#setting bin size ggplot(mtcars,aes(x=hp))+geom_histogram(bins =


3)+labs(title = "Distribution of hp",y='frequency')

#setting bin width ggplot(mtcars,aes(x=hp))+geom_histogram(binwidth =


30)+labs(title = "Distribution of hp",y='frequency')

#with border and fill color ggplot(mtcars,aes(x=hp))+geom_histogram(binwidth =


30,color='green',fill='yellow')+labs(title = "Distribution of hp",y='frequency')

#facets ggplot(mtcars,aes(x=hp))+geom_histogram(color="white",fill="blue")+labs(title = "Distribution of


hp",y='frequency')+facet_wrap(cyl_factor,ncol=1)

################ Kernel density curve ############ ggplot(mtcars,


aes(x=hp))+geom_density()+labs(title="Distribution of hp",x="horse
power",y='density')

#with fill color ggplot(mtcars,


aes(x=hp))+geom_density(fill='blue',color='red')+labs(title="Distribution of
hp",x="horse power",y='density')

############## Line plot ###############


library(dplyr) d <-
sample_n(mtcars,10)
ggplot(d,aes(x=wt,y=drat))+ge
om_line() d

#with varied thickness and color points


ggplot(d,aes(x=wt,y=drat))+geom_line(aes(size=2,color='red'))+geom_point(aes(size=2,color='blue'))

################# box plot


################ ggplot(mtcars,
aes(x=mpg))+geom_boxplot()
#multiple box plots ggplot(mtcars,
aes(x=cyl_factor,y=mpg))+geom_boxplot()

You might also like