Professional Documents
Culture Documents
#character type
str <- 'R
programming'
str s <-
"cse3505 -"
s class(s)
str
print(str)
#complex type
cmp <- 21+10i
sqrt(-1) sqrt(-1+0i)
sqrt(as.complex(-1)) #explicit type
conversion
#logical type lg
<- TRUE
p=TRUE;q=FAL
SE
p&q;p|q;!p
class(a)
typeof(a)
class(str)
typeof(str)
class(cmp)
typeof(cmp)
class(lg)
typeof(lg)
#using
vector() x
<- vector()
x length(x)
class(x)
x <-
vector("character",length
= 10) x
#Implicit type coercion -
mixed objects y <- c(1.5,"a")
#character y y <- c(1.5,TRUE)
#numeric y
y <- c(TRUE,"a")
#character y
#Explicit type
coercion x <- 2.5
class(x)
as.integer(x) x
x <- -1:5 x
class(x)
as.numeric
(x)
as.logical(x)
as.characte
r(x)
as.complex
(x)
#vector
arithmetics x
<- c(1,3,5) y <-
c(2,4,6)
x+y
x-y x*y x/y
help(options
) ?options
options(digit
s=2)
#recycling
rule y <-
c(2,4,6,8,10)
x+y
#create
matrices m <-
matrix()
m
m <- matrix(nrow=3,ncol=2)
m
attributes(m) dim(m) m <- matrix() m <-
matrix(1:6,nrow=3,ncol=2) #constructed column-wise m
<- matrix(1:6,nrow=3,ncol=2,byrow = TRUE)
#constructed column-wise
m
#constructing from
vector m <- 1:6
#constructing using
column-binding x <- 1:3 x y
<- 10:12 y cbind(x,y)
#similarly, use x%/%y for matrix division. Otherwise, it does element-wise division
x t(x) #transpose of a
matrix solve(x) #inverse
of a matrix det(x) #
determinant of a matrix
#factors x <-
factor(c("male","female")) x x <-
factor(c("low","medium","high",
"low")) table(x) unclass(x)
#missing values x
<-
c(1,2,NA,5,NaN,6)
is.na(x) is.nan(x)
# table with the same type within a column and different types between columns #
defined with a data.frame() function id=c(1,2,3) name=c("a","b","c") marks = c(50, 0, 25)
sample_df=data.frame(id,name,marks) sample_df
my_df
#initialize with 0
my_df$name
my_df$perf <- 0
my_df
my_df$mark2 <-
c(30,20,10) my_df
#rowsum
my_df$total <-
rowSums(my_df[c(3,5)]) my_df
#max
max(my_df$total)
#index at which max value is present
which.max(my_df$total)
write.csv(my_df,"marks1.csv")
write.csv(my_df,"marks.csv",row.names = FALSE)
#R datasets
stu_marks
stu_marks$mark2
stu_marks[4]
stu_marks[3,3]
stu_marks[3,5]
stu_marks
stu_marks[c(1,3),c(
2,5)]
row.names(stu_temp)
row.names(stu_temp) <-
stu_marks$name stu_temp
boolv <-
stu_temp["mark2"]>10
boolv
row.names(stu_temp)[bo
olv]
#---------------------------------------------------------------
-----
library(help=datas
ets)
#subsetting
#create a variable subset1 and have only mpg and cyl variables of mtcars
#using indexing subset1 <-
mtcars[,c(1,2)] head(subset1,3)
#using subset() subset2 <-
subset(mtcars,select=c(mpg,cyl))
subset2
#clear workspace
rm(list=ls())
dim(loan)
str(loan)
head(loan,3)
tail(loan,2)
summary(loan)
any(is.na(loan))
sum(is.na(loan))
loan_cln <-
na.omit(loan)
nrow(loan)
nrow(loan_cln)
loan_cln2 <-
loan[complete.cases(loan),]
nrow(loan_cln2)
#loading
data
data("mtca
rs") cars <-
mtcars
#viewing data
View(cars)
#summary
summary(cars)
cars
#tbl_df(cars)
as_tibble(cars)
glimpse(cars)
slice_sample(cars,n=5)
temp <-
filter(cars,mpg>25)
slice_sample(temp,n
=2)
cars %>%
filter(mpg>25) %>%
slice_sample(n=2)
#grouping
cars %>%
group_by(cyl)%
>%
slice_sample(n
=2)
cars %>%
dplyr::select(mpg)%>%
head(3)
#slice_sample(n=3)
#selecting multiple columns
dplyr::select(cars,mpg,cyl,gear)
dplyr::select(cars,c("mpg","cyl","ge
ar"))
names(cars)
#select all columns between a range of columns
(inclusive) dplyr::select(cars,hp:am)
names(cars)
#selecting columns starting with 'd'
dplyr::select(cars,starts_with('d'))
#selecting columns
containing 'g'
dplyr::select(cars,contains('g
')) #selecting columns
matching regular expression
dplyr::select(cars,matches('..
a.'))
arrange(cars,mpg,desc(disp))
#combining functions
#create a new variable that sum up disp and hp
and filter only
#the rows where mpg>25 &
disp>90 #and select only mpg,
disp, hp, newvar cars %>%
mutate(newvar2=disp+hp)%>%
filter(mpg>25,disp>90)%>%
dplyr::select(mpg,disp,hp,newv
ar2)%>% sample_n(2)
table(cars$cyl)
#computing max, min and standard dev cars %>% group_by(cyl) %>%
summarize(mx_mpg=max(mpg),mi_mpg=min(mpg),std_mpg=sd(mpg),mn=mean(
mpg),md=median(mpg))
#clear workspace
rm(list=ls())
dim(loan)
str(loan)
head(loan,3)
tail(loan,2)
summary(loan)
any(is.na(loan))
sum(is.na(loan))
loan_cln <-
na.omit(loan)
nrow(loan)
nrow(loan_cln)
loan_cln2 <-
loan[complete.cases(loan),]
nrow(loan_cln2)
#loading
data
data("mtca
rs") cars <-
mtcars
any(is.na(cars))
sum(is.na(cars))
#################### Viewing data ########################
#fetching top 6 rows
head(cars)
#viewing data
View(cars)
#summary
summary(cars)
cars
#tbl_df(cars)
as_tibble(cars)
glimpse(cars)
slice_sample(cars,n=5)
temp <-
filter(cars,mpg>25)
slice_sample(temp,n
=2)
cars %>%
filter(mpg>25) %>%
slice_sample(n=2)
#grouping
cars %>%
group_by(cyl)%
>%
slice_sample(n
=2)
cars %>%
dplyr::select(mpg)%>%
head(3)
#slice_sample(n=3)
names(cars)
#select all columns between a range of columns
(inclusive) dplyr::select(cars,hp:am)
cars %>%
filter(mpg>18)%>%
dplyr::select(mpg,cyl)%>%
head(3)
names(cars)
#selecting columns starting with 'd'
dplyr::select(cars,starts_with('d'))
arrange(cars,mpg,desc(disp))
#combining functions
#create a new variable that sum up disp and hp
and filter only
#the rows where mpg>25 &
disp>90 #and select only mpg,
disp, hp, newvar cars %>%
mutate(newvar2=disp+hp)%>%
filter(mpg>25,disp>90)%>%
dplyr::select(mpg,disp,hp,newv
ar2)%>% sample_n(2)
table(cars$cyl)
#computing max, min and standard dev cars %>% group_by(cyl) %>%
summarize(mx_mpg=max(mpg),mi_mpg=min(mpg),std_mpg=sd(mpg),mn=mean(
mpg),md=median(mpg))
rm(list=ls())
A1[,"c2","mat1"]
#printing the element in the 2nd row and 3rd column of second matrix
A1[2,3,2]
M1
#Aggregation on array
elements apply(M1,1,sum)
#1- along row
apply(M2,2,sum) #2 -along
column A1
apply(A1,1,sum)
apply(A1,2,mean)
rm(list=ls())
#as.Date(d)
#to convert date string to date class d <-
as.Date("2022-8-25") #default format -year-
month-day class(d)
d as.Date("2022-8-25
10:44:22")
as.Date("2022-8-25
21:15")
pd <- as.POSIXlt("2022-8-
25") pd
class(pd)
pd <- as.POSIXlt("2022-8-17
21:15:30") pd$sec pd$hour
pd$min pd$mday pd$year
unlist(pd)
?strptime
help("strptime")
class(x)
class(y)
x <- as.POSIXlt(x)
x-y
xgmt<-as.POSIXct("2021-08-25 08:00:00",
tz="GMT") xgmt
xgmt-x
#if(!file.exists("data"))
# dir.create("data")
#install.packages("XLConnect")
#library(XLConnect)
library(readxl)
#excel_sheets('E:/sweetlin-official/FALL 2020 -2021/CSE3505/R
programs/loans data.xlsx') excel_sheets("loans data.xlsx") df <-
read_excel("loans data.xlsx",sheet="sample")
str(df)
root <-
xmlRoot(doc)
root
xmlName(root)
names(root)
root
class(jdata) str(jdata)
head(jdata,2)
#Extracting nested
objects
names(jdata$owner)
jdata$owner$login
#clear workspace
rm(list=ls())
#cleaning NA values
loan_clean <-
na.omit(loan)
sum(is.na(loan_clean)) str(loan_clean)
sum(is.na(loan$Amount.Requested)
)
unique(loan$Amount.Requested)
mean(loan$Amount.Requested,na.rm = TRUE)
median(loan$Amount.Requested,na.rm = TRUE)
#library(dplyr)
library(tidyverse)
#Decide whether to impute with mean or median loan %>%
summarize(avg=mean(Amount.Requested,na.rm =
TRUE),med=median(Amount.Requested,na.rm = TRUE))
sum(is.na(loan$Amount.Requested)
)
#Rename a column loan <-
loan %>%
rename(Amt_Req=Amount.Re
quested) names(loan)
str(loan)
#cleaning Amount.Funded.By.Investors column
sum(is.na(loan$Amount.Funded.By.Investors))
unique(loan$Amount.Funded.By.In
vestors) str(loan)
loan <- loan%>%
rename(Amt_fund=Amount.Funded.By.Investors)
#convert the type to numeric
loan$Amt_fund <-
as.numeric(loan$Amt_fund)
sum(is.na(loan$Amt_fund))
str(loan)
#cleaning Interest.Rate column
sum(is.na(loan$Interest.Rate))
#cleaning unwanted substring in a chr
column loan <- loan %>%
mutate(Interest.Rate=gsub("%","",Int
erest.Rate))
head(loan$Interest.Rate,2)
loan$Interest.Rate <-
as.numeric(loan$Interest.Rate) str(loan)
unique(loan$Loan.Length)
sum(is.na(loan$Loan.Length))
unique(loan$Loan.Length)
#checking
sum(is.na(loan$Loan.Length))
unique(loan$Loan.Length)
unique(loan$Employment.Length)
#checking
unique(loan$Employment.Length)
sum(is.na(loan$Employment.Len
gth))
table(loan$Employment.Length)
mean(table(loan$Employment.Len
gth))
loan$`fico-high` <-
as.integer(loan$`fico-high`)
loan$`fico-low` <-
as.integer(loan$`fico-low`) str(loan)
sum(is.na(loan$`fico-high`))
sum(is.na(loan$`fico-low`))
unique(loan$`fico-high`)
unique(loan$`fico-low`)
#statistical analysis - Numerical measure
str(faithful) #faithful - built-in data
head(faithful)
#Measure of dispersion
range(faithful$eruptions)
max(faithful$eruptions)-
min(faithful$eruptions)
#quartile
quantile(faithful$eruption
s)
#Inter-quartile range
IQR(faithful$eruptions)
#percentile
quantile(faithful$eruptions,c(.27,.3
5,.65))
#variance
var(faithful$eruptions)
#standard deviation
sd(faithful$eruptions)
#covariance
cov(faithful$eruptions,faithful$wai
ting)
#correlation
cor(faithful$eruptions,faithful$wai
ting)
#skewness
skewness(faithful$eruptions)
#kurtosis
kurtosis(faithful$eruptions)
#frequency
distributio #step1 -
find range
range(faithful$erupti
ons)
#step 4 - Compute the frequency of eruptions in each sub-interval with the table function.
Interval_freq =
table(interval)
Interval_freq
cbind(Interval_freq)
#relative frequency relfreq
<-
Interval_freq/nrow(faithful)
old=options(digits = 2)
cbind(Interval_freq,relfreq)
#cumulative frequency
cumfreq <-
cumsum(table(interval))
cumfreq cbind(cumfreq)
rm(list=ls())
library(help=graphi
cs)
data("airquality")
str(airquality)
plot(airquality$Ozone,airquality$W
ind)
?plot
#histogram
hist(airquality$Solar.R)
#boxplot
summary(airquality$Ozone)
boxplot(airquality$Ozone)
#multiple boxplot
boxplot(airquality[,1:4],main="multiple
box plots")
#pie chart
unique(airquality$Wind)
table(airquality$Wind)
#grid of charts
par(mfrow=c(2,3),mar=c(2,2,2,1),las=0, bty="n")
plot(airquality$Ozone)
plot(airquality$Ozone,airquality$Wind)
plot(airquality$Ozone,type ='l')
barplot(airquality$Ozone, main = 'Ozone levels', ylab
= 'ozone value') hist(airquality$Solar.R)
boxplot(airquality$Ozone)
#lattice graph
library(lattice)
#density plot
densityplot(airquality$Ozone)
rm(list=ls())
data("mtcars")
#install.packages("ggplot2") library(ggplot2)
head(mtcars,2) #scatter plot ggplot(data=mtcars,
mapping=aes(x=wt,y=mpg))+geom_point()
unique(mtcars$cyl) cyl_factor <-
factor(mtcars$cyl,levels = c(4,6,8),labels =
c("4cyl","6cyl","8cyl"))
ggplot(mtcars,aes(x=wt,y=mpg,shape=gear_factor))+geom_point(aes(color=cyl_factor,size=4))+geom_point(color="grey",
size=1.5)