You are on page 1of 13

LAB ASSIGNMENT – 9

Data cleaning and summarizing with dplyr package:


install.packages("dplyr")

package ‘dplyr’ successfully unpacked and MD5 sums checked

The downloaded binary packages are in


C:\Users\Dell\AppData\Local\Temp\RtmpQVVXDp\downloaded_packages
> library("dplyr", lib.loc="~/R/win-library/3.5")
> install.packages("readxl")
package ‘rematch’ successfully unpacked and MD5 sums checked
package ‘hms’ successfully unpacked and MD5 sums checked
package ‘prettyunits’ successfully unpacked and MD5 sums checked
package ‘cellranger’ successfully unpacked and MD5 sums checked
package ‘Rcpp’ successfully unpacked and MD5 sums checked
package ‘progress’ successfully unpacked and MD5 sums checked
package ‘readxl’ successfully unpacked and MD5 sums checked

The downloaded binary packages are in


C:\Users\Dell\AppData\Local\Temp\RtmpQVVXDp\downloaded_packages
> library("readxl", lib.loc="~/R/win-library/3.5")
> detach("package:dplyr", unload = TRUE)
> library(dplyr)

Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:


filter, lag

The following objects are masked from ‘package:base’:

intersect, setdiff, setequal, union

> mydata<- read.csv("D:\\Users\\Dell\\MTCars.csv")


> mydata
mpg cyl disp hp drat wt qsec vs am gear carb
Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
>
> #SELECTING RANDOM N ROWS
>
> sample_n(mydata,3)
mpg cyl disp hp drat wt qsec vs am gear carb
Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
>
> #SELECTING RANDOM FRACTION OF ROWS.
> #REMOVE DUPLICATE ROWS BASED ON ALL THE VARIABLES (COMPLETE ROW)
> sample_frac(mydata,0,1)
[1] mpg cyl disp hp drat wt qsec vs am gear carb
<0 rows> (or 0-length row.names)
> x1<-distinct(mydata)
> x1
mpg cyl disp hp drat wt qsec vs am gear carb
Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
>
> View(mydata)
> View(x1)

> #REMOVE DUPLICATE ROWS BASED ON A VARIABLE


> x2<-distinct(mydata,mpg,.keep_all=TRUE)
> x2
mpg cyl disp hp drat wt qsec vs am gear carb
Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
> x2<-distinct(mydata,mpg)
> x2
mpg
Mazda RX4 21.0
Datsun 710 22.8
Hornet 4 Drive 21.4
Hornet Sportabout 18.7
Valiant 18.1
>
> #REMOVE DUPLICATES ROWS BASED ON MILTIPLE VARIABLES
>
> x2<-distinct(mydata,mpg,cyl.keep_all=TRUE)
> x2
mpg cyl.keep_all
Mazda RX4 21.0 TRUE
Datsun 710 22.8 TRUE
Hornet 4 Drive 21.4 TRUE
Hornet Sportabout 18.7 TRUE
Valiant 18.1 TRUE
>
> #SELECTING VARIABLES( OR COLUMNS)
>
> data<-head(mtcars)
> data2<-select(mydata,hp)
> data2
hp
Mazda RX4 110
Mazda RX4 Wag 110
Datsun 710 93
Hornet 4 Drive 110
Hornet Sportabout 175
Valiant 105
> data2<-select(mydata,hp,cyl)
> data2
hp cyl
Mazda RX4 110 6
Mazda RX4 Wag 110 6
Datsun 710 93 4
Hornet 4 Drive 110 6
Hornet Sportabout 175 8
Valiant 105 6
>
> data2<-select(mydata,hp,cyl:mpg)
> data2
hp cyl mpg
Mazda RX4 110 6 21.0
Mazda RX4 Wag 110 6 21.0
Datsun 710 93 4 22.8
Hornet 4 Drive 110 6 21.4
Hornet Sportabout 175 8 18.7
Valiant 105 6 18.1
>
> #DROPPING VARIABLES :
> mydata1<-select(mydata,-cyl,-mpg)
> mydata1
disp hp drat wt qsec vs am gear carb
Mazda RX4 160 110 3.90 2.620 16.46 0 1 4 4
Mazda RX4 Wag 160 110 3.90 2.875 17.02 0 1 4 4
Datsun 710 108 93 3.85 2.320 18.61 1 1 4 1
Hornet 4 Drive 258 110 3.08 3.215 19.44 1 0 3 1
Hornet Sportabout 360 175 3.15 3.440 17.02 0 0 3 2
Valiant 225 105 2.76 3.460 20.22 1 0 3 1
>
> #SELECTING OR DROPPING VARIABLES STARTS WITH 'M'
> mydata3<-select(mydata,starts_with("M"))
> mydata3
mpg
Mazda RX4 21.0
Mazda RX4 Wag 21.0
Datsun 710 22.8
Hornet 4 Drive 21.4
Hornet Sportabout 18.7
Valiant 18.1
> mydata3<-select(mydata,-starts_with("M"))
> mydata3
cyl disp hp drat wt qsec vs am gear carb
Mazda RX4 6 160 110 3.90 2.620 16.46 0 1 4 4
Mazda RX4 Wag 6 160 110 3.90 2.875 17.02 0 1 4 4
Datsun 710 4 108 93 3.85 2.320 18.61 1 1 4 1
Hornet 4 Drive 6 258 110 3.08 3.215 19.44 1 0 3 1
Hornet Sportabout 8 360 175 3.15 3.440 17.02 0 0 3 2
Valiant 6 225 105 2.76 3.460 20.22 1 0 3 1
>
> mydata4<-select(mydata,ends_with("M"))
> mydata4
am
Mazda RX4 1
Mazda RX4 Wag 1
Datsun 710 1
Hornet 4 Drive 0
Hornet Sportabout 0
Valiant 0
> mydata4<-select(mydata,-ends_with("M"))
> mydata4
mpg cyl disp hp drat wt qsec vs gear carb
Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 4 4
Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 4 4
Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 4 1
Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 3 1
Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 3 2
Valiant 18.1 6 225 105 2.76 3.460 20.22 1 3 1
> mydata5<-select(mydata,contains("M"))
> mydata5
mpg am
Mazda RX4 21.0 1
Mazda RX4 Wag 21.0 1
Datsun 710 22.8 1
Hornet 4 Drive 21.4 0
Hornet Sportabout 18.7 0
Valiant 18.1 0
> mydata5<-select(mydata,-contains("M"))
> mydata5
cyl disp hp drat wt qsec vs gear carb
Mazda RX4 6 160 110 3.90 2.620 16.46 0 4 4
Mazda RX4 Wag 6 160 110 3.90 2.875 17.02 0 4 4
Datsun 710 4 108 93 3.85 2.320 18.61 1 4 1
Hornet 4 Drive 6 258 110 3.08 3.215 19.44 1 3 1
Hornet Sportabout 8 360 175 3.15 3.440 17.02 0 3 2
Valiant 6 225 105 2.76 3.460 20.22 1 3 1
>
> mydata6<-select(mydata,matches("M"))
> mydata6
mpg am
Mazda RX4 21.0 1
Mazda RX4 Wag 21.0 1
Datsun 710 22.8 1
Hornet 4 Drive 21.4 0
Hornet Sportabout 18.7 0
Valiant 18.1 0
>
> mydata6<-select(mydata,-matches("M"))
> mydata6
cyl disp hp drat wt qsec vs gear carb
Mazda RX4 6 160 110 3.90 2.620 16.46 0 4 4
Mazda RX4 Wag 6 160 110 3.90 2.875 17.02 0 4 4
Datsun 710 4 108 93 3.85 2.320 18.61 1 4 1
Hornet 4 Drive 6 258 110 3.08 3.215 19.44 1 3 1
Hornet Sportabout 8 360 175 3.15 3.440 17.02 0 3 2
Valiant 6 225 105 2.76 3.460 20.22 1 3 1
>
> mydata7<-select(mydata,everything())
> mydata7
mpg cyl disp hp drat wt qsec vs am gear carb
Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
>
> mydata7<-select(mydata,-everything())
> mydata7
data frame with 0 columns and 6 rows
>
> mydata8<-select(mydata,one_of("mpg"))
> mydata8
mpg
Mazda RX4 21.0
Mazda RX4 Wag 21.0
Datsun 710 22.8
Hornet 4 Drive 21.4
Hornet Sportabout 18.7
Valiant 18.1
>
> mydata8<-select(mydata,-one_of("mpg"))
> mydata8
cyl disp hp drat wt qsec vs am gear carb
Mazda RX4 6 160 110 3.90 2.620 16.46 0 1 4 4
Mazda RX4 Wag 6 160 110 3.90 2.875 17.02 0 1 4 4
Datsun 710 4 108 93 3.85 2.320 18.61 1 1 4 1
Hornet 4 Drive 6 258 110 3.08 3.215 19.44 1 0 3 1
Hornet Sportabout 8 360 175 3.15 3.440 17.02 0 0 3 2
Valiant 6 225 105 2.76 3.460 20.22 1 0 3 1
>
> #REORDER VARIABLES :
> mydata9<-select(mydata,mpg,everything())
> mydata9
mpg cyl disp hp drat wt qsec vs am gear carb
Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
> #RENAME VARIABLES :
> mydata10<-rename(mydata,CPG=mpg)
> mydata10
CPG cyl disp hp drat wt qsec vs am gear carb
Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
> #CREATE JOIN OPERATION :
> df1<-data.frame(ID=c(1,2,3,4,5),
+ w=c('a',"b","c","d","e"),
+ x=c(1,1,0,0,1),
+ y=rnorm(5),
+ z=letters[1:5])
>
> df2<-data.frame(ID=c(6,7,8,9,10),
+ w=c('f','g','h','i','j'),
+ x=c(1,2,3,4,1),
+ y=rnorm(5),
+ z=letters[2:6])
> df3<-inner_join(df1,df2,by="ID")
> df3
[1] ID w.x x.x y.x z.x w.y x.y y.y z.y
<0 rows> (or 0-length row.names)
> df1<-data.frame(ID=c(1,2,3,4,5),
+ w=c('a','b','c','d','e'),
+ x=c(1,1,0,0,1),
+ y=rnorm(5),
+ z=letters[1:5])
> df3<-inner_join(df1,df2,by="ID")
> df3
[1] ID w.x x.x y.x z.x w.y x.y y.y z.y
<0 rows> (or 0-length row.names)
> df3<-left_join(df1,df2,by="ID")
> df3
ID w.x x.x y.x z.x w.y x.y y.y z.y
1 1 a 1 1.23880722 a <NA> NA NA <NA>
2 2 b 1 0.06740645 b <NA> NA NA <NA>
3 3 c 0 -1.10506875 c <NA> NA NA <NA>
4 4 d 0 1.26324291 d <NA> NA NA <NA>
5 5 e 1 1.08766362 e <NA> NA NA <NA>
>
> df3<-right_join(df1,df2,by="ID")
> df3
ID w.x x.x y.x z.x w.y x.y y.y z.y
1 6 <NA> NA NA <NA> f 1 -0.08988933 b
2 7 <NA> NA NA <NA> g 2 -1.45710936 c
3 8 <NA> NA NA <NA> h 3 -0.15047338 d
4 9 <NA> NA NA <NA> i 4 -0.73064022 e
5 10 <NA> NA NA <NA> j 1 -1.41407058 f
>
> df3<-full_join(df1,df2,by="ID")
> df3
ID w.x x.x y.x z.x w.y x.y y.y z.y
1 1 a 1 1.23880722 a <NA> NA NA <NA>
2 2 b 1 0.06740645 b <NA> NA NA <NA>
3 3 c 0 -1.10506875 c <NA> NA NA <NA>
4 4 d 0 1.26324291 d <NA> NA NA <NA>
5 5 e 1 1.08766362 e <NA> NA NA <NA>
6 6 <NA> NA NA <NA> f 1 -0.08988933 b
7 7 <NA> NA NA <NA> g 2 -1.45710936 c
8 8 <NA> NA NA <NA> h 3 -0.15047338 d
9 9 <NA> NA NA <NA> i 4 -0.73064022 e
10 10 <NA> NA NA <NA> j 1 -1.41407058 f
>
> #SUMARIZE SELECTED VARIABLES :
> summarise(mydata,mpg_mean=mean(hp),hp_med=median(hp))
mpg_mean hp_med
1 117.1667 110
>
> #SUMMARISE MULTIPLE VARIABLES :
> summarise_at(mydata,vars(hp,disp),funs(n(),mean,median))
hp_n disp_n hp_mean disp_mean hp_median disp_median
1 6 6 117.1667 211.8333 110 192.5
# Simple named list:
list(mean = mean, median = median)
# Auto named with `tibble::lst()`:
tibble::lst(mean, median)
# Using lambdas
list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))

>
summarise_at(mydata,vars(hp,disp),funs(n(),missing=sum(is.na(.)),mean(.,na.
rm=TRUE),median(.,na.rm=TRUE)))
hp_n disp_n hp_missing disp_missing hp_mean disp_mean hp_median
1 6 6 0 0 117.1667 211.8333 110
disp_median
1 192.5
>
> summarise_if(mydata,is.numeric,funs(n(),mean,median))
mpg_n cyl_n disp_n hp_n drat_n wt_n qsec_n vs_n am_n gear_n carb_n
166666666666
mpg_mean cyl_mean disp_mean hp_mean drat_mean wt_mean qsec_mean
1 20.5 6 211.8333 117.1667 3.44 2.988333 18.12833
vs_mean am_mean gear_mean carb_mean mpg_median cyl_median disp_median
1 0.5 0.5 3.5 2.166667 21 6 192.5
hp_median drat_median wt_median qsec_median vs_median am_median
1 110 3.5 3.045 17.815 0.5 0.5
gear_median carb_median
1 3.5 1.5

Name: GHANA SYAM SAI VARMA


Reg.no: 20BCD7129

You might also like