Ex 2 - Data Preprocessing and Exploration - Fortune 500 Dataset With Comments

Ex-1—my-sol.
R
Admin
2019-09-21
getwd() # returns the present working directory
## [1] "D:/Academics/AY 2019-20/ODD SEM 2019/Predictive

Analytics/Datasets/Exercises/Ex1"
setwd("D:/Academics/AY 2019-20/ODD SEM 2019/Predictive

Analytics/Datasets/Exercises/Ex1")
# setwd changes the working directory - change this to the folder
# which contains the data file.
fin <- read.csv("Future 500.csv")
# import csv file to object "fin" - you can give any name
head(fin, 24) # want to see the first 24 rows
## ID Name Industry Inception Employees State

## 1 1 Over-Hex Software 2006 25 TN
## 2 2 Unimattax IT Services 2009 36 PA
## 3 3 Greenfax Retail 2012 NA SC
## 4 4 Blacklane IT Services 2011 66 CA
## 5 5 Yearflex Software 2013 45 WI
## 6 6 Indigoplanet IT Services 2013 60 NJ
## 7 7 Treslam Financial Services 2009 116 MO
## 8 8 Rednimdox Construction 2013 73 NY
## 9 9 Lamtone IT Services 2009 55 CA
## 10 10 Stripfind Financial Services 2010 25 FL
## 11 11 Canecorporation Health 2012 6
## 12 12 Mattouch IT Services 2013 6 WA
## 13 13 Techdrill Health 2009 9 MS
## 14 14 Techline 2006 65 CA
## 15 15 Cityace 2010 25 CO
## 16 16 Kayelectronics Health 2009 687 NC
## 17 17 Ganzlax IT Services 2011 75 NJ
## 18 18 Trantraxlax Government Services 2011 35 VA
## 19 19 E-Zim Retail 2008 320 OH
## 20 20 Daltfase Software 2011 78 NC
## 21 21 Hotlane Government Services 2012 87 AL
## 22 22 Lathotline Health NA 103 VA
## 23 23 Lambam IT Services 2012 210 SC
## 24 24 Quozap Software 2004 21 NJ
## City Revenue Expenses Profit Growth
## 1 Franklin $9,684,527 1,130,700 Dollars 8553827 19%
## 2 Newtown Square $14,016,543 804,035 Dollars 13212508 20%
## 3 Greenville $9,746,272 1,044,375 Dollars 8701897 16%
## 4 Orange $15,359,369 4,631,808 Dollars 10727561 19%
## 5 Madison $8,567,910 4,374,841 Dollars 4193069 19%
## 6 Manalapan $12,805,452 4,626,275 Dollars 8179177 22%
## 7 Clayton $5,387,469 2,127,984 Dollars 3259485 17%
## 8 Woodside NA
## 9 San Ramon $11,757,018 6,482,465 Dollars 5274553 30%
## 10 Boca Raton $12,329,371 916,455 Dollars 11412916 20%
## 11 New York $10,597,009 7,591,189 Dollars 3005820 7%
## 12 Bellevue $14,026,934 7,429,377 Dollars 6597557 26%
## 13 Flowood $10,573,990 7,435,363 Dollars 3138627 8%
## 15 Louisville $9,254,614 6,249,498 Dollars 3005116 6%
## 16 Clayton $9,451,943 3,878,113 Dollars 5573830 4%
## 17 Iselin $14,001,180 11901180 18%
## 18 Suffolk $11,088,336 5,635,276 Dollars 5453060 7%
## 19 Monroe $10,746,451 4,762,319 Dollars 5984132 13%
## 20 Durham $10,410,628 6,196,409 Dollars 4214219 17%
## 21 Huntsville $7,978,332 5,686,574 Dollars 2291758 2%
## 22 McLean $9,418,303 7,567,233 Dollars 1851070 2%
## 23 Columbia $11,950,148 4,365,512 Dollars 7584636 20%
## 24 Collingswood $8,304,480 7,019,973 Dollars 1284507 20%
# we find after running this there are some blanks without NA

# this is because R treats blank spaces as character
# how to fix this - go to data import code line and change the code
# if excel file has blank spaces R treats it as a character hence to
# tell R to consider "" blank as NA na.strings = "" is used.
fin <- read.csv("Future 500.csv", na.strings="")
head(fin, 24)

## 11 11 Canecorporation Health 2012 6 <NA>
## 14 14 Techline <NA> 2006 65 CA
## 15 15 Cityace <NA> 2010 25 CO
## 19 19 E-Zim Retail 2008 320 OH
## 22 22 Lathotline Health NA 103 VA
## 1 Franklin $9,684,527 1,130,700 Dollars 8553827 19%
## 2 Newtown Square $14,016,543 804,035 Dollars 13212508 20%
## 3 Greenville $9,746,272 1,044,375 Dollars 8701897 16%
## 4 Orange $15,359,369 4,631,808 Dollars 10727561 19%
## 5 Madison $8,567,910 4,374,841 Dollars 4193069 19%
## 6 Manalapan $12,805,452 4,626,275 Dollars 8179177 22%
## 7 Clayton $5,387,469 2,127,984 Dollars 3259485 17%
## 8 Woodside <NA> <NA> NA <NA>
## 10 Boca Raton $12,329,371 916,455 Dollars 11412916 20%
## 11 New York $10,597,009 7,591,189 Dollars 3005820 7%
## 12 Bellevue $14,026,934 7,429,377 Dollars 6597557 26%
## 13 Flowood $10,573,990 7,435,363 Dollars 3138627 8%
## 15 Louisville $9,254,614 6,249,498 Dollars 3005116 6%
## 16 Clayton $9,451,943 3,878,113 Dollars 5573830 4%
## 17 Iselin $14,001,180 <NA> 11901180 18%
## 18 Suffolk $11,088,336 5,635,276 Dollars 5453060 7%
## 19 Monroe $10,746,451 4,762,319 Dollars 5984132 13%
## 20 Durham $10,410,628 6,196,409 Dollars 4214219 17%
## 21 Huntsville $7,978,332 5,686,574 Dollars 2291758 2%
## 22 McLean $9,418,303 7,567,233 Dollars 1851070 2%
## 23 Columbia $11,950,148 4,365,512 Dollars 7584636 20%
## 24 Collingswood $8,304,480 7,019,973 Dollars 1284507 20%
# note now blank spaces are displayed as NA

# note NA value in an integer will be like this NA
# note NA in a string or character will be displayed like this <NA>
tail(fin) # returns the last 6 rows

## 495 495 Rawfishcomplete Financial Services 2012 124 CA
## 496 496 Buretteadmirable IT Services 2009 93 ME
## 497 497 Inventtremendous Construction 2009 24 MN
## 498 498 Overviewparrot Retail 2011 7125 TX
## 499 499 Belaguerra IT Services 2010 140 MI
## 500 500 Allpossible IT Services 2011 24 CA
## 495 Los Angeles $10,624,949 2,951,178 Dollars 7673771 22%
## 496 Portland $15,407,450 2,833,136 Dollars 12574314 25%
## 497 Woodbury $9,144,857 4,755,995 Dollars 4388862 11%
## 498 Fort Worth $11,134,728 5,152,110 Dollars 5982618 12%
## 499 Troy $17,387,130 1,387,784 Dollars 15999346 23%
## 500 Los Angeles $11,949,706 689,161 Dollars 11260545 24%
str(fin) # returns the structure of the data
## 'data.frame': 500 obs. of 11 variables:

## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Name : Factor w/ 500 levels "Abstractedchocolat",..: 297 451 168 40
485 199 435 339 242 395 ...
## $ Industry : Factor w/ 7 levels "Construction",..: 7 5 6 5 7 5 2 1 5
2 ...
## $ Inception: int 2006 2009 2012 2011 2013 2013 2009 2013 2009 2010 ...
## $ Employees: int 25 36 NA 66 45 60 116 73 55 25 ...
## $ State : Factor w/ 42 levels "AL","AZ","CA",..: 36 33 35 3 41 27 22
29 3 8 ...
## $ City : Factor w/ 297 levels "Addison","Alexandria",..: 94 181 105
195 151 154 53 295 232 26 ...
## $ Revenue : Factor w/ 498 levels "$1,614,585","$1,835,717",..: 479 194
485 246 402 141 308 NA 96 117 ...
## $ Expenses : Factor w/ 497 levels "1,026,548 Dollars",..: 6 485 3 248 227
247 57 NA 402 495 ...
## $ Profit : int 8553827 13212508 8701897 10727561 4193069 8179177
3259485 NA 5274553 11412916 ...
## $ Growth : Factor w/ 32 levels "-2%","-3%","0%",..: 14 16 11 14 14 18
12 NA 26 16 ...
# The dataset has 500 observations of 11 variables.

# we want to convert ID, Inception from integer to Factor (Factors are
categorical variables in R)
fin$ID <- as.factor(fin$ID)
fin$Inception <- as.factor(fin$Inception)
# Now we want to convert revenue, expenses and growth from Factor to Integer
or num
# also we want to remove unwanted symbols like $ , % Dollars etc using gsub()
function
# replace $ with blank space - since $ is special character in R we use
//before $ symbol
fin$Revenue <- gsub("\\$","",fin$Revenue)
# replace , with blank space and store it in fin data frame with column name
Revenue
fin$Revenue <- gsub(",","",fin$Revenue)
# Repeat the same process to replace unwanted symbols in Expenses and Growth
fin$Growth <- gsub("%","",fin$Growth)
fin$Expenses <- gsub(",","",fin$Expenses)
fin$Expenses <- gsub(" Dollars","",fin$Expenses)
head(fin) # check all necessary changes are done
## ID Name Industry Inception Employees State City

## 1 1 Over-Hex Software 2006 25 TN Franklin
## 2 2 Unimattax IT Services 2009 36 PA Newtown Square
## 3 3 Greenfax Retail 2012 NA SC Greenville
## 4 4 Blacklane IT Services 2011 66 CA Orange
## 5 5 Yearflex Software 2013 45 WI Madison
## 6 6 Indigoplanet IT Services 2013 60 NJ Manalapan
## Revenue Expenses Profit Growth
## 1 9684527 1130700 8553827 19
## 2 14016543 804035 13212508 20
## 3 9746272 1044375 8701897 16
## 4 15359369 4631808 10727561 19
## 5 8567910 4374841 4193069 19
## 6 12805452 4626275 8179177 22
# now check the structure of data set

str(fin)

## $ ID : Factor w/ 500 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9
10 ...
485 199 435 339 242 395 ...
2 ...
## $ Inception: Factor w/ 16 levels "1999","2000",..: 8 11 14 13 15 15 11 15
11 12 ...
## $ Employees: int 25 36 NA 66 45 60 116 73 55 25 ...
29 3 8 ...
195 151 154 53 295 232 26 ...
## $ Revenue : chr "9684527" "14016543" "9746272" "15359369" ...
## $ Expenses : chr "1130700" "804035" "1044375" "4631808" ...
## $ Profit : int 8553827 13212508 8701897 10727561 4193069 8179177
3259485 NA 5274553 11412916 ...
## $ Growth : chr "19" "20" "16" "19" ...
# we find revenue, expenses and growth are stored as character type after
using gsub
# but we want them to be numeric
fin$Revenue <- as.numeric(fin$Revenue)
fin$Expenses <- as.numeric(fin$Expenses)
fin$Growth <- as.numeric(fin$Growth)
str(fin) # check again

## $ ID : Factor w/ 500 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9
10 ...
485 199 435 339 242 395 ...
2 ...
## $ Inception: Factor w/ 16 levels "1999","2000",..: 8 11 14 13 15 15 11 15
11 12 ...
## $ Employees: int 25 36 NA 66 45 60 116 73 55 25 ...
29 3 8 ...
195 151 154 53 295 232 26 ...
## $ Revenue : num 9684527 14016543 9746272 15359369 8567910 ...
## $ Expenses : num 1130700 804035 1044375 4631808 4374841 ...
## $ Profit : int 8553827 13212508 8701897 10727561 4193069 8179177
3259485 NA 5274553 11412916 ...
## $ Growth : num 19 20 16 19 19 22 17 NA 30 20 ...
# now we have changed all the data types

# to find out all the missing values in the dataset using complete.cases()
function
# NA in R means data is missing
complete.cases(fin) # this will return true for complete data and false for
missing data.
## [1] TRUE TRUE FALSE TRUE TRUE TRUE TRUE FALSE TRUE TRUE FALSE
## [12] TRUE TRUE FALSE FALSE TRUE FALSE TRUE TRUE TRUE TRUE FALSE
## [23] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [34] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE
## [78] TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE
## [265] TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [331] TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [375] TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE
## [496] TRUE TRUE TRUE TRUE TRUE
!complete.cases(fin) # this will return the opposite of above code (NOT !

operator is used )
## [1] FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE
## [12] FALSE FALSE TRUE TRUE FALSE TRUE FALSE FALSE FALSE FALSE TRUE
## [23] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [34] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [78] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
## [265] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [331] FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [375] FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [496] FALSE FALSE FALSE FALSE FALSE
# Next line of code is used to filter from the dataset only missing cases
fin[!complete.cases(fin),] # similar to subsetting row 2, column 3 in a
matrix c[2,3]

## 14 14 Techline <NA> 2006 65 CA
## 15 15 Cityace <NA> 2010 25 CO
## 22 22 Lathotline Health <NA> 103 VA
## 44 44 Ganzgreen Construction 2010 224 TN
## 84 84 Drilldrill Software 2010 30 <NA>
## 267 267 Circlechop Software 2010 14 <NA>
## 332 332 Westminster Financial Services 2010 NA MI
## 379 379 Stovepuck Retail 2013 73 <NA>
## 3 Greenville 9746272 1044375 8701897 16
## 8 Woodside NA NA NA NA
## 11 New York 10597009 7591189 3005820 7
## 14 San Ramon 13898119 5470303 8427816 23
## 15 Louisville 9254614 6249498 3005116 6
## 17 Iselin 14001180 NA 11901180 18
## 22 McLean 9418303 7567233 1851070 2
## 44 Franklin NA NA NA 9
## 84 San Francisco 7800620 2785799 5014821 17
## 267 San Francisco 9067070 5929828 3137242 20
## 332 Troy 11861652 5245126 6616526 15
## 379 New York 13814975 5904502 7910473 10
# now we have 12 rows with NA values - <NA> represent NA in strings or

character - NA represent missing value in integer or num
# Lets fix the first missing value in Industry column
# how to do that - delete row 14 and 15 which is NA in industry as the data
cannot be obtained
# before we start deleting rows let us create a backup of our dataset
fin.backup <- fin
fin <- fin[!is.na(fin$Industry),]
# the above code first filters all the values in 3rd row i.e. Industry except
NA and considers all columns and stores it in fin object
head(fin,24) # row no 14 and 15 are deleted.

## 19 19 E-Zim Retail 2008 320 OH
## 25 25 Tampware Construction 2011 13 TX
## 26 26 Dalthow Health 2000 20 GA
## 1 Franklin 9684527 1130700 8553827 19
## 2 Newtown Square 14016543 804035 13212508 20
## 3 Greenville 9746272 1044375 8701897 16
## 4 Orange 15359369 4631808 10727561 19
## 5 Madison 8567910 4374841 4193069 19
## 6 Manalapan 12805452 4626275 8179177 22
## 7 Clayton 5387469 2127984 3259485 17
## 9 San Ramon 11757018 6482465 5274553 30
## 10 Boca Raton 12329371 916455 11412916 20
## 11 New York 10597009 7591189 3005820 7
## 12 Bellevue 14026934 7429377 6597557 26
## 13 Flowood 10573990 7435363 3138627 8
## 16 Clayton 9451943 3878113 5573830 4
## 17 Iselin 14001180 NA 11901180 18
## 18 Suffolk 11088336 5635276 5453060 7
## 19 Monroe 10746451 4762319 5984132 13
## 20 Durham 10410628 6196409 4214219 17
## 21 Huntsville 7978332 5686574 2291758 2
## 22 McLean 9418303 7567233 1851070 2
## 23 Columbia 11950148 4365512 7584636 20
## 24 Collingswood 8304480 7019973 1284507 20
## 25 Houston 9785982 2910756 6875226 11
## 26 Dacula 10800718 7731820 3068898 7
# you can observe that after deleting 14 and 15 - the row nos are not reset
i.e. after 13 you will find 16
# the next line of code reset the numbering
rownames(fin) <- NULL
head(fin,24) # check row nos are in order

## 17 19 E-Zim Retail 2008 320 OH
## 23 25 Tampware Construction 2011 13 TX
## 24 26 Dalthow Health 2000 20 GA
## 1 Franklin 9684527 1130700 8553827 19
## 2 Newtown Square 14016543 804035 13212508 20
## 3 Greenville 9746272 1044375 8701897 16
## 4 Orange 15359369 4631808 10727561 19
## 5 Madison 8567910 4374841 4193069 19
## 6 Manalapan 12805452 4626275 8179177 22
## 7 Clayton 5387469 2127984 3259485 17
## 9 San Ramon 11757018 6482465 5274553 30
## 10 Boca Raton 12329371 916455 11412916 20
## 11 New York 10597009 7591189 3005820 7
## 12 Bellevue 14026934 7429377 6597557 26
## 13 Flowood 10573990 7435363 3138627 8
## 14 Clayton 9451943 3878113 5573830 4
## 15 Iselin 14001180 NA 11901180 18
## 16 Suffolk 11088336 5635276 5453060 7
## 17 Monroe 10746451 4762319 5984132 13
## 18 Durham 10410628 6196409 4214219 17
## 19 Huntsville 7978332 5686574 2291758 2
## 20 McLean 9418303 7567233 1851070 2
## 21 Columbia 11950148 4365512 7584636 20
## 22 Collingswood 8304480 7019973 1284507 20
## 23 Houston 9785982 2910756 6875226 11
## 24 Dacula 10800718 7731820 3068898 7
fin[!complete.cases(fin),] # again check the missing values reduced from 12

to 10

## 3 Greenville 9746272 1044375 8701897 16
## 11 New York 10597009 7591189 3005820 7
## 15 Iselin 14001180 NA 11901180 18
## 20 McLean 9418303 7567233 1851070 2
## 82 San Francisco 7800620 2785799 5014821 17
## 265 San Francisco 9067070 5929828 3137242 20
## 330 Troy 11861652 5245126 6616526 15
## 377 New York 13814975 5904502 7910473 10
# next we have to fix NA in employees

# now filter all rows with NA in employees column
fin[is.na(fin$Employees),] # we have two such rows i.e. row no 3 and 330

## 3 Greenville 9746272 1044375 8701897 16
## 330 Troy 11861652 5245126 6616526 15
# median imputation method
median(fin$Employees, na.rm = TRUE) # calculate median without considering NA
values
## [1] 56
# now find median of employees in retail industry

# first filter all employees data in retail sector
# median imputation method
filter1 <- fin[fin$Industry == "Retail", "Employees"]
med.emp.retail <- median(filter1, na.rm=TRUE)
med.emp.retail
## [1] 28
# now replace the NA in employee for retail sector with calculated median
value
fin[is.na(fin$Employees) & fin$Industry == "Retail", 5] <- med.emp.retail
fin[c(3,330),]

## 3 3 Greenfax Retail 2012 28 SC
## 3 Greenville 9746272 1044375 8701897 16
## 330 Troy 11861652 5245126 6616526 15
# now check whether na is replaced by median value 28

fin[3,]
## ID Name Industry Inception Employees State City Revenue

## 3 3 Greenfax Retail 2012 28 SC Greenville 9746272
## Expenses Profit Growth
## 3 1044375 8701897 16
# Repeat the same to fill median of employee in Financial services sector

filter2 <- fin[fin$Industry == "Financial Services", "Employees"]
median.emp.fs <- median(filter2, na.rm = TRUE)
median.emp.fs
## [1] 80
fin[is.na(fin$Employees) & fin$Industry == "Financial Services", "Employees"]

<- median.emp.fs
fin[330,]

## 330 332 Westminster Financial Services 2010 80 MI Troy
## 330 11861652 5245126 6616526 15
# check the total no of missing values
fin[!complete.cases(fin),]

## 11 New York 10597009 7591189 3005820 7
## 15 Iselin 14001180 NA 11901180 18
## 20 McLean 9418303 7567233 1851070 2
## 82 San Francisco 7800620 2785799 5014821 17
## 265 San Francisco 9067070 5929828 3137242 20
## 377 New York 13814975 5904502 7910473 10
# let us fill the state if city is new york, then state is NY

# if city is san francisco, then state is CA
# filter all the states with NA value and city New York
# filter all the states with NA value and city San Francisco
filter3 <- fin[is.na(fin$State) & fin$City == "New York", ]
filter3

## 11 11 Canecorporation Health 2012 6 <NA> New York
## 377 379 Stovepuck Retail 2013 73 <NA> New York
## 11 10597009 7591189 3005820 7
## 377 13814975 5904502 7910473 10
filter4 <- fin[is.na(fin$State) & fin$City == "San Francisco", ]

filter4

## 82 84 Drilldrill Software 2010 30 <NA> San Francisco
## 265 267 Circlechop Software 2010 14 <NA> San Francisco
## 82 7800620 2785799 5014821 17
## 265 9067070 5929828 3137242 20
# now replace respective NA in filter 3 with NY and filter4 with CA

# rows 11, 377, 82 and 265
fin[is.na(fin$State) & fin$City == "New York","State"] <- "NY"
fin[is.na(fin$State) & fin$City == "San Francisco","State"] <- "CA"
# check NA replaced by NY and CA
fin[c(11, 377, 82,265),]

## 11 11 Canecorporation Health 2012 6 NY New York
## 377 379 Stovepuck Retail 2013 73 NY New York
## 82 84 Drilldrill Software 2010 30 CA San Francisco
## 265 267 Circlechop Software 2010 14 CA San Francisco
## 11 10597009 7591189 3005820 7
## 377 13814975 5904502 7910473 10
## 82 7800620 2785799 5014821 17
## 265 9067070 5929828 3137242 20

## 8 8 Rednimdox Construction 2013 73 NY Woodside NA
## 15 17 Ganzlax IT Services 2011 75 NJ Iselin 14001180
## 20 22 Lathotline Health <NA> 103 VA McLean 9418303
## 42 44 Ganzgreen Construction 2010 224 TN Franklin NA
## 8 NA NA NA
## 15 NA 11901180 18
## 20 7567233 1851070 2
## 42 NA NA 9
# median imputation method do for NA values in growth column

# similar to employee data
filter4 <- fin[fin$Industry == "Construction","Growth"]
filter4
## [1] NA 11 15 13 9 12 13 11 19 5 8 8 8 12 6 10 15 9 16 12 14 9 8
## [24] 5 10 12 8 15 11 9 7 11 12 8 9 5 7 12 9 5 9 11 8 7 12 8
## [47] 7 11 11 11
fin[is.na(fin$Growth) & fin$Industry == "Construction", "Growth"] <-

median(filter4, na.rm = TRUE)
# check the growth na was in row 8
fin[8,]

## 8 NA NA 10

## 8 NA NA 10
## 15 NA 11901180 18
## 20 7567233 1851070 2
## 42 NA NA 9
# now let us fix expenses for IT services industry

# expenses = revenue - profit
fin[is.na(fin$Expenses),"Expenses"] <- fin[is.na(fin$Expenses),"Revenue"] -
fin[is.na(fin$Expenses), "Profit"]
# check row 15
fin[15,]

## 15 2100000 11901180 18

## 8 NA NA 10
## 20 7567233 1851070 2
## 42 NA NA 9
# now we have only 3 rows left with NA values

# let us fix revenue and expenses for construction industry
med.construction.revenue <- median(fin[fin$Industry == "Construction",
"Revenue"], na.rm=TRUE)
fin[is.na(fin$Revenue) & fin$Industry == "Construction", "Revenue"] <-
med.construction.revenue
# check change row 8, 42
fin[c(8,42),]

## 8 8 Rednimdox Construction 2013 73 NY Woodside 9055059
## 42 44 Ganzgreen Construction 2010 224 TN Franklin 9055059
## 8 NA NA 10
## 42 NA NA 9
# now change expenses for construction industry

med.construction.expenses <- median(fin[fin$Industry == "Construction",
"Expenses"], na.rm=TRUE)
fin[is.na(fin$Expenses) & fin$Industry == "Construction", "Expenses"] <-
med.construction.expenses
fin[c(8,42),]

## 8 4506976 NA 10
## 42 4506976 NA 9
# now in these rows profit = revenue - expenses

fin[is.na(fin$Profit),"Profit"] <- fin[is.na(fin$Profit),"Revenue"] -
fin[is.na(fin$Profit),"Expenses"]
# check
fin[c(8,42),]

## 8 4506976 4548083 10
## 42 4506976 4548083 9

## 20 7567233 1851070 2
# now we are left with only one NA value which can be ignored - year of
inception is not required for any analysis
# ............ NOW OUR DATA IS CLEAN ...........
# Now lets plot - use ggplot2

# Visualizing Plots
# Scatter plot classified by industry showing revenue, expenses and profit
library(ggplot2)
# if ggplot2 is not installed you may get a error
# then type run this code install.packages("ggplot2")
# then type run this library(ggplot2)
# now lets plot
# A scatterplot classified by industry showing revenue, expenses, profit.
plot1 <- ggplot(data=fin) # we are creating a object plot1
# x axis revenue, y axis Expenses and points should be coloured
# industry wise and size of the point should vary based on the profit.
plot1 + geom_point(aes(x=Revenue,
y=Expenses,
colour=Industry,
size=Profit))
# try to plot the other types of plots required
# Home work
# plot2 A scatterplot that includes industry trends for the expenses~revenue
relationship.
# plot3 BoxPlots showing growth by industry
# Refer the cheat sheets circulated earlier

Ex 2 - Data Preprocessing and Exploration - Fortune 500 Dataset With Comments

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Ex 2 - Data Preprocessing and Exploration - Fortune 500 Dataset With Comments

Uploaded by

Copyright:

Available Formats

Ex-1—my-sol.

## [1] "D:/Academics/AY 2019-20/ODD SEM 2019/Predictive

setwd("D:/Academics/AY 2019-20/ODD SEM 2019/Predictive

## ID Name Industry Inception Employees State

# we find after running this there are some blanks without NA

## ID Name Industry Inception Employees State

# note now blank spaces are displayed as NA

## ID Name Industry Inception Employees State

## 'data.frame': 500 obs. of 11 variables:

# The dataset has 500 observations of 11 variables.

## ID Name Industry Inception Employees State City

# now check the structure of data set

## 'data.frame': 500 obs. of 11 variables:

## 'data.frame': 500 obs. of 11 variables:

# now we have changed all the data types

!complete.cases(fin) # this will return the opposite of above code (NOT !

## ID Name Industry Inception Employees State

# now we have 12 rows with NA values - <NA> represent NA in strings or

## ID Name Industry Inception Employees State

## ID Name Industry Inception Employees State

fin[!complete.cases(fin),] # again check the missing values reduced from 12

## ID Name Industry Inception Employees State

# next we have to fix NA in employees

## ID Name Industry Inception Employees State

# now find median of employees in retail industry

## ID Name Industry Inception Employees State

# now check whether na is replaced by median value 28

## ID Name Industry Inception Employees State City Revenue

# Repeat the same to fill median of employee in Financial services sector

fin[is.na(fin$Employees) & fin$Industry == "Financial Services", "Employees"]

## ID Name Industry Inception Employees State City

## ID Name Industry Inception Employees State

# let us fill the state if city is new york, then state is NY

## ID Name Industry Inception Employees State City

filter4 <- fin[is.na(fin$State) & fin$City == "San Francisco", ]

## ID Name Industry Inception Employees State City

# now replace respective NA in filter 3 with NY and filter4 with CA

## ID Name Industry Inception Employees State City

## ID Name Industry Inception Employees State City Revenue

# median imputation method do for NA values in growth column

fin[is.na(fin$Growth) & fin$Industry == "Construction", "Growth"] <-

## ID Name Industry Inception Employees State City Revenue

## ID Name Industry Inception Employees State City Revenue

# now let us fix expenses for IT services industry

## ID Name Industry Inception Employees State City Revenue

## ID Name Industry Inception Employees State City Revenue

# now we have only 3 rows left with NA values

## ID Name Industry Inception Employees State City Revenue

# now change expenses for construction industry

## ID Name Industry Inception Employees State City Revenue

# now in these rows profit = revenue - expenses

## ID Name Industry Inception Employees State City Revenue

## ID Name Industry Inception Employees State City Revenue

# Now lets plot - use ggplot2

You might also like