You are on page 1of 17

Ex-1—my-sol.

R
Admin

2019-09-21
getwd() # returns the present working directory

## [1] "D:/Academics/AY 2019-20/ODD SEM 2019/Predictive


Analytics/Datasets/Exercises/Ex1"

setwd("D:/Academics/AY 2019-20/ODD SEM 2019/Predictive


Analytics/Datasets/Exercises/Ex1")
# setwd changes the working directory - change this to the folder
# which contains the data file.
fin <- read.csv("Future 500.csv")
# import csv file to object "fin" - you can give any name
head(fin, 24) # want to see the first 24 rows

## ID Name Industry Inception Employees State


## 1 1 Over-Hex Software 2006 25 TN
## 2 2 Unimattax IT Services 2009 36 PA
## 3 3 Greenfax Retail 2012 NA SC
## 4 4 Blacklane IT Services 2011 66 CA
## 5 5 Yearflex Software 2013 45 WI
## 6 6 Indigoplanet IT Services 2013 60 NJ
## 7 7 Treslam Financial Services 2009 116 MO
## 8 8 Rednimdox Construction 2013 73 NY
## 9 9 Lamtone IT Services 2009 55 CA
## 10 10 Stripfind Financial Services 2010 25 FL
## 11 11 Canecorporation Health 2012 6
## 12 12 Mattouch IT Services 2013 6 WA
## 13 13 Techdrill Health 2009 9 MS
## 14 14 Techline 2006 65 CA
## 15 15 Cityace 2010 25 CO
## 16 16 Kayelectronics Health 2009 687 NC
## 17 17 Ganzlax IT Services 2011 75 NJ
## 18 18 Trantraxlax Government Services 2011 35 VA
## 19 19 E-Zim Retail 2008 320 OH
## 20 20 Daltfase Software 2011 78 NC
## 21 21 Hotlane Government Services 2012 87 AL
## 22 22 Lathotline Health NA 103 VA
## 23 23 Lambam IT Services 2012 210 SC
## 24 24 Quozap Software 2004 21 NJ
## City Revenue Expenses Profit Growth
## 1 Franklin $9,684,527 1,130,700 Dollars 8553827 19%
## 2 Newtown Square $14,016,543 804,035 Dollars 13212508 20%
## 3 Greenville $9,746,272 1,044,375 Dollars 8701897 16%
## 4 Orange $15,359,369 4,631,808 Dollars 10727561 19%
## 5 Madison $8,567,910 4,374,841 Dollars 4193069 19%
## 6 Manalapan $12,805,452 4,626,275 Dollars 8179177 22%
## 7 Clayton $5,387,469 2,127,984 Dollars 3259485 17%
## 8 Woodside NA
## 9 San Ramon $11,757,018 6,482,465 Dollars 5274553 30%
## 10 Boca Raton $12,329,371 916,455 Dollars 11412916 20%
## 11 New York $10,597,009 7,591,189 Dollars 3005820 7%
## 12 Bellevue $14,026,934 7,429,377 Dollars 6597557 26%
## 13 Flowood $10,573,990 7,435,363 Dollars 3138627 8%
## 14 San Ramon $13,898,119 5,470,303 Dollars 8427816 23%
## 15 Louisville $9,254,614 6,249,498 Dollars 3005116 6%
## 16 Clayton $9,451,943 3,878,113 Dollars 5573830 4%
## 17 Iselin $14,001,180 11901180 18%
## 18 Suffolk $11,088,336 5,635,276 Dollars 5453060 7%
## 19 Monroe $10,746,451 4,762,319 Dollars 5984132 13%
## 20 Durham $10,410,628 6,196,409 Dollars 4214219 17%
## 21 Huntsville $7,978,332 5,686,574 Dollars 2291758 2%
## 22 McLean $9,418,303 7,567,233 Dollars 1851070 2%
## 23 Columbia $11,950,148 4,365,512 Dollars 7584636 20%
## 24 Collingswood $8,304,480 7,019,973 Dollars 1284507 20%

# we find after running this there are some blanks without NA


# this is because R treats blank spaces as character
# how to fix this - go to data import code line and change the code
# if excel file has blank spaces R treats it as a character hence to
# tell R to consider "" blank as NA na.strings = "" is used.
fin <- read.csv("Future 500.csv", na.strings="")
head(fin, 24)

## ID Name Industry Inception Employees State


## 1 1 Over-Hex Software 2006 25 TN
## 2 2 Unimattax IT Services 2009 36 PA
## 3 3 Greenfax Retail 2012 NA SC
## 4 4 Blacklane IT Services 2011 66 CA
## 5 5 Yearflex Software 2013 45 WI
## 6 6 Indigoplanet IT Services 2013 60 NJ
## 7 7 Treslam Financial Services 2009 116 MO
## 8 8 Rednimdox Construction 2013 73 NY
## 9 9 Lamtone IT Services 2009 55 CA
## 10 10 Stripfind Financial Services 2010 25 FL
## 11 11 Canecorporation Health 2012 6 <NA>
## 12 12 Mattouch IT Services 2013 6 WA
## 13 13 Techdrill Health 2009 9 MS
## 14 14 Techline <NA> 2006 65 CA
## 15 15 Cityace <NA> 2010 25 CO
## 16 16 Kayelectronics Health 2009 687 NC
## 17 17 Ganzlax IT Services 2011 75 NJ
## 18 18 Trantraxlax Government Services 2011 35 VA
## 19 19 E-Zim Retail 2008 320 OH
## 20 20 Daltfase Software 2011 78 NC
## 21 21 Hotlane Government Services 2012 87 AL
## 22 22 Lathotline Health NA 103 VA
## 23 23 Lambam IT Services 2012 210 SC
## 24 24 Quozap Software 2004 21 NJ
## City Revenue Expenses Profit Growth
## 1 Franklin $9,684,527 1,130,700 Dollars 8553827 19%
## 2 Newtown Square $14,016,543 804,035 Dollars 13212508 20%
## 3 Greenville $9,746,272 1,044,375 Dollars 8701897 16%
## 4 Orange $15,359,369 4,631,808 Dollars 10727561 19%
## 5 Madison $8,567,910 4,374,841 Dollars 4193069 19%
## 6 Manalapan $12,805,452 4,626,275 Dollars 8179177 22%
## 7 Clayton $5,387,469 2,127,984 Dollars 3259485 17%
## 8 Woodside <NA> <NA> NA <NA>
## 9 San Ramon $11,757,018 6,482,465 Dollars 5274553 30%
## 10 Boca Raton $12,329,371 916,455 Dollars 11412916 20%
## 11 New York $10,597,009 7,591,189 Dollars 3005820 7%
## 12 Bellevue $14,026,934 7,429,377 Dollars 6597557 26%
## 13 Flowood $10,573,990 7,435,363 Dollars 3138627 8%
## 14 San Ramon $13,898,119 5,470,303 Dollars 8427816 23%
## 15 Louisville $9,254,614 6,249,498 Dollars 3005116 6%
## 16 Clayton $9,451,943 3,878,113 Dollars 5573830 4%
## 17 Iselin $14,001,180 <NA> 11901180 18%
## 18 Suffolk $11,088,336 5,635,276 Dollars 5453060 7%
## 19 Monroe $10,746,451 4,762,319 Dollars 5984132 13%
## 20 Durham $10,410,628 6,196,409 Dollars 4214219 17%
## 21 Huntsville $7,978,332 5,686,574 Dollars 2291758 2%
## 22 McLean $9,418,303 7,567,233 Dollars 1851070 2%
## 23 Columbia $11,950,148 4,365,512 Dollars 7584636 20%
## 24 Collingswood $8,304,480 7,019,973 Dollars 1284507 20%

# note now blank spaces are displayed as NA


# note NA value in an integer will be like this NA
# note NA in a string or character will be displayed like this <NA>
tail(fin) # returns the last 6 rows

## ID Name Industry Inception Employees State


## 495 495 Rawfishcomplete Financial Services 2012 124 CA
## 496 496 Buretteadmirable IT Services 2009 93 ME
## 497 497 Inventtremendous Construction 2009 24 MN
## 498 498 Overviewparrot Retail 2011 7125 TX
## 499 499 Belaguerra IT Services 2010 140 MI
## 500 500 Allpossible IT Services 2011 24 CA
## City Revenue Expenses Profit Growth
## 495 Los Angeles $10,624,949 2,951,178 Dollars 7673771 22%
## 496 Portland $15,407,450 2,833,136 Dollars 12574314 25%
## 497 Woodbury $9,144,857 4,755,995 Dollars 4388862 11%
## 498 Fort Worth $11,134,728 5,152,110 Dollars 5982618 12%
## 499 Troy $17,387,130 1,387,784 Dollars 15999346 23%
## 500 Los Angeles $11,949,706 689,161 Dollars 11260545 24%
str(fin) # returns the structure of the data

## 'data.frame': 500 obs. of 11 variables:


## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Name : Factor w/ 500 levels "Abstractedchocolat",..: 297 451 168 40
485 199 435 339 242 395 ...
## $ Industry : Factor w/ 7 levels "Construction",..: 7 5 6 5 7 5 2 1 5
2 ...
## $ Inception: int 2006 2009 2012 2011 2013 2013 2009 2013 2009 2010 ...
## $ Employees: int 25 36 NA 66 45 60 116 73 55 25 ...
## $ State : Factor w/ 42 levels "AL","AZ","CA",..: 36 33 35 3 41 27 22
29 3 8 ...
## $ City : Factor w/ 297 levels "Addison","Alexandria",..: 94 181 105
195 151 154 53 295 232 26 ...
## $ Revenue : Factor w/ 498 levels "$1,614,585","$1,835,717",..: 479 194
485 246 402 141 308 NA 96 117 ...
## $ Expenses : Factor w/ 497 levels "1,026,548 Dollars",..: 6 485 3 248 227
247 57 NA 402 495 ...
## $ Profit : int 8553827 13212508 8701897 10727561 4193069 8179177
3259485 NA 5274553 11412916 ...
## $ Growth : Factor w/ 32 levels "-2%","-3%","0%",..: 14 16 11 14 14 18
12 NA 26 16 ...

# The dataset has 500 observations of 11 variables.


# we want to convert ID, Inception from integer to Factor (Factors are
categorical variables in R)
fin$ID <- as.factor(fin$ID)
fin$Inception <- as.factor(fin$Inception)
# Now we want to convert revenue, expenses and growth from Factor to Integer
or num
# also we want to remove unwanted symbols like $ , % Dollars etc using gsub()
function
# replace $ with blank space - since $ is special character in R we use
//before $ symbol
fin$Revenue <- gsub("\\$","",fin$Revenue)
# replace , with blank space and store it in fin data frame with column name
Revenue
fin$Revenue <- gsub(",","",fin$Revenue)
# Repeat the same process to replace unwanted symbols in Expenses and Growth
fin$Growth <- gsub("%","",fin$Growth)
fin$Expenses <- gsub(",","",fin$Expenses)
fin$Expenses <- gsub(" Dollars","",fin$Expenses)
head(fin) # check all necessary changes are done

## ID Name Industry Inception Employees State City


## 1 1 Over-Hex Software 2006 25 TN Franklin
## 2 2 Unimattax IT Services 2009 36 PA Newtown Square
## 3 3 Greenfax Retail 2012 NA SC Greenville
## 4 4 Blacklane IT Services 2011 66 CA Orange
## 5 5 Yearflex Software 2013 45 WI Madison
## 6 6 Indigoplanet IT Services 2013 60 NJ Manalapan
## Revenue Expenses Profit Growth
## 1 9684527 1130700 8553827 19
## 2 14016543 804035 13212508 20
## 3 9746272 1044375 8701897 16
## 4 15359369 4631808 10727561 19
## 5 8567910 4374841 4193069 19
## 6 12805452 4626275 8179177 22

# now check the structure of data set


str(fin)

## 'data.frame': 500 obs. of 11 variables:


## $ ID : Factor w/ 500 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9
10 ...
## $ Name : Factor w/ 500 levels "Abstractedchocolat",..: 297 451 168 40
485 199 435 339 242 395 ...
## $ Industry : Factor w/ 7 levels "Construction",..: 7 5 6 5 7 5 2 1 5
2 ...
## $ Inception: Factor w/ 16 levels "1999","2000",..: 8 11 14 13 15 15 11 15
11 12 ...
## $ Employees: int 25 36 NA 66 45 60 116 73 55 25 ...
## $ State : Factor w/ 42 levels "AL","AZ","CA",..: 36 33 35 3 41 27 22
29 3 8 ...
## $ City : Factor w/ 297 levels "Addison","Alexandria",..: 94 181 105
195 151 154 53 295 232 26 ...
## $ Revenue : chr "9684527" "14016543" "9746272" "15359369" ...
## $ Expenses : chr "1130700" "804035" "1044375" "4631808" ...
## $ Profit : int 8553827 13212508 8701897 10727561 4193069 8179177
3259485 NA 5274553 11412916 ...
## $ Growth : chr "19" "20" "16" "19" ...

# we find revenue, expenses and growth are stored as character type after
using gsub
# but we want them to be numeric
fin$Revenue <- as.numeric(fin$Revenue)
fin$Expenses <- as.numeric(fin$Expenses)
fin$Growth <- as.numeric(fin$Growth)
str(fin) # check again

## 'data.frame': 500 obs. of 11 variables:


## $ ID : Factor w/ 500 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9
10 ...
## $ Name : Factor w/ 500 levels "Abstractedchocolat",..: 297 451 168 40
485 199 435 339 242 395 ...
## $ Industry : Factor w/ 7 levels "Construction",..: 7 5 6 5 7 5 2 1 5
2 ...
## $ Inception: Factor w/ 16 levels "1999","2000",..: 8 11 14 13 15 15 11 15
11 12 ...
## $ Employees: int 25 36 NA 66 45 60 116 73 55 25 ...
## $ State : Factor w/ 42 levels "AL","AZ","CA",..: 36 33 35 3 41 27 22
29 3 8 ...
## $ City : Factor w/ 297 levels "Addison","Alexandria",..: 94 181 105
195 151 154 53 295 232 26 ...
## $ Revenue : num 9684527 14016543 9746272 15359369 8567910 ...
## $ Expenses : num 1130700 804035 1044375 4631808 4374841 ...
## $ Profit : int 8553827 13212508 8701897 10727561 4193069 8179177
3259485 NA 5274553 11412916 ...
## $ Growth : num 19 20 16 19 19 22 17 NA 30 20 ...

# now we have changed all the data types


# to find out all the missing values in the dataset using complete.cases()
function
# NA in R means data is missing
complete.cases(fin) # this will return true for complete data and false for
missing data.

## [1] TRUE TRUE FALSE TRUE TRUE TRUE TRUE FALSE TRUE TRUE FALSE
## [12] TRUE TRUE FALSE FALSE TRUE FALSE TRUE TRUE TRUE TRUE FALSE
## [23] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [34] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE
## [45] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [56] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [67] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [78] TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE
## [89] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [100] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [111] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [122] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [133] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [144] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [155] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [166] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [177] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [188] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [199] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [210] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [221] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [232] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [243] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [254] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [265] TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [276] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [287] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [298] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [309] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [320] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [331] TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [342] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [353] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [364] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [375] TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE
## [386] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [397] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [408] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [419] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [430] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [441] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [452] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [463] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [474] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [485] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [496] TRUE TRUE TRUE TRUE TRUE

!complete.cases(fin) # this will return the opposite of above code (NOT !


operator is used )

## [1] FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE
## [12] FALSE FALSE TRUE TRUE FALSE TRUE FALSE FALSE FALSE FALSE TRUE
## [23] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [34] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [45] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [56] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [67] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [78] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
## [89] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [100] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [111] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [122] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [144] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [155] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [166] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [177] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [188] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [199] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [210] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [221] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [232] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [243] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [254] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [265] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [276] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [287] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [298] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [309] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [320] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [331] FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [342] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [353] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [364] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [375] FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [386] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [397] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [408] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [419] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [430] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [441] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [452] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [463] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [474] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [485] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [496] FALSE FALSE FALSE FALSE FALSE

# Next line of code is used to filter from the dataset only missing cases
fin[!complete.cases(fin),] # similar to subsetting row 2, column 3 in a
matrix c[2,3]

## ID Name Industry Inception Employees State


## 3 3 Greenfax Retail 2012 NA SC
## 8 8 Rednimdox Construction 2013 73 NY
## 11 11 Canecorporation Health 2012 6 <NA>
## 14 14 Techline <NA> 2006 65 CA
## 15 15 Cityace <NA> 2010 25 CO
## 17 17 Ganzlax IT Services 2011 75 NJ
## 22 22 Lathotline Health <NA> 103 VA
## 44 44 Ganzgreen Construction 2010 224 TN
## 84 84 Drilldrill Software 2010 30 <NA>
## 267 267 Circlechop Software 2010 14 <NA>
## 332 332 Westminster Financial Services 2010 NA MI
## 379 379 Stovepuck Retail 2013 73 <NA>
## City Revenue Expenses Profit Growth
## 3 Greenville 9746272 1044375 8701897 16
## 8 Woodside NA NA NA NA
## 11 New York 10597009 7591189 3005820 7
## 14 San Ramon 13898119 5470303 8427816 23
## 15 Louisville 9254614 6249498 3005116 6
## 17 Iselin 14001180 NA 11901180 18
## 22 McLean 9418303 7567233 1851070 2
## 44 Franklin NA NA NA 9
## 84 San Francisco 7800620 2785799 5014821 17
## 267 San Francisco 9067070 5929828 3137242 20
## 332 Troy 11861652 5245126 6616526 15
## 379 New York 13814975 5904502 7910473 10

# now we have 12 rows with NA values - <NA> represent NA in strings or


character - NA represent missing value in integer or num
# Lets fix the first missing value in Industry column
# how to do that - delete row 14 and 15 which is NA in industry as the data
cannot be obtained
# before we start deleting rows let us create a backup of our dataset
fin.backup <- fin
fin <- fin[!is.na(fin$Industry),]
# the above code first filters all the values in 3rd row i.e. Industry except
NA and considers all columns and stores it in fin object
head(fin,24) # row no 14 and 15 are deleted.

## ID Name Industry Inception Employees State


## 1 1 Over-Hex Software 2006 25 TN
## 2 2 Unimattax IT Services 2009 36 PA
## 3 3 Greenfax Retail 2012 NA SC
## 4 4 Blacklane IT Services 2011 66 CA
## 5 5 Yearflex Software 2013 45 WI
## 6 6 Indigoplanet IT Services 2013 60 NJ
## 7 7 Treslam Financial Services 2009 116 MO
## 8 8 Rednimdox Construction 2013 73 NY
## 9 9 Lamtone IT Services 2009 55 CA
## 10 10 Stripfind Financial Services 2010 25 FL
## 11 11 Canecorporation Health 2012 6 <NA>
## 12 12 Mattouch IT Services 2013 6 WA
## 13 13 Techdrill Health 2009 9 MS
## 16 16 Kayelectronics Health 2009 687 NC
## 17 17 Ganzlax IT Services 2011 75 NJ
## 18 18 Trantraxlax Government Services 2011 35 VA
## 19 19 E-Zim Retail 2008 320 OH
## 20 20 Daltfase Software 2011 78 NC
## 21 21 Hotlane Government Services 2012 87 AL
## 22 22 Lathotline Health <NA> 103 VA
## 23 23 Lambam IT Services 2012 210 SC
## 24 24 Quozap Software 2004 21 NJ
## 25 25 Tampware Construction 2011 13 TX
## 26 26 Dalthow Health 2000 20 GA
## City Revenue Expenses Profit Growth
## 1 Franklin 9684527 1130700 8553827 19
## 2 Newtown Square 14016543 804035 13212508 20
## 3 Greenville 9746272 1044375 8701897 16
## 4 Orange 15359369 4631808 10727561 19
## 5 Madison 8567910 4374841 4193069 19
## 6 Manalapan 12805452 4626275 8179177 22
## 7 Clayton 5387469 2127984 3259485 17
## 8 Woodside NA NA NA NA
## 9 San Ramon 11757018 6482465 5274553 30
## 10 Boca Raton 12329371 916455 11412916 20
## 11 New York 10597009 7591189 3005820 7
## 12 Bellevue 14026934 7429377 6597557 26
## 13 Flowood 10573990 7435363 3138627 8
## 16 Clayton 9451943 3878113 5573830 4
## 17 Iselin 14001180 NA 11901180 18
## 18 Suffolk 11088336 5635276 5453060 7
## 19 Monroe 10746451 4762319 5984132 13
## 20 Durham 10410628 6196409 4214219 17
## 21 Huntsville 7978332 5686574 2291758 2
## 22 McLean 9418303 7567233 1851070 2
## 23 Columbia 11950148 4365512 7584636 20
## 24 Collingswood 8304480 7019973 1284507 20
## 25 Houston 9785982 2910756 6875226 11
## 26 Dacula 10800718 7731820 3068898 7

# you can observe that after deleting 14 and 15 - the row nos are not reset
i.e. after 13 you will find 16
# the next line of code reset the numbering
rownames(fin) <- NULL
head(fin,24) # check row nos are in order

## ID Name Industry Inception Employees State


## 1 1 Over-Hex Software 2006 25 TN
## 2 2 Unimattax IT Services 2009 36 PA
## 3 3 Greenfax Retail 2012 NA SC
## 4 4 Blacklane IT Services 2011 66 CA
## 5 5 Yearflex Software 2013 45 WI
## 6 6 Indigoplanet IT Services 2013 60 NJ
## 7 7 Treslam Financial Services 2009 116 MO
## 8 8 Rednimdox Construction 2013 73 NY
## 9 9 Lamtone IT Services 2009 55 CA
## 10 10 Stripfind Financial Services 2010 25 FL
## 11 11 Canecorporation Health 2012 6 <NA>
## 12 12 Mattouch IT Services 2013 6 WA
## 13 13 Techdrill Health 2009 9 MS
## 14 16 Kayelectronics Health 2009 687 NC
## 15 17 Ganzlax IT Services 2011 75 NJ
## 16 18 Trantraxlax Government Services 2011 35 VA
## 17 19 E-Zim Retail 2008 320 OH
## 18 20 Daltfase Software 2011 78 NC
## 19 21 Hotlane Government Services 2012 87 AL
## 20 22 Lathotline Health <NA> 103 VA
## 21 23 Lambam IT Services 2012 210 SC
## 22 24 Quozap Software 2004 21 NJ
## 23 25 Tampware Construction 2011 13 TX
## 24 26 Dalthow Health 2000 20 GA
## City Revenue Expenses Profit Growth
## 1 Franklin 9684527 1130700 8553827 19
## 2 Newtown Square 14016543 804035 13212508 20
## 3 Greenville 9746272 1044375 8701897 16
## 4 Orange 15359369 4631808 10727561 19
## 5 Madison 8567910 4374841 4193069 19
## 6 Manalapan 12805452 4626275 8179177 22
## 7 Clayton 5387469 2127984 3259485 17
## 8 Woodside NA NA NA NA
## 9 San Ramon 11757018 6482465 5274553 30
## 10 Boca Raton 12329371 916455 11412916 20
## 11 New York 10597009 7591189 3005820 7
## 12 Bellevue 14026934 7429377 6597557 26
## 13 Flowood 10573990 7435363 3138627 8
## 14 Clayton 9451943 3878113 5573830 4
## 15 Iselin 14001180 NA 11901180 18
## 16 Suffolk 11088336 5635276 5453060 7
## 17 Monroe 10746451 4762319 5984132 13
## 18 Durham 10410628 6196409 4214219 17
## 19 Huntsville 7978332 5686574 2291758 2
## 20 McLean 9418303 7567233 1851070 2
## 21 Columbia 11950148 4365512 7584636 20
## 22 Collingswood 8304480 7019973 1284507 20
## 23 Houston 9785982 2910756 6875226 11
## 24 Dacula 10800718 7731820 3068898 7

fin[!complete.cases(fin),] # again check the missing values reduced from 12


to 10

## ID Name Industry Inception Employees State


## 3 3 Greenfax Retail 2012 NA SC
## 8 8 Rednimdox Construction 2013 73 NY
## 11 11 Canecorporation Health 2012 6 <NA>
## 15 17 Ganzlax IT Services 2011 75 NJ
## 20 22 Lathotline Health <NA> 103 VA
## 42 44 Ganzgreen Construction 2010 224 TN
## 82 84 Drilldrill Software 2010 30 <NA>
## 265 267 Circlechop Software 2010 14 <NA>
## 330 332 Westminster Financial Services 2010 NA MI
## 377 379 Stovepuck Retail 2013 73 <NA>
## City Revenue Expenses Profit Growth
## 3 Greenville 9746272 1044375 8701897 16
## 8 Woodside NA NA NA NA
## 11 New York 10597009 7591189 3005820 7
## 15 Iselin 14001180 NA 11901180 18
## 20 McLean 9418303 7567233 1851070 2
## 42 Franklin NA NA NA 9
## 82 San Francisco 7800620 2785799 5014821 17
## 265 San Francisco 9067070 5929828 3137242 20
## 330 Troy 11861652 5245126 6616526 15
## 377 New York 13814975 5904502 7910473 10

# next we have to fix NA in employees


# now filter all rows with NA in employees column
fin[is.na(fin$Employees),] # we have two such rows i.e. row no 3 and 330

## ID Name Industry Inception Employees State


## 3 3 Greenfax Retail 2012 NA SC
## 330 332 Westminster Financial Services 2010 NA MI
## City Revenue Expenses Profit Growth
## 3 Greenville 9746272 1044375 8701897 16
## 330 Troy 11861652 5245126 6616526 15
# median imputation method
median(fin$Employees, na.rm = TRUE) # calculate median without considering NA
values

## [1] 56

# now find median of employees in retail industry


# first filter all employees data in retail sector
# median imputation method
filter1 <- fin[fin$Industry == "Retail", "Employees"]
med.emp.retail <- median(filter1, na.rm=TRUE)
med.emp.retail

## [1] 28

# now replace the NA in employee for retail sector with calculated median
value
fin[is.na(fin$Employees) & fin$Industry == "Retail", 5] <- med.emp.retail
fin[c(3,330),]

## ID Name Industry Inception Employees State


## 3 3 Greenfax Retail 2012 28 SC
## 330 332 Westminster Financial Services 2010 NA MI
## City Revenue Expenses Profit Growth
## 3 Greenville 9746272 1044375 8701897 16
## 330 Troy 11861652 5245126 6616526 15

# now check whether na is replaced by median value 28


fin[3,]

## ID Name Industry Inception Employees State City Revenue


## 3 3 Greenfax Retail 2012 28 SC Greenville 9746272
## Expenses Profit Growth
## 3 1044375 8701897 16

# Repeat the same to fill median of employee in Financial services sector


filter2 <- fin[fin$Industry == "Financial Services", "Employees"]
median.emp.fs <- median(filter2, na.rm = TRUE)
median.emp.fs

## [1] 80

fin[is.na(fin$Employees) & fin$Industry == "Financial Services", "Employees"]


<- median.emp.fs
fin[330,]

## ID Name Industry Inception Employees State City


## 330 332 Westminster Financial Services 2010 80 MI Troy
## Revenue Expenses Profit Growth
## 330 11861652 5245126 6616526 15
# check the total no of missing values
fin[!complete.cases(fin),]

## ID Name Industry Inception Employees State


## 8 8 Rednimdox Construction 2013 73 NY
## 11 11 Canecorporation Health 2012 6 <NA>
## 15 17 Ganzlax IT Services 2011 75 NJ
## 20 22 Lathotline Health <NA> 103 VA
## 42 44 Ganzgreen Construction 2010 224 TN
## 82 84 Drilldrill Software 2010 30 <NA>
## 265 267 Circlechop Software 2010 14 <NA>
## 377 379 Stovepuck Retail 2013 73 <NA>
## City Revenue Expenses Profit Growth
## 8 Woodside NA NA NA NA
## 11 New York 10597009 7591189 3005820 7
## 15 Iselin 14001180 NA 11901180 18
## 20 McLean 9418303 7567233 1851070 2
## 42 Franklin NA NA NA 9
## 82 San Francisco 7800620 2785799 5014821 17
## 265 San Francisco 9067070 5929828 3137242 20
## 377 New York 13814975 5904502 7910473 10

# let us fill the state if city is new york, then state is NY


# if city is san francisco, then state is CA
# filter all the states with NA value and city New York
# filter all the states with NA value and city San Francisco
filter3 <- fin[is.na(fin$State) & fin$City == "New York", ]
filter3

## ID Name Industry Inception Employees State City


## 11 11 Canecorporation Health 2012 6 <NA> New York
## 377 379 Stovepuck Retail 2013 73 <NA> New York
## Revenue Expenses Profit Growth
## 11 10597009 7591189 3005820 7
## 377 13814975 5904502 7910473 10

filter4 <- fin[is.na(fin$State) & fin$City == "San Francisco", ]


filter4

## ID Name Industry Inception Employees State City


## 82 84 Drilldrill Software 2010 30 <NA> San Francisco
## 265 267 Circlechop Software 2010 14 <NA> San Francisco
## Revenue Expenses Profit Growth
## 82 7800620 2785799 5014821 17
## 265 9067070 5929828 3137242 20

# now replace respective NA in filter 3 with NY and filter4 with CA


# rows 11, 377, 82 and 265
fin[is.na(fin$State) & fin$City == "New York","State"] <- "NY"
fin[is.na(fin$State) & fin$City == "San Francisco","State"] <- "CA"
# check NA replaced by NY and CA
fin[c(11, 377, 82,265),]

## ID Name Industry Inception Employees State City


## 11 11 Canecorporation Health 2012 6 NY New York
## 377 379 Stovepuck Retail 2013 73 NY New York
## 82 84 Drilldrill Software 2010 30 CA San Francisco
## 265 267 Circlechop Software 2010 14 CA San Francisco
## Revenue Expenses Profit Growth
## 11 10597009 7591189 3005820 7
## 377 13814975 5904502 7910473 10
## 82 7800620 2785799 5014821 17
## 265 9067070 5929828 3137242 20

fin[!complete.cases(fin),]

## ID Name Industry Inception Employees State City Revenue


## 8 8 Rednimdox Construction 2013 73 NY Woodside NA
## 15 17 Ganzlax IT Services 2011 75 NJ Iselin 14001180
## 20 22 Lathotline Health <NA> 103 VA McLean 9418303
## 42 44 Ganzgreen Construction 2010 224 TN Franklin NA
## Expenses Profit Growth
## 8 NA NA NA
## 15 NA 11901180 18
## 20 7567233 1851070 2
## 42 NA NA 9

# median imputation method do for NA values in growth column


# similar to employee data
filter4 <- fin[fin$Industry == "Construction","Growth"]
filter4

## [1] NA 11 15 13 9 12 13 11 19 5 8 8 8 12 6 10 15 9 16 12 14 9 8
## [24] 5 10 12 8 15 11 9 7 11 12 8 9 5 7 12 9 5 9 11 8 7 12 8
## [47] 7 11 11 11

fin[is.na(fin$Growth) & fin$Industry == "Construction", "Growth"] <-


median(filter4, na.rm = TRUE)
# check the growth na was in row 8
fin[8,]

## ID Name Industry Inception Employees State City Revenue


## 8 8 Rednimdox Construction 2013 73 NY Woodside NA
## Expenses Profit Growth
## 8 NA NA 10

fin[!complete.cases(fin),]

## ID Name Industry Inception Employees State City Revenue


## 8 8 Rednimdox Construction 2013 73 NY Woodside NA
## 15 17 Ganzlax IT Services 2011 75 NJ Iselin 14001180
## 20 22 Lathotline Health <NA> 103 VA McLean 9418303
## 42 44 Ganzgreen Construction 2010 224 TN Franklin NA
## Expenses Profit Growth
## 8 NA NA 10
## 15 NA 11901180 18
## 20 7567233 1851070 2
## 42 NA NA 9

# now let us fix expenses for IT services industry


# expenses = revenue - profit
fin[is.na(fin$Expenses),"Expenses"] <- fin[is.na(fin$Expenses),"Revenue"] -
fin[is.na(fin$Expenses), "Profit"]
# check row 15
fin[15,]

## ID Name Industry Inception Employees State City Revenue


## 15 17 Ganzlax IT Services 2011 75 NJ Iselin 14001180
## Expenses Profit Growth
## 15 2100000 11901180 18

fin[!complete.cases(fin),]

## ID Name Industry Inception Employees State City Revenue


## 8 8 Rednimdox Construction 2013 73 NY Woodside NA
## 20 22 Lathotline Health <NA> 103 VA McLean 9418303
## 42 44 Ganzgreen Construction 2010 224 TN Franklin NA
## Expenses Profit Growth
## 8 NA NA 10
## 20 7567233 1851070 2
## 42 NA NA 9

# now we have only 3 rows left with NA values


# let us fix revenue and expenses for construction industry
med.construction.revenue <- median(fin[fin$Industry == "Construction",
"Revenue"], na.rm=TRUE)
fin[is.na(fin$Revenue) & fin$Industry == "Construction", "Revenue"] <-
med.construction.revenue
# check change row 8, 42
fin[c(8,42),]

## ID Name Industry Inception Employees State City Revenue


## 8 8 Rednimdox Construction 2013 73 NY Woodside 9055059
## 42 44 Ganzgreen Construction 2010 224 TN Franklin 9055059
## Expenses Profit Growth
## 8 NA NA 10
## 42 NA NA 9

# now change expenses for construction industry


med.construction.expenses <- median(fin[fin$Industry == "Construction",
"Expenses"], na.rm=TRUE)
fin[is.na(fin$Expenses) & fin$Industry == "Construction", "Expenses"] <-
med.construction.expenses
fin[c(8,42),]

## ID Name Industry Inception Employees State City Revenue


## 8 8 Rednimdox Construction 2013 73 NY Woodside 9055059
## 42 44 Ganzgreen Construction 2010 224 TN Franklin 9055059
## Expenses Profit Growth
## 8 4506976 NA 10
## 42 4506976 NA 9

# now in these rows profit = revenue - expenses


fin[is.na(fin$Profit),"Profit"] <- fin[is.na(fin$Profit),"Revenue"] -
fin[is.na(fin$Profit),"Expenses"]
# check
fin[c(8,42),]

## ID Name Industry Inception Employees State City Revenue


## 8 8 Rednimdox Construction 2013 73 NY Woodside 9055059
## 42 44 Ganzgreen Construction 2010 224 TN Franklin 9055059
## Expenses Profit Growth
## 8 4506976 4548083 10
## 42 4506976 4548083 9

fin[!complete.cases(fin),]

## ID Name Industry Inception Employees State City Revenue


## 20 22 Lathotline Health <NA> 103 VA McLean 9418303
## Expenses Profit Growth
## 20 7567233 1851070 2

# now we are left with only one NA value which can be ignored - year of
inception is not required for any analysis
# ............ NOW OUR DATA IS CLEAN ...........

# Now lets plot - use ggplot2


# Visualizing Plots
# Scatter plot classified by industry showing revenue, expenses and profit
library(ggplot2)
# if ggplot2 is not installed you may get a error
# then type run this code install.packages("ggplot2")
# then type run this library(ggplot2)
# now lets plot
# A scatterplot classified by industry showing revenue, expenses, profit.
plot1 <- ggplot(data=fin) # we are creating a object plot1
# x axis revenue, y axis Expenses and points should be coloured
# industry wise and size of the point should vary based on the profit.
plot1 + geom_point(aes(x=Revenue,
y=Expenses,
colour=Industry,
size=Profit))
# try to plot the other types of plots required
# Home work
# plot2 A scatterplot that includes industry trends for the expenses~revenue
relationship.
# plot3 BoxPlots showing growth by industry
# Refer the cheat sheets circulated earlier

You might also like