You are on page 1of 4

pda-da-2.

karth

2024-03-30

library(tidyverse)

## Warning: package ’tidyverse’ was built under R version 4.3.3

## Warning: package ’ggplot2’ was built under R version 4.3.2

## Warning: package ’tibble’ was built under R version 4.3.2

## Warning: package ’tidyr’ was built under R version 4.3.2

## Warning: package ’readr’ was built under R version 4.3.3

## Warning: package ’purrr’ was built under R version 4.3.2

## Warning: package ’dplyr’ was built under R version 4.3.2

## Warning: package ’stringr’ was built under R version 4.3.3

## Warning: package ’forcats’ was built under R version 4.3.3

## Warning: package ’lubridate’ was built under R version 4.3.3

## -- Attaching core tidyverse packages ------------------------ tidyverse 2.0.0 --


## v dplyr 1.1.4 v readr 2.1.5
## v forcats 1.0.0 v stringr 1.5.1
## v ggplot2 3.4.4 v tibble 3.2.1
## v lubridate 1.9.3 v tidyr 1.3.0
## v purrr 1.0.2
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## i Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ggplot2)
data=read.csv('C://Users//karth//OneDrive//Desktop//R-CODES//fol//NYC_Dog_Licensing_Dataset.csv') #data
head(data)

1
## RowNumber AnimalName AnimalGender AnimalBirthMonth BreedName Borough
## 1 1753 SHADOW M 01/01/2000 12:00:00 AM Beagle Brooklyn
## 2 2415 ROCCO M 10/01/2011 12:00:00 AM Boxer Brooklyn
## 3 3328 LUIGI M 09/01/2005 12:00:00 AM Maltese Bronx
## 4 7537 PETUNIA F 08/01/2013 12:00:00 AM Pug Brooklyn
## 5 8487 ROMEO M 10/01/2008 12:00:00 AM Maltese Bronx
## 6 10503 BRANDY M 01/01/2004 12:00:00 AM Unknown Brooklyn
## ZipCode CommunityDistrict CensusTract2010 NTA CityCouncilDistrict
## 1 11236 318 1014 BK50 46
## 2 11210 314 756 BK43 45
## 3 10464 210 516 BX10 13
## 4 11221 304 419 BK78 34
## 5 10451 201 65 BX34 17
## 6 11225 309 800 BK60 40
## CongressionalDistrict StateSenatorialDistrict LicenseIssuedDate
## 1 8 19 12/29/2014
## 2 9 17 01/07/2015
## 3 14 34 01/17/2015
## 4 7 18 03/01/2015
## 5 15 32 03/09/2015
## 6 9 20 03/27/2015
## LicenseExpiredDate
## 1 01/30/2016
## 2 01/30/2016
## 3 02/02/2016
## 4 03/28/2016
## 5 03/09/2016
## 6 03/29/2016

summary(data)

## RowNumber AnimalName AnimalGender AnimalBirthMonth


## Min. : 1 Length:121949 Length:121949 Length:121949
## 1st Qu.: 30491 Class :character Class :character Class :character
## Median : 61250 Mode :character Mode :character Mode :character
## Mean : 61135
## 3rd Qu.: 91738
## Max. :122229
##
## BreedName Borough ZipCode CommunityDistrict
## Length:121949 Length:121949 Min. : 121 Min. :101.0
## Class :character Class :character 1st Qu.:10029 1st Qu.:108.0
## Mode :character Mode :character Median :10465 Median :302.0
## Mean :10678 Mean :265.2
## 3rd Qu.:11228 3rd Qu.:403.0
## Max. :94608 Max. :595.0
## NA’s :1 NA’s :3337
## CensusTract2010 NTA CityCouncilDistrict
## Length:121949 Length:121949 Min. : 1.00
## Class :character Class :character 1st Qu.: 6.00
## Mode :character Mode :character Median :22.00
## Mean :22.83
## 3rd Qu.:37.00
## Max. :51.00

2
## NA’s :3337
## CongressionalDistrict StateSenatorialDistrict LicenseIssuedDate
## Min. : 3.00 Min. :10.00 Length:121949
## 1st Qu.: 8.00 1st Qu.:18.00 Class :character
## Median :11.00 Median :25.00 Mode :character
## Mean :10.27 Mean :23.54
## 3rd Qu.:12.00 3rd Qu.:28.00
## Max. :16.00 Max. :36.00
## NA’s :3337 NA’s :3337
## LicenseExpiredDate
## Length:121949
## Class :character
## Mode :character
##
##
##
##

n_distinct(data$AnimalGender)

## [1] 4

levels(data$AnimalGender) #Look at the levels

## NULL

data=data[! data$AnimalGender %in% c('', ' '),] #Dropping the extra levels

n_distinct(data$AnimalGender)

## [1] 2

n_distinct(data$Borough)

## [1] 79

unique(data$Borough)

## [1] "Brooklyn" "Bronx" "Manhattan"


## [4] "Staten Island" "Queens" "BROOKLYN"
## [7] "STATEN IS" "QUEENS" "BRONX"
## [10] "MANHATTAN" "New York" "New york"
## [13] "ASTORIA" "New York " "Brooklyn "
## [16] "Bronx " "NEW YORK CITY" "bronx"
## [19] "New York " "Manhattan " "Flushing"
## [22] "elmhurst" "The Villages" "Rockaway Park"
## [25] "Fresh meadows" "Corona" "Elmhurst"
## [28] "Astoria" "fresh meadows" "NEW YORK"
## [31] "College Point" "new york" "MIDDLE VILG"

3
## [34] "Quens" "Jersey City" "San Francisco"
## [37] "Richmond Hill" "East Elmhurst" "Ridgewood"
## [40] "bronxville" "Hoboken" "NYC"
## [43] "Floral Park" "Elmhurst " "NY"
## [46] "Middle Village" "Potomac" "Ozone Park"
## [49] "Glendale" "Long Island City" "oakland gardens"
## [52] "ozone park" "Staten Island, NY" "Wappingers Falls, NY"
## [55] "Woodside" "B" "Glen Oaks"
## [58] "Woodside NY." "Kew Gardens" "staten island"
## [61] "BELLE HARBOR" "Jackson heights " "Lynbrook"
## [64] "ARVERNE" "Forest Hills" "cambria heights"
## [67] "Middletown" "MASPETH" "West Palm Beach"
## [70] "South Richmond Hil" "Briarwood " "SO RICHMOND"
## [73] "JACKSON HGTS" "Bayside" "kissimmee florida"
## [76] "queens" "WOODSIDE" "Santa Monica"
## [79] "Albany"

data$Borough=tolower(data$Borough) #convert all names to lowercase

#STEP 2:
#Remove values outside the NYC area
data=data[!(data$Borough=="long island city"),]
data=data[!(data$Borough=="albany"),]

data=data[!(data$Borough=="Jersey City"),]
data$Borough [data$Borough=="staten is"] <- "staten island"
data$Borough [data$Borough=="QUEENS"] <- "queens"

You might also like