HiveCaseStudy Tushar&Siddhardha

-- 1.
Pre-requisite #######################################################
ADD JAR /opt/cloudera/parcels/CDH/lib/hive/lib/hive-hcatalog-core-1.1.0-
cdh5.11.2.jar;
SET hive.exec.max.dynamic.partitions=100000;
SET hive.exec.max.dynamic.partitions.pernode=100000;
-- 2.creating workspace #######################################################
-- drop database HiveCaseStudy_Tushar_Siddhardha CASCADE;

Create database if not exists HiveCaseStudy_Tushar_Siddhardha;
use HiveCaseStudy_Tushar_Siddhardha;
-- 3.Loading data #######################################################
-- drop table HiveCaseStudy_Tushar_Siddhardha.Base_Data;
create external table if not exists HiveCaseStudy_Tushar_Siddhardha.Base_Data(

VendorID int,
tpep_pickup_datetime timestamp,
tpep_dropoff_datetime timestamp,
passenger_count int,
trip_distance decimal(10,2),
RatecodeID int,
store_and_fwd_flag string,
PULocationID int,
DOLocationID int,
payment_type int,
fare_amount decimal(10,2),
extra decimal(10,2),
mta_tax decimal(10,2),
tip_amount decimal(10,2),
tolls_amount decimal(10,2),
improvement_surcharge decimal(10,2),
total_amount decimal(10,2)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','

STORED AS TEXTFILE
LOCATION '/common_folder/nyc_taxi_data/'
tblproperties ("skip.header.line.count"="2");
-- skipping the header and blank row
-- querying table for validation

select * from HiveCaseStudy_Tushar_Siddhardha.Base_Data BD;
select count(*) from HiveCaseStudy_Tushar_Siddhardha.Base_Data BD;
-- 1174568 records
-- 4.Basic Data Quality Checks/sanity checks

----------------------------------------------
-- 1.How many records has each TPEP provider provided?

-- Write a query that summarises the number of records of each provider.
select vendorid,count(*) from HiveCaseStudy_Tushar_Siddhardha.Base_Data BD
group by vendorid;
-- 1 527385
-- 2 647183
-- Vendor 2 has lager chuck of data
select 647183/1174568;
-- 55% od data belong to vendor2 , that leaves 45% of vendor 1
-- 2.The data provided is for months November and December only.

-- Check whether the data is consistent, and if not, identify the data quality
issues.
-- Mention all data quality issues in comments.
-- tpep_pickup_datetime ----------------------------------------------
-- the data is for two months Nov 2017 & Dec 2017
-- any day before 1-nov-2017 and after 31-dec-2017(represent as >= 1-jan-2018) is
out of context
select count(*) from HiveCaseStudy_Tushar_Siddhardha.Base_Data BD

where BD.tpep_pickup_datetime < '2017-11-1 00:00:00.0' or
tpep_pickup_datetime>='2018-01-01 00:00:00.0';
-- 14 records are not in the range
select vendorid, count(*)from HiveCaseStudy_Tushar_Siddhardha.Base_Data BD

where BD.tpep_pickup_datetime < '2017-11-1 00:00:00.0' or
tpep_pickup_datetime>='2018-01-01 00:00:00.0'
group by vendorid;
-- seems vendor 2 is at fault.
-- 2 14
-- tpep_dropoff_datetime ----------------------------------------------
-- The drop may have happened the next day hence the drop time is allowed to be
till 1 jan 2018(represent as >= 2-jan-2018)
where BD.tpep_dropoff_datetime < '2017-11-1 00:00:00.0' or
tpep_dropoff_datetime>='2018-01-02 00:00:00.0';
-- 7 records in total are not in range

where BD.tpep_dropoff_datetime < '2017-11-1 00:00:00.0' or
tpep_dropoff_datetime>='2018-01-02 00:00:00.0'
group by vendorid;
-- vendor 1 has 1 records
-- vendor 2 has 6 records
-- we will evaluate vendor 1 a lil more in details, since it has only one records
select * from HiveCaseStudy_Tushar_Siddhardha.Base_Data BD

where (BD.tpep_dropoff_datetime < '2017-11-1 00:00:00.0' or
tpep_dropoff_datetime>='2018-01-02 00:00:00.0')
and vendorid=1;
-- seems like the data is corrupt one record ending far in future(2019) and one in
past(2017)
-- drop of time can't be greater or equal to pick up time

where BD.tpep_dropoff_datetime<=BD.tpep_pickup_datetime;
-- 6555
select 6555/ 1174568;
-- 0.00558,a smaller set of records can be deleted/ignored
select vendorid, count(*) from HiveCaseStudy_Tushar_Siddhardha.Base_Data BD
where BD.tpep_dropoff_datetime<=BD.tpep_pickup_datetime
group by vendorid;
-- 1 3492
-- 2 3063
-- Vendor 1 seems to be at fault lets evaluate few of its records
select * from HiveCaseStudy_Tushar_Siddhardha.Base_Data BD

where BD.tpep_dropoff_datetime<=BD.tpep_pickup_datetime and vendorid=1;
-- well location id's for pick up and drop are changing and the billing seems to be
different everytime
-- But since we can't be sure of what actual event took place and record set is
small
-- we will ignore the records
-- passenger_count
--------------------------------------------------------------------------------
select passenger_count, count(*) from HiveCaseStudy_Tushar_Siddhardha.Base_Data BD
group by passenger_count;
-- 0 6824
-- 1 827498
-- 2 176872
-- 3 50693
-- 4 24951
-- 5 54568
-- 6 33146
-- 7 12
-- 8 3
-- 9 1
-- passenger count above 6 is small, like 7,8,9, may be bigger car, or driver
manual entry mistake
-- 192 is a single records definately an manual error
select 6824/1174568;
--0.0058 %
-- 0 again seems like an disinterested driver not putting in details, or an empty
parcel being sent in teh cab
-- Lets see which vendor is at fault here
select vendorid,passenger_count, count(*)

from HiveCaseStudy_Tushar_Siddhardha.Base_Data BD
where passenger_count in (0,7,8,9) group by vendorid,passenger_count
order by passenger_count,vendorid;
-- we will ignore 0 in passenger_count as the count or records are not too high and
can be to be ignored
--vendorid passenger_count count
-- 1 0 6813
-- 2 0 11
-- Vendor 1 seems to be at fault w.r.t 0 passenger_count
-- trip_distance -----------------------------------
select max(BD.trip_distance)
-- max_trip_distance,min_trip_distance
-- 126.41, 0,
-- Data Dictionary:- The elapsed trip distance in miles reported by the taximeter.
select count(*) from HiveCaseStudy_Tushar_Siddhardha.Base_Data BD where
trip_distance<=0;
--7402 out of 1174568 small set
select 7402/1174568;
-- 0.00630189
-- we will ignore this data
select vendorid,count(*) from HiveCaseStudy_Tushar_Siddhardha.Base_Data BD where
trip_distance<=0 group by vendorid;
-- 1 4217
-- 2 3185
-- Both vendor seems to be equally responsible for this
-- ratecodeid -----------------------------------
select ratecodeid,count(*) from HiveCaseStudy_Tushar_Siddhardha.Base_Data BD
group by ratecodeid;
-- 1-6 are valid id as per teh metadata, and 99 value based 9 records are incorrect
-- we will ignore the 9 records

-- vendorid wise analysis
select vendorid , count(*)
where ratecodeid=99
group by vendorid;
-- 1 8
-- 2 1
-- Vendor 1 is the mazor contributor towards this data discripency
-- store_and_fwd_flag -----------------------------------
select store_and_fwd_flag,count(*) from HiveCaseStudy_Tushar_Siddhardha.Base_Data
BD group by store_and_fwd_flag;
-- the value of yes and no are fine
-- fare_amount
-- max_fare_amount,min_fare_amount,
-- 650, -200,
-- Data Dictionary :- The time-and-distance fare calculated by the meter.
select
percentile_approx(fare_amount,array(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.99))
from HiveCaseStudy_Tushar_Siddhardha.Base_Data BD;
--
[4.936277914662287,5.952247700797058,6.92251039282117,7.970874767979148,9.390659940
336155,10.927988505747127,12.998791213590113,16.812362001498627,24.8522171486555,51
.958303868698714]
-- these values are acceptable lets tru a smaller percentile value

select percentile_approx(fare_amount,array(0.01,0.999))
from HiveCaseStudy_Tushar_Siddhardha.Base_Data BD;
-- [3.261611065392163,88.10268000000157]
-- even these values are within range
-- seems like the negative values and very high values are wrong data or outliner.
-- we can easily ignore negative values
fare_amount<0;
-- 558 negative values
-- lets find a upper limit for these fare values
select count(*)from HiveCaseStudy_Tushar_Siddhardha.Base_Data BD where
fare_amount>100;
-- 770 records
-- let's try a bigger number
select * from HiveCaseStudy_Tushar_Siddhardha.Base_Data BD where fare_amount>1000;
-- 9 records fall in this category
-- some these records have same pickup and drop location, may be the trip was
circular
-- although all these values look like outliner , if we had businees team access we
would have requested them to validate.
-- due to lack of grond to reject other values we will reject only 9 th and higher.
-- Vendor wise
select vendorid ,count(*)
where fare_amount>390000 or fare_amount<0 group by vendorid;
-- 2 558
-- Vendor 2 is the major portion of the corrupt data
-- extra ---------------------------------------------------------------------
-- max_extra,min_extra
-- 4.8, -10.6,
-- extra ranges between 4.8, -10.6,, as seen previously by the min and max query
-- But data disctionary says :-Miscellaneous extras and surcharges.
-- Currently, this only includes the $0.50 and $1 rush hour and overnight charges.
-- hence we will reject these values let's verify their count
select count(*) from HiveCaseStudy_Tushar_Siddhardha.Base_Data BD where extra not
in (0,0.5,1);
-- 4856
select 4856/1174568;
-- 0.004134 this data can be safely ignored
select vendorid,count(*) from HiveCaseStudy_Tushar_Siddhardha.Base_Data BD where
extra not in (0,0.5,1)
group by vendorid;
-- 1 1823
-- 2 3033
-- both vendro seems to at fault
-- may be the understanding w.r.t these extra column is missing
-- as these are the only column where vendor 1 is also looking malicious
-- mta_tax --------------------------------
-- max_mta_tax,min_mta_tax
-- 11.4, -0.5,
-- Data Dictionary :-$0.50 MTA tax that is automatically triggered based on the
metered rate in use.
select count(*) from HiveCaseStudy_Tushar_Siddhardha.Base_Data BD where mta_tax
not in (0,0.5);
select 548/1174568;
--0.0004 smaller set, based on data disctionary we would ignore these
where mta_tax not in (0,0.5) group by vendorid;
-- Both vendor are equally responsible,
-- 1 1
-- 2 547
-- Vendor 2 is again mazorly at fault
-- tip_amount ---------------------------------------------------
-- max_tip_amount,min_tip_amount
-- 450, -1.16,
-- Data dictionary - Tip amount – This field is automatically populated for credit
card tips.
-- Cash tips are not included.
--negative values
select count(*) from HiveCaseStudy_Tushar_Siddhardha.Base_Data BD where tip_amount
<0;
-- only 4 values are negative can be easily be ignored
where tip_amount <0 group by vendorid;
-- all belong to vendor 2
-- Let's chek if their are non credit card based tips

Payment_type!=1 and tip_amount>0;
-- 17 records have payment mode other than credit and still have tip amount greate
than 0
-- we will ignore these records to sanity as well.

where Payment_type!=1 and tip_amount>0 group by vendorid;
-- 1-17
-- All records belong to vendor 1
-- we will remove these records as well
-- tolls_amount ---------------------------------------------------
-- max_tolls_amount,min_tolls_amount,
-- 895.89, -5.76,
-- Data Dictionary:- Total amount of all tolls paid in trip.
-- The value can't be negative
tolls_amount <0;
-- only 3 records can be safely ignored
where tolls_amount <0 group by vendorid;
-- All vendor 2
-- improvement_surcharge ---------------------------------------------------
-- max_improvement_surcharge,min_improvement_surcharge,
-- 1, -0.3
-- Data Dictionary :- $0.30 improvement surcharge assessed trips at the flag drop.
-- The improvement surcharge began being levied in 2015.

improvement_surcharge not in (0,0.3);
-- only 562 can be easily ignored
-- vendor wise
where improvement_surcharge not in (0,0.3) group by vendorid;
-- All records belong to vendor 2
-- total_amount ---------------------------------------------------
-- ,max_total_amount,min_total_amount
-- 928.19, -200.8
/*Data Dictionary:- The total amount charged to passengers. Does not include cash
tips
can be negative and has similar high value as fare_amount, we will check this with
similar queries */
total_amount<0;
-- 558 can be easily ignored
-- Higehr value check
select * from HiveCaseStudy_Tushar_Siddhardha.Base_Data BD where
total_amount>1000;
-- Again we have no business acces to reject values raing till 8000 but 390000 for
sure seems to high for a cab ride
-- vendor wise
where total_amount>1000 or total_amount<0 group by vendorid;
-- 2 558
-- Group 2 highly dominate in teh corrupt data section
-- Basic Data Quality Checks ---------------------------------------------------

-- 3.You might have encountered unusual or erroneous rows in the dataset.
-- Can you conclude which vendor is doing a bad job in providing the records using
different columns of the dataset? Summarise your conclusions based on every column
where these errors are present.
-- For example, There are unusual passenger count i.e 0 or 7 which is unusual.
-- For the data It's mostly vendor 2 that is providing faulty data.
-- Below is teh list of problemtic data they have provided column wise
-- invalid values for 1.total_amount , 2.improvement_surcharge,3.tolls_amount,
4.tip_amount,mta_tax,
-- 5.fare_amount,6.passenger_count, 7.pickup and 8.drop off time
-- ---------------------------------------------------
-- For the column extra both vendor seems to be equally at fault
-- Vendor 1 for has few tip amount where the payment mode is not credit card
--
-- But overall Vendor 2 is definately not providing correct data.
-----------------------------------------------------
use HiveCaseStudy_Tushar_Siddhardha;
-- Before answering the below questions, you need to create a clean, ORC
partitioned table for analysis.
-- Remove all the erroneous rows.
-- We will be partinening on the month column first as we need to answer question

comparing between the two
-- since we expect only two month data to pass from out filters year is not an any
use in partionening
-- Our secondarly partition is based on Vendor , although the question don't call
for this one
-- In general if we were analysing this data freely we would still want the
partitionening to be done this way
-- drop table HiveCaseStudy_Tushar_Siddhardha.ParOrc_Data;
Create external table if not exists HiveCaseStudy_Tushar_Siddhardha.ParOrc_Data(
tpep_pickup_datetime timestamp,
tpep_dropoff_datetime timestamp,
passenger_count int,
trip_distance decimal(10,2),
RatecodeID int,
store_and_fwd_flag string,
PULocationID int,
DOLocationID int,
payment_type int,
fare_amount decimal(10,2),
extra decimal(10,2),
mta_tax decimal(10,2),
tip_amount decimal(10,2),
tolls_amount decimal(10,2),
improvement_surcharge decimal(10,2),
total_amount decimal(10,2)
)
partitioned by (Mnth int,VendorID int)
stored as orc location '/user/tusharchorghe29_gmail/Assignment_HIVE'
tblproperties ("orc.compress"="SNAPPY");
-- Posting data
insert overwrite table HiveCaseStudy_Tushar_Siddhardha.ParOrc_Data
partition(Mnth,VendorID)
select
tpep_pickup_datetime,
tpep_dropoff_datetime,
passenger_count,
trip_distance,
RatecodeID,
store_and_fwd_flag,
PULocationID,
DOLocationID,
payment_type,
fare_amount,
extra,
mta_tax,
tip_amount,
tolls_amount,
improvement_surcharge,
total_amount,
month(tpep_pickup_datetime) Mnth,
VendorID
where (BD.tpep_pickup_datetime >='2017-11-1 00:00:00.0' and
tpep_pickup_datetime<'2018-01-01 00:00:00.0') and
( BD.tpep_dropoff_datetime >= '2017-11-1 00:00:00.0' and
tpep_dropoff_datetime<'2018-01-02 00:00:00.0') and
(BD.tpep_dropoff_datetime>BD.tpep_pickup_datetime) and
(passenger_count not in (0,192)) and
(trip_distance>0) and
(ratecodeid!=99) and
(fare_amount<=390000 and fare_amount>0 ) and
(extra in (0,0.5,1)) and
(mta_tax in (0,0.5)) and
((tip_amount >=0 and Payment_type=1) or (Payment_type!=1 and tip_amount=0)) and
( tolls_amount >=0) and
( improvement_surcharge in (0,0.3)) and
(total_amount<=390000 and total_amount>0 ) ;
select count(*) from HiveCaseStudy_Tushar_Siddhardha.ParOrc_Data;

-- 1153586
select 1174568-1153586;
-- 20982 were removed
select 20982/1174568;
-- amounting to 0.017863 % of data
-- Analysis-I
-----------------------------------------------------------------------------------
------------
-- 1 .Compare the overall average fare per trip for November and
December.-------------------------------------
select mnth,round(avg(total_amount),2),round(avg(fare_amount),2)
from HiveCaseStudy_Tushar_Siddhardha.ParOrc_Data group by mnth;
-- Month Avg_total_amt Avg_fare_amount
-- 12 15.89 12.7
-- 11 16.19 12.91
select 16.19-15.9, 12.9-12.71;
-- Overall the month Novemeber seems to be better considering total amount.
-- Also the difference in fare amount avg is on the lower side when compared to
total amount
-- this signifies that exta tax and charges are also coming in play during the
month of November
-- 2. Explore the ‘number of passengers per trip’ - how many trips are made by each
level of ‘Passenger_count’?
-- Do most people travel solo or with other people?
select passenger_count,round((count(*)*100/1153586),4) cnt
from HiveCaseStudy_Tushar_Siddhardha.ParOrc_Data group by passenger_count
order by cnt desc;
passenger_count cnt
1 70.8242
2 15.1513
5 4.6843
3 4.3502
6 2.8504
4 2.1394
7 0.0003
-- Solo rides are most common , dominant infact with almost 71% od data belonging
to them
-- Dual rides are teh conly other significant category with 15% occupancy
-- Rest all are marfinal below 5 %
-- value for 9,8,7 are two small to be of any significance, may be special limo
rides, corrupt data
select passenger_count,count(*) cnt
from HiveCaseStudy_Tushar_Siddhardha.ParOrc_Data where passenger_count in (9,8,7)
group by passenger_count
order by cnt desc;
-- 7 3
-- 3.Which is the most preferred mode of payment?

-------------------------------------------
select payment_type,round((count(*)*100/1153586),4) cnt
from HiveCaseStudy_Tushar_Siddhardha.ParOrc_Data group by payment_type
order by cnt desc;
payment_type cnt
1 1 67.5418
2 2 31.9576
3 3 0.3884
4 4 0.1123
-- Credit card pays are dominant with 67.5% and cash payment are 2nd highest
paymnet 32%
-- rest all modes are negligable
-- 5 & 6 are not existance as previsously seen.
-- 4.What is the average tip paid per trip?
-- Compare the average tip with the 25th, 50th and 75th percentiles and
-- comment whether the ‘average tip’ is a representative statistic (of the central
tendency) of ‘tip amount paid’.
-- Hint: You may use percentile_approx(DOUBLE col, p):
-- Returns an approximate pth percentile of a numeric column (including floating
point types) in the group.
select round(avg(tip_amount),2)
from HiveCaseStudy_Tushar_Siddhardha.ParOrc_Data;
-- 1.83
select percentile_approx(tip_amount,array(0.25,0.50,0.75))
-- 25%, 50%, 75%
-- [0.0, 1.3548446103593095, 2.4482005042753783]
-- From the %centile values we can see that data is skewed towards teh higher side.
-- 25% or more values bein zero tip do play a high part in this behaviour
-- again the median 1.36 is much lower then the avg 1.83 due to the skewness
towards higher values
-- Hence mean is not representative statistic of centeral tendency here.
-- It woudl be advised to use median instead of mean for this particular column
during analysis
-- 5. Explore the ‘Extra’ (charge) variable - what fraction of total trips have an
extra charge is levied?
select extra,round((count(*)*100/1153586),4) cnt from (
select case when extra>0 then 1 else 0 end extra
from HiveCaseStudy_Tushar_Siddhardha.ParOrc_Data ) T
group by extra order by cnt desc;
-- Extra applied %age records
-- 1 46.1454
-- 0 53.8546
-- The distribusion is fairly even with 46.14% records having extra charges applied
, where as 53.85% have no extra charges applied
-- Analysis-II
-- 1. What is the correlation between the number of passengers on any given trip,
and the tip paid per trip?
-- Do multiple travellers tip more compared to solo travellers? Hint: Use
CORR(Col_1, Col_2)
select round(corr(passenger_count, tip_amount),4) from
HiveCaseStudy_Tushar_Siddhardha.ParOrc_Data;
-- -0.0053
-- the value is fairly small although negative but its would be fair to say that
passenger count is unrealted to the tip amount paid.
select round(corr(is_solo, tip_amount),4) from

(select case when passenger_count=1 then 1 else 0 end is_solo,tip_amount
from HiveCaseStudy_Tushar_Siddhardha.ParOrc_Data) T;
-- 0.0062, comparing only single vs multiple rider count their is still very low
co-relation
select is_solo,round(avg(tip_amount),4) from

(select case when passenger_count=1 then 1 else 0 end is_solo,tip_amount
from HiveCaseStudy_Tushar_Siddhardha.ParOrc_Data ) T group by is_solo;
-- 0 1.8023
-- 1 1.8354
-- Values are almost same
-- 2. Segregate the data into five segments of ‘tip paid’: [0-5), [5-10), [10-15) ,
[15-20) and >=20. #######################################################
-- Calculate the percentage share of each bucket (i.e. the fraction of trips
falling in each bucket).
select Tip_range, round((count(*)*100/1153586),4) cnt
from (select
case when (tip_amount>=0 and tip_amount<5) then '[0-5)'
when (tip_amount>=5 and tip_amount<10) then '[5-10)'
when (tip_amount>=20) then '>=20' end Tip_range
from HiveCaseStudy_Tushar_Siddhardha.ParOrc_Data) T
group by Tip_range
order by cnt desc;
tip_range cnt
[0-5) 92.4038
[5-10) 5.638
[10-15) 1.6829
[15-20) 0.1872
>=20 0.0881
-- 0-5 range is the most prominate group with 92.4% records, we already know 25%+
of these are 0 values from teh precious percentile based check
-- 5-10 represening a small fraction of 5.6%, remaning set are almost neglihgble
amount to 2% of data
-- 3.Which month has a greater average ‘speed’ - November or December?

##################################################
-- Note that the variable ‘speed’ will have to be derived from other metrics.
-- Hint: You have columns for distance and time.
-- values will be returned in sec hence we will be dividing it by 3600 to get value
sin hour
-- since distance is psecified in miles out final value will be in miles/hour unit
select mnth , round(avg(trip_distance/

((unix_timestamp(tpep_dropoff_datetime)-
unix_timestamp(tpep_pickup_datetime) )/3600) ),2) avg_speed
from HiveCaseStudy_Tushar_Siddhardha.ParOrc_Data
group by mnth
order by avg_speed desc;
-- 11 10.97
-- 12 11.07
-- December month is marginally faster by 0.1 miles/hour
-- this is expected that during the holiday seasons of december taxi are running
faster
-- but the minimilatic difference represents that NewYork never rest it works even
through its holiday season
-- 5.Analyse the average speed of the most happening days of the year,
-- i.e. 31st December (New year’s eve) and 25th December (Christmas Eve)
-- and compare it with the overall average.
-- any trip that started on 25th or 31 will be considerd for the avg calculation
irrespective of the fact that it might have ended on the next day
select IsHoliday, round(avg(speed),2) avg_speed from
(select case when ((tpep_pickup_datetime>='2017-12-25 00:00:00.0' and
tpep_pickup_datetime<'2017-12-26 00:00:00.0')
or (tpep_pickup_datetime>='2017-12-31 00:00:00.0' and tpep_pickup_datetime<'2018-
01-01 00:00:00.0') ) then 1 else 0 end IsHoliday ,
trip_distance/((unix_timestamp(tpep_dropoff_datetime)-
unix_timestamp(tpep_pickup_datetime) )/3600) speed
from HiveCaseStudy_Tushar_Siddhardha.ParOrc_Data) T
group by IsHoliday
order by avg_speed desc;
-- 1 14.01
-- 0 10.95
select 14.01-10.95;
-- The comparision between holiday vs non-holiday , the during the holiday atleast
the streets of New York are clear(er)
-- as the Cab's are running at a faster average speed by a margin of 3.06
miles/hour
-- The non festive day average is in sync with november and december averages at
around 10.95 miles/per hour
-- let's confirm the overall averages once
select round(avg(trip_distance/((unix_timestamp(tpep_dropoff_datetime)-
unix_timestamp(tpep_pickup_datetime) )/3600)),2) avg_speed
-- 11.02 is the overall avg speed as expected so the faster speed on 25th and 31
dec amounts to 0.07(10.95 was for non holiday days) increment on the overall speed
-- Let's compare individual days too

-- christmas
select Day_type,round(avg(trip_distance/((unix_timestamp(tpep_dropoff_datetime)-
unix_timestamp(tpep_pickup_datetime) )/3600)),2) avg_speed
from (
select trip_distance,tpep_dropoff_datetime,tpep_pickup_datetime,
case when ((tpep_pickup_datetime>='2017-12-25 00:00:00.0' and
tpep_pickup_datetime<'2017-12-26 00:00:00.0')) then 1
when ((tpep_pickup_datetime>='2017-12-31 00:00:00.0' and
tpep_pickup_datetime<'2018-01-01 00:00:00.0') ) then 2 else 0 end Day_type
from HiveCaseStudy_Tushar_Siddhardha.ParOrc_Data
) T
group by Day_type;
-- 0 10.95 rest of the days

-- 1 15.27 Chritsmas
-- 2 13.24 new year eve
-- The fasted avg speed is oberved on chrismat day @ 15.27 miles/hour; 2.03
miles/hour faster than new year eve mark of 13.24 miles/hour
-- The result represent similar value to the combined is-holiday data i.e. Both are
indidvidually much faster than the average time taken on other days
-----------END-----------------------------

HiveCaseStudy Tushar&Siddhardha

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

HiveCaseStudy Tushar&Siddhardha

Uploaded by

Copyright:

Available Formats

-- 1.

-- 2.creating workspace #######################################################

-- drop database HiveCaseStudy_Tushar_Siddhardha CASCADE;

-- 3.Loading data #######################################################

-- drop table HiveCaseStudy_Tushar_Siddhardha.Base_Data;

create external table if not exists HiveCaseStudy_Tushar_Siddhardha.Base_Data(

ROW FORMAT DELIMITED FIELDS TERMINATED BY ','

-- querying table for validation

-- 4.Basic Data Quality Checks/sanity checks

-- 1.How many records has each TPEP provider provided?

-- 2.The data provided is for months November and December only.

select count(*) from HiveCaseStudy_Tushar_Siddhardha.Base_Data BD

-- 14 records are not in the range

select vendorid, count(*)from HiveCaseStudy_Tushar_Siddhardha.Base_Data BD

select vendorid,count(*) from HiveCaseStudy_Tushar_Siddhardha.Base_Data BD

select * from HiveCaseStudy_Tushar_Siddhardha.Base_Data BD

-- drop of time can't be greater or equal to pick up time

select * from HiveCaseStudy_Tushar_Siddhardha.Base_Data BD

select vendorid,passenger_count, count(*)

-- we will ignore the 9 records

-- these values are acceptable lets tru a smaller percentile value

-- Let's chek if their are non credit card based tips

select vendorid,count(*) from HiveCaseStudy_Tushar_Siddhardha.Base_Data BD

select count(*) from HiveCaseStudy_Tushar_Siddhardha.Base_Data BD where

-- Basic Data Quality Checks ---------------------------------------------------

-- We will be partinening on the month column first as we need to answer question

select count(*) from HiveCaseStudy_Tushar_Siddhardha.ParOrc_Data;

-- 3.Which is the most preferred mode of payment?

select round(corr(is_solo, tip_amount),4) from

select is_solo,round(avg(tip_amount),4) from

-- 3.Which month has a greater average ‘speed’ - November or December?

select mnth , round(avg(trip_distance/

-- Let's compare individual days too

-- 0 10.95 rest of the days

You might also like