Professional Documents
Culture Documents
Pre-requisite #######################################################
ADD JAR /opt/cloudera/parcels/CDH/lib/hive/lib/hive-hcatalog-core-1.1.0-
cdh5.11.2.jar;
SET hive.exec.max.dynamic.partitions=100000;
SET hive.exec.max.dynamic.partitions.pernode=100000;
-- tpep_dropoff_datetime ----------------------------------------------
-- The drop may have happened the next day hence the drop time is allowed to be
till 1 jan 2018(represent as >= 2-jan-2018)
select count(*) from HiveCaseStudy_Tushar_Siddhardha.Base_Data BD
where BD.tpep_dropoff_datetime < '2017-11-1 00:00:00.0' or
tpep_dropoff_datetime>='2018-01-02 00:00:00.0';
-- 7 records in total are not in range
-- passenger_count
--------------------------------------------------------------------------------
select passenger_count, count(*) from HiveCaseStudy_Tushar_Siddhardha.Base_Data BD
group by passenger_count;
-- 0 6824
-- 1 827498
-- 2 176872
-- 3 50693
-- 4 24951
-- 5 54568
-- 6 33146
-- 7 12
-- 8 3
-- 9 1
-- passenger count above 6 is small, like 7,8,9, may be bigger car, or driver
manual entry mistake
-- 192 is a single records definately an manual error
select 6824/1174568;
--0.0058 %
-- 0 again seems like an disinterested driver not putting in details, or an empty
parcel being sent in teh cab
-- Lets see which vendor is at fault here
-- trip_distance -----------------------------------
select max(BD.trip_distance)
from HiveCaseStudy_Tushar_Siddhardha.Base_Data BD
-- max_trip_distance,min_trip_distance
-- 126.41, 0,
-- Data Dictionary:- The elapsed trip distance in miles reported by the taximeter.
select count(*) from HiveCaseStudy_Tushar_Siddhardha.Base_Data BD where
trip_distance<=0;
--7402 out of 1174568 small set
select 7402/1174568;
-- 0.00630189
-- we will ignore this data
select vendorid,count(*) from HiveCaseStudy_Tushar_Siddhardha.Base_Data BD where
trip_distance<=0 group by vendorid;
-- 1 4217
-- 2 3185
-- Both vendor seems to be equally responsible for this
-- ratecodeid -----------------------------------
select ratecodeid,count(*) from HiveCaseStudy_Tushar_Siddhardha.Base_Data BD
group by ratecodeid;
-- 1-6 are valid id as per teh metadata, and 99 value based 9 records are incorrect
-- store_and_fwd_flag -----------------------------------
select store_and_fwd_flag,count(*) from HiveCaseStudy_Tushar_Siddhardha.Base_Data
BD group by store_and_fwd_flag;
-- the value of yes and no are fine
-- fare_amount
-- max_fare_amount,min_fare_amount,
-- 650, -200,
-- Data Dictionary :- The time-and-distance fare calculated by the meter.
select
percentile_approx(fare_amount,array(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.99))
from HiveCaseStudy_Tushar_Siddhardha.Base_Data BD;
--
[4.936277914662287,5.952247700797058,6.92251039282117,7.970874767979148,9.390659940
336155,10.927988505747127,12.998791213590113,16.812362001498627,24.8522171486555,51
.958303868698714]
-- 2 558
-- Vendor 2 is the major portion of the corrupt data
-- extra ---------------------------------------------------------------------
-- max_extra,min_extra
-- 4.8, -10.6,
-- extra ranges between 4.8, -10.6,, as seen previously by the min and max query
-- But data disctionary says :-Miscellaneous extras and surcharges.
-- Currently, this only includes the $0.50 and $1 rush hour and overnight charges.
-- hence we will reject these values let's verify their count
select count(*) from HiveCaseStudy_Tushar_Siddhardha.Base_Data BD where extra not
in (0,0.5,1);
-- 4856
select 4856/1174568;
-- 0.004134 this data can be safely ignored
select vendorid,count(*) from HiveCaseStudy_Tushar_Siddhardha.Base_Data BD where
extra not in (0,0.5,1)
group by vendorid;
-- 1 1823
-- 2 3033
-- both vendro seems to at fault
-- may be the understanding w.r.t these extra column is missing
-- as these are the only column where vendor 1 is also looking malicious
-- mta_tax --------------------------------
-- max_mta_tax,min_mta_tax
-- 11.4, -0.5,
-- Data Dictionary :-$0.50 MTA tax that is automatically triggered based on the
metered rate in use.
select count(*) from HiveCaseStudy_Tushar_Siddhardha.Base_Data BD where mta_tax
not in (0,0.5);
select 548/1174568;
--0.0004 smaller set, based on data disctionary we would ignore these
select vendorid,count(*) from HiveCaseStudy_Tushar_Siddhardha.Base_Data BD
where mta_tax not in (0,0.5) group by vendorid;
-- Both vendor are equally responsible,
-- 1 1
-- 2 547
-- Vendor 2 is again mazorly at fault
-- tip_amount ---------------------------------------------------
-- max_tip_amount,min_tip_amount
-- 450, -1.16,
-- Data dictionary - Tip amount – This field is automatically populated for credit
card tips.
-- Cash tips are not included.
--negative values
select count(*) from HiveCaseStudy_Tushar_Siddhardha.Base_Data BD where tip_amount
<0;
-- only 4 values are negative can be easily be ignored
select vendorid,count(*) from HiveCaseStudy_Tushar_Siddhardha.Base_Data BD
where tip_amount <0 group by vendorid;
-- all belong to vendor 2
-- improvement_surcharge ---------------------------------------------------
-- max_improvement_surcharge,min_improvement_surcharge,
-- 1, -0.3
-- Data Dictionary :- $0.30 improvement surcharge assessed trips at the flag drop.
-- The improvement surcharge began being levied in 2015.
-- vendor wise
select vendorid,count(*) from HiveCaseStudy_Tushar_Siddhardha.Base_Data BD
where improvement_surcharge not in (0,0.3) group by vendorid;
-- All records belong to vendor 2
-- total_amount ---------------------------------------------------
-- ,max_total_amount,min_total_amount
-- 928.19, -200.8
/*Data Dictionary:- The total amount charged to passengers. Does not include cash
tips
can be negative and has similar high value as fare_amount, we will check this with
similar queries */
select count(*) from HiveCaseStudy_Tushar_Siddhardha.Base_Data BD where
total_amount<0;
-- 558 can be easily ignored
-- Higehr value check
select * from HiveCaseStudy_Tushar_Siddhardha.Base_Data BD where
total_amount>1000;
-- Again we have no business acces to reject values raing till 8000 but 390000 for
sure seems to high for a cab ride
-- vendor wise
select vendorid,count(*) from HiveCaseStudy_Tushar_Siddhardha.Base_Data BD
where total_amount>1000 or total_amount<0 group by vendorid;
-- 2 558
-- Group 2 highly dominate in teh corrupt data section
-- For the data It's mostly vendor 2 that is providing faulty data.
-- Below is teh list of problemtic data they have provided column wise
-- invalid values for 1.total_amount , 2.improvement_surcharge,3.tolls_amount,
4.tip_amount,mta_tax,
-- 5.fare_amount,6.passenger_count, 7.pickup and 8.drop off time
-- ---------------------------------------------------
-- For the column extra both vendor seems to be equally at fault
-- Vendor 1 for has few tip amount where the payment mode is not credit card
--
-- But overall Vendor 2 is definately not providing correct data.
-----------------------------------------------------
use HiveCaseStudy_Tushar_Siddhardha;
-- Before answering the below questions, you need to create a clean, ORC
partitioned table for analysis.
-- Remove all the erroneous rows.
-- Our secondarly partition is based on Vendor , although the question don't call
for this one
-- In general if we were analysing this data freely we would still want the
partitionening to be done this way
-- drop table HiveCaseStudy_Tushar_Siddhardha.ParOrc_Data;
Create external table if not exists HiveCaseStudy_Tushar_Siddhardha.ParOrc_Data(
tpep_pickup_datetime timestamp,
tpep_dropoff_datetime timestamp,
passenger_count int,
trip_distance decimal(10,2),
RatecodeID int,
store_and_fwd_flag string,
PULocationID int,
DOLocationID int,
payment_type int,
fare_amount decimal(10,2),
extra decimal(10,2),
mta_tax decimal(10,2),
tip_amount decimal(10,2),
tolls_amount decimal(10,2),
improvement_surcharge decimal(10,2),
total_amount decimal(10,2)
)
partitioned by (Mnth int,VendorID int)
stored as orc location '/user/tusharchorghe29_gmail/Assignment_HIVE'
tblproperties ("orc.compress"="SNAPPY");
-- Posting data
insert overwrite table HiveCaseStudy_Tushar_Siddhardha.ParOrc_Data
partition(Mnth,VendorID)
select
tpep_pickup_datetime,
tpep_dropoff_datetime,
passenger_count,
trip_distance,
RatecodeID,
store_and_fwd_flag,
PULocationID,
DOLocationID,
payment_type,
fare_amount,
extra,
mta_tax,
tip_amount,
tolls_amount,
improvement_surcharge,
total_amount,
month(tpep_pickup_datetime) Mnth,
VendorID
from HiveCaseStudy_Tushar_Siddhardha.Base_Data BD
where (BD.tpep_pickup_datetime >='2017-11-1 00:00:00.0' and
tpep_pickup_datetime<'2018-01-01 00:00:00.0') and
( BD.tpep_dropoff_datetime >= '2017-11-1 00:00:00.0' and
tpep_dropoff_datetime<'2018-01-02 00:00:00.0') and
(BD.tpep_dropoff_datetime>BD.tpep_pickup_datetime) and
(passenger_count not in (0,192)) and
(trip_distance>0) and
(ratecodeid!=99) and
(fare_amount<=390000 and fare_amount>0 ) and
(extra in (0,0.5,1)) and
(mta_tax in (0,0.5)) and
((tip_amount >=0 and Payment_type=1) or (Payment_type!=1 and tip_amount=0)) and
( tolls_amount >=0) and
( improvement_surcharge in (0,0.3)) and
(total_amount<=390000 and total_amount>0 ) ;
-- 1 .Compare the overall average fare per trip for November and
December.-------------------------------------
select mnth,round(avg(total_amount),2),round(avg(fare_amount),2)
from HiveCaseStudy_Tushar_Siddhardha.ParOrc_Data group by mnth;
-- Month Avg_total_amt Avg_fare_amount
-- 12 15.89 12.7
-- 11 16.19 12.91
select 16.19-15.9, 12.9-12.71;
-- Overall the month Novemeber seems to be better considering total amount.
-- Also the difference in fare amount avg is on the lower side when compared to
total amount
-- this signifies that exta tax and charges are also coming in play during the
month of November
-- 2. Explore the ‘number of passengers per trip’ - how many trips are made by each
level of ‘Passenger_count’?
-- Do most people travel solo or with other people?
select passenger_count,round((count(*)*100/1153586),4) cnt
from HiveCaseStudy_Tushar_Siddhardha.ParOrc_Data group by passenger_count
order by cnt desc;
passenger_count cnt
1 70.8242
2 15.1513
5 4.6843
3 4.3502
6 2.8504
4 2.1394
7 0.0003
-- Solo rides are most common , dominant infact with almost 71% od data belonging
to them
-- Dual rides are teh conly other significant category with 15% occupancy
-- Rest all are marfinal below 5 %
-- value for 9,8,7 are two small to be of any significance, may be special limo
rides, corrupt data
select passenger_count,count(*) cnt
from HiveCaseStudy_Tushar_Siddhardha.ParOrc_Data where passenger_count in (9,8,7)
group by passenger_count
order by cnt desc;
-- 7 3
-- 5. Explore the ‘Extra’ (charge) variable - what fraction of total trips have an
extra charge is levied?
select extra,round((count(*)*100/1153586),4) cnt from (
select case when extra>0 then 1 else 0 end extra
from HiveCaseStudy_Tushar_Siddhardha.ParOrc_Data ) T
group by extra order by cnt desc;
-- Extra applied %age records
-- 1 46.1454
-- 0 53.8546
-- The distribusion is fairly even with 46.14% records having extra charges applied
, where as 53.85% have no extra charges applied
-- Analysis-II
-- 1. What is the correlation between the number of passengers on any given trip,
and the tip paid per trip?
-- Do multiple travellers tip more compared to solo travellers? Hint: Use
CORR(Col_1, Col_2)
select round(corr(passenger_count, tip_amount),4) from
HiveCaseStudy_Tushar_Siddhardha.ParOrc_Data;
-- -0.0053
-- the value is fairly small although negative but its would be fair to say that
passenger count is unrealted to the tip amount paid.
-- 2. Segregate the data into five segments of ‘tip paid’: [0-5), [5-10), [10-15) ,
[15-20) and >=20. #######################################################
-- Calculate the percentage share of each bucket (i.e. the fraction of trips
falling in each bucket).
select Tip_range, round((count(*)*100/1153586),4) cnt
from (select
case when (tip_amount>=0 and tip_amount<5) then '[0-5)'
when (tip_amount>=5 and tip_amount<10) then '[5-10)'
when (tip_amount>=10 and tip_amount<15) then '[10-15)'
when (tip_amount>=15 and tip_amount<20) then '[15-20)'
when (tip_amount>=20) then '>=20' end Tip_range
from HiveCaseStudy_Tushar_Siddhardha.ParOrc_Data) T
group by Tip_range
order by cnt desc;
tip_range cnt
[0-5) 92.4038
[5-10) 5.638
[10-15) 1.6829
[15-20) 0.1872
>=20 0.0881
-- 0-5 range is the most prominate group with 92.4% records, we already know 25%+
of these are 0 values from teh precious percentile based check
-- 5-10 represening a small fraction of 5.6%, remaning set are almost neglihgble
amount to 2% of data
-- values will be returned in sec hence we will be dividing it by 3600 to get value
sin hour
-- since distance is psecified in miles out final value will be in miles/hour unit
-- 5.Analyse the average speed of the most happening days of the year,
-- i.e. 31st December (New year’s eve) and 25th December (Christmas Eve)
-- and compare it with the overall average.
-- any trip that started on 25th or 31 will be considerd for the avg calculation
irrespective of the fact that it might have ended on the next day
select IsHoliday, round(avg(speed),2) avg_speed from
(select case when ((tpep_pickup_datetime>='2017-12-25 00:00:00.0' and
tpep_pickup_datetime<'2017-12-26 00:00:00.0')
or (tpep_pickup_datetime>='2017-12-31 00:00:00.0' and tpep_pickup_datetime<'2018-
01-01 00:00:00.0') ) then 1 else 0 end IsHoliday ,
trip_distance/((unix_timestamp(tpep_dropoff_datetime)-
unix_timestamp(tpep_pickup_datetime) )/3600) speed
from HiveCaseStudy_Tushar_Siddhardha.ParOrc_Data) T
group by IsHoliday
order by avg_speed desc;
-- 1 14.01
-- 0 10.95
select 14.01-10.95;
-- The comparision between holiday vs non-holiday , the during the holiday atleast
the streets of New York are clear(er)
-- as the Cab's are running at a faster average speed by a margin of 3.06
miles/hour
-- The non festive day average is in sync with november and december averages at
around 10.95 miles/per hour
-- let's confirm the overall averages once
select round(avg(trip_distance/((unix_timestamp(tpep_dropoff_datetime)-
unix_timestamp(tpep_pickup_datetime) )/3600)),2) avg_speed
from HiveCaseStudy_Tushar_Siddhardha.ParOrc_Data;
-- 11.02 is the overall avg speed as expected so the faster speed on 25th and 31
dec amounts to 0.07(10.95 was for non holiday days) increment on the overall speed
-----------END-----------------------------