You are on page 1of 6

Assignment 7.1.

a
In [93]: import os

import json

from pathlib import Path

import gzip

import hashlib

import shutil

import pandas as pd

import pygeohash

import s3fs

endpoint_url='https://storage.budsc.midwest-datascience.com'

current_dir = Path(os.getcwd()).absolute()

results_dir = current_dir.joinpath('results')

if results_dir.exists():

shutil.rmtree(results_dir)

results_dir.mkdir(parents=True, exist_ok=True)

def read_jsonl_data():

s3 = s3fs.S3FileSystem(

anon=True,

client_kwargs={

'endpoint_url': endpoint_url

src_data_path = 'data/processed/openflights/routes.jsonl.gz'

with s3.open(src_data_path, 'rb') as f_gz:

with gzip.open(f_gz, 'rb') as f:

records = [json.loads(line) for line in f.readlines()]

return records

In [94]: from pyarrow.json import read_json

import pyarrow.parquet as pq

def create_parquet_dataset():

src_data_path = 'data/processed/openflights/routes.jsonl.gz'

parquet_output_path = results_dir.joinpath('routes.parquet')

s3 = s3fs.S3FileSystem(

anon=True,

client_kwargs={

'endpoint_url': endpoint_url

with s3.open(src_data_path, 'rb') as f_gz:

with gzip.open(f_gz, 'rb') as f:

## TODO: Use Apache Arrow to create Parquet table and save the dataset

table = read_json(f)

print(table)

pq.write_table(table, parquet_output_path, compression='none')

create_parquet_dataset()

pyarrow.Table

airline: struct<airline_id: int64, name: string, alias: string, iata: string, icao:
string, callsign: string, country: string, active: bool>

child 0, airline_id: int64

child 1, name: string

child 2, alias: string

child 3, iata: string

child 4, icao: string

child 5, callsign: string

child 6, country: string

child 7, active: bool

src_airport: struct<airport_id: int64, name: string, city: string, country: string,


iata: string, icao: string, latitude: double, longitude: double, altitude: int64, ti
mezone: double, dst: string, tz_id: string, type: string, source: string>

child 0, airport_id: int64

child 1, name: string

child 2, city: string

child 3, country: string

child 4, iata: string

child 5, icao: string

child 6, latitude: double

child 7, longitude: double

child 8, altitude: int64

child 9, timezone: double

child 10, dst: string

child 11, tz_id: string

child 12, type: string

child 13, source: string

dst_airport: struct<airport_id: int64, name: string, city: string, country: string,


iata: string, icao: string, latitude: double, longitude: double, altitude: int64, ti
mezone: double, dst: string, tz_id: string, type: string, source: string>

child 0, airport_id: int64

child 1, name: string

child 2, city: string

child 3, country: string

child 4, iata: string

child 5, icao: string

child 6, latitude: double

child 7, longitude: double

child 8, altitude: int64

child 9, timezone: double

child 10, dst: string

child 11, tz_id: string

child 12, type: string

child 13, source: string

codeshare: bool

equipment: list<item: string>

child 0, item: string

In [95]: parquet_output_path = results_dir.joinpath('routes.parquet')

pq = pd.read_parquet(parquet_output_path, engine='fastparquet')

print(list(pq.columns.values))

['codeshare', 'equipment', 'airline.airline_id', 'airline.name', 'airline.alias', 'a


irline.iata', 'airline.icao', 'airline.callsign', 'airline.country', 'airline.activ
e', 'src_airport.airport_id', 'src_airport.name', 'src_airport.city', 'src_airport.c
ountry', 'src_airport.iata', 'src_airport.icao', 'src_airport.latitude', 'src_airpor
t.longitude', 'src_airport.altitude', 'src_airport.timezone', 'src_airport.dst', 'sr
c_airport.tz_id', 'src_airport.type', 'src_airport.source', 'dst_airport.airport_i
d', 'dst_airport.name', 'dst_airport.city', 'dst_airport.country', 'dst_airport.iat
a', 'dst_airport.icao', 'dst_airport.latitude', 'dst_airport.longitude', 'dst_airpor
t.altitude', 'dst_airport.timezone', 'dst_airport.dst', 'dst_airport.tz_id', 'dst_ai
rport.type', 'dst_airport.source']

In [96]: partitions = (

('A', 'A'), ('B', 'B'), ('C', 'D'), ('E', 'F'),

('G', 'H'), ('I', 'J'), ('K', 'L'), ('M', 'M'),

('N', 'N'), ('O', 'P'), ('Q', 'R'), ('S', 'T'),

('U', 'U'), ('V', 'V'), ('W', 'X'), ('Y', 'Z')

In [97]: partitions_keys = (

'A', 'B', 'C-D', 'E-F',

'G-H', 'I-J', 'K-L', 'M',

'N', 'O-P', 'Q-R', 'S-T',

'U', 'V', 'W-X', 'Y-Z'

In [98]: parts_k_v = dict(zip(partitions_keys, partitions))

print(parts_k_v)

{'A': ('A', 'A'), 'B': ('B', 'B'), 'C-D': ('C', 'D'), 'E-F': ('E', 'F'), 'G-H':
('G', 'H'), 'I-J': ('I', 'J'), 'K-L': ('K', 'L'), 'M': ('M', 'M'), 'N': ('N', 'N'),
'O-P': ('O', 'P'), 'Q-R': ('Q', 'R'), 'S-T': ('S', 'T'), 'U': ('U', 'U'), 'V': ('V',
'V'), 'W-X': ('W', 'X'), 'Y-Z': ('Y', 'Z')}

In [99]: def get_key(val):

for key, value in parts_k_v.items():

if val in value:

return key

return "0"

In [100… pq['key'] = pq['src_airport.iata'] + pq['dst_airport.iata'] + pq['airline.iata']

pq['partition_value'] = pq['key'].str[:1]

pq['kv_key'] = pq.apply(lambda x: get_key(x.partition_value), axis=1)

In [101… # remove invalid keys

pq = pq[pq.kv_key != "0"].astype('float32', errors='ignore')

In [102… pq.head()

Out[102… codeshare equipment airline.airline_id airline.name airline.alias airline.iata airline.icao airline.

ANA All
0 0.0 [CR2] 410.0 Aerocondor Nippon 2B ARD AEROC
Airways

ANA All
1 0.0 [CR2] 410.0 Aerocondor Nippon 2B ARD AEROC
Airways

ANA All
2 0.0 [CR2] 410.0 Aerocondor Nippon 2B ARD AEROC
Airways

ANA All
3 0.0 [CR2] 410.0 Aerocondor Nippon 2B ARD AEROC
Airways

ANA All
4 0.0 [CR2] 410.0 Aerocondor Nippon 2B ARD AEROC
Airways

5 rows × 41 columns

In [104… import pyarrow as pa

import pyarrow.parquet as parpq

pq_tab = pa.Table.from_pandas(pq)

parpq.write_to_dataset(

pq_tab,

root_path=results_dir.joinpath('kv'),

partition_cols=['kv_key'],

Assignment 7.1.b
In [105… import hashlib

def hash_key(key):

m = hashlib.sha256()

m.update(str(key).encode('utf-8'))

return m.hexdigest()

In [106… pq['key'] = pq['src_airport.iata']+pq['dst_airport.iata']+pq['airline.iata']

pq['hashed'] = pq.apply(lambda x: hash_key(x.key), axis=1)

pq['hash_key'] = pq['hashed'].str[:1]

In [107… pq_tab1 = pa.Table.from_pandas(pq)

parpq.write_to_dataset(

pq_tab1,

root_path=results_dir.joinpath('hash'),

partition_cols=['hash_key'],

Assignment 7.1.c
In [109… #get hash for datacenters

datacenters = {}

datacenters['west'] = pygeohash.encode(45.5945645, -121.1786823)

datacenters['central'] = pygeohash.encode(41.1544433, -96.0422378)

datacenters['east'] = pygeohash.encode(39.08344, -77.6497145)

print(datacenters)

{'west': 'c21g6s0rs4c7', 'central': '9z7dnebnj8kb', 'east': 'dqby34cjw922'}

In [110… def closest_datacenter(latitude, longitude):

geohash = pygeohash.encode(latitude, longitude)

dist_dict = {}

closest_datacenter = ''

last_distance = None

for key, value in datacenters.items():

dist = pygeohash.geohash_approximate_distance(str(geohash), str(value))

dist_dict[key] = dist

if (last_distance == None) or (dist < last_distance):


closest_datacenter = key

last_distance = dist

return closest_datacenter

In [113… pq['datacenter'] = pq[['src_airport.latitude', 'src_airport.longitude']].apply(lambd

In [114… pq_tab2 = pa.Table.from_pandas(pq)

parpq.write_to_dataset(

pq_tab2,

root_path=results_dir.joinpath('geo'),

partition_cols=['datacenter'],

Assignment 7.1.d
In [128… pq['airline.airline_id'].head()

Out[128… 0 410.0

1 410.0

2 410.0

3 410.0

4 410.0

Name: airline.airline_id, dtype: float32

In [133… def balance_partitions(keys, num_partitions):

ac = keys.cumsum()

#sum of the entire array

partsum = ac[-1]//num_partitions

#generates the cumulative sums of each part

cum_part_sums = np.array(range(1,p))*partsum

#finds the indices

inds = np.searchsorted(ac,cum_part_sums)

#split into approximately equal-sum arrays

parts = np.split(arr,inds)

return parts

In [134… keys = list(pq['airline.airline_id'])

num_partitions=7

In [135… print(balance_partitions(keys, num_partitions))

[{-1.0: 1}, {10.0: 1}, {21.0: 1}, {24.0: 1}, {28.0: 1}, {29.0: 1}, {32.0: 1}, {35.0:
1}, {42.0: 1}, {43.0: 1}, {55.0: 1}, {68.0: 1}, {83.0: 1}, {90.0: 1}, {96.0: 1}, {10
6.0: 1}, {109.0: 1}, {116.0: 1}, {125.0: 1}, {130.0: 1}, {132.0: 1}, {137.0: 1}, {13
9.0: 1}, {146.0: 1}, {153.0: 1}, {179.0: 1}, {197.0: 1}, {214.0: 1}, {218.0: 1}, {22
0.0: 1}, {221.0: 1}, {225.0: 1}, {231.0: 1}, {240.0: 1}, {241.0: 1}, {242.0: 1}, {24
6.0: 1}, {312.0: 1}, {316.0: 1}, {319.0: 1}, {321.0: 1}, {324.0: 1}, {328.0: 1}, {32
9.0: 1}, {330.0: 1}, {333.0: 1}, {336.0: 1}, {338.0: 1}, {341.0: 1}, {345.0: 1}, {38
6.0: 1}, {397.0: 1}, {410.0: 1}, {412.0: 1}, {426.0: 1}, {439.0: 1}, {442.0: 1}, {46
2.0: 1}, {470.0: 1}, {476.0: 1}, {477.0: 1}, {491.0: 1}, {502.0: 1}, {503.0: 1}, {50
8.0: 1}, {515.0: 1}, {524.0: 1}, {543.0: 1}, {563.0: 1}, {567.0: 1}, {569.0: 1}, {57
6.0: 1}, {595.0: 1}, {596.0: 1}, {603.0: 1}, {608.0: 1}, {622.0: 1}, {641.0: 1}, {68
3.0: 1}, {690.0: 2}, {692.0: 2}, {751.0: 2}, {753.0: 2}, {794.0: 2}, {807.0: 2}, {83
7.0: 2}, {879.0: 2}, {881.0: 2}, {882.0: 2}, {896.0: 2}, {897.0: 2}, {921.0: 2}, {97
0.0: 2}, {995.0: 2}, {998.0: 2}, {1006.0: 2}, {1008.0: 2}, {1034.0: 2}, {1048.0: 2},
{1057.0: 2}, {1066.0: 2}, {1073.0: 2}, {1109.0: 2}, {1173.0: 2}, {1191.0: 2}, {1203.
0: 2}, {1206.0: 2}, {1230.0: 2}, {1266.0: 2}, {1287.0: 2}, {1290.0: 2}, {1299.0: 2},
{1308.0: 2}, {1316.0: 2}, {1317.0: 2}, {1338.0: 2}, {1340.0: 2}, {1355.0: 2}, {1359.
0: 2}, {1392.0: 2}, {1401.0: 2}, {1403.0: 2}, {1422.0: 2}, {1434.0: 2}, {1441.0: 2},
{1463.0: 2}, {1469.0: 2}, {1472.0: 2}, {1478.0: 2}, {1492.0: 2}, {1500.0: 2}, {1508.
0: 2}, {1531.0: 2}, {1539.0: 2}, {1548.0: 2}, {1581.0: 2}, {1611.0: 2}, {1623.0: 2},
{1629.0: 2}, {1654.0: 2}, {1663.0: 2}, {1669.0: 2}, {1680.0: 2}, {1682.0: 2}, {1683.
0: 2}, {1729.0: 2}, {1750.0: 2}, {1756.0: 2}, {1758.0: 2}, {1767.0: 2}, {1769.0: 2},
{1775.0: 2}, {1790.0: 2}, {1792.0: 2}, {1829.0: 2}, {1844.0: 2}, {1868.0: 2}, {1886.
0: 2}, {1889.0: 3}, {1908.0: 3}, {1909.0: 3}, {1925.0: 3}, {1936.0: 3}, {1942.0: 3},
{1943.0: 3}, {1946.0: 3}, {1954.0: 3}, {1966.0: 3}, {2009.0: 3}, {2056.0: 3}, {2058.
0: 3}, {2091.0: 3}, {2094.0: 3}, {2104.0: 3}, {2117.0: 3}, {2143.0: 3}, {2150.0: 3},
{2183.0: 3}, {2193.0: 3}, {2217.0: 3}, {2218.0: 3}, {2220.0: 3}, {2222.0: 3}, {2226.
0: 3}, {2245.0: 3}, {2260.0: 3}, {2264.0: 3}, {2293.0: 3}, {2297.0: 3}, {2324.0: 3},
{2350.0: 3}, {2353.0: 3}, {2354.0: 3}, {2395.0: 3}, {2409.0: 3}, {2417.0: 3}, {2418.
0: 3}, {2419.0: 3}, {2420.0: 3}, {2421.0: 3}, {2439.0: 3}, {2468.0: 3}, {2520.0: 3},
{2524.0: 3}, {2538.0: 3}, {2541.0: 3}, {2547.0: 3}, {2548.0: 3}, {2575.0: 3}, {2585.
0: 3}, {2607.0: 3}, {2622.0: 3}, {2638.0: 3}, {2660.0: 3}, {2681.0: 3}, {2682.0: 3},
{2684.0: 3}, {2688.0: 3}, {2692.0: 3}, {2731.0: 3}, {2748.0: 3}, {2750.0: 3}, {2757.
0: 3}, {2765.0: 3}, {2773.0: 3}, {2774.0: 3}, {2822.0: 3}, {2825.0: 3}, {2826.0: 3},
{2835.0: 3}, {2850.0: 3}, {2857.0: 3}, {2881.0: 3}, {2896.0: 3}, {2916.0: 3}, {2922.
0: 3}, {2923.0: 3}, {2942.0: 4}, {2951.0: 4}, {2954.0: 4}, {2987.0: 4}, {2989.0: 4},
{2990.0: 4}, {2993.0: 4}, {2994.0: 4}, {3000.0: 4}, {3021.0: 4}, {3026.0: 4}, {3029.
0: 4}, {3052.0: 4}, {3081.0: 4}, {3090.0: 4}, {3097.0: 4}, {3123.0: 4}, {3126.0: 4},
{3148.0: 4}, {3163.0: 4}, {3179.0: 4}, {3197.0: 4}, {3200.0: 4}, {3201.0: 4}, {3210.
0: 4}, {3233.0: 4}, {3251.0: 4}, {3258.0: 4}, {3287.0: 4}, {3290.0: 4}, {3320.0: 4},
{3329.0: 4}, {3342.0: 4}, {3354.0: 4}, {3370.0: 4}, {3378.0: 4}, {3386.0: 4}, {3391.
0: 4}, {3392.0: 4}, {3393.0: 4}, {3432.0: 4}, {3437.0: 4}, {3463.0: 4}, {3490.0: 4},
{3498.0: 4}, {3534.0: 4}, {3539.0: 4}, {3545.0: 4}, {3547.0: 4}, {3574.0: 4}, {3589.
0: 4}, {3613.0: 4}, {3618.0: 4}, {3637.0: 4}, {3652.0: 4}, {3661.0: 4}, {3674.0: 4},
{3721.0: 4}, {3734.0: 4}, {3737.0: 4}, {3740.0: 4}, {3754.0: 4}, {3764.0: 4}, {3776.
0: 4}, {3778.0: 4}, {3781.0: 4}, {3783.0: 4}, {3788.0: 4}, {3805.0: 4}, {3811.0: 4},
{3826.0: 4}, {3834.0: 4}, {3835.0: 4}, {3850.0: 4}, {3856.0: 4}, {3857.0: 4}, {3865.
0: 4}, {3871.0: 4}, {3926.0: 4}, {3935.0: 5}, {3952.0: 5}, {3969.0: 5}, {3976.0: 5},
{4021.0: 5}, {4026.0: 5}, {4031.0: 5}, {4044.0: 5}, {4066.0: 5}, {4089.0: 5}, {4091.
0: 5}, {4165.0: 5}, {4178.0: 5}, {4234.0: 5}, {4248.0: 5}, {4255.0: 5}, {4259.0: 5},
{4292.0: 5}, {4296.0: 5}, {4304.0: 5}, {4305.0: 5}, {4311.0: 5}, {4319.0: 5}, {4329.
0: 5}, {4335.0: 5}, {4349.0: 5}, {4356.0: 5}, {4375.0: 5}, {4388.0: 5}, {4429.0: 5},
{4435.0: 5}, {4436.0: 5}, {4438.0: 5}, {4454.0: 5}, {4475.0: 5}, {4496.0: 5}, {4513.
0: 5}, {4521.0: 5}, {4533.0: 5}, {4547.0: 5}, {4550.0: 5}, {4559.0: 5}, {4573.0: 5},
{4599.0: 5}, {4608.0: 5}, {4609.0: 5}, {4611.0: 5}, {4687.0: 5}, {4691.0: 5}, {4735.
0: 5}, {4737.0: 5}, {4740.0: 5}, {4750.0: 5}, {4752.0: 5}, {4797.0: 5}, {4805.0: 5},
{4808.0: 5}, {4822.0: 5}, {4840.0: 5}, {4863.0: 5}, {4867.0: 5}, {4869.0: 5}, {4870.
0: 5}, {4897.0: 5}, {4936.0: 5}, {4937.0: 5}, {4940.0: 5}, {4947.0: 5}, {4951.0: 5},
{4965.0: 5}, {5002.0: 5}, {5013.0: 5}, {5016.0: 5}, {5038.0: 5}, {5039.0: 5}, {5041.
0: 5}, {5067.0: 5}, {5083.0: 5}, {5085.0: 5}, {5097.0: 6}, {5133.0: 6}, {5156.0: 6},
{5179.0: 6}, {5188.0: 6}, {5209.0: 6}, {5234.0: 6}, {5265.0: 6}, {5281.0: 6}, {5282.
0: 6}, {5297.0: 6}, {5309.0: 6}, {5325.0: 6}, {5331.0: 6}, {5333.0: 6}, {5347.0: 6},
{5354.0: 6}, {5360.0: 6}, {5368.0: 6}, {5399.0: 6}, {5416.0: 6}, {5439.0: 6}, {5461.
0: 6}, {5479.0: 6}, {5484.0: 6}, {5496.0: 6}, {5521.0: 6}, {5523.0: 6}, {5651.0: 6},
{5813.0: 6}, {5982.0: 6}, {6557.0: 6}, {8359.0: 6}, {8463.0: 6}, {8576.0: 6}, {8745.
0: 6}, {8809.0: 6}, {9082.0: 6}, {9531.0: 6}, {9541.0: 6}, {9620.0: 6}, {9666.0: 6},
{9764.0: 6}, {9784.0: 6}, {9809.0: 6}, {9810.0: 6}, {9818.0: 6}, {9828.0: 6}, {9829.
0: 6}, {10121.0: 6}, {10122.0: 6}, {10128.0: 6}, {10646.0: 6}, {10650.0: 6}, {10675.
0: 6}, {10737.0: 6}, {10739.0: 6}, {10741.0: 6}, {10758.0: 6}, {10765.0: 6}, {10776.
0: 6}, {10800.0: 6}, {10912.0: 6}, {10955.0: 6}, {11741.0: 6}, {11763.0: 6}, {11794.
0: 6}, {11806.0: 6}, {11808.0: 6}, {11811.0: 6}, {11814.0: 6}, {11838.0: 6}, {11857.
0: 6}, {11948.0: 6}, {11963.0: 6}, {12978.0: 6}, {13088.0: 6}, {13108.0: 6}, {13200.
0: 6}, {13335.0: 7}, {13704.0: 7}, {13757.0: 7}, {13899.0: 7}, {13983.0: 7}, {14061.
0: 7}, {14118.0: 7}, {14485.0: 7}, {14849.0: 7}, {15814.0: 7}, {15837.0: 7}, {15893.
0: 7}, {15999.0: 7}, {16120.0: 7}, {16133.0: 7}, {16136.0: 7}, {16149.0: 7}, {16150.
0: 7}, {16262.0: 7}, {16415.0: 7}, {16475.0: 7}, {16508.0: 7}, {16615.0: 7}, {16624.
0: 7}, {16660.0: 7}, {16707.0: 7}, {16725.0: 7}, {16726.0: 7}, {16844.0: 7}, {16882.
0: 7}, {16942.0: 7}, {16960.0: 7}, {16963.0: 7}, {17023.0: 7}, {17083.0: 7}, {17094.
0: 7}, {17095.0: 7}, {17099.0: 7}, {17408.0: 7}, {17519.0: 7}, {17675.0: 7}, {17885.
0: 7}, {17891.0: 7}, {18169.0: 7}, {18232.0: 7}, {18529.0: 7}, {18543.0: 7}, {18553.
0: 7}, {18700.0: 7}, {18732.0: 7}, {18825.0: 7}, {18828.0: 7}, {18944.0: 7}, {18946.
0: 7}, {19016.0: 7}, {19305.0: 7}, {19582.0: 7}, {19610.0: 7}, {19676.0: 7}, {19804.
0: 7}, {19810.0: 7}, {19944.0: 7}, {20004.0: 7}, {20047.0: 7}, {20160.0: 7}, {20270.
0: 7}, {20565.0: 7}, {20577.0: 7}, {20686.0: 7}, {20710.0: 7}, {20963.0: 7}, {20976.
0: 7}, {21012.0: 7}]

In [ ]:

You might also like