You are on page 1of 10

How to Accelerate Large Data Analytics on Laptop | bodo.ai https://medium.com/bodo-ai/bodo-and-pandas-how-to-acceler...

1 of 10 11/16/2021, 9:04 AM
How to Accelerate Large Data Analytics on Laptop | bodo.ai https://medium.com/bodo-ai/bodo-and-pandas-how-to-acceler...

2 of 10 11/16/2021, 9:04 AM
How to Accelerate Large Data Analytics on Laptop | bodo.ai https://medium.com/bodo-ai/bodo-and-pandas-how-to-acceler...

%%px

3 of 10 11/16/2021, 9:04 AM
How to Accelerate Large Data Analytics on Laptop | bodo.ai https://medium.com/bodo-ai/bodo-and-pandas-how-to-acceler...

1 import pandas as pd
2 import time
3
4 def read_data():
5 start_time = time.time()
6 df_pandas = pd.read_parquet("nyc_taxi_2019.parquet")
7 print("Reading time: ", time.time()-start_time, "seconds")
8 return df_pandas
9
10 df_pandas = read_data()

Reading time: 228.91828107833862 seconds

1 %%px
2
3 import pandas as pd
4 import bodo
5 import time
6
7 @bodo.jit
8 def read_data():
9 start_time = time.time()
10 df_bodo = pd.read_parquet("taxi_2019.parquet")
11 print("Reading time: ", time.time()-start_time, "seconds")
12 return df_bodo
13

[stdout:0]
Reading time: 49.204203798999515 seconds

4 of 10 11/16/2021, 9:04 AM
How to Accelerate Large Data Analytics on Laptop | bodo.ai https://medium.com/bodo-ai/bodo-and-pandas-how-to-acceler...

groupby()

groupby()

groupby()

1 def grouping(df_pandas):
2 start_time = time.time()
3 rides_count = df_pandas.groupby("passenger_count")["VendorID"].count()
4 pt_count = df_pandas.groupby("payment_type")["VendorID"].count()
5 fare_avg_ct = df_pandas.groupby("passenger_count")["fare_amount"].mean()
6 print("Execution time: ", time.time()-start_time, "seconds")
7 return rides_count, pt_count, fare_avg_ct
8
9 rides_count, pt_count, fare_avg_ct = grouping(df_pandas)

Execution time: 21.10689616203308 seconds

5 of 10 11/16/2021, 9:04 AM
How to Accelerate Large Data Analytics on Laptop | bodo.ai https://medium.com/bodo-ai/bodo-and-pandas-how-to-acceler...

1 %%px
2
3 @bodo.jit
4 def grouping(df_bodo):
5 start_time = time.time()
6 rides_count = df_bodo.groupby("passenger_count")["VendorID"].count()
7 pt_count = df_bodo.groupby("payment_type")["VendorID"].count()
8 fare_avg_ct = df_bodo.groupby("passenger_count")["fare_amount"].mean()
9 print("Execution time: ", time.time()-start_time, "seconds")
10 return rides_count, pt_count, fare_avg_ct
11

[stdout:0]
Execution time: 5.038841708999826 seconds

1 def filter_rides(df_pandas):
2 start_time = time.time()
3 df_pandas["tpep_pickup_datetime"] = pd.to_datetime(df_pandas["tpep_pickup_datetime"
4 filtered_rides = df_pandas[df_pandas["tpep_pickup_datetime"]>"2019-05-31"]
5 filtered_rides.to_csv("rides_after_may.csv", index=False)
6 print("Execution time: ", time.time() - start_time, "seconds")
7 return df_pandas
8
9 df_pandas = filter_rides(df_pandas)

6 of 10 11/16/2021, 9:04 AM
How to Accelerate Large Data Analytics on Laptop | bodo.ai https://medium.com/bodo-ai/bodo-and-pandas-how-to-acceler...

Execution time: 531.8111879825592 seconds

1 %%px
2
3 @bodo.jit
4 def filter_rides(df_bodo):
5 start_time = time.time()
6 df_bodo["tpep_pickup_datetime"] = pd.to_datetime(df_bodo["tpep_pickup_datetime"
7 filtered_rides = df_bodo[df_bodo["tpep_pickup_datetime"]>"2019-05-31"]
8 filtered_rides.to_csv("rides_after_may_bodo.csv", index=False)
9 print("Execution time: ", time.time() - start_time, "seconds")
10 return df_bodo
11

[stdout:0]
Execution time: 300.6326244680022 seconds

apply()

apply()

apply()

7 of 10 11/16/2021, 9:04 AM
How to Accelerate Large Data Analytics on Laptop | bodo.ai https://medium.com/bodo-ai/bodo-and-pandas-how-to-acceler...

1 def UDF(df_pandas):
2 start_time = time.time()
3 df_pandas["year"] = df_pandas["tpep_pickup_datetime"].apply(lambda t: t.year
4 df_pandas["weekday"] = df_pandas["tpep_pickup_datetime"].apply(lambda t: t
5 df_pandas["hour"] = df_pandas["tpep_pickup_datetime"].apply(lambda t: t.hour
6 print("Execution time: ", time.time()-start_time)
7 return df_pandas
8
9 df_pandas = UDF(df_pandas)

Execution time: 1075.6187331676483 seconds

1 %%px
2
3 @bodo.jit
4 def UDF(df_bodo):
5 start_time = time.time()
6 df_bodo["year"] = df_bodo["tpep_pickup_datetime"].apply(lambda t: t.year)
7 df_bodo["weekday"] = df_bodo["tpep_pickup_datetime"].apply(lambda t: t.weekday
8 df_bodo["hour"] = df_bodo["tpep_pickup_datetime"].apply(lambda t: t.hour)
9 print("Execution time: ", time.time()-start_time)
10 return df_bodo
11

[stdout:0]
Execution time: 21.2050147129994 seconds

bodo.jit()

8 of 10 11/16/2021, 9:04 AM
How to Accelerate Large Data Analytics on Laptop | bodo.ai https://medium.com/bodo-ai/bodo-and-pandas-how-to-acceler...

%%px

9 of 10 11/16/2021, 9:04 AM
How to Accelerate Large Data Analytics on Laptop | bodo.ai https://medium.com/bodo-ai/bodo-and-pandas-how-to-acceler...

10 of 10 11/16/2021, 9:04 AM

You might also like