You are on page 1of 16
5728725, 612 PM In [1]: In [2]: out [2]: In [3]: out [3]: ‘Aitlines Delay Prediction -Jupyter Notebook import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import nunpy as ap sns.set_theme(color_codes=True) df = pd.read_csv(‘airlines_delay.csv') df-head() Flight Time Length Airline AirportFrom AirportTo DayOfWeek Class 0 23130 12960 1410 © OL ATL HOU 1 0 1 6948.0 360.0 1480 00 cos ORD 40 2 12470 11700 1430 BB Bos LT 30 3 310 14100 3440 Us ose PHX 6 0 4 5630 6920 980 FL em ATL s 0 Exploratory Data Analysis sns.countplot(data=df, x="Airline", hue="Class") #HN Airlines are the only airline that often delayed than other airlines 200 75 150 12% 100 15 50 oy oO DL 00 B6 US FL WN CO AA YV EV XE 9E OH UA MQ AS F9 HA Airline R Length locahost 8888inotebooks/Aitines Delay Prediction jpynb 216 3726723, 612 PM {Ailes Delay Prediction -Jupyter Notebook In [5]: sns.barplot(data=df, x="Airline", Time") #lnost all of the airplane have the sane departure time Out[5]: 800 700 600 500 2 E 400 00 200 100 0 DL OO B6 US FL WN CO AA YV EV XE 9E OH UA MQ AS F9 HA Airline Data Preprocessing In [6]: #Remove flight ID df = df.drop("Flight', axis=1) df.head() out[s): Time Length Aine AirportFrom ArportTo DayOfWeek Class 0 1960 1410 OL a HOU ry 1 3600 1450 00 cos orD 40 2 11700 1430 86 os cur 30 3 1100 340 Us OSG PAK 6 0 46920 980 FL eM aTL 40 In [7]: df [‘AirLine* ] unique() out(7): array({'DL', 00°, "BS", "US', “FL', "WN', "CO", "AR', "W', EV", "XE", "96", ‘OH, “UA, locahost 8888inotebooks/Aitines Delay Prediction jpynb MQ’, 'AS', "FO", ‘HA'], dtypesobject) 36 5728725, 612 PM In [8]: out [8]: In [9]: out [9]: ‘Aitlines Delay Prediction - Jupyter Notebook from sklearn import preprocessing Label_encoder df[‘Airline* df[ Airline’ ].unique() array([ 5, 12, 9) 3,14, 8, 15, 4, dF['AirportFron® ].unique() array([‘ATL', "COS', "BOS", '0G6", “CRW, "LGB", "BIS', 'CLT", FLL", "SAN, "BHM", "ROC", “EYW', "IND", "JFK", ‘ORD', ‘PHL’, "BZN", ‘GRB", 'MBS', "BUF, 'RIC', "SEA", 'POX", PSE", "CPR", 'SNA', 'STL", MEM", "KOA", "ELP', 'SJU', GUC", 'NKE", "CAE", 'GRR', "ROD", 'OKC", ‘ITO", 'SIC*, ABQ’, 'SME", "PAV", ‘ABI", DAY", "RNO", "PVD", ‘ALB', LEX", "XNA", 'GIT", "CMH, “GRK", "NCI", ‘TXK', 'LRD', ‘Tus’, "ROA", ‘NOD", "INU", "BTV", "FCA", "GNV', 'RAP", "HOU", "BOI", "CRP*, 'BRO", “GeT', "KIN", 'ICT', "SAF", MAF", "HPN, ‘AVP", 'AZO", AMA, 'NHK', ‘ISP", 'CID', LET", 'TOL", "ECP", 'PSG", “FAL', "ASE", "BTR", 'BQK", FUG", 'TLH', "SOF", 'BFL", EWN", “ILM', ‘OTZ", 'SGU', “CME', "NFR", "JAC", 'DLH', FSM", "COD", *GGG", ‘DEQ’, “TYR', "EGE", "PIH', "VLD", ‘DAB, "HLN, 'PIA', 'SPI', "BGM", "MYR", "HRL", 'MKG", LNK", "BIL", "CYS", 'LCH', STWE, "LMT", *ACT*, 'PLN', “BLI', "SCE", 'MH', 'LYH', “STX", "FLO", "UTM", "CLL", locahost 8888inotebooks/Aitines Delay Prediction jpynb preprocessing. LabelEncoder() Jabel_encoder. fit_transfor(df[ ‘Airline"]) 1,17, 6, 16, @, 11, 13, "BMI, MSY", "EWR", "DEW", STAH', "LAX", "JAX", 'SAV', "DTW", 'STT', "AUS", 'DCA', "PBI', 'SFO', ‘MIA’, “DSM", "SBA', 'TYS', ‘MSP’, DEN’, "LaS', "TAD", "HNL", "BDL", "ev", PIT", "HSV", ‘SGF', "JAN, AEX’, "LGA", 'RSW', "FAR", 'LIT', ‘OMA’, ‘BNA’, "McO", "LBB", 'CSG', ‘OAK", "MSO", 'MFE', 'GEG', "MSN', "CHO", 'ONT', "LIK", "PSP", "G50", 'PSC', "SYR", ‘AVL’, "ABE", 'LWB', ‘ERI", ‘DAL", "SBP", 'CDV', ‘TUL’, ‘FSD’, "MDW", "FWA', "BUR", "PNS', "ATW", "SHV", "SMX", "RDM", "caK", "IDA’, "MQT', "VPS", "TRI", 'GSP', ‘HON’, “MLU', "MOB", 'BGR', 'SRQ', 'MLI', "SBN, "FAT', "ELM', "YUM", "COU", HRY’, "CEC", "CHA', "CHA, ACV", "MGM", "ROW", ‘OTH, 'CHX', “SWF', “BET’, "ABY', 'MTJ', ‘SCC’, "DRO", "GFK’, 'BKG', ‘AGS’, "BTM', 'MEI", "SIT", "MLB", "PAH', ‘occ, "IPL", ‘TV, ‘OAI", "SUN", ‘LSE’, ‘CIC’, ‘OME’, "BON', 'WRG', ‘BRW', SPS‘, "ACY, "ADK", "SIT", "YK", “GUM, "CDC', *ADQ", *HTS", "ABR’], dtype=obj) 10, 2, "BHI", “cle, "PH" “sic, ‘SATS ‘MoT’, "RDU", ‘NOT, tEw', 'PHE “TPA, “LAN ‘MKT ANC tENT 1RST, “oRF*, “CHS “Eus" EKO", ‘eu, “PH ‘err’, ‘oF, STEX', ‘DH "YAK" “EAU, ‘ITH, 1RKS" “LHS “PLE, 5728725, 612 PM ‘Aitlines Delay Prediction - Jupyter Notebook In [10]: df[‘AirportFrom’]= label_encoder.Fit_transform(df[ "AirportFrom’ ]) df[AirportFron* ].unique() out{10): array({ 16, 135, 27, a1, 22, o1, 205, 262, 227, us, 248, 240, 242, 168, 87, 51, 238, 199, 284, ue, 142, 222, 133, locahost 8888inotebooks/Aitines Delay Prediction jpynb 65, 154, 97, 246, 192, 268, 33, 101, 161, 172, 58, 129, 48, 93, 226, rR, 2u, 249, 88, 140, 164, 7, 219, 35, 147, 139, 285, 225, 146, 94, 1 227, 283, 279, 34, 137, 2B, 247, 229, 255, 3, 220, 281, 28, 8, 270, 203, 245, 148, 197, 67, 10, 231, 196, 152, 165, 107, 68, 193, 181, 100, 103, 218, 274, 287, 201, 23, 259, 105, 32, 58, 208, 73, 264, 159, 204, 178, 157, 8, 106, 38, 288, 141, 98, 275, 63, 108, 176, 86, 156, 144, 286, 198, 104, 213, 240, 268, 241, 143, 111, 290, 167, 42, 17, 54, 56, 292, 251, 272, 64, 257, 25, 37, 168, 59, 96, 243, 253, 43, 7, 174, 258, 195, 114, 92, 102, 256, 178, 198, 57, 24, 23, 113, 186, 200, 289, 31, 31) 80, 27, 183, 234, 221, 124, 173, 277, 61, 75, 15, 263, 138, 26, 98, 52, 122, 77, 212, 131, 39, 258, 45, 238, a4, 252, 132, 184, 155, 76, 120, 14, 230, 232, 20, 267, 45, 6, 62, 12, 291, 185, 266, 189, 69, 85, 261, 214, 254, 47, 70, 236, 224, 280, 175, 209, 2, 187, 41, 180, 179, 30, 74, 271, 235, 169, 160, 269, 216, 153, 233, 19, 202, 228, 273, 237, 109, 116, 278, 89, 36, 239, 145, u, 127, 166, 282, 125, 29, 18, 46, 134, v7, 99, 215, 2, 19, 191, 44, 151, 12a, 158, 66, 123, 82, 40, 218, 55, 163, 49, 60, 78, 117, 128, 158, 162, 53, 182, 149, 223, 136, 126, 276, 194, 95, 4, 81, 265, 206, 5, 5116 5728725, 612 PM In [11]: df[‘AirportTo" ].unique() out[11): array(["HOU', ‘ORD', "CLT", "PHX", "ATL "PBI", ‘NCO’, 'SFO", 'NKE', OAK", "TAH", "PSP", "DCA", 'LGA’, "Sle", 'G]T", "LAK", ‘ONT', 'RNO", "JAX", “CAE, ‘ST’, "DAY", "NIA", "CRW", "RDU", "PHL", ‘ABI', ‘MOB", "BNA', 'MCI', 'MSY", RIC’, 'LIH", 'ABQ", "es6", ‘SHV', "FAI", *sRQ", "ROC", "CHO", “PHM, CID", ‘TRI, “CEC', ‘TEX', 'MBS", Lea", 'MvR", ‘ALB, "NER", ‘BTR', 'FLG", "PS", ‘ROM", "BZN", STLH', "XNA", "CHAS, "ANA', 'EGE", ‘BET’, MLB, 'SHF', ‘ASE", "LeB", "CHS", "GN", "PNT", "EAU", ‘ILM’, "GRB", ‘FSD", "LRD', "LHS", “AGS', "CK", ACY, "CMI", "INU", SIDA’, 'MFE", "EKO", NTTH', ‘LYH', "BIL", "BIS", 'GFK", 'MTJ", "ROW', ‘PLN’, ‘TWF, MOD", 'STX', ‘OTZ", "ang', 'PSG', 'SIT", “CLL, ‘TOL", *S3T", locahost 8888inotebooks/Aitines Delay Prediction jpynb ‘Aitlines Delay Prediction - Jupyter Notebook "DEN", "CLE, "STL, ‘ves', "6sP" "MoM" 'MAT "SDF, "FLL, JAN’ "act, "ac", "ORF", "DSM", cou", “KIN "DLH', "SBP", "FA, BGM", "NOT", "BRM", "ROD", “ATW AVP’ "ABE", "ENT, "BON', "ELM HTS', reuc’, "BLI', "YUM", *syR", “FAY, "FAR", "eve", "RSW", "FAT, “sav, "Pv", PIT’, “rat, "sat, ‘GIF, “HRL Use", "psc, "cRP', "BFL, "Eu", MSO", “LAN *LcH, "SPL, "MGM" ERI’, "PAH, "SMX", "BOK", “oce', BTM’, “UWB "ABR" ], dtyp “BHM, “BwI', "sau", "MSP", "RKS", "Tpa', "1T0", "aK", “MT *oKc', "BME" "SBN", "EWAN, "Tus", "LET'S “CHAN *6s0", “TxK', "DRO", ‘ew, "ROK" "uN “IVK', *ROA', "occ", "Tyr", “unr "mer", “oTR', "evs", "PSE", "BGR", "DEW", “HEN "BDL", “BUF, "Bos", "SEA", "rap", *cos", ‘urs “ECP, ‘BV’, “IND', *oma’, “MRK “ELP', “MLE *oae", "koa", "WRG" "AVL “GRK" “OME “mKG" “IPL, “Bor', "DAB" "YAK" *oa', ‘cov', “coc', "scc'y “crc, ject) MEM’, “EWR’ “SAN, "Las"; anc’, "Lex", rer", “DAL, “TUL, “PHF, "RAP", "SG", “POx", “BuR', "AEX", "ces", "HSN", *nor", “DHN' *HSV", "SUN", ‘oTH, *HON", “EW, "FLO", “Tvc", “HUN "sce", “BKG", “ABY", "RST, “eum, "GRR", “aE, ‘pTw, "scu', "SNAY, “SMES, “HNL “Ts, "acy" “aus', "mR" sact', “CHT, “MLu' "CPR, "azo", ‘Ger’, “MAF, “sps', 'PIE', “spat, "1s, "BRO", “SAF, *cop", "ESM, “MM “cL, “IH, “VLD, 089", ‘umm, 66 5728725, 612 PM ‘Aitlines Delay Prediction - Jupyter Notebook In [12]: df[‘AinportTo' ]= label_encoder.fit_transform(df[ ‘AirportTo" ]) dF ‘AirportTo" ]-unique() out(12): array([129, 292, 85, 288, 157, 233, 162, 161, 247, 228, 131, 2, 82, 81, 45, 206, 107, 113, 137, 263, 228, 133, 225, In [13]: df-head() out (13): 208, 60, 45, 130, 227, 78, 99, 235, 262, 47, 182, 100, 279, 6, 2, 146, 139, 254, 56, 278, 158, 91, 179, 41, 68, 283, 266, 13, 25, 196, 210, 141, 165, 231, 34, 105, 178, 89, 176, 201, 239, 222, 49, 225, 286, 59, Time Length Airline 0 12960 4 3600 2 11700 3 100 4 6920 1410S 148012 103 sao 18 08 217, 96, 159, 35, 269, 48, 33, 221, 5, 209, 10, 103, 150, 88, 8, 106, 265, 64, 8, 250, 282, 249, 276, AirportFrom AirportTo DayOtWer 16, 148, 268, 14, 76, 65, 172, 32, 267, 122, 67, 151, 193, 23, 118, 86, 237, z 212, 57, 98, 240, 259, 16 6s 38 203 32 27, 184, 101, 264, 175, 7, 198, 42, 238, 280, 155, 224, 170, 102, 271, 138, 140, 62, 183, 29, 110, 77, 31, Check the Class Value locahost 8888inotebooks/Aifines Delay Prediction pynb 80, 202, 197, 207, 241, 285, 104, 230, 53, 181, 200, 128, 275, 93, 246, 39, 97, 149, 291, 112, 23, 9%, 31) 128 208 60 a7 16 177, 135, 23, 236, 143, 216, 228, 194, 145, 44, 122, 203, 298, 94, 168, 156, 242, 28, 127, 199, 49, 226, us, 58, 153, a7, 134, 1, 204, 78, 244, 188, 66, 195, 72, 19, 34, 144, 168, 92, 189, 37, 4, 257, 213, 273, 255, 121, 136, 190, 87, 256, 109, su, 166, 116, 248, 132, 115, 185, 11, 284, 142, 36, 287, 124, 173, 260, 261, 71, 128, 251, 215, 98, 205, 274, 52, 223, 24, 219, 192, 126, 63, 74, 169, 123, 191, 167, 253, 22, 114, 277, 183, 245, 18, 258, 214, 171, 187, 232, 83, 186, 152, 38, v7, 281, 28, 50, 270, 26, 79, 243, 154, 252, 68, 174, 234, 218, 61, 34, 111, 46, 289, 272, 164, 117, 180, 108, 95, 30, 211, 55, m6 5728725, 612 PM {Ailes Delay Prediction -Jupyter Notebook In [14]: sns.countplot(df[ ‘Class’ ]) df['Class"].value_counts() :\anaconda3\1ib\site-packages\seaborn\_decorators .py:36: FutureWarning: Pass the fo Llowing variable as a keyword arg: x. From version @.12, the only valid positional a rgument will be “data, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings .warn( out(i4): @ 299118 1 240264 Name: Class, dtype: inte4 300000 250000 200000 count 450000 400000 0000 Class In [15]: from sklearn.utils import resample ficreate tho different dataframe of majority and minority class df_majority = df[(df['Class']. df_minority = df[(df[ ‘Class’ ] # upsample minority class df_minority_upsampled = resample(df_minority, replace=True, —_# sample with replacement n_samples= 299118, # to match majority class random_state=0) # reproducible results # Combine majority class with upsampled minority class df_upsanpled = pd.concat([df_minority_upsampled, df_majority]) locahost 8888inotebooks/Aitines Delay Prediction jpynb ane 5728725, 612 PM {Ailes Delay Prediction -Jupyter Notebook In [16]: sns.countplot (4f_upsampled| ‘Class’ ]) df_upsanpled| ‘Class' ].value_counts() :\anaconda3\1ib\site-packages\seaborn\_decorators .py:36: FutureWarning: Pass the fo Llowing variable as a keyword arg: x. From version @.12, the only valid positional a rgument will be “data, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings .warn( out(i6): 1 299118 @ 299118 Name: Class, dtype: intea 300000 250000 200000 count 450000 400000 50000 Class Check the Outliers In [17]: #Remove Outlier using Z-Score Method import scipy.stats as stats Z = np.abs(stats.zscore(df_upsanpled)) data_clean = df upsampled[(z<3).all(axis = 1)] data_clean. shape out (17): (587262, 7) Attribute Correlation locahost 8888inotebooks/Aitines Delay Prediction jpynb 96 5728725, 612 PM ‘Aitlines Delay Prediction - Jupyter Notebook In [18]: sns-heatmap(data_clean.corr(), fm 2") Out[18]: -10 Time -08 Length Airline ba AirportFrom 04 AirportTo we DayOfWeek 00 Crass 02 Time Length Airline Class AitportFrom AirportTo DayOfWeek In [19]: X = data_clean.drop('Class', axis=1) y = data_clean['Class'] In [20]: #test size 2% and train size 80x from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score X_train, X test, y_train, y test = train_test_split(x,y, test_size -2,random_stat Decision Tree Classifier In [21]: from sklearn.tree import DecisionTreeClassifier dtree = DecisionTreeClassifier(randon_state=0) dtree.fit(x_train, y train) Out[21}: DecisionTreeClassifier(random_stat In [22]: y_pred = dtree.predict(x_test) print("Accuracy Score :", round(accuracy_score(y test, y_pred)*100 ,2), "x Accuracy Score : 68.09 % locahost 8888inotebooks/Aitines Delay Prediction jpynb 1016 5728725, 612 PM In [23]: In [34]: out [34]: In [35]: In [36]: In [27]: out [27]: In [28]: In [29]: ‘Aitlines Delay Prediction - Jupyter Notebook from sklearn.metrics import accuracy score, fi_score, precision_score, recall_score print("F-1 Score : *,(fi_score(y_test, y_pred))) print("Precision Score : ', (precision score(y test, y_pred))) print(*Recall Score : *,(recall_score(y test, y_pred))) F-1 Score : 0.6744489030173536 Precision Score : 0.686760413902892 Recall Score : @.6625710336353863 Random Forest Classifier from sklearn.ensenble import RandonForestClassifier rfc = RandonForestClassifier(random_state=2) rfc. Fit(Xtrain, y_train) RandonForestClassifier(randon_stat y_pred = rfc.predict(X_test) print(“Aaccuracy Score :*, round(accuracy_score(y test, y pred)*100 ,2), "X") Accuracy Score : 69.89 % from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score print('F-1 Score : ',(f1_score(y_test, y_pred))) print("Precision Score : ',(precision_score(y test, y_pred))) print(*Recall Score : ',(recal1_score(y test, y_pred))) F-1 Score : 0.7072126886366834 Precision Score : 0,6867303302144625 Recall Score : @.7289544190173893 Logistic Regression from sklearn.linear_model import LogisticRegression Ip = LogisticRegression(randon_state=0) Ir. fit(X_train, y_train) LogisticRegression(randon_state=@) y_pred = Ir.predict(x_test) print("Accuracy Score :", round(accuracy_score(y test, y_pred)*100 ,2), "%") Accuracy Score : 57.52 % from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score print(‘F-1 Score : ‘,(fi_score(y_test, y_pred))) print('Precision Score : ',(precision_score(y test, y pred))) print('Recall Score : *,(recall_score(y test, y_pred))) F-1 Score : @.572576568858428 Precision Score : @.5749062381722465 Recall Score : @.5702657041929043 locahost 8888inotebooks/Aitines Delay Prediction jpynb ne 5728725, 612 PM In [30]: out [36]: In [31]: In [32]: Iocahost 8888inotebooksiAiines De ‘Aitlines Delay Prediction - Jupyter Notebook AdaBoost Classifier from sklearn.ensenble import AdaBoostClassifier ade = AdaBoostClassifier(randon_state=0) ada.fit(X train, y_train) AdaBoostClassifier(random_state=0) y_pred = ada.predict(x_test) print("Accuracy Score :", round(accuracy_score(y test, y_pred)*100 ,2), “ey Accuracy Score : 62.38 % from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score print('F-1 Score : ',(f1_score(y test, y_pred))) print('Precision Score : ',(precision_score(y test, y_pred))) print('Recall Score : ',(recall_score(y_test, y_pred))) F-1 Score : @.6156910151935504 Precision Score : @.6277844778660613 Recall Score : @,6040546767009676 Visualize Random Forest Classifier 1 Prediction pyr 126 372823, 612 PM Aitines Delay Preto - pyter Notebook In [37]: from sklearn.metrics import confusion matrix cm = confusion matrix(y test, y_pred) plt.figure(figsize=(5,5)) sns-heatmap (data=cm, Linewidths= pit.ylabel( ‘Actual label’) plt.xlabel( ‘Predicted label’) all_sample_title = ‘accuracy Score: (0)' .format(rfe.score(x_test, y_test)*100) plt.title(all_sample title, size = 15) 5, annot=True,square = True, cmap = ‘Blues') Out[37]: Text(@.5, 1.0, ‘Accuracy Scor 69.8866780754855") Accuracy Score: 69.8866780754855 40000 e 35000 g a 30000 § — 25000 ~ 20000 Predicted label locahost 8888inotebooks/Aitines Delay Prediction jpynb 1916 5728725, 612 PM ‘Aitlines Delay Prediction - Jupyter Notebook In [38]: from sklearn.metrics import roc_curve, roc_auc y_pred_proba = rfc.predict_proba(x_test)[:Il:, d¥_actual_predicted = pd.concat({pd.DataFrane(np.array(y test), colunns=["y_actual"]) df_actual_predicted.index = y_test.index for, tpr, tr = roc_curve(df_actual_predicted{ y_actual'], éf_actual_predicted{ 'y_pred auc = roc_auc_score(df_actual_predicted['y_actual'], df_actual_predicted['y_pred_prob: plt.plot(fpr, tpr, label="AUC = %0.4f' Xauc) plt.plot(fpr, fpr, linestyle = '--', color='k') plt.xlabel('False Positive Rate’) plt.ylabel('True Positive Rate’) plt.title('ROC Curve’, size = 15) plt.legend() core Out[38]: ROC Curve 10 08 & True Positive Rate 02 00 or) 02 a4 06 08 1.0 False Positive Rate Iocahost 8888inotebooksiAiines De 1 Prediction pyr 146 5728725, 612 PM In [39]: out [39]: In [41]: {Ailes Delay Prediction -Jupyter Notebook ‘#Feature Importance imp_dt = pd.DataFrame({ “Feature Name": X_train.colunns, “Importance: rfc. feature_importances_ » Fi = imp_df.sort_values(by="Inportance", ascending=False) fi Feature Name Importance ° Time 0264597 5 Dayomeck 0.239242 Longin 0.177447 AiporFrom 0.110214 Aitline 0.108437 ArportTa 0.100123, #12 = fichead(1e) plt. figure(figsize=(10,8)) sns.barplot(data=fi2, x='Importance’, y="Feature Name‘) plt.title(‘Top Feature Importance Each Attributes (Random Forest)’, fontsize=18) plt.xlabel (‘Inportance', fontsize=16) plt.ylabel (“Feature Name’, fontsize=16) plt.show() Top Feature Importance Each Attributes (Random Forest) Time Davoreek 5 3 AipotFrom Feature Name ane ‘AaportTo ° 0 005 010 or 020 028 Importance locahost 8888inotebooks/Aitines Delay Prediction jpynb 1916 5728725, 612 PM {Ailes Delay Prediction -Jupyter Notebook locahost 8888inotebooks/Aitines Delay Prediction jpynb 16116

You might also like