Professional Documents
Culture Documents
"""AirplanesThatHitBirds.ipynb
#BY Christal O’Connell, Brian James, Kyle Oakes, and Nick D’Orazio
Report link:
https://docs.google.com/document/d/1T4C5eoncfHiw_rG8Fxh5gg41L_Xd_K_43C0EpAznW1s/edi
t?usp=sharing
Read the Kaggle CSV file. Please upload your Kaggle API key json file when
prompted. You can obtain one with a Kaggle account and "Creating an API Key" under
the "My Account" Settings.
"""
#1
!pip install kaggle
uploaded = files.upload()
for fn in uploaded.keys():
print('User uploaded file "{name}" with length {length} bytes'.format(
name=fn, length=len(uploaded[fn])))
# Then move kaggle.json into the folder where the API expects to find it.
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600
~/.kaggle/kaggle.json
!unzip /content/wildlife-strikes.zip
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns
from scipy.stats import zscore
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, cross_val_score
basepath = "/content/"
df = pd.read_csv(basepath + "database.csv")
#2
df.info()
#3
df.isna().sum().sum()
#4
df.describe()
"""The data took to long to run so we cut it down to only the years after 2014.
Historic data older than 2014 will not be used."""
#5
df2 = df[(df['Incident Year'] > 2000) & (df['Incident Year'] < 2015)].copy()
df2
#6
df2.info()
#7
df2.isna().sum().sum()
#8
df2.describe()
#9
#Nick
unique_operators = df2['Operator'].unique()
df_unique_operators = pd.DataFrame(data=unique_operators, columns=['Operator'])
df_unique_operators
#10
unique_aircraft = df2['Aircraft'].unique()
df_unique_aircraft = pd.DataFrame(data=unique_aircraft, columns=['Aircraft'])
df_unique_aircraft
#11
unique_airport = df2['Airport'].unique()
df_unique_airport = pd.DataFrame(data=unique_airport, columns=['Airport'])
df_unique_airport
#12
dict =
pd.Series(df_unique_operators.index,index=df_unique_operators.Operator.values).to_d
ict()
df2['Unique Operator Value'] = df2['Operator'].map(dict)
#13
dict =
pd.Series(df_unique_aircraft.index,index=df_unique_aircraft.Aircraft.values).to_dic
t()
df2['Unique Aircraft Value'] = df2['Aircraft'].map(dict)
#14
dict =
pd.Series(df_unique_airport.index,index=df_unique_airport.Airport.values).to_dict()
df2['Unique Airport Value'] = df2['Airport'].map(dict)
dict =
pd.Series(df_unique_aircraft_type.index,index=df_unique_aircraft_type['Aircraft
Type'].values).to_dict()
df2['Unique Aircraft Type Value'] = df2['Aircraft Type'].map(dict)
dict = pd.Series(df_unique_engine_type.index,index=df_unique_engine_type['Engine
Type'].values).to_dict()
df2['Unique Engine Type Value'] = df2['Engine Type'].map(dict)
unique_state = df2['State'].unique()
df_unique_state = pd.DataFrame(data=unique_state, columns=['State'])
dict =
pd.Series(df_unique_state.index,index=df_unique_state['State'].values).to_dict()
df2['Unique State Value'] = df2['State'].map(dict)
dict = pd.Series(unique_warning.index,index=unique_warning['Warning
Issued'].values).to_dict()
df2['Unique Warning Issued Value'] = df2['Warning Issued'].map(dict)
dict = pd.Series(unique_flight_phase.index,index=unique_flight_phase['Flight
Phase'].values).to_dict()
df2['Unique Flight Phase Value'] = df2['Flight Phase'].map(dict)
unique_visibility = df2['Visibility'].unique()
unique_visibility = pd.DataFrame(data=unique_visibility, columns=['Visibility'])
dict =
pd.Series(unique_visibility.index,index=unique_visibility['Visibility'].values).to_
dict()
df2['Unique Visibility Value'] = df2['Visibility'].map(dict)
unique_precipitation = df2['Precipitation'].unique()
unique_precipitation = pd.DataFrame(data=unique_precipitation,
columns=['Precipitation'])
dict =
pd.Series(unique_precipitation.index,index=unique_precipitation['Precipitation'].va
lues).to_dict()
df2['Unique Precipitation Value'] = df2['Precipitation'].map(dict)
dict = pd.Series(unique_species_name.index,index=unique_species_name['Species
Name'].values).to_dict()
df2['Unique Species Name Value'] = df2['Species Name'].map(dict)
dict = pd.Series(unique_flight_impact.index,index=unique_flight_impact['Flight
Impact'].values).to_dict()
df2['Unique Flight Impact Value'] = df2['Flight Impact'].map(dict)
#15
df2 = df2.drop(['Fatalities', 'Injuries'], axis=1)
df2.head(50)
df2.info()
df_counts = df2
df_counts = df_counts.groupby('Incident Month').count()
df_counts
df_mean = df2
df_mean = df_mean.groupby('Incident Month').mean()
df_mean
operator_value_counts = df2['Operator'].value_counts()
operator_value_counts
aircraft_value_counts = df2['Aircraft'].value_counts()
aircraft_value_counts
airport_value_counts = df2['Airport'].value_counts()
airport_value_counts
pivot_aircraft_by_month = pd.pivot_table(data=df2,
index='Incident Month',
columns='Aircraft',
aggfunc='count')
pivot_aircraft_by_month
pivot_operator_by_month = pd.pivot_table(data=df2,
index='Incident Month',
columns='Operator',
aggfunc='count')
pivot_operator_by_month
pivot_airport_by_month = pd.pivot_table(data=df2,
index='Incident Month',
columns='Airport',
aggfunc='count')
pivot_airport_by_month
"""Below is a set of scatterplots we've used to check for correlations among the
number of collisions and the columns 'Incident Month', 'Unique Operator Value',
'Unique Aircraft Value', and 'Unique Airport Value'.
Conclusions:
* There seems to be a positive correlation between 'Incident Month' and each
other column considered. This is likely due to an increase in the number of
collisions reported.
"""
# This is another violin idea. The only thing I could think of that might be paired
together are
# Military vs. Non-military aircraft. To get this, I need to replace the 'Operator'
# strings with either "Yes" or "No" for Military aircraft.
df5 = df2[['Record ID', 'Unique Operator Value', 'Operator', 'Incident Month',
'Unique Airport Value']].copy()
df5['Operator'].mask(df5['Operator'] != 'MILITARY', 'COMMERCIAL', inplace=True)
df5['Operator'] = df5['Operator'].replace(regex='MILITARY', value='MILITARY')
fig = plt.gcf()
ax = sns.violinplot(data=df5, x="Operator", y="Incident Month", hue="Operator",
column="Incident Month",
split=False, inner="box", dodge=False, scale="count")
# Copy df2 to df0_2 and drop rows with na values in any predictor/target columns
df0_2 = df2.copy()
for predictor in predictors0_2:
df0_2 = df0_2[df0_2[predictor].notna()]
df0_2 = df0_2[df0_2[target0_2].notna()]
X = df0_2[predictors0_2].values
y = df0_2[target0_2].values
reg = LinearRegression()
reg.fit(X_train,y_train)
predicted = reg.predict(X_test)
sns.scatterplot(y_test, predicted)
plt.plot((np.min([y_test, predicted]) , np.max([y_test, predicted])),
(np.min([y_test, predicted]) , np.max([y_test, predicted])),
color='black', linewidth=2, linestyle='dashed')
plt.savefig("abc.png")
files.download("abc.png")
# sns.pairplot(df0_2[['Unique Operator Type Value','Distance','Speed','Height']])
r2 = reg.score(X_test, y_test)
print('r-squared value: {:.2f}'.format(r2))
print('Intersection: {:.2f}'.format(reg.intercept_))
print('Distance: {:.2f}'.format(reg.coef_[0]))
print('Speed coef: {:.2f}'.format(reg.coef_[1]))
print('Unique Operator coef: {:.2f}'.format(reg.coef_[2]))