You are on page 1of 9

# -*- coding: utf-8 -*-

"""AirplanesThatHitBirds.ipynb

Automatically generated by Colaboratory.

Original file is located at


https://colab.research.google.com/drive/1R0Bu4RKXvQ6pD4CHYGUMY2zgaPaen2eR

Machine Learning Strikes

#BY Christal O’Connell, Brian James, Kyle Oakes, and Nick D’Orazio

Report link:
https://docs.google.com/document/d/1T4C5eoncfHiw_rG8Fxh5gg41L_Xd_K_43C0EpAznW1s/edi
t?usp=sharing

Read the Kaggle CSV file. Please upload your Kaggle API key json file when
prompted. You can obtain one with a Kaggle account and "Creating an API Key" under
the "My Account" Settings.
"""

#1
!pip install kaggle

from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
print('User uploaded file "{name}" with length {length} bytes'.format(
name=fn, length=len(uploaded[fn])))

# Then move kaggle.json into the folder where the API expects to find it.
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600
~/.kaggle/kaggle.json

!kaggle datasets download -d faa/wildlife-strikes

!unzip /content/wildlife-strikes.zip

import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns
from scipy.stats import zscore
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, cross_val_score

basepath = "/content/"
df = pd.read_csv(basepath + "database.csv")

"""Checking for NaN values"""

#2
df.info()

"""The sum of all NaN values in the DataFrame"""

#3
df.isna().sum().sum()

#4
df.describe()

"""The data took to long to run so we cut it down to only the years after 2014.
Historic data older than 2014 will not be used."""

#5
df2 = df[(df['Incident Year'] > 2000) & (df['Incident Year'] < 2015)].copy()
df2

"""Checked new row count and NaN values by column"""

#6
df2.info()

"""Checked reduced number of NaN values in the DataFrame"""

#7
df2.isna().sum().sum()

#8
df2.describe()

#9
#Nick
unique_operators = df2['Operator'].unique()
df_unique_operators = pd.DataFrame(data=unique_operators, columns=['Operator'])
df_unique_operators
#10
unique_aircraft = df2['Aircraft'].unique()
df_unique_aircraft = pd.DataFrame(data=unique_aircraft, columns=['Aircraft'])
df_unique_aircraft

#11
unique_airport = df2['Airport'].unique()
df_unique_airport = pd.DataFrame(data=unique_airport, columns=['Airport'])
df_unique_airport

operator_type = df2['Operator'].mask(df2['Operator'] != 'MILITARY', 'COMMERCIAL',


inplace=False)
unique_operator_type = operator_type.unique()
df_unique_operator_type = pd.DataFrame(data=unique_operator_type,
columns=['Operator Type'])
dict =
pd.Series(df_unique_operator_type.index,index=df_unique_operator_type['Operator
Type'].values).to_dict()
df2['Unique Operator Type Value'] = operator_type.map(dict)
df2

#12
dict =
pd.Series(df_unique_operators.index,index=df_unique_operators.Operator.values).to_d
ict()
df2['Unique Operator Value'] = df2['Operator'].map(dict)

#13
dict =
pd.Series(df_unique_aircraft.index,index=df_unique_aircraft.Aircraft.values).to_dic
t()
df2['Unique Aircraft Value'] = df2['Aircraft'].map(dict)

#14
dict =
pd.Series(df_unique_airport.index,index=df_unique_airport.Airport.values).to_dict()
df2['Unique Airport Value'] = df2['Airport'].map(dict)

unique_aircraft_type = df2['Aircraft Type'].unique()


df_unique_aircraft_type = pd.DataFrame(data=unique_aircraft_type,
columns=['Aircraft Type'])

dict =
pd.Series(df_unique_aircraft_type.index,index=df_unique_aircraft_type['Aircraft
Type'].values).to_dict()
df2['Unique Aircraft Type Value'] = df2['Aircraft Type'].map(dict)

unique_aircraft_make = df2['Aircraft Make'].unique()


df_unique_aircraft_make = pd.DataFrame(data=unique_aircraft_make,
columns=['Aircraft Make'])
dict =
pd.Series(df_unique_aircraft_make.index,index=df_unique_aircraft_make['Aircraft
Make'].values).to_dict()
df2['Unique Aircraft Make Value'] = df2['Aircraft Make'].map(dict)

unique_engine_type = df2['Engine Type'].unique()


df_unique_engine_type = pd.DataFrame(data=unique_engine_type, columns=['Engine
Type'])

dict = pd.Series(df_unique_engine_type.index,index=df_unique_engine_type['Engine
Type'].values).to_dict()
df2['Unique Engine Type Value'] = df2['Engine Type'].map(dict)

unique_state = df2['State'].unique()
df_unique_state = pd.DataFrame(data=unique_state, columns=['State'])

dict =
pd.Series(df_unique_state.index,index=df_unique_state['State'].values).to_dict()
df2['Unique State Value'] = df2['State'].map(dict)

unique_faa = df2['FAA Region'].unique()


df_faa = pd.DataFrame(data=unique_faa, columns=['FAA Region'])

dict = pd.Series(df_faa.index,index=df_faa['FAA Region'].values).to_dict()


df2['Unique FAA Region Value'] = df2['FAA Region'].map(dict)

unique_warning = df2['Warning Issued'].unique()


unique_warning = pd.DataFrame(data=unique_warning, columns=['Warning Issued'])

dict = pd.Series(unique_warning.index,index=unique_warning['Warning
Issued'].values).to_dict()
df2['Unique Warning Issued Value'] = df2['Warning Issued'].map(dict)

unique_flight_phase = df2['Flight Phase'].unique()


unique_flight_phase = pd.DataFrame(data=unique_flight_phase, columns=['Flight
Phase'])

dict = pd.Series(unique_flight_phase.index,index=unique_flight_phase['Flight
Phase'].values).to_dict()
df2['Unique Flight Phase Value'] = df2['Flight Phase'].map(dict)

unique_visibility = df2['Visibility'].unique()
unique_visibility = pd.DataFrame(data=unique_visibility, columns=['Visibility'])

dict =
pd.Series(unique_visibility.index,index=unique_visibility['Visibility'].values).to_
dict()
df2['Unique Visibility Value'] = df2['Visibility'].map(dict)

unique_precipitation = df2['Precipitation'].unique()
unique_precipitation = pd.DataFrame(data=unique_precipitation,
columns=['Precipitation'])

dict =
pd.Series(unique_precipitation.index,index=unique_precipitation['Precipitation'].va
lues).to_dict()
df2['Unique Precipitation Value'] = df2['Precipitation'].map(dict)

unique_species_name = df2['Species Name'].unique()


unique_species_name = pd.DataFrame(data=unique_species_name, columns=['Species
Name'])

dict = pd.Series(unique_species_name.index,index=unique_species_name['Species
Name'].values).to_dict()
df2['Unique Species Name Value'] = df2['Species Name'].map(dict)

unique_flight_impact = df2['Flight Impact'].unique()


unique_flight_impact = pd.DataFrame(data=unique_flight_impact, columns=['Flight
Impact'])

dict = pd.Series(unique_flight_impact.index,index=unique_flight_impact['Flight
Impact'].values).to_dict()
df2['Unique Flight Impact Value'] = df2['Flight Impact'].map(dict)

unique_flight_impact = df2['Flight Impact'].unique()


unique_flight_impact = pd.DataFrame(data=unique_flight_impact, columns=['Flight
Impact'])

#15
df2 = df2.drop(['Fatalities', 'Injuries'], axis=1)
df2.head(50)

df2.info()

df_counts = df2
df_counts = df_counts.groupby('Incident Month').count()
df_counts

df_mean = df2
df_mean = df_mean.groupby('Incident Month').mean()
df_mean

operator_value_counts = df2['Operator'].value_counts()
operator_value_counts

aircraft_value_counts = df2['Aircraft'].value_counts()
aircraft_value_counts

airport_value_counts = df2['Airport'].value_counts()
airport_value_counts

month_value_counts = df2['Incident Month'].value_counts()


month_value_counts

pivot_aircraft_by_month = pd.pivot_table(data=df2,
index='Incident Month',
columns='Aircraft',
aggfunc='count')
pivot_aircraft_by_month

pivot_operator_by_month = pd.pivot_table(data=df2,
index='Incident Month',
columns='Operator',
aggfunc='count')
pivot_operator_by_month

pivot_airport_by_month = pd.pivot_table(data=df2,
index='Incident Month',
columns='Airport',
aggfunc='count')
pivot_airport_by_month

"""Below is a set of scatterplots we've used to check for correlations among the
number of collisions and the columns 'Incident Month', 'Unique Operator Value',
'Unique Aircraft Value', and 'Unique Airport Value'.
Conclusions:
* There seems to be a positive correlation between 'Incident Month' and each
other column considered. This is likely due to an increase in the number of
collisions reported.
"""

sns.pairplot(df2, vars=['Record ID', 'Incident Month', 'Unique Operator Value',


'Unique Aircraft Value', 'Unique Airport Value'], kind="scatter");

#sns.jointplot(data=df2, x="Incident Month", y="Unique Aircraft Value", kind="kde")

df3 = df[['Operator','Aircraft', 'Airport ID', 'Incident Month', 'Incident Year',


'Incident Day']].copy()
# The following few lines would only need to be present if we were predicting
Aircraft or Airport ID.
#df3 = df3.loc[df['Aircraft'] != 'UNKNOWN']
#df3 = df3.loc[df['Airport ID'] != 'UNKN']
#df3 = df3.loc[df['Airport ID'] != 'ZZZZ']
df3.dropna(axis=0)
df3.info()
df3['Incident Month'].value_counts().sort_index()
df3.isna().sum().sum()
df3

collisions = df3['Incident Month'].value_counts().sort_index()


ax = sns.barplot(x=collisions.index, y=collisions)
ax.set_title('Number of Collisions by Month')
ax.set_ylabel('Number of Collisions')
ax.set_xlabel('Month')
plt.show()

# This is another violin idea. The only thing I could think of that might be paired
together are
# Military vs. Non-military aircraft. To get this, I need to replace the 'Operator'
# strings with either "Yes" or "No" for Military aircraft.
df5 = df2[['Record ID', 'Unique Operator Value', 'Operator', 'Incident Month',
'Unique Airport Value']].copy()
df5['Operator'].mask(df5['Operator'] != 'MILITARY', 'COMMERCIAL', inplace=True)
df5['Operator'] = df5['Operator'].replace(regex='MILITARY', value='MILITARY')
fig = plt.gcf()
ax = sns.violinplot(data=df5, x="Operator", y="Incident Month", hue="Operator",
column="Incident Month",
split=False, inner="box", dodge=False, scale="count")

ax.legend(title= 'Type of Aircraft', loc='upper center', edgecolor='red', ncol=2)


fig.set_size_inches(12, 8)
plt.show();

# Predict the month that a military or commercial aircraft hit a bird


# kNN Classification
predictors = ['Unique Operator Type Value', 'Unique Airport Value']
#'Unique Operator Type Value', 'Unique Airport Value'
target = 'Incident Month'
X = df2[predictors].values
y = df2[target].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35,
random_state=0)
print("************ Before scaling training data: ************")
print(X_train[:5])
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Print the first 5 rows of X_train again to make sure the data is scaled.
print("************ After scaling training data: ************")
print(X_train[:5])
# kNN Classification
knn = KNeighborsClassifier(n_neighbors=15)
knn.fit(X_train, y_train)
predicted = knn.predict(X_test)
# Baseline Accuracy
baseline_accuracy = pd.Series(y_train).value_counts().iloc[0] / y_train.size
print('baseline accuracy: {0:.3f}'.format(baseline_accuracy))
#kNN Clasification Accuracy
accuracy = knn.score(X_test, y_test)
print('classification accuracy: {0:.3f}'.format(accuracy))

# Make predictions and check mean squarded error on test set


knn = KNeighborsRegressor()
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)
mse = ((predicted - y_test)**2).mean()
accuracy = (predicted == y_test).mean()
print('regression accuracy: {0:.3f}'.format(accuracy))
print("MSE: ", mse)

"""Linear Regression 'Height' vs 'Unique Operator Type Value'"""

df2[['Unique Operator Type Value','Distance','Speed','Height']]


predictors0_2 = ['Distance', 'Speed', 'Unique Operator Type Value']
target0_2 = 'Height'

# Copy df2 to df0_2 and drop rows with na values in any predictor/target columns
df0_2 = df2.copy()
for predictor in predictors0_2:
df0_2 = df0_2[df0_2[predictor].notna()]
df0_2 = df0_2[df0_2[target0_2].notna()]

X = df0_2[predictors0_2].values
y = df0_2[target0_2].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,


random_state=0)

reg = LinearRegression()
reg.fit(X_train,y_train)

predicted = reg.predict(X_test)
sns.scatterplot(y_test, predicted)
plt.plot((np.min([y_test, predicted]) , np.max([y_test, predicted])),
(np.min([y_test, predicted]) , np.max([y_test, predicted])),
color='black', linewidth=2, linestyle='dashed')
plt.savefig("abc.png")
files.download("abc.png")
# sns.pairplot(df0_2[['Unique Operator Type Value','Distance','Speed','Height']])

r2 = reg.score(X_test, y_test)
print('r-squared value: {:.2f}'.format(r2))

rmse = np.sqrt(((reg.predict(X_test) - y_test)**2).mean())


print("RMSE: {:.2f}".format(rmse))

print('Intersection: {:.2f}'.format(reg.intercept_))
print('Distance: {:.2f}'.format(reg.coef_[0]))
print('Speed coef: {:.2f}'.format(reg.coef_[1]))
print('Unique Operator coef: {:.2f}'.format(reg.coef_[2]))

You might also like