Airplanesthathitbirds

# -*- coding: utf-8 -*-
"""AirplanesThatHitBirds.ipynb
Automatically generated by Colaboratory.
Original file is located at

https://colab.research.google.com/drive/1R0Bu4RKXvQ6pD4CHYGUMY2zgaPaen2eR
Machine Learning Strikes
#BY Christal O’Connell, Brian James, Kyle Oakes, and Nick D’Orazio
Report link:
https://docs.google.com/document/d/1T4C5eoncfHiw_rG8Fxh5gg41L_Xd_K_43C0EpAznW1s/edi
t?usp=sharing
Read the Kaggle CSV file. Please upload your Kaggle API key json file when
prompted. You can obtain one with a Kaggle account and "Creating an API Key" under
the "My Account" Settings.
"""
#1
!pip install kaggle
from google.colab import files
uploaded = files.upload()
for fn in uploaded.keys():
print('User uploaded file "{name}" with length {length} bytes'.format(
name=fn, length=len(uploaded[fn])))
# Then move kaggle.json into the folder where the API expects to find it.
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600
~/.kaggle/kaggle.json
!kaggle datasets download -d faa/wildlife-strikes
!unzip /content/wildlife-strikes.zip
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns
from scipy.stats import zscore
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, cross_val_score
basepath = "/content/"
df = pd.read_csv(basepath + "database.csv")
"""Checking for NaN values"""
#2
df.info()
"""The sum of all NaN values in the DataFrame"""
#3
df.isna().sum().sum()
#4
df.describe()
"""The data took to long to run so we cut it down to only the years after 2014.
Historic data older than 2014 will not be used."""
#5
df2 = df[(df['Incident Year'] > 2000) & (df['Incident Year'] < 2015)].copy()
df2
"""Checked new row count and NaN values by column"""
#6
df2.info()
"""Checked reduced number of NaN values in the DataFrame"""
#7
df2.isna().sum().sum()
#8
df2.describe()
#9
#Nick
unique_operators = df2['Operator'].unique()
df_unique_operators = pd.DataFrame(data=unique_operators, columns=['Operator'])
df_unique_operators
#10
unique_aircraft = df2['Aircraft'].unique()
df_unique_aircraft = pd.DataFrame(data=unique_aircraft, columns=['Aircraft'])
df_unique_aircraft
#11
unique_airport = df2['Airport'].unique()
df_unique_airport = pd.DataFrame(data=unique_airport, columns=['Airport'])
df_unique_airport
operator_type = df2['Operator'].mask(df2['Operator'] != 'MILITARY', 'COMMERCIAL',

inplace=False)
unique_operator_type = operator_type.unique()
df_unique_operator_type = pd.DataFrame(data=unique_operator_type,
columns=['Operator Type'])
dict =
pd.Series(df_unique_operator_type.index,index=df_unique_operator_type['Operator
Type'].values).to_dict()
df2['Unique Operator Type Value'] = operator_type.map(dict)
df2
#12
dict =
pd.Series(df_unique_operators.index,index=df_unique_operators.Operator.values).to_d
ict()
df2['Unique Operator Value'] = df2['Operator'].map(dict)
#13
dict =
pd.Series(df_unique_aircraft.index,index=df_unique_aircraft.Aircraft.values).to_dic
t()
df2['Unique Aircraft Value'] = df2['Aircraft'].map(dict)
#14
dict =
pd.Series(df_unique_airport.index,index=df_unique_airport.Airport.values).to_dict()
df2['Unique Airport Value'] = df2['Airport'].map(dict)
unique_aircraft_type = df2['Aircraft Type'].unique()

df_unique_aircraft_type = pd.DataFrame(data=unique_aircraft_type,
columns=['Aircraft Type'])
dict =
pd.Series(df_unique_aircraft_type.index,index=df_unique_aircraft_type['Aircraft
df2['Unique Aircraft Type Value'] = df2['Aircraft Type'].map(dict)
unique_aircraft_make = df2['Aircraft Make'].unique()

df_unique_aircraft_make = pd.DataFrame(data=unique_aircraft_make,
columns=['Aircraft Make'])
dict =
pd.Series(df_unique_aircraft_make.index,index=df_unique_aircraft_make['Aircraft
Make'].values).to_dict()
df2['Unique Aircraft Make Value'] = df2['Aircraft Make'].map(dict)
unique_engine_type = df2['Engine Type'].unique()

df_unique_engine_type = pd.DataFrame(data=unique_engine_type, columns=['Engine
Type'])
dict = pd.Series(df_unique_engine_type.index,index=df_unique_engine_type['Engine
df2['Unique Engine Type Value'] = df2['Engine Type'].map(dict)
unique_state = df2['State'].unique()
df_unique_state = pd.DataFrame(data=unique_state, columns=['State'])
dict =
pd.Series(df_unique_state.index,index=df_unique_state['State'].values).to_dict()
df2['Unique State Value'] = df2['State'].map(dict)
unique_faa = df2['FAA Region'].unique()

df_faa = pd.DataFrame(data=unique_faa, columns=['FAA Region'])
dict = pd.Series(df_faa.index,index=df_faa['FAA Region'].values).to_dict()

df2['Unique FAA Region Value'] = df2['FAA Region'].map(dict)
unique_warning = df2['Warning Issued'].unique()

unique_warning = pd.DataFrame(data=unique_warning, columns=['Warning Issued'])
dict = pd.Series(unique_warning.index,index=unique_warning['Warning
Issued'].values).to_dict()
df2['Unique Warning Issued Value'] = df2['Warning Issued'].map(dict)
unique_flight_phase = df2['Flight Phase'].unique()

unique_flight_phase = pd.DataFrame(data=unique_flight_phase, columns=['Flight
Phase'])
dict = pd.Series(unique_flight_phase.index,index=unique_flight_phase['Flight
Phase'].values).to_dict()
df2['Unique Flight Phase Value'] = df2['Flight Phase'].map(dict)
unique_visibility = df2['Visibility'].unique()
unique_visibility = pd.DataFrame(data=unique_visibility, columns=['Visibility'])
dict =
pd.Series(unique_visibility.index,index=unique_visibility['Visibility'].values).to_
dict()
df2['Unique Visibility Value'] = df2['Visibility'].map(dict)
unique_precipitation = df2['Precipitation'].unique()
unique_precipitation = pd.DataFrame(data=unique_precipitation,
columns=['Precipitation'])
dict =
pd.Series(unique_precipitation.index,index=unique_precipitation['Precipitation'].va
lues).to_dict()
df2['Unique Precipitation Value'] = df2['Precipitation'].map(dict)
unique_species_name = df2['Species Name'].unique()

unique_species_name = pd.DataFrame(data=unique_species_name, columns=['Species
Name'])
dict = pd.Series(unique_species_name.index,index=unique_species_name['Species
Name'].values).to_dict()
df2['Unique Species Name Value'] = df2['Species Name'].map(dict)
unique_flight_impact = df2['Flight Impact'].unique()

unique_flight_impact = pd.DataFrame(data=unique_flight_impact, columns=['Flight
Impact'])
dict = pd.Series(unique_flight_impact.index,index=unique_flight_impact['Flight
Impact'].values).to_dict()
df2['Unique Flight Impact Value'] = df2['Flight Impact'].map(dict)
unique_flight_impact = df2['Flight Impact'].unique()

unique_flight_impact = pd.DataFrame(data=unique_flight_impact, columns=['Flight
Impact'])
#15
df2 = df2.drop(['Fatalities', 'Injuries'], axis=1)
df2.head(50)
df2.info()
df_counts = df2
df_counts = df_counts.groupby('Incident Month').count()
df_counts
df_mean = df2
df_mean = df_mean.groupby('Incident Month').mean()
df_mean
operator_value_counts = df2['Operator'].value_counts()
operator_value_counts
aircraft_value_counts = df2['Aircraft'].value_counts()
aircraft_value_counts
airport_value_counts = df2['Airport'].value_counts()
airport_value_counts
month_value_counts = df2['Incident Month'].value_counts()

month_value_counts
pivot_aircraft_by_month = pd.pivot_table(data=df2,
index='Incident Month',
columns='Aircraft',
aggfunc='count')
pivot_aircraft_by_month
pivot_operator_by_month = pd.pivot_table(data=df2,
columns='Operator',
aggfunc='count')
pivot_operator_by_month
pivot_airport_by_month = pd.pivot_table(data=df2,
columns='Airport',
aggfunc='count')
pivot_airport_by_month
"""Below is a set of scatterplots we've used to check for correlations among the
number of collisions and the columns 'Incident Month', 'Unique Operator Value',
'Unique Aircraft Value', and 'Unique Airport Value'.
Conclusions:
* There seems to be a positive correlation between 'Incident Month' and each
other column considered. This is likely due to an increase in the number of
collisions reported.
"""
sns.pairplot(df2, vars=['Record ID', 'Incident Month', 'Unique Operator Value',

'Unique Aircraft Value', 'Unique Airport Value'], kind="scatter");
#sns.jointplot(data=df2, x="Incident Month", y="Unique Aircraft Value", kind="kde")
df3 = df[['Operator','Aircraft', 'Airport ID', 'Incident Month', 'Incident Year',

'Incident Day']].copy()
# The following few lines would only need to be present if we were predicting
Aircraft or Airport ID.
#df3 = df3.loc[df['Aircraft'] != 'UNKNOWN']
#df3 = df3.loc[df['Airport ID'] != 'UNKN']
#df3 = df3.loc[df['Airport ID'] != 'ZZZZ']
df3.dropna(axis=0)
df3.info()
df3['Incident Month'].value_counts().sort_index()
df3.isna().sum().sum()
df3
collisions = df3['Incident Month'].value_counts().sort_index()

ax = sns.barplot(x=collisions.index, y=collisions)
ax.set_title('Number of Collisions by Month')
ax.set_ylabel('Number of Collisions')
ax.set_xlabel('Month')
plt.show()
# This is another violin idea. The only thing I could think of that might be paired
together are
# Military vs. Non-military aircraft. To get this, I need to replace the 'Operator'
# strings with either "Yes" or "No" for Military aircraft.
df5 = df2[['Record ID', 'Unique Operator Value', 'Operator', 'Incident Month',
'Unique Airport Value']].copy()
df5['Operator'].mask(df5['Operator'] != 'MILITARY', 'COMMERCIAL', inplace=True)
df5['Operator'] = df5['Operator'].replace(regex='MILITARY', value='MILITARY')
fig = plt.gcf()
ax = sns.violinplot(data=df5, x="Operator", y="Incident Month", hue="Operator",
column="Incident Month",
split=False, inner="box", dodge=False, scale="count")
ax.legend(title= 'Type of Aircraft', loc='upper center', edgecolor='red', ncol=2)

fig.set_size_inches(12, 8)
plt.show();
# Predict the month that a military or commercial aircraft hit a bird

# kNN Classification
predictors = ['Unique Operator Type Value', 'Unique Airport Value']
#'Unique Operator Type Value', 'Unique Airport Value'
target = 'Incident Month'
X = df2[predictors].values
y = df2[target].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35,
random_state=0)
print("************ Before scaling training data: ************")
print(X_train[:5])
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Print the first 5 rows of X_train again to make sure the data is scaled.
print("************ After scaling training data: ************")
print(X_train[:5])
# kNN Classification
knn = KNeighborsClassifier(n_neighbors=15)
knn.fit(X_train, y_train)
predicted = knn.predict(X_test)
# Baseline Accuracy
baseline_accuracy = pd.Series(y_train).value_counts().iloc[0] / y_train.size
print('baseline accuracy: {0:.3f}'.format(baseline_accuracy))
#kNN Clasification Accuracy
accuracy = knn.score(X_test, y_test)
print('classification accuracy: {0:.3f}'.format(accuracy))
# Make predictions and check mean squarded error on test set

knn = KNeighborsRegressor()
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)
mse = ((predicted - y_test)**2).mean()
accuracy = (predicted == y_test).mean()
print('regression accuracy: {0:.3f}'.format(accuracy))
print("MSE: ", mse)
"""Linear Regression 'Height' vs 'Unique Operator Type Value'"""
df2[['Unique Operator Type Value','Distance','Speed','Height']]

predictors0_2 = ['Distance', 'Speed', 'Unique Operator Type Value']
target0_2 = 'Height'
# Copy df2 to df0_2 and drop rows with na values in any predictor/target columns
df0_2 = df2.copy()
for predictor in predictors0_2:
df0_2 = df0_2[df0_2[predictor].notna()]
df0_2 = df0_2[df0_2[target0_2].notna()]
X = df0_2[predictors0_2].values
y = df0_2[target0_2].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,

random_state=0)
reg = LinearRegression()
reg.fit(X_train,y_train)
predicted = reg.predict(X_test)
sns.scatterplot(y_test, predicted)
plt.plot((np.min([y_test, predicted]) , np.max([y_test, predicted])),
(np.min([y_test, predicted]) , np.max([y_test, predicted])),
color='black', linewidth=2, linestyle='dashed')
plt.savefig("abc.png")
files.download("abc.png")
# sns.pairplot(df0_2[['Unique Operator Type Value','Distance','Speed','Height']])
r2 = reg.score(X_test, y_test)
print('r-squared value: {:.2f}'.format(r2))
rmse = np.sqrt(((reg.predict(X_test) - y_test)**2).mean())

print("RMSE: {:.2f}".format(rmse))
print('Intersection: {:.2f}'.format(reg.intercept_))
print('Distance: {:.2f}'.format(reg.coef_[0]))
print('Speed coef: {:.2f}'.format(reg.coef_[1]))
print('Unique Operator coef: {:.2f}'.format(reg.coef_[2]))

Airplanesthathitbirds

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Airplanesthathitbirds

Uploaded by

Copyright:

Available Formats

# -*- coding: utf-8 -*-

Automatically generated by Colaboratory.

Original file is located at

Machine Learning Strikes

from google.colab import files

!kaggle datasets download -d faa/wildlife-strikes

"""Checking for NaN values"""

"""The sum of all NaN values in the DataFrame"""

"""Checked new row count and NaN values by column"""

"""Checked reduced number of NaN values in the DataFrame"""

operator_type = df2['Operator'].mask(df2['Operator'] != 'MILITARY', 'COMMERCIAL',

unique_aircraft_type = df2['Aircraft Type'].unique()

unique_aircraft_make = df2['Aircraft Make'].unique()

unique_engine_type = df2['Engine Type'].unique()

unique_faa = df2['FAA Region'].unique()

dict = pd.Series(df_faa.index,index=df_faa['FAA Region'].values).to_dict()

unique_warning = df2['Warning Issued'].unique()

unique_flight_phase = df2['Flight Phase'].unique()

unique_species_name = df2['Species Name'].unique()

unique_flight_impact = df2['Flight Impact'].unique()

unique_flight_impact = df2['Flight Impact'].unique()

month_value_counts = df2['Incident Month'].value_counts()

sns.pairplot(df2, vars=['Record ID', 'Incident Month', 'Unique Operator Value',

#sns.jointplot(data=df2, x="Incident Month", y="Unique Aircraft Value", kind="kde")

df3 = df[['Operator','Aircraft', 'Airport ID', 'Incident Month', 'Incident Year',

collisions = df3['Incident Month'].value_counts().sort_index()

ax.legend(title= 'Type of Aircraft', loc='upper center', edgecolor='red', ncol=2)

# Predict the month that a military or commercial aircraft hit a bird

# Make predictions and check mean squarded error on test set

"""Linear Regression 'Height' vs 'Unique Operator Type Value'"""

df2[['Unique Operator Type Value','Distance','Speed','Height']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,

rmse = np.sqrt(((reg.predict(X_test) - y_test)**2).mean())

You might also like

# -- coding: utf-8 --