Professional Documents
Culture Documents
ipynb - Colaboratory
Rainfall prediction
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remoun
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
get_ipython().run_line_magic('matplotlib', 'inline')
Dataset contains about 10 years of daily weather observations from numerous weather stations across Australia.
In this notebook we will train a model to predict whether it will rain next day
# reading data
data = pd.read_csv("weather_data.csv")
data.shape
(145460, 23)
data.head()
https://colab.research.google.com/drive/1XM89JsPOm5UTWy8aQCcyBupVlhFp3m3Y#printMode=true 1/10
21/11/2023, 00:01 Rainfall_prediction.ipynb - Colaboratory
2008-
0 Albury 13.4 22.9 0.6 NaN NaN
12-01
2008-
Exploration
1
12-02
Albury 7.4 25.1 0.0 NaN NaN WN
2008-
2 Albury 12.9 25.7 0.0 NaN NaN WS
print(data.info())
12-03
2008-'pandas.core.frame.DataFrame'>
<class
3 Albury 9.2 28.0 0.0 NaN NaN N
12-04
RangeIndex: 145460 entries, 0 to 145459
Data columns (total 23 columns):
2008-
4# Column Albury Non-Null
17.5 Count
32.3 Dtype1.0 NaN NaN
12-05
--- ------ -------------- -----
5 0rows Date
× 23 columns 145460 non-null object
1 Location 145460 non-null object
2 MinTemp 143975 non-null float64
3 MaxTemp 144199 non-null float64
4 Rainfall 142199 non-null float64
5 Evaporation 82670 non-null float64
6 Sunshine 75625 non-null float64
7 WindGustDir 135134 non-null object
8 WindGustSpeed 135197 non-null float64
9 WindDir9am 134894 non-null object
10 WindDir3pm 141232 non-null object
11 WindSpeed9am 143693 non-null float64
12 WindSpeed3pm 142398 non-null float64
13 Humidity9am 142806 non-null float64
14 Humidity3pm 140953 non-null float64
15 Pressure9am 130395 non-null float64
16 Pressure3pm 130432 non-null float64
17 Cloud9am 89572 non-null float64
18 Cloud3pm 86102 non-null float64
19 Temp9am 143693 non-null float64
20 Temp3pm 141851 non-null float64
21 RainToday 142199 non-null object
22 RainTomorrow 142193 non-null object
dtypes: float64(16), object(7)
memory usage: 25.5+ MB
None
data.describe()
data.describe(include=[object])
unique 3436 49 16 16 16 2 2
data.RainTomorrow.unique()
data.RainToday.unique()
https://colab.research.google.com/drive/1XM89JsPOm5UTWy8aQCcyBupVlhFp3m3Y#printMode=true 2/10
21/11/2023, 00:01 Rainfall_prediction.ipynb - Colaboratory
data.WindGustDir.unique()
array(['W', 'WNW', 'WSW', 'NE', 'NNW', 'N', 'NNE', 'SW', nan, 'ENE',
'SSE', 'S', 'NW', 'SE', 'ESE', 'E', 'SSW'], dtype=object)
data.WindGustDir.value_counts()
W 9915
SE 9418
N 9313
SSE 9216
E 9181
S 9168
WSW 9069
SW 8967
SSW 8736
WNW 8252
NW 8122
ENE 8104
ESE 7372
NE 7133
NNW 6620
NNE 6548
Name: WindGustDir, dtype: int64
data['RainTomorrow'].value_counts().plot(kind='bar')
<matplotlib.axes._subplots.AxesSubplot at 0x7f4d94962b10>
sns.distplot(data['MaxTemp'])
<matplotlib.axes._subplots.AxesSubplot at 0x7f4d9493be10>
https://colab.research.google.com/drive/1XM89JsPOm5UTWy8aQCcyBupVlhFp3m3Y#printMode=true 3/10
21/11/2023, 00:01 Rainfall_prediction.ipynb - Colaboratory
sns.boxplot(x='RainTomorrow', y="Temp3pm", data=data)
<matplotlib.axes._subplots.AxesSubplot at 0x7f4d947cac50>
<matplotlib.axes._subplots.AxesSubplot at 0x7f4d947c0d90>
Create features
data['Date'] = pd.to_datetime(data['Date'],format='%y-%m-%d',infer_datetime_format=True)
data['year'] = data['Date'].dt.year
data['month'] = data['Date'].dt.month
data['day'] = data['Date'].dt.day
Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed WindDir9am WindDir3pm ... Pre
1 Albury 7.4 25.1 0.0 NaN NaN WNW 44.0 NNW WSW ...
2 Albury 12.9 25.7 0.0 NaN NaN WSW 46.0 W WSW ...
5 rows × 25 columns
Location 0
WindGustDir 10326
WindDir9am 10566
WindDir3pm 4228
RainToday 3261
RainTomorrow 3267
dtype: int64
https://colab.research.google.com/drive/1XM89JsPOm5UTWy8aQCcyBupVlhFp3m3Y#printMode=true 4/10
21/11/2023, 00:01 Rainfall_prediction.ipynb - Colaboratory
# categorical feature mode imputation
categorical_features_with_null = [feature for feature in categorical_features if data[feature].isnull().sum()]
for each_feature in categorical_features_with_null:
mode_val = data[each_feature].mode()[0]
data[each_feature].fillna(mode_val,inplace=True)
MinTemp 1485
MaxTemp 1261
Rainfall 3261
Evaporation 62790
Sunshine 69835
WindGustSpeed 10263
WindSpeed9am 1767
WindSpeed3pm 3062
Humidity9am 2654
Humidity3pm 4507
Pressure9am 15065
Pressure3pm 15028
Cloud9am 55888
Cloud3pm 59358
Temp9am 1767
Temp3pm 3609
year 0
month 0
day 0
dtype: int64
Outlier treatment
sns.boxplot(x='WindGustSpeed',data=data)
<matplotlib.axes._subplots.AxesSubplot at 0x7f4d946d4490>
data.WindGustSpeed.mean()
40.03523007167319
data.WindGustSpeed.mean()
39.83779225870396
sns.boxplot(x='WindGustSpeed',data=data)
https://colab.research.google.com/drive/1XM89JsPOm5UTWy8aQCcyBupVlhFp3m3Y#printMode=true 5/10
21/11/2023, 00:01 Rainfall_prediction.ipynb - Colaboratory
<matplotlib.axes._subplots.AxesSubplot at 0x7f4d946cc110>
Feature eng
data.head()
MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustSpeed WindSpeed9am WindSpeed3pm Humidity9am Humidity3pm
0 13.4 22.9 0.6 5.318667 7.611178 44.0 20.0 24.0 71.0 22.0
1 7.4 25.1 0.0 5.318667 7.611178 44.0 4.0 22.0 44.0 25.0
2 12.9 25.7 0.0 5.318667 7.611178 46.0 19.0 26.0 38.0 30.0
3 9.2 28.0 0.0 5.318667 7.611178 24.0 11.0 9.0 45.0 16.0
4 17.5 32.3 1.0 5.318667 7.611178 41.0 7.0 20.0 82.0 33.0
# plt.figure(figsize=(20,20))
# sns.heatmap(data.corr(), linewidths=0.5, annot=False, fmt=".2f", cmap = 'viridis')
X = data.drop(['RainTomorrow'],axis=1)
y = data['RainTomorrow']
y_train.value_counts()
0 90857
1 25511
Name: RainTomorrow, dtype: int64
y_test.value_counts()
0 22726
1 6366
Name: RainTomorrow, dtype: int64
X_test = scaler.transform(X_test)
LogisticRegression(random_state=0, solver='liblinear')
https://colab.research.google.com/drive/1XM89JsPOm5UTWy8aQCcyBupVlhFp3m3Y#printMode=true 6/10
21/11/2023, 00:01 Rainfall_prediction.ipynb - Colaboratory
y_pred_logreg_proba = classifier_logreg.predict_proba(X_test)
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_logreg_proba[:,1])
plt.figure(figsize=(6,4))
plt.plot(fpr,tpr,'-g',linewidth=1)
plt.plot([0,1], [0,1], 'k--' )
plt.title('ROC curve for Logistic Regression Model')
plt.xlabel("False Positive Rate")
plt.ylabel('True Positive Rate')
plt.show()
thresholds[np.argmax(tpr - fpr)]
0.18556315429146203
pred_proba = y_pred_logreg_proba[:,1]
preds = np.where(pred_proba>0.18, 1, 0)
https://colab.research.google.com/drive/1XM89JsPOm5UTWy8aQCcyBupVlhFp3m3Y#printMode=true 7/10
21/11/2023, 00:01 Rainfall_prediction.ipynb - Colaboratory
Decision trees
classifier_dt = DecisionTreeClassifier(max_depth=8,random_state=0)
classifier_dt.fit(X_train, y_train)
DecisionTreeClassifier(max_depth=8, random_state=0)
array([[0.95227766, 0.04772234],
[0.69072165, 0.30927835],
[0.90505079, 0.09494921],
...,
[0.62937063, 0.37062937],
[0.83574879, 0.16425121],
[0.96869176, 0.03130824]])
pred_proba_dt = y_pred_logreg_proba_dt[:,1]
preds_dt = np.where(pred_proba_dt>0.70, 1, 0)
https://colab.research.google.com/drive/1XM89JsPOm5UTWy8aQCcyBupVlhFp3m3Y#printMode=true 8/10
21/11/2023, 00:01 Rainfall_prediction.ipynb - Colaboratory
Random Forest
y_pred_logreg_proba_rf = classifier_rf.predict_proba(X_test)
from sklearn.metrics import roc_curve
fpr_rf, tpr_rf, thresholds_rf = roc_curve(y_test, y_pred_logreg_proba_rf[:,1])
plt.figure(figsize=(6,4))
plt.plot(fpr_rf,tpr_rf,'-g',linewidth=1)
plt.plot([0,1], [0,1], 'k--' )
plt.title('ROC curve for Random forest Model')
plt.xlabel("False Positive Rate")
plt.ylabel('True Positive Rate')
plt.show()
pred_proba_rf = y_pred_logreg_proba_rf[:,1]
preds_rf = np.where(pred_proba_rf>0.70, 1, 0)
https://colab.research.google.com/drive/1XM89JsPOm5UTWy8aQCcyBupVlhFp3m3Y#printMode=true 9/10
21/11/2023, 00:01 Rainfall_prediction.ipynb - Colaboratory
https://colab.research.google.com/drive/1XM89JsPOm5UTWy8aQCcyBupVlhFp3m3Y#printMode=true 10/10