You are on page 1of 21

# Importing Necessary Libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

#Importing dataset
ipl_df = pd.read_csv('/content/ipl_data.csv')
print(f"Dataset successfully Imported of Shape : {ipl_df.shape}")

Dataset successfully Imported of Shape : (76014, 15)

# First 5 Columns Data


ipl_df.head()

{"summary":"{\n \"name\": \"ipl_df\",\n \"rows\": 76014,\n


\"fields\": [\n {\n \"column\": \"mid\",\n
\"properties\": {\n \"dtype\": \"number\",\n \"std\":
178,\n \"min\": 1,\n \"max\": 617,\n
\"num_unique_values\": 617,\n \"samples\": [\n 50,\n
582,\n 83\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"date\",\n \"properties\": {\n \"dtype\": \"object\",\n
\"num_unique_values\": 442,\n \"samples\": [\n \"2014-
05-02\",\n \"2012-05-15\",\n \"2009-05-18\"\n
],\n \"semantic_type\": \"\",\n \"description\": \"\"\n
}\n },\n {\n \"column\": \"venue\",\n \"properties\":
{\n \"dtype\": \"category\",\n \"num_unique_values\":
35,\n \"samples\": [\n \"Shaheed Veer Narayan Singh
International Stadium\",\n \"Buffalo Park\",\n \"Dr.
Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"bat_team\",\n \"properties\":
{\n \"dtype\": \"category\",\n \"num_unique_values\":
14,\n \"samples\": [\n \"Pune Warriors\",\n
\"Rising Pune Supergiants\",\n \"Kolkata Knight Riders\"\n
],\n \"semantic_type\": \"\",\n \"description\": \"\"\n
}\n },\n {\n \"column\": \"bowl_team\",\n
\"properties\": {\n \"dtype\": \"category\",\n
\"num_unique_values\": 14,\n \"samples\": [\n \"Kochi
Tuskers Kerala\",\n \"Rising Pune Supergiants\",\n
\"Royal Challengers Bangalore\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"batsman\",\n \"properties\":
{\n \"dtype\": \"category\",\n \"num_unique_values\":
411,\n \"samples\": [\n \"A Nehra\",\n \"A
Symonds\",\n \"DJ Bravo\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"bowler\",\n \"properties\":
{\n \"dtype\": \"category\",\n \"num_unique_values\":
329,\n \"samples\": [\n \"IK Pathan\",\n \"AB
McDonald\",\n \"JM Kemp\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"runs\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 48,\n \"min\": 0,\n
\"max\": 263,\n \"num_unique_values\": 252,\n
\"samples\": [\n 106,\n 21,\n 97\
n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"wickets\",\n \"properties\": {\n \"dtype\": \"number\",\
n \"std\": 2,\n \"min\": 0,\n \"max\": 10,\n
\"num_unique_values\": 11,\n \"samples\": [\n 5,\n
0,\n 9\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"overs\",\n \"properties\": {\n \"dtype\": \"number\",\n
\"std\": 5.772586873852113,\n \"min\": 0.0,\n \"max\":
19.6,\n \"num_unique_values\": 140,\n \"samples\": [\n
17.6,\n 11.1,\n 5.1\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"runs_last_5\",\n
\"properties\": {\n \"dtype\": \"number\",\n \"std\":
14,\n \"min\": 0,\n \"max\": 113,\n
\"num_unique_values\": 102,\n \"samples\": [\n 37,\n
18,\n 74\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"wickets_last_5\",\n \"properties\": {\n \"dtype\":
\"number\",\n \"std\": 1,\n \"min\": 0,\n
\"max\": 7,\n \"num_unique_values\": 8,\n \"samples\":
[\n 1,\n 5,\n 0\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"striker\",\n \"properties\":
{\n \"dtype\": \"number\",\n \"std\": 20,\n
\"min\": 0,\n \"max\": 175,\n \"num_unique_values\":
155,\n \"samples\": [\n 95,\n 160,\n
80\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"non-striker\",\n \"properties\": {\n \"dtype\":
\"number\",\n \"std\": 10,\n \"min\": 0,\n
\"max\": 109,\n \"num_unique_values\": 88,\n
\"samples\": [\n 69,\n 0,\n 18\n ],\
n \"semantic_type\": \"\",\n \"description\": \"\"\n
}\n },\n {\n \"column\": \"total\",\n \"properties\":
{\n \"dtype\": \"number\",\n \"std\": 29,\n
\"min\": 67,\n \"max\": 263,\n \"num_unique_values\":
138,\n \"samples\": [\n 132,\n 115,\n
154\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n }\n ]\
n}","type":"dataframe","variable_name":"ipl_df"}

# Describing the ipl_dfset


ipl_df.describe()

{"summary":"{\n \"name\": \"ipl_df\",\n \"rows\": 8,\n \"fields\":


[\n {\n \"column\": \"mid\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 6155.813307577099,\n
\"min\": 1.0,\n \"max\": 17478.0,\n
\"num_unique_values\": 8,\n \"samples\": [\n
71.40788419727657,\n 72.0,\n 17478.0\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"runs\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 6150.2944315706645,\n
\"min\": 0.0,\n \"max\": 17477.0,\n
\"num_unique_values\": 8,\n \"samples\": [\n
74.54814899582308,\n 70.0,\n 17477.0\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"wickets\",\n \"properties\":
{\n \"dtype\": \"number\",\n \"std\":
6177.954883141603,\n \"min\": 0.0,\n \"max\": 17477.0,\n
\"num_unique_values\": 8,\n \"samples\": [\n
2.6637866910797046,\n 2.0,\n 17477.0\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"overs\",\n \"properties\": {\
n \"dtype\": \"number\",\n \"std\": 6175.826215003261,\n
\"min\": 0.0,\n \"max\": 17477.0,\n
\"num_unique_values\": 8,\n \"samples\": [\n
9.75828803570407,\n 9.6,\n 17477.0\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"runs_last_5\",\n
\"properties\": {\n \"dtype\": \"number\",\n \"std\":
6167.329477872403,\n \"min\": 0.0,\n \"max\": 17477.0,\n
\"num_unique_values\": 8,\n \"samples\": [\n
32.71310865709218,\n 34.0,\n 17477.0\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"wickets_last_5\",\n
\"properties\": {\n \"dtype\": \"number\",\n \"std\":
6178.484772156044,\n \"min\": 0.0,\n \"max\": 17477.0,\n
\"num_unique_values\": 7,\n \"samples\": [\n 17477.0,\
n 1.1984322252102764,\n 2.0\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"striker\",\n \"properties\":
{\n \"dtype\": \"number\",\n \"std\":
6165.786654907889,\n \"min\": 0.0,\n \"max\": 17477.0,\n
\"num_unique_values\": 8,\n \"samples\": [\n
24.57475539280197,\n 20.0,\n 17477.0\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"non-striker\",\n
\"properties\": {\n \"dtype\": \"number\",\n \"std\":
6174.403089883773,\n \"min\": 0.0,\n \"max\": 17477.0,\n
\"num_unique_values\": 8,\n \"samples\": [\n
7.979802025519254,\n 4.0,\n 17477.0\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"total\",\n \"properties\": {\
n \"dtype\": \"number\",\n \"std\": 6129.802688655688,\n
\"min\": 28.746713170397086,\n \"max\": 17477.0,\n
\"num_unique_values\": 8,\n \"samples\": [\n
159.5532986210448,\n 161.0,\n 17477.0\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n }\n ]\n}","type":"dataframe"}

# Information about Each Column


ipl_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17478 entries, 0 to 17477
Data columns (total 15 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 mid 17478 non-null int64
1 date 17478 non-null object
2 venue 17478 non-null object
3 bat_team 17478 non-null object
4 bowl_team 17478 non-null object
5 batsman 17478 non-null object
6 bowler 17478 non-null object
7 runs 17477 non-null float64
8 wickets 17477 non-null float64
9 overs 17477 non-null float64
10 runs_last_5 17477 non-null float64
11 wickets_last_5 17477 non-null float64
12 striker 17477 non-null float64
13 non-striker 17477 non-null float64
14 total 17477 non-null float64
dtypes: float64(8), int64(1), object(6)
memory usage: 2.0+ MB

# Number of Unique Values in each column


ipl_df.nunique()

mid 142
date 102
venue 20
bat_team 8
bowl_team 8
batsman 214
bowler 155
runs 225
wickets 11
overs 139
runs_last_5 85
wickets_last_5 7
striker 129
non-striker 59
total 84
dtype: int64

# ipl_df types of all Columns


ipl_df.dtypes

mid int64
date object
venue object
bat_team object
bowl_team object
batsman object
bowler object
runs float64
wickets float64
overs float64
runs_last_5 float64
wickets_last_5 float64
striker float64
non-striker float64
total float64
dtype: object

#Wickets Distribution
sns.displot(ipl_df['wickets'],kde=False,bins=10)
plt.title("Wickets Distribution")

plt.show()
#Runs Distribution
sns.displot(ipl_df['total'],kde=False,bins=10)
plt.title("Runs Distribution")

plt.show()
# Names of all columns
ipl_df.columns

Index(['mid', 'date', 'venue', 'bat_team', 'bowl_team', 'batsman',


'bowler',
'runs', 'wickets', 'overs', 'runs_last_5', 'wickets_last_5',
'striker',
'non-striker', 'total'],
dtype='object')

irrelevant = ['mid', 'date', 'venue','batsman', 'bowler', 'striker', 'non-striker'] print(f'Before


Removing Irrelevant Columns : {ipl_df.shape}') ipl_df = ipl_df.drop(irrelevant, axis=1) # Drop
Irrelevant Columns print(f'After Removing Irrelevant Columns : {ipl_df.shape}') ipl_df.head()

# Define Consistent Teams


const_teams = ['Kolkata Knight Riders', 'Chennai Super Kings',
'Rajasthan Royals',
'Mumbai Indians', 'Kings XI Punjab', 'Royal Challengers
Bangalore',
'Delhi Daredevils', 'Sunrisers Hyderabad']

print(f'Before Removing Inconsistent Teams : {ipl_df.shape}')


ipl_df = ipl_df[(ipl_df['bat_team'].isin(const_teams)) &
(ipl_df['bowl_team'].isin(const_teams))]
print(f'After Removing Irrelevant Columns : {ipl_df.shape}')
print(f"Consistent Teams : \n{ipl_df['bat_team'].unique()}")
ipl_df.head()

Before Removing Inconsistent Teams : (76014, 8)


After Removing Irrelevant Columns : (53811, 8)
Consistent Teams :
['Kolkata Knight Riders' 'Chennai Super Kings' 'Rajasthan Royals'
'Mumbai Indians' 'Kings XI Punjab' 'Royal Challengers Bangalore'
'Delhi Daredevils' 'Sunrisers Hyderabad']

{"summary":"{\n \"name\": \"ipl_df\",\n \"rows\": 53811,\n


\"fields\": [\n {\n \"column\": \"bat_team\",\n
\"properties\": {\n \"dtype\": \"category\",\n
\"num_unique_values\": 8,\n \"samples\": [\n \"Chennai
Super Kings\",\n \"Royal Challengers Bangalore\",\n
\"Kolkata Knight Riders\"\n ],\n \"semantic_type\":
\"\",\n \"description\": \"\"\n }\n },\n {\n
\"column\": \"bowl_team\",\n \"properties\": {\n
\"dtype\": \"category\",\n \"num_unique_values\": 8,\n
\"samples\": [\n \"Kings XI Punjab\",\n \"Chennai
Super Kings\",\n \"Royal Challengers Bangalore\"\n ],\
n \"semantic_type\": \"\",\n \"description\": \"\"\n
}\n },\n {\n \"column\": \"runs\",\n \"properties\":
{\n \"dtype\": \"number\",\n \"std\": 49,\n
\"min\": 0,\n \"max\": 246,\n \"num_unique_values\":
239,\n \"samples\": [\n 66,\n 21,\n
20\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"wickets\",\n \"properties\": {\n \"dtype\": \"number\",\
n \"std\": 2,\n \"min\": 0,\n \"max\": 10,\n
\"num_unique_values\": 11,\n \"samples\": [\n 5,\n
0,\n 9\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"overs\",\n \"properties\": {\n \"dtype\": \"number\",\n
\"std\": 5.773906711218136,\n \"min\": 0.0,\n \"max\":
19.6,\n \"num_unique_values\": 140,\n \"samples\": [\n
17.6,\n 11.1,\n 5.1\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"runs_last_5\",\n
\"properties\": {\n \"dtype\": \"number\",\n \"std\":
15,\n \"min\": 0,\n \"max\": 94,\n
\"num_unique_values\": 94,\n \"samples\": [\n 67,\n
45,\n 42\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"wickets_last_5\",\n \"properties\": {\n \"dtype\":
\"number\",\n \"std\": 1,\n \"min\": 0,\n
\"max\": 7,\n \"num_unique_values\": 8,\n \"samples\":
[\n 1,\n 5,\n 0\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"total\",\n \"properties\": {\
n \"dtype\": \"number\",\n \"std\": 30,\n
\"min\": 67,\n \"max\": 246,\n \"num_unique_values\":
128,\n \"samples\": [\n 101,\n 149,\n
187\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n }\n ]\
n}","type":"dataframe","variable_name":"ipl_df"}

print(f'Before Removing Overs : {ipl_df.shape}')


ipl_df = ipl_df[ipl_df['overs'] >= 5.0]
print(f'After Removing Overs : {ipl_df.shape}')
ipl_df.head()

Before Removing Overs : (53811, 8)


After Removing Overs : (40108, 8)

{"summary":"{\n \"name\": \"ipl_df\",\n \"rows\": 40108,\n


\"fields\": [\n {\n \"column\": \"bat_team\",\n
\"properties\": {\n \"dtype\": \"category\",\n
\"num_unique_values\": 8,\n \"samples\": [\n \"Chennai
Super Kings\",\n \"Royal Challengers Bangalore\",\n
\"Kolkata Knight Riders\"\n ],\n \"semantic_type\":
\"\",\n \"description\": \"\"\n }\n },\n {\n
\"column\": \"bowl_team\",\n \"properties\": {\n
\"dtype\": \"category\",\n \"num_unique_values\": 8,\n
\"samples\": [\n \"Kings XI Punjab\",\n \"Chennai
Super Kings\",\n \"Royal Challengers Bangalore\"\n ],\
n \"semantic_type\": \"\",\n \"description\": \"\"\n
}\n },\n {\n \"column\": \"runs\",\n \"properties\":
{\n \"dtype\": \"number\",\n \"std\": 40,\n
\"min\": 13,\n \"max\": 246,\n \"num_unique_values\":
226,\n \"samples\": [\n 71,\n 20,\n
43\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"wickets\",\n \"properties\": {\n \"dtype\": \"number\",\
n \"std\": 1,\n \"min\": 0,\n \"max\": 10,\n
\"num_unique_values\": 11,\n \"samples\": [\n 5,\n
0,\n 9\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"overs\",\n \"properties\": {\n \"dtype\": \"number\",\n
\"std\": 4.32300139521351,\n \"min\": 5.0,\n \"max\":
19.6,\n \"num_unique_values\": 105,\n \"samples\": [\n
10.1,\n 15.6,\n 15.5\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"runs_last_5\",\n
\"properties\": {\n \"dtype\": \"number\",\n \"std\":
11,\n \"min\": 10,\n \"max\": 94,\n
\"num_unique_values\": 84,\n \"samples\": [\n 93,\n
59,\n 36\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"wickets_last_5\",\n \"properties\": {\n \"dtype\":
\"number\",\n \"std\": 1,\n \"min\": 0,\n
\"max\": 7,\n \"num_unique_values\": 8,\n \"samples\":
[\n 1,\n 5,\n 0\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"total\",\n \"properties\": {\
n \"dtype\": \"number\",\n \"std\": 29,\n
\"min\": 67,\n \"max\": 246,\n \"num_unique_values\":
128,\n \"samples\": [\n 101,\n 149,\n
187\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n }\n ]\
n}","type":"dataframe","variable_name":"ipl_df"}

from seaborn import heatmap


heatmap(data=ipl_df.corr(), annot=True)

<ipython-input-28-8fde0e13fc28>:2: FutureWarning: The default value of


numeric_only in DataFrame.corr is deprecated. In a future version, it
will default to False. Select only valid columns or specify the value
of numeric_only to silence this warning.
heatmap(data=ipl_df.corr(), annot=True)

<Axes: >
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
le = LabelEncoder()
for col in ['bat_team', 'bowl_team']:
ipl_df[col] = le.fit_transform(ipl_df[col])
ipl_df.head()

{"summary":"{\n \"name\": \"ipl_df\",\n \"rows\": 40108,\n


\"fields\": [\n {\n \"column\": \"bat_team\",\n
\"properties\": {\n \"dtype\": \"number\",\n \"std\":
2,\n \"min\": 0,\n \"max\": 7,\n
\"num_unique_values\": 8,\n \"samples\": [\n 0,\n
6,\n 3\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"bowl_team\",\n \"properties\": {\n \"dtype\":
\"number\",\n \"std\": 2,\n \"min\": 0,\n
\"max\": 7,\n \"num_unique_values\": 8,\n \"samples\":
[\n 2,\n 0,\n 6\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"runs\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 40,\n \"min\": 13,\n
\"max\": 246,\n \"num_unique_values\": 226,\n
\"samples\": [\n 71,\n 20,\n 43\
n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"wickets\",\n \"properties\": {\n \"dtype\": \"number\",\
n \"std\": 1,\n \"min\": 0,\n \"max\": 10,\n
\"num_unique_values\": 11,\n \"samples\": [\n 5,\n
0,\n 9\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"overs\",\n \"properties\": {\n \"dtype\": \"number\",\n
\"std\": 4.32300139521351,\n \"min\": 5.0,\n \"max\":
19.6,\n \"num_unique_values\": 105,\n \"samples\": [\n
10.1,\n 15.6,\n 15.5\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"runs_last_5\",\n
\"properties\": {\n \"dtype\": \"number\",\n \"std\":
11,\n \"min\": 10,\n \"max\": 94,\n
\"num_unique_values\": 84,\n \"samples\": [\n 93,\n
59,\n 36\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"wickets_last_5\",\n \"properties\": {\n \"dtype\":
\"number\",\n \"std\": 1,\n \"min\": 0,\n
\"max\": 7,\n \"num_unique_values\": 8,\n \"samples\":
[\n 1,\n 5,\n 0\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"total\",\n \"properties\": {\
n \"dtype\": \"number\",\n \"std\": 29,\n
\"min\": 67,\n \"max\": 246,\n \"num_unique_values\":
128,\n \"samples\": [\n 101,\n 149,\n
187\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n }\n ]\
n}","type":"dataframe","variable_name":"ipl_df"}

from sklearn.compose import ColumnTransformer


columnTransformer = ColumnTransformer([('encoder',
OneHotEncoder(),
[0, 1])],
remainder='passthrough')

ipl_df = np.array(columnTransformer.fit_transform(ipl_df))

cols = ['batting_team_Chennai Super Kings', 'batting_team_Delhi


Daredevils', 'batting_team_Kings XI Punjab',
'batting_team_Kolkata Knight Riders',
'batting_team_Mumbai Indians', 'batting_team_Rajasthan Royals',
'batting_team_Royal Challengers Bangalore',
'batting_team_Sunrisers Hyderabad',
'bowling_team_Chennai Super Kings', 'bowling_team_Delhi
Daredevils', 'bowling_team_Kings XI Punjab',
'bowling_team_Kolkata Knight Riders',
'bowling_team_Mumbai Indians', 'bowling_team_Rajasthan Royals',
'bowling_team_Royal Challengers Bangalore',
'bowling_team_Sunrisers Hyderabad', 'runs', 'wickets', 'overs',
'runs_last_5', 'wickets_last_5', 'total']
df = pd.DataFrame(ipl_df, columns=cols)

# Encoded Data
df.head()

{"type":"dataframe","variable_name":"df"}

features = df.drop(['total'], axis=1)


labels = df['total']

from sklearn.model_selection import train_test_split


train_features, test_features, train_labels, test_labels =
train_test_split(features, labels, test_size=0.20, shuffle=True)
print(f"Training Set : {train_features.shape}\nTesting Set :
{test_features.shape}")

Training Set : (32086, 21)


Testing Set : (8022, 21)

models = dict()

from sklearn.tree import DecisionTreeRegressor


tree = DecisionTreeRegressor()
# Train Model
tree.fit(train_features, train_labels)

DecisionTreeRegressor()

# Evaluate Model
train_score_tree = str(tree.score(train_features, train_labels) * 100)
test_score_tree = str(tree.score(test_features, test_labels) * 100)
print(f'Train Score : {train_score_tree[:5]}%\nTest Score :
{test_score_tree[:5]}%')
models["tree"] = test_score_tree

Train Score : 99.98%


Test Score : 87.48%

from sklearn.metrics import mean_absolute_error as mae,


mean_squared_error as mse
print("---- Decision Tree Regressor - Model Evaluation ----")
print("Mean Absolute Error (MAE): {}".format(mae(test_labels,
tree.predict(test_features))))
print("Mean Squared Error (MSE): {}".format(mse(test_labels,
tree.predict(test_features))))
print("Root Mean Squared Error (RMSE):
{}".format(np.sqrt(mse(test_labels, tree.predict(test_features)))))
---- Decision Tree Regressor - Model Evaluation ----
Mean Absolute Error (MAE): 3.855958613812017
Mean Squared Error (MSE): 113.42043754674644
Root Mean Squared Error (RMSE): 10.64990317076857

from sklearn.linear_model import LinearRegression


linreg = LinearRegression()
# Train Model
linreg.fit(train_features, train_labels)

LinearRegression()

# Evaluate Model
train_score_linreg = str(linreg.score(train_features, train_labels) *
100)
test_score_linreg = str(linreg.score(test_features, test_labels) *
100)
print(f'Train Score : {train_score_linreg[:5]}%\nTest Score :
{test_score_linreg[:5]}%')
models["linreg"] = test_score_linreg

Train Score : 65.85%


Test Score : 66.14%

print("---- Linear Regression - Model Evaluation ----")


print("Mean Absolute Error (MAE): {}".format(mae(test_labels,
linreg.predict(test_features))))
print("Mean Squared Error (MSE): {}".format(mse(test_labels,
linreg.predict(test_features))))
print("Root Mean Squared Error (RMSE):
{}".format(np.sqrt(mse(test_labels, linreg.predict(test_features)))))

---- Linear Regression - Model Evaluation ----


Mean Absolute Error (MAE): 13.13286821326581
Mean Squared Error (MSE): 306.74069830363675
Root Mean Squared Error (RMSE): 17.51401434005456

from sklearn.ensemble import RandomForestRegressor


forest = RandomForestRegressor()
# Train Model
forest.fit(train_features, train_labels)

RandomForestRegressor()

# Evaluate Model
train_score_forest = str(forest.score(train_features,
train_labels)*100)
test_score_forest = str(forest.score(test_features, test_labels)*100)
print(f'Train Score : {train_score_forest[:5]}%\nTest Score :
{test_score_forest[:5]}%')
models["forest"] = test_score_forest
Train Score : 99.04%
Test Score : 93.78%

print("---- Random Forest Regression - Model Evaluation ----")


print("Mean Absolute Error (MAE): {}".format(mae(test_labels,
forest.predict(test_features))))
print("Mean Squared Error (MSE): {}".format(mse(test_labels,
forest.predict(test_features))))
print("Root Mean Squared Error (RMSE):
{}".format(np.sqrt(mse(test_labels, forest.predict(test_features)))))

---- Random Forest Regression - Model Evaluation ----


Mean Absolute Error (MAE): 4.4322614269093314
Mean Squared Error (MSE): 56.283472061009306
Root Mean Squared Error (RMSE): 7.502231138868577

from sklearn.svm import SVR


svm = SVR()
# Train Model
svm.fit(train_features, train_labels)

SVR()

from sklearn.svm import SVR


svm = SVR()
# Train Model
svm.fit(train_features, train_labels)

SVR()

train_score_svm = str(svm.score(train_features, train_labels)*100)


test_score_svm = str(svm.score(test_features, test_labels)*100)
print(f'Train Score : {train_score_svm[:5]}%\nTest Score :
{test_score_svm[:5]}%')
models["svm"] = test_score_svm

Train Score : 57.34%


Test Score : 57.79%

print("---- Support Vector Regression - Model Evaluation ----")


print("Mean Absolute Error (MAE): {}".format(mae(test_labels,
svm.predict(test_features))))
print("Mean Squared Error (MSE): {}".format(mse(test_labels,
svm.predict(test_features))))
print("Root Mean Squared Error (RMSE):
{}".format(np.sqrt(mse(test_labels, svm.predict(test_features)))))

---- Support Vector Regression - Model Evaluation ----


Mean Absolute Error (MAE): 14.746880132948581
Mean Squared Error (MSE): 382.4076559758673
Root Mean Squared Error (RMSE): 19.555246251987402
from xgboost import XGBRegressor
xgb = XGBRegressor()
# Train Model
xgb.fit(train_features, train_labels)

XGBRegressor(base_score=None, booster=None, callbacks=None,


colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None,
early_stopping_rounds=None,
enable_categorical=False, eval_metric=None,
feature_types=None,
gamma=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None,
max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan,
monotone_constraints=None,
multi_strategy=None, n_estimators=None, n_jobs=None,
num_parallel_tree=None, random_state=None, ...)

train_score_xgb = str(xgb.score(train_features, train_labels)*100)


test_score_xgb = str(xgb.score(test_features, test_labels)*100)
print(f'Train Score : {train_score_xgb[:5]}%\nTest Score :
{test_score_xgb[:5]}%')
models["xgb"] = test_score_xgb

Train Score : 88.63%


Test Score : 85.09%

print("---- XGB Regression - Model Evaluation ----")


print("Mean Absolute Error (MAE): {}".format(mae(test_labels,
xgb.predict(test_features))))
print("Mean Squared Error (MSE): {}".format(mse(test_labels,
xgb.predict(test_features))))
print("Root Mean Squared Error (RMSE):
{}".format(np.sqrt(mse(test_labels, xgb.predict(test_features)))))

---- XGB Regression - Model Evaluation ----


Mean Absolute Error (MAE): 8.323914393555997
Mean Squared Error (MSE): 135.04082317310105
Root Mean Squared Error (RMSE): 11.620706655496518

from sklearn.neighbors import KNeighborsRegressor


knr = KNeighborsRegressor()
# Train Model
knr.fit(train_features, train_labels)

KNeighborsRegressor()
train_score_knr = str(knr.score(train_features, train_labels)*100)
test_score_knr = str(knr.score(test_features, test_labels)*100)
print(f'Train Score : {train_score_knr[:5]}%\nTest Score :
{test_score_knr[:5]}%')
models["knr"] = test_score_knr

Train Score : 86.83%


Test Score : 77.01%

print("---- KNR - Model Evaluation ----")


print("Mean Absolute Error (MAE): {}".format(mae(test_labels,
knr.predict(test_features))))
print("Mean Squared Error (MSE): {}".format(mse(test_labels,
knr.predict(test_features))))
print("Root Mean Squared Error (RMSE):
{}".format(np.sqrt(mse(test_labels, knr.predict(test_features)))))

---- KNR - Model Evaluation ----


Mean Absolute Error (MAE): 10.000822737471953
Mean Squared Error (MSE): 208.29277985539767
Root Mean Squared Error (RMSE): 14.432351847685728

import matplotlib.pyplot as plt


model_names = list(models.keys())
accuracy = list(map(float, models.values()))
# creating the bar plot
plt.bar(model_names, accuracy)

<BarContainer object of 6 artists>


def score_predict(batting_team, bowling_team, runs, wickets, overs,
runs_last_5, wickets_last_5, model=forest):
prediction_array = []
# Batting Team
if batting_team == 'Chennai Super Kings':
prediction_array = prediction_array + [1,0,0,0,0,0,0,0]
elif batting_team == 'Delhi Daredevils':
prediction_array = prediction_array + [0,1,0,0,0,0,0,0]
elif batting_team == 'Kings XI Punjab':
prediction_array = prediction_array + [0,0,1,0,0,0,0,0]
elif batting_team == 'Kolkata Knight Riders':
prediction_array = prediction_array + [0,0,0,1,0,0,0,0]
elif batting_team == 'Mumbai Indians':
prediction_array = prediction_array + [0,0,0,0,1,0,0,0]
elif batting_team == 'Rajasthan Royals':
prediction_array = prediction_array + [0,0,0,0,0,1,0,0]
elif batting_team == 'Royal Challengers Bangalore':
prediction_array = prediction_array + [0,0,0,0,0,0,1,0]
elif batting_team == 'Sunrisers Hyderabad':
prediction_array = prediction_array + [0,0,0,0,0,0,0,1]
# Bowling Team
if bowling_team == 'Chennai Super Kings':
prediction_array = prediction_array + [1,0,0,0,0,0,0,0]
elif bowling_team == 'Delhi Daredevils':
prediction_array = prediction_array + [0,1,0,0,0,0,0,0]
elif bowling_team == 'Kings XI Punjab':
prediction_array = prediction_array + [0,0,1,0,0,0,0,0]
elif bowling_team == 'Kolkata Knight Riders':
prediction_array = prediction_array + [0,0,0,1,0,0,0,0]
elif bowling_team == 'Mumbai Indians':
prediction_array = prediction_array + [0,0,0,0,1,0,0,0]
elif bowling_team == 'Rajasthan Royals':
prediction_array = prediction_array + [0,0,0,0,0,1,0,0]
elif bowling_team == 'Royal Challengers Bangalore':
prediction_array = prediction_array + [0,0,0,0,0,0,1,0]
elif bowling_team == 'Sunrisers Hyderabad':
prediction_array = prediction_array + [0,0,0,0,0,0,0,1]
prediction_array = prediction_array + [runs, wickets, overs,
runs_last_5, wickets_last_5]
prediction_array = np.array([prediction_array])
pred = model.predict(prediction_array)
return int(round(pred[0]))

batting_team='Delhi Daredevils'
bowling_team='Chennai Super Kings'
score = score_predict(batting_team, bowling_team, overs=10.2, runs=68,
wickets=3, runs_last_5=29, wickets_last_5=1)
print(f'Predicted Score : {score} || Actual Score : 147')

Predicted Score : 151 || Actual Score : 147

/usr/local/lib/python3.10/dist-packages/sklearn/base.py:439:
UserWarning: X does not have valid feature names, but
RandomForestRegressor was fitted with feature names
warnings.warn(

batting_team='Mumbai Indians'
bowling_team='Kings XI Punjab'
score = score_predict(batting_team, bowling_team, overs=12.3,
runs=113, wickets=2, runs_last_5=55, wickets_last_5=0)
print(f'Predicted Score : {score} || Actual Score : 176')

Predicted Score : 188 || Actual Score : 176

/usr/local/lib/python3.10/dist-packages/sklearn/base.py:439:
UserWarning: X does not have valid feature names, but
RandomForestRegressor was fitted with feature names
warnings.warn(

batting_team="Kings XI Punjab"
bowling_team="Rajasthan Royals"
score =score_predict(batting_team, bowling_team, overs=14.0, runs=118,
wickets=1, runs_last_5=45, wickets_last_5=0)
print(f'Predicted Score : {score} || Actual Score : 185')

Predicted Score : 175 || Actual Score : 185


/usr/local/lib/python3.10/dist-packages/sklearn/base.py:439:
UserWarning: X does not have valid feature names, but
RandomForestRegressor was fitted with feature names
warnings.warn(

batting_team="Kolkata Knight Riders"


bowling_team="Chennai Super Kings"
score = score_predict(batting_team, bowling_team, overs=18.0,
runs=150, wickets=4, runs_last_5=57, wickets_last_5=1)
print(f'Predicted Score : {score} || Actual Score : 172')

Predicted Score : 172 || Actual Score : 172

/usr/local/lib/python3.10/dist-packages/sklearn/base.py:439:
UserWarning: X does not have valid feature names, but
RandomForestRegressor was fitted with feature names
warnings.warn(

batting_team='Delhi Daredevils'
bowling_team='Mumbai Indians'
score = score_predict(batting_team, bowling_team, overs=18.0, runs=96,
wickets=8, runs_last_5=18, wickets_last_5=4)
print(f'Predicted Score : {score} || Actual Score : 110')

Predicted Score : 108 || Actual Score : 110

/usr/local/lib/python3.10/dist-packages/sklearn/base.py:439:
UserWarning: X does not have valid feature names, but
RandomForestRegressor was fitted with feature names
warnings.warn(

batting_team='Kings XI Punjab'
bowling_team='Chennai Super Kings'
score = score_predict(batting_team, bowling_team, overs=18.0,
runs=129, wickets=6, runs_last_5=34, wickets_last_5=2)
print(f'Predicted Score : {score} || Actual Score : 153')

Predicted Score : 148 || Actual Score : 153

/usr/local/lib/python3.10/dist-packages/sklearn/base.py:439:
UserWarning: X does not have valid feature names, but
RandomForestRegressor was fitted with feature names
warnings.warn(

batting_team='Sunrisers Hyderabad'
bowling_team='Royal Challengers Bangalore'
score = score_predict(batting_team, bowling_team, overs=10.5, runs=67,
wickets=3, runs_last_5=29, wickets_last_5=1)
print(f'Predicted Score : {score} || Actual Score : 146')

Predicted Score : 153 || Actual Score : 146


/usr/local/lib/python3.10/dist-packages/sklearn/base.py:439:
UserWarning: X does not have valid feature names, but
RandomForestRegressor was fitted with feature names
warnings.warn(

import pickle
filename = "ml_model.pkl"
pickle.dump(forest, open(filename, "wb"))

You might also like