You are on page 1of 11

Linear versus Polynomial Regression

o m
Import Library il.c
m a
@ g
import pandas as pd

2 0
import numpy as np

80
import matplotlib.pyplot as ply

1 9
import seaborn as sns

s y
g )
.o r
Import Dataset
ion
a t
n d
ou
df = pd.read_csv('https://github.com/ybifoundation/Dataset/raw/main/MPG.csv')

b i f
.y
Explore Data
ww
( w
o n
df.head()

ati
n d
o u
I F
Y B
mpg cylinders displacement horsepower weight acceleration model_year origin nam

chevrol
df.info()

0 18.0 8 307.0 130.0 3504 12.0 70 usa chevel


malib

o m
.c
<class 'pandas.core.frame.DataFrame'>

il
buic
RangeIndex: 398 entries, 0 to 397
1 15.0 8 350.0
Data columns (total 9 columns):

165.0 3693 11.5 70 usa skyla


32
m a
g
# Column Non-Null Count Dtype

@
--- ------ -------------- -----
plymou
0 mpg 398 non-null float64

2 0
0
1 cylinders 398 non-null int64

2
3
displacement 398 non-null
horsepower 392 non-null
float64

float64

9 8
4 weight 398 non-null int64

y1
5 acceleration 398 non-null float64

) s
6 model_year 398 non-null int64

r g
.o
7 origin 398 non-null object

n
8 name 398 non-null object

io
dtypes: float64(4), int64(3), object(2)

memory usage: 28.1+ KB

a t
n d
df.dropna(inplace=True)

o u
b i f
.y
w
df['model_year'] = pd.to_datetime(df['model_year'], format='%y')

w
( w
df.describe(include='all')

o n
a ti
n d
o u
I F
Y B
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:1: FutureWarning: Treating dateti
"""Entry point for launching an IPython kernel.

mpg cylinders displacement horsepower weight acceleration model_year

count 392.000000 392.000000 392.000000 392.000000 392.000000 392.000000 392

o m
unique NaN NaN NaN NaN NaN NaN 13
il.c
1973-01-01
m a
g
top NaN NaN NaN NaN NaN NaN
00:00:00

freq NaN NaN NaN NaN NaN NaN 40


0 @
0 2
8
1970-01-01

9
first NaN NaN NaN NaN NaN NaN
00:00:00

y 1
)s
1982-01-01
last NaN NaN NaN NaN NaN NaN

g
00:00:00

mean 23.445918 5.471939 194.411990 104.469388 2977.584184


.o r
15.541327 NaN
n
tio
df.corr()

std 7.805007 1.705783 104.644004 38.491160 849.402560 2.758864 NaN

min 9.000000 3.000000 68.000000


d
46.000000 1613.000000a 8.000000 NaN
mpg cylinders displacement horsepower

u n
weight acceleration
25%
mpg
17.000000
1.000000
4.000000
-0.777618
105.000000
-0.805127
if o
75.000000 2225.250000
-0.778427 -0.832244
13.775000
0.423329
NaN

.yb
50% 22.750000 4.000000 151.000000 93.500000 2803.500000 15.500000 NaN
cylinders -0.777618 1.000000 0.950823 0.842983 0.897527 -0.504683

ww
75% 29.000000 8.000000 275.750000 126.000000 3614.750000 17.025000 NaN
displacement -0.805127 0.950823 1.000000 0.897257 0.932994 -0.543800

horsepower -0.778427 0.842983


( w0.897257 1.000000 0.864538 -0.689196
n
tio
weight -0.832244 0.897527 0.932994 0.864538 1.000000 -0.416839

acceleration 0.423329
d a
-0.504683 -0.543800 -0.689196 -0.416839 1.000000

u n
F o
I
df['origin'].value_counts()

usa
japan
YB 245

79

europe 68

Name: origin, dtype: int64


df.replace({'origin' : {'usa':0, 'japan':1, 'europe': 2}}, inplace=True)

Visualize Data o m
il.c
m a
sns.pairplot(df, x_vars= ['displacement', 'horsepower', 'weight', 'acceleration',   'mpg'], y_vars=
@ g
2 0
0
<seaborn.axisgrid.PairGrid at 0x7f8745b9a390>

9 8
y1
) s
rg
n .o
t io
d a
u n
i f o
Define y and X
.y b
w w
( w
df.columns

o n
a ti
Index(['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',

n d
'acceleration', 'model_year', 'origin', 'name'],

dtype='object')

o u
I F
y = df['mpg']

Y B
X = df[['horsepower', 'weight']]

# For each X, calculate VIF and save in dataframe

from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()

vif["VIF Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

o m
vif["features"] = X.columns

il.c
vif.round(1)

m a
@ g
0
VIF Factor features

0 32.2 horsepower
0 2
9 8
1
1 32.2 weight

s y
g )
Train Test Split Data .o r
ion
a t
n d
from sklearn.model_selection import train_test_split

u o
i f
yb
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, random_state = 2529)

.
w w
w
X_train.shape, X_test.shape, y_train.shape, y_test.shape

(
o n
ti
((274, 2), (118, 2), (274,), (118,))

d a
Scaling Data u n
F o
B I
Y
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
X_train = ss.fit_transform(X_train)

X_test = ss.fit_transform(X_test)

o m
il.c
m a
Linear Regression Model
@ g
2 0
80
from sklearn.linear_model import LinearRegression

1 9
s y
lr = LinearRegression()

g )
.o r
lr.fit(X_train, y_train)

ion
a t
LinearRegression()
n d
ou
lr.intercept_

b i f
.y
23.577737226277375
ww
( w
lr.coef_

o n
a ti
d
array([-1.83276106, -4.89794393])

n
o u
I F
Predict Test Data
YB
y_pred = lr.predict(X_test)

Model Accuracy

o m
.c
from sklearn.metrics import mean_absolute_percentage_error, r2_score

a il
mean_absolute_percentage_error(y_test, y_pred)

gm
0 @
2
0.15224796342732086

80
r2_score(y_test, y_pred)

1 9
s y
0.7032406165122396
g )
.o r
Significant Variables io n
a t
n d
ou
import·statsmodels.api·as·sm
b i f
X·=·sm.add_constant(X)
.y
model·=·sm.OLS(y,·X)
w w
results·=·model.fit()
( w
print(results.summary())
o n
a ti
OLS Regression Results

n d
==============================================================================

Dep. Variable:
o u mpg R-squared: 0.706

F
Model: OLS Adj. R-squared: 0.705

Method:
Date:
B I Least Squares
Fri, 22 Jul 2022
F-statistic:
Prob (F-statistic):
467.9

3.06e-104

Time:
Y
No. Observations:
02:33:16
392
Log-Likelihood:
AIC:
-1121.0

2248.

Df Residuals: 389 BIC: 2260.

Df Model: 2

Covariance Type: nonrobust

==============================================================================

coef std err t P>|t| [0.025 0.975]

------------------------------------------------------------------------------

const 45.6402 0.793 57.540 0.000 44.081 47.200

horsepower -0.0473 0.011 -4.267 0.000 -0.069 -0.026

o m
.c
weight -0.0058 0.001 -11.535 0.000 -0.007 -0.005

==============================================================================

Omnibus: 35.336 Durbin-Watson: 0.858

a il
Prob(Omnibus): 0.000 Jarque-Bera (JB): 45.973

g m
Skew:
Kurtosis:
0.683
3.974
Prob(JB):
Cond. No.
1.04e-10

1.15e+04

0 @
==============================================================================

0 2
Warnings:

9 8
y1
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

)
[2] The condition number is large, 1.15e+04. This might indicate that there are

s
strong multicollinearity or other numerical problems.

r g
.o
/usr/local/lib/python3.7/dist-packages/statsmodels/tsa/tsatools.py:117: FutureWarning: In a future version of pandas all arguments of con
x = pd.concat(x[::order], 1)

n
t io
d a
Polynomial Regression u n
i f o
.y b
w
from sklearn.preprocessing import PolynomialFeatures

w
( w
o n
poly = PolynomialFeatures(degree=2)

a ti
n d
u
X_train2 = poly.fit_transform(X_train)

o
I F
YB
X_train2 = pd.DataFrame(X_train2, columns=['bias', 'horsepower', 'weight', 'square of horsepower', 

X_train2

bias horsepower weight square of horsepower horsepower*weight square of weight

0 1.0 3.010404 1.923330 9.062532 5.789999 3.699196

m
1 1.0 -0.903062 -1.150355 0.815521 1.038842 1.323317

.c o
il
2 1.0 -0.687147 0.704394 0.472171 -0.484022 0.496171

3 1.0 0.176515 1.273465 0.031157 0.224785 1.621713


m a
4 1.0 0.446409 -0.049683 0.199281 -0.022179 0.002468
@ g
... ... ... ... ... ... ...
2 0
8 0
9
269 1.0 0.716303 1.317960 0.513090 0.944059 1.737019

270 1.0 1.795880 1.396412 3.225185 2.507789


y1
1.949967

) s
271 1.0 -0.363274 -0.306116 0.131968 0.111204
r g 0.093707

272 1.0 -0.903062 -1.051997 0.815521


n
0.950019
.o 1.106698

273 1.0 -0.093379 -0.418525 0.008720


t io
0.039082 0.175163

d a
n
274 rows × 6 columns

o u
X_test2 = poly.fit_transform(X_test)

b i f
.y
w w
X_test2 = pd.DataFrame(X_test2, columns=['bias', 'horsepower', 'weight', 'square of horsepower', 'h
( w
o n
lr.fit(X_train2, y_train)

a ti
LinearRegression()
n d
o u
I F
lr.intercept_

Y B
array([22.09857341])
lr coef
array([[ 0. , -3.81240238, -4.16928337, -0.04228002, 2.21938086,

-0.41596167]])

y_pred_poly = lr.predict(X_test2)

o m
il.c
m a
Model Accuracy
@ g
2 0
from sklearn.metrics import mean_absolute_percentage_error, r2_score
8 0
1 9
) sy
mean_absolute_percentage_error(y_test, y_pred_poly)
r g
0.13053963468368052
n .o
t io
r2_score(y_test, y_pred_poly)
d a
u n
0.7427946235072436
i f o
.yb
w w
( w
o n
a ti
n d
o u
I F
YB
o m
check 0s completed at 8:12 AM
il.c
m a
@ g
2 0
80
1 9
s y
g )
.o r
io n
a t
n d
o u
b i f
.y
ww
( w
o n
ati
n d
o u
I F
Y B

You might also like