You are on page 1of 22

7/3/2021 temp-162530791369287433

In [20]:

%matplotlib inline

In [21]:

# Numerical libraries

import numpy as np

# Import Linear Regression machine learning library

from sklearn.linear_model import LinearRegression

# to handle data in form of rows and columns

import pandas as pd

# importing ploting libraries

import matplotlib.pyplot as plt

import matplotlib.style

plt.style.use('classic')

#importing seaborn for statistical plots

import seaborn as sns

In [22]:

mpg_df = pd.read_csv("car-mpg.csv")

https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/notebook 1.html?t=1625308015432 1/22


7/3/2021 temp-162530791369287433

In [23]:

mpg_df.head(50)

https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/notebook 1.html?t=1625308015432 2/22


7/3/2021 temp-162530791369287433

Out[23]:

mpg cyl disp hp wt acc yr origin car_type car_name

0 18.0 8 307.0 130 3504 12.0 70 1 0 chevrolet chevelle malibu

1 15.0 8 350.0 165 3693 11.5 70 1 0 buick skylark 320

2 18.0 8 318.0 150 3436 11.0 70 1 0 plymouth satellite

3 16.0 8 304.0 150 3433 12.0 70 1 0 amc rebel sst

4 17.0 8 302.0 140 3449 10.5 70 1 0 ford torino

5 15.0 8 429.0 198 4341 10.0 70 1 0 ford galaxie 500

6 14.0 8 454.0 220 4354 9.0 70 1 0 chevrolet impala

7 14.0 8 440.0 215 4312 8.5 70 1 0 plymouth fury iii

8 14.0 8 455.0 225 4425 10.0 70 1 0 pontiac catalina

9 15.0 8 390.0 190 3850 8.5 70 1 0 amc ambassador dpl

10 15.0 8 383.0 170 3563 10.0 70 1 0 dodge challenger se

11 14.0 8 340.0 160 3609 8.0 70 1 0 plymouth 'cuda 340

12 15.0 8 400.0 150 3761 9.5 70 1 0 chevrolet monte carlo

13 14.0 8 455.0 225 3086 10.0 70 1 0 buick estate wagon (sw)

14 24.0 4 113.0 95 2372 15.0 70 3 1 toyota corona mark ii

15 22.0 6 198.0 95 2833 15.5 70 1 0 plymouth duster

16 18.0 6 199.0 97 2774 15.5 70 1 0 amc hornet

17 21.0 6 200.0 85 2587 16.0 70 1 0 ford maverick

18 27.0 4 97.0 88 2130 14.5 70 3 1 datsun pl510

19 26.0 4 97.0 46 1835 20.5 70 2 1 volkswagen 1131 deluxe sedan

20 25.0 4 110.0 87 2672 17.5 70 2 1 peugeot 504

21 24.0 4 107.0 90 2430 14.5 70 2 1 audi 100 ls

22 25.0 4 104.0 95 2375 17.5 70 2 1 saab 99e

23 26.0 4 121.0 113 2234 12.5 70 2 1 bmw 2002

24 21.0 6 199.0 90 2648 15.0 70 1 0 amc gremlin

25 10.0 8 360.0 215 4615 14.0 70 1 0 ford f250

26 10.0 8 307.0 200 4376 15.0 70 1 0 chevy c20

27 11.0 8 318.0 210 4382 13.5 70 1 0 dodge d200

28 9.0 8 304.0 193 4732 18.5 70 1 0 hi 1200d

29 27.0 4 97.0 88 2130 14.5 71 3 1 datsun pl510

30 28.0 4 140.0 90 2264 15.5 71 1 1 chevrolet vega 2300

31 25.0 4 113.0 95 2228 14.0 71 3 1 toyota corona

32 25.0 4 98.0 ? 2046 19.0 71 1 1 ford pinto

33 19.0 6 232.0 100 2634 13.0 71 1 0 amc gremlin

34 16.0 6 225.0 105 3439 15.5 71 1 0 plymouth satellite custom

35 17.0 6 250.0 100 3329 15.5 71 1 0 chevrolet chevelle malibu

36 19.0 6 250.0 88 3302 15.5 71 1 0 ford torino 500

https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/notebook 1.html?t=1625308015432 3/22


7/3/2021 temp-162530791369287433

mpg cyl disp hp wt acc yr origin car_type car_name

37 18.0 6 232.0 100 3288 15.5 71 1 0 amc matador

38 14.0 8 350.0 165 4209 12.0 71 1 0 chevrolet impala

39 14.0 8 400.0 175 4464 11.5 71 1 0 pontiac catalina brougham

40 14.0 8 351.0 153 4154 13.5 71 1 0 ford galaxie 500

41 14.0 8 318.0 150 4096 13.0 71 1 0 plymouth fury iii

42 12.0 8 383.0 180 4955 11.5 71 1 0 dodge monaco (sw)

43 13.0 8 400.0 170 4746 12.0 71 1 0 ford country squire (sw)

44 13.0 8 400.0 175 5140 12.0 71 1 0 pontiac safari (sw)

45 18.0 6 258.0 110 2962 13.5 71 1 0 amc hornet sportabout (sw)

46 22.0 4 140.0 72 2408 19.0 71 1 1 chevrolet vega (sw)

47 19.0 6 250.0 100 3282 15.0 71 1 0 pontiac firebird

48 18.0 6 250.0 88 3139 14.5 71 1 0 ford mustang

49 23.0 4 122.0 86 2220 14.0 71 1 1 mercury capri 2000

In [24]:

# drop the car name column as it is useless for the model

mpg_df = mpg_df.drop('car_name', axis=1)

In [25]:

mpg_df

Out[25]:

mpg cyl disp hp wt acc yr origin car_type

0 18.0 8 307.0 130 3504 12.0 70 1 0

1 15.0 8 350.0 165 3693 11.5 70 1 0

2 18.0 8 318.0 150 3436 11.0 70 1 0

3 16.0 8 304.0 150 3433 12.0 70 1 0

4 17.0 8 302.0 140 3449 10.5 70 1 0

... ... ... ... ... ... ... ... ... ...

393 27.0 4 140.0 86 2790 15.6 82 1 1

394 44.0 4 97.0 52 2130 24.6 82 2 1

395 32.0 4 135.0 84 2295 11.6 82 1 1

396 28.0 4 120.0 79 2625 18.6 82 1 1

397 31.0 4 119.0 82 2720 19.4 82 1 1

398 rows × 9 columns

https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/notebook 1.html?t=1625308015432 4/22


7/3/2021 temp-162530791369287433

In [26]:

# Replace the numbers in categorical variables with the actual country name in the orig
in col

mpg_df['origin'] = mpg_df['origin'].replace({1: 'america' , 2: 'europe' , 3: 'asia'})

In [27]:

mpg_df

Out[27]:

mpg cyl disp hp wt acc yr origin car_type

0 18.0 8 307.0 130 3504 12.0 70 america 0

1 15.0 8 350.0 165 3693 11.5 70 america 0

2 18.0 8 318.0 150 3436 11.0 70 america 0

3 16.0 8 304.0 150 3433 12.0 70 america 0

4 17.0 8 302.0 140 3449 10.5 70 america 0

... ... ... ... ... ... ... ... ... ...

393 27.0 4 140.0 86 2790 15.6 82 america 1

394 44.0 4 97.0 52 2130 24.6 82 europe 1

395 32.0 4 135.0 84 2295 11.6 82 america 1

396 28.0 4 120.0 79 2625 18.6 82 america 1

397 31.0 4 119.0 82 2720 19.4 82 america 1

398 rows × 9 columns

In [28]:

# Convert catogrical variable into dummy/indicator variable. As many columns will be cr


eated as distinct values

# This is also kown as one hot coding. The column names will be A merica, Europe and As
ia... with one hot coding

mpg_df = pd.get_dummies(mpg_df, columns=['origin'])

https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/notebook 1.html?t=1625308015432 5/22


7/3/2021 temp-162530791369287433

In [29]:

mpg_df

Out[29]:

mpg cyl disp hp wt acc yr car_type origin_america origin_asia origin_euro

0 18.0 8 307.0 130 3504 12.0 70 0 1 0

1 15.0 8 350.0 165 3693 11.5 70 0 1 0

2 18.0 8 318.0 150 3436 11.0 70 0 1 0

3 16.0 8 304.0 150 3433 12.0 70 0 1 0

4 17.0 8 302.0 140 3449 10.5 70 0 1 0

... ... ... ... ... ... ... ... ... ... ...

393 27.0 4 140.0 86 2790 15.6 82 1 1 0

394 44.0 4 97.0 52 2130 24.6 82 1 0 0

395 32.0 4 135.0 84 2295 11.6 82 1 1 0

396 28.0 4 120.0 79 2625 18.6 82 1 1 0

397 31.0 4 119.0 82 2720 19.4 82 1 1 0

398 rows × 11 columns

In [31]:

#Lets analysze the distribution of the dependent (mpg) column

mpg_df.describe().transpose()

Out[31]:

count mean std min 25% 50% 75% max

mpg 398.0 23.514573 7.815984 9.0 17.500 23.0 29.000 46.6

cyl 398.0 5.454774 1.701004 3.0 4.000 4.0 8.000 8.0

disp 398.0 193.425879 104.269838 68.0 104.250 148.5 262.000 455.0

wt 398.0 2970.424623 846.841774 1613.0 2223.750 2803.5 3608.000 5140.0

acc 398.0 15.568090 2.757689 8.0 13.825 15.5 17.175 24.8

yr 398.0 76.010050 3.697627 70.0 73.000 76.0 79.000 82.0

car_type 398.0 0.530151 0.499718 0.0 0.000 1.0 1.000 1.0

origin_america 398.0 0.625628 0.484569 0.0 0.000 1.0 1.000 1.0

origin_asia 398.0 0.198492 0.399367 0.0 0.000 0.0 0.000 1.0

origin_europe 398.0 0.175879 0.381197 0.0 0.000 0.0 0.000 1.0

https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/notebook 1.html?t=1625308015432 6/22


7/3/2021 temp-162530791369287433

In [32]:

mpg_df.dtypes

Out[32]:

mpg float64

cyl int64

disp float64

hp object

wt int64

acc float64

yr int64

car_type int64

origin_america uint8

origin_asia uint8

origin_europe uint8

dtype: object

In [33]:

# Note: HP column is missing the describe output. That indicates something is not righ
t with that column

In [34]:

#Check if the hp column contains anything other than digits

# run the "isdigit() check on 'hp' column of the mpg_df dataframe. Result will be True
or False for every row

# capture the result in temp dataframe and dow a frequency count using value_counts()

# There are six records with non digit values in 'hp' column

temp = pd.DataFrame(mpg_df.hp.str.isdigit()) # if the string is made of digits store T


rue else False in the hp column

# in temp dataframe

temp[temp['hp'] == False] # from temp take only those rows where hp has false

Out[34]:

hp

32 False

126 False

330 False

336 False

354 False

374 False

In [36]:

# On inspecting records number 32, 126 etc, we find "?" in the columns. Replace them wi
th "nan"

#Replace them with nan and remove the records from the data frame that have "nan"

mpg_df = mpg_df.replace('?', np.nan)

https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/notebook 1.html?t=1625308015432 7/22


7/3/2021 temp-162530791369287433

In [37]:

#Let us see if we can get those records with nan

mpg_df[mpg_df.isnull().any(axis=1)]

Out[37]:

mpg cyl disp hp wt acc yr car_type origin_america origin_asia origin_eur

32 25.0 4 98.0 NaN 2046 19.0 71 1 1 0

126 21.0 6 200.0 NaN 2875 17.0 74 0 1 0

330 40.9 4 85.0 NaN 1835 17.3 80 1 0 0

336 23.6 4 140.0 NaN 2905 14.3 80 1 1 0

354 34.5 4 100.0 NaN 2320 15.8 81 1 0 0

374 23.0 4 151.0 NaN 3035 20.5 82 1 1 0

In [38]:

# There are various ways to handle missing values. Drop the rows, replace missing value
s with median values etc.

In [39]:

#of the 398 rows 6 have NAN in the hp column. We will drop those 6 rows. Not a good ide
a under all situations

#note: HP is missing becauses of the non-numeric values in the column.

#mpg_df = mpg_df.dropna()

https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/notebook 1.html?t=1625308015432 8/22


7/3/2021 temp-162530791369287433

In [40]:

#instead of dropping the rows, lets replace the missing values with median value

mpg_df.median

Out[40]:

<bound method DataFrame.median of mpg cyl disp hp wt acc


yr car_type origin_america \

0 18.0 8 307.0 130 3504 12.0 70 0 1

1 15.0 8 350.0 165 3693 11.5 70 0 1

2 18.0 8 318.0 150 3436 11.0 70 0 1

3 16.0 8 304.0 150 3433 12.0 70 0 1

4 17.0 8 302.0 140 3449 10.5 70 0 1

.. ... ... ... ... ... ... .. ... ...

393 27.0 4 140.0 86 2790 15.6 82 1 1

394 44.0 4 97.0 52 2130 24.6 82 1 0

395 32.0 4 135.0 84 2295 11.6 82 1 1

396 28.0 4 120.0 79 2625 18.6 82 1 1

397 31.0 4 119.0 82 2720 19.4 82 1 1

origin_asia origin_europe

0 0 0

1 0 0

2 0 0

3 0 0

4 0 0

.. ... ...

393 0 0

394 0 1

395 0 0

396 0 0

397 0 0

[398 rows x 11 columns]>

In [41]:

# replace the missing values in 'hp' with median value of 'hp' :Note, we do not need to
specify the column names

# every column's missing value is replaced with that column's median respectively (axi
s =0 means columnwise)

#mpg_df = mpg_df.fillna(mpg_df.median())

mpg_df = mpg_df.apply(lambda x: x.fillna(x.median()),axis=0)

https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/notebook 1.html?t=1625308015432 9/22


7/3/2021 temp-162530791369287433

In [42]:

mpg_df.dtypes

Out[42]:

mpg float64

cyl int64

disp float64

hp object

wt int64

acc float64

yr int64

car_type int64

origin_america uint8

origin_asia uint8

origin_europe uint8

dtype: object

In [43]:

mpg_df['hp'] = mpg_df['hp'].astype('float64') # converting the hp column from object /


string type to float

In [45]:

mpg_df.describe()

Out[45]:

mpg cyl disp hp wt acc yr

count 398.000000 398.000000 398.000000 398.000000 398.000000 398.000000 398.000000

mean 23.514573 5.454774 193.425879 104.304020 2970.424623 15.568090 76.010050

std 7.815984 1.701004 104.269838 38.222625 846.841774 2.757689 3.697627

min 9.000000 3.000000 68.000000 46.000000 1613.000000 8.000000 70.000000

25% 17.500000 4.000000 104.250000 76.000000 2223.750000 13.825000 73.000000

50% 23.000000 4.000000 148.500000 93.500000 2803.500000 15.500000 76.000000

75% 29.000000 8.000000 262.000000 125.000000 3608.000000 17.175000 79.000000

max 46.600000 8.000000 455.000000 230.000000 5140.000000 24.800000 82.000000

https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/notebook 1.html?t=1625308015432 10/22


7/3/2021 temp-162530791369287433

In [46]:

# Let us do a correlation analysis among the different dimensions and also each dimensi
on with the dependent dimension

# This is done using scatter matrix function which creates a dashboard reflecting usefu
l information about the dimensions

# The result can be stored as a .png file and opened in say, paint to get a larger view

mpg_df_attr = mpg_df.iloc[:, 0:10]

#axes = pd.plotting.scatter_matrix(mpg_df_attr)

#plt.tight_layout()

#plt.savefig('d:\greatlakes\mpg_pairpanel.png')

sns.pairplot(mpg_df_attr, diag_kind='kde') # to plot density curve instead of histogr


am

#sns.pairplot(mpg_df_attr) # to plot histogram, the default

Out[46]:

<seaborn.axisgrid.PairGrid at 0x1817e50cfd0>

https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/notebook 1.html?t=1625308015432 11/22


7/3/2021 temp-162530791369287433

In [47]:

#The data distribution across various dimensions except 'Acc' do not look normal

#Close observation between 'mpg' and other attributes indicate the relationship is not
really linear

#relation between 'mpg' and 'hp' show hetroscedacity... which will impact model accurac
y

#How about 'mpg' vs 'yr' surprising to see a positive relation

In [50]:

# Copy all the predictor variables into X dataframe. Since 'mpg' is dependent variable
drop it

X = mpg_df.drop('mpg', axis=1)

X = X.drop({'origin_america', 'origin_asia' ,'origin_europe'}, axis=1)

# Copy the 'mpg' column alone into the y dataframe. This is the dependent variable

y = mpg_df[['mpg']]

In [51]:

#Let us break the X and y dataframes into training set and test set. For this we will u
se

#Sklearn package's data splitting function which is based on random function

from sklearn.model_selection import train_test_split

In [52]:

# Split X and y into training and test set in 75:25 ratio

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30 , random_state


=1)

In [53]:

# invoke the LinearRegression function and find the bestfit model on training data

regression_model = LinearRegression()

regression_model.fit(X_train, y_train)

Out[53]:

LinearRegression()

https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/notebook 1.html?t=1625308015432 12/22


7/3/2021 temp-162530791369287433

In [54]:

# Let us explore the coefficients for each of the independent attributes

for idx, col_name in enumerate(X_train.columns):

print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx


]))

The coefficient for cyl is 1.8637178336269913

The coefficient for disp is 0.010066050755003018

The coefficient for hp is -0.039229006295896617

The coefficient for wt is -0.006414996525192761

The coefficient for acc is 0.011723809488668087

The coefficient for yr is 0.7588184849759366

The coefficient for car_type is 6.62652133924925

In [55]:

# Let us check the intercept for the model

intercept = regression_model.intercept_[0]

print("The intercept for our model is {}".format(intercept))

The intercept for our model is -26.693360125832104

In [56]:

regression_model.score(X_train, y_train)

Out[56]:

0.825809118133759

In [57]:

# Model score - R2 or coeff of determinant

# R^2=1–RSS / TSS = RegErr / TSS

regression_model.score(X_test, y_test)

Out[57]:

0.8404952015294237

https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/notebook 1.html?t=1625308015432 13/22


7/3/2021 temp-162530791369287433

In [58]:

# R^2 is not a reliable metric as it always increases with addition of more attributes
even if the attributes have no

# influence on the predicted variable. Instead we use adjusted R^2 which removes the st
atistical chance that improves R^2

# Scikit does not provide a facility for adjusted R^2... so we use

# statsmodel, a library that gives results similar to

# what you obtain in R language

# This library expects the X and Y to be given in one single dataframe

data_train = pd.concat([X_train, y_train], axis=1)

data_train.head()

Out[58]:

cyl disp hp wt acc yr car_type mpg

350 4 105.0 63.0 2215 14.9 81 1 34.7

59 4 97.0 54.0 2254 23.5 72 1 23.0

120 4 121.0 112.0 2868 15.5 73 1 19.0

12 8 400.0 150.0 3761 9.5 70 0 15.0

349 4 91.0 68.0 1985 16.0 81 1 34.1

In [59]:

import statsmodels.formula.api as smf

lm1 = smf.ols(formula= 'mpg ~ cyl+disp+hp+wt+acc+yr+car_type', data = data_train).fit()

lm1.params

Out[59]:

Intercept -26.693360

cyl 1.863718

disp 0.010066

hp -0.039229

wt -0.006415

acc 0.011724

yr 0.758818

car_type 6.626521

dtype: float64

https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/notebook 1.html?t=1625308015432 14/22


7/3/2021 temp-162530791369287433

In [60]:

print(lm1.summary()) #Inferential statistics

https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/notebook 1.html?t=1625308015432 15/22


7/3/2021 temp-162530791369287433

OLS Regression Results


==========================================================================
====

Dep. Variable: mpg R-squared:


0.826

Model: OLS Adj. R-squared:


0.821

Method: Least Squares F-statistic: 1


82.9

Date: Wed, 31 Mar 2021 Prob (F-statistic): 1.41


e-98

Time: 15:17:36 Log-Likelihood: -72


5.17

No. Observations: 278 AIC: 1


466.

Df Residuals: 270 BIC: 1


495.

Df Model: 7
Covariance Type: nonrobust
==========================================================================
====

coef std err t P>|t| [0.025 0.


975]

--------------------------------------------------------------------------
----

Intercept -26.6934 5.597 -4.769 0.000 -37.713 -1


5.674

cyl 1.8637 0.517 3.606 0.000 0.846


2.881

disp 0.0101 0.009 1.123 0.262 -0.008


0.028

hp -0.0392 0.016 -2.420 0.016 -0.071 -


0.007

wt -0.0064 0.001 -7.865 0.000 -0.008 -


0.005

acc 0.0117 0.114 0.103 0.918 -0.212


0.236

yr 0.7588 0.060 12.668 0.000 0.641


0.877

car_type 6.6265 1.041 6.364 0.000 4.577


8.677

==========================================================================
====

Omnibus: 35.838 Durbin-Watson:


2.082

Prob(Omnibus): 0.000 Jarque-Bera (JB): 6


8.579

Skew: 0.693 Prob(JB): 1.28


e-15

Kurtosis: 5.000 Cond. No. 8.64


e+04

==========================================================================
====

Notes:

[1] Standard Errors assume that the covariance matrix of the errors is cor
rectly specified.

[2] The condition number is large, 8.64e+04. This might indicate that ther
e are

strong multicollinearity or other numerical problems.

https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/notebook 1.html?t=1625308015432 16/22


7/3/2021 temp-162530791369287433

In [61]:

# Let us check the sum of squared errors by predicting value of y for test cases and

# subtracting from the actual y for the test cases

mse = np.mean((regression_model.predict(X_test)-y_test)**2)

In [62]:

# underroot of mean_sq_error is standard deviation i.e. avg variance between predicted


and actual

import math

math.sqrt(mse)

Out[62]:

3.0538103653849573

In [63]:

# so there is avg of 3.0 (roundoff) mpg difference from real mpg on an avg

In [64]:

# Model score - R2 or coeff of determinant

# R^2=1–RSS / TSS

regression_model.score(X_test, y_test)

Out[64]:

0.8404952015294237

In [65]:

# predict mileage (mpg) for a set of attributes not in the training or test set

y_pred = regression_model.predict(X_test)

https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/notebook 1.html?t=1625308015432 17/22


7/3/2021 temp-162530791369287433

In [66]:

# Since this is regression, plot the predicted y value vs actual y values for the test
data

# A good model's prediction will be close to actual leading to high R and R2 values

#plt.rcParams['figure.dpi'] = 500

plt.scatter(y_test['mpg'], y_pred)

Out[66]:

<matplotlib.collections.PathCollection at 0x18104b9ddc0>

In [67]:

# How do we improve the model? the R^2 is .844, how do we improve it


# The indpendent attributes have different units and scales of measurement

# It is always a good practice to scale all the dimensions using z scores or someother
methode to address the problem of different scales

https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/notebook 1.html?t=1625308015432 18/22


7/3/2021 temp-162530791369287433

In [68]:

from scipy.stats import zscore

X_train_scaled = X_train.apply(zscore)

X_test_scaled = X_test.apply(zscore)

y_train_scaled = y_train.apply(zscore)
y_test_scaled = y_test.apply(zscore)

In [69]:

# invoke the LinearRegression function and find the bestfit model on training data

regression_model = LinearRegression()

regression_model.fit(X_train_scaled, y_train_scaled)

Out[69]:

LinearRegression()

In [70]:

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

Out[70]:

LinearRegression()

In [71]:

# Let us explore the coefficients for each of the independent attributes

for idx, col_name in enumerate(X_train.columns):

print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx


]))

The coefficient for cyl is 0.3957470005920464

The coefficient for disp is 0.1305719525443726

The coefficient for hp is -0.18704682143280665

The coefficient for wt is -0.6881521871765556

The coefficient for acc is 0.004130471087879466

The coefficient for yr is 0.35562759880850775

The coefficient for car_type is 0.4197709075092129

In [72]:

intercept = regression_model.intercept_[0]

print("The intercept for our model is {}".format(intercept))

The intercept for our model is -2.8520553264365213e-18

https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/notebook 1.html?t=1625308015432 19/22


7/3/2021 temp-162530791369287433

In [73]:

# Model score - R2 or coeff of determinant

# R^2=1–RSS / TSS

regression_model.score(X_test_scaled, y_test_scaled)

Out[73]:

0.8460575288663481

In [74]:

# Let us check the sum of squared errors by predicting value of y for training cases an
d

# subtracting from the actual y for the training cases

mse = np.mean((regression_model.predict(X_test_scaled)-y_test_scaled)**2)

In [75]:

# underroot of mean_sq_error is standard deviation i.e. avg variance between predicted


and actual

import math

math.sqrt(mse)

Out[75]:

0.39235503199736316

In [76]:

# predict mileage (mpg) for a set of attributes not in the training or test set

y_pred = regression_model.predict(X_test_scaled)

https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/notebook 1.html?t=1625308015432 20/22


7/3/2021 temp-162530791369287433

In [77]:

# Since this is regression, plot the predicted y value vs actual y values for the test
data

# A good model's prediction will be close to actual leading to high R and R2 values

plt.scatter(y_test_scaled['mpg'], y_pred)

Out[77]:

<matplotlib.collections.PathCollection at 0x18104bfc880>

In [78]:

from statsmodels.stats.outliers_influence import variance_inflation_factor

In [79]:

vif = [variance_inflation_factor(X.values, ix) for ix in range(X.shape[1])]

https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/notebook 1.html?t=1625308015432 21/22


7/3/2021 temp-162530791369287433

In [80]:

i=0

for column in X.columns:

if i < 11:

print (column ,"--->", vif[i])

i = i+1

cyl ---> 172.09167529137474

disp ---> 87.05808335183303

hp ---> 71.23983108333236

wt ---> 139.1665144189037

acc ---> 69.82068667385671

yr ---> 166.95012233353933

car_type ---> 12.993508077923245

In [ ]:

https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/notebook 1.html?t=1625308015432 22/22

You might also like