Statisitics Project 3

7/3/2021 temp-162530791369287433
In [20]:
%matplotlib inline
In [21]:
# Numerical libraries
import numpy as np
# Import Linear Regression machine learning library
from sklearn.linear_model import LinearRegression
# to handle data in form of rows and columns
import pandas as pd
# importing ploting libraries
import matplotlib.pyplot as plt
import matplotlib.style
plt.style.use('classic')
#importing seaborn for statistical plots
import seaborn as sns
In [22]:
mpg_df = pd.read_csv("car-mpg.csv")
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/notebook 1.html?t=1625308015432 1/22

7/3/2021 temp-162530791369287433
In [23]:
mpg_df.head(50)

7/3/2021 temp-162530791369287433
Out[23]:
mpg cyl disp hp wt acc yr origin car_type car_name
0 18.0 8 307.0 130 3504 12.0 70 1 0 chevrolet chevelle malibu
1 15.0 8 350.0 165 3693 11.5 70 1 0 buick skylark 320
2 18.0 8 318.0 150 3436 11.0 70 1 0 plymouth satellite
3 16.0 8 304.0 150 3433 12.0 70 1 0 amc rebel sst
4 17.0 8 302.0 140 3449 10.5 70 1 0 ford torino
5 15.0 8 429.0 198 4341 10.0 70 1 0 ford galaxie 500
6 14.0 8 454.0 220 4354 9.0 70 1 0 chevrolet impala
7 14.0 8 440.0 215 4312 8.5 70 1 0 plymouth fury iii
8 14.0 8 455.0 225 4425 10.0 70 1 0 pontiac catalina
9 15.0 8 390.0 190 3850 8.5 70 1 0 amc ambassador dpl
10 15.0 8 383.0 170 3563 10.0 70 1 0 dodge challenger se
11 14.0 8 340.0 160 3609 8.0 70 1 0 plymouth 'cuda 340
12 15.0 8 400.0 150 3761 9.5 70 1 0 chevrolet monte carlo
13 14.0 8 455.0 225 3086 10.0 70 1 0 buick estate wagon (sw)
14 24.0 4 113.0 95 2372 15.0 70 3 1 toyota corona mark ii
15 22.0 6 198.0 95 2833 15.5 70 1 0 plymouth duster
16 18.0 6 199.0 97 2774 15.5 70 1 0 amc hornet
17 21.0 6 200.0 85 2587 16.0 70 1 0 ford maverick
18 27.0 4 97.0 88 2130 14.5 70 3 1 datsun pl510
19 26.0 4 97.0 46 1835 20.5 70 2 1 volkswagen 1131 deluxe sedan
20 25.0 4 110.0 87 2672 17.5 70 2 1 peugeot 504
21 24.0 4 107.0 90 2430 14.5 70 2 1 audi 100 ls
22 25.0 4 104.0 95 2375 17.5 70 2 1 saab 99e
23 26.0 4 121.0 113 2234 12.5 70 2 1 bmw 2002
24 21.0 6 199.0 90 2648 15.0 70 1 0 amc gremlin
25 10.0 8 360.0 215 4615 14.0 70 1 0 ford f250
26 10.0 8 307.0 200 4376 15.0 70 1 0 chevy c20
27 11.0 8 318.0 210 4382 13.5 70 1 0 dodge d200
28 9.0 8 304.0 193 4732 18.5 70 1 0 hi 1200d
29 27.0 4 97.0 88 2130 14.5 71 3 1 datsun pl510
30 28.0 4 140.0 90 2264 15.5 71 1 1 chevrolet vega 2300
31 25.0 4 113.0 95 2228 14.0 71 3 1 toyota corona
32 25.0 4 98.0 ? 2046 19.0 71 1 1 ford pinto
33 19.0 6 232.0 100 2634 13.0 71 1 0 amc gremlin
34 16.0 6 225.0 105 3439 15.5 71 1 0 plymouth satellite custom
35 17.0 6 250.0 100 3329 15.5 71 1 0 chevrolet chevelle malibu
36 19.0 6 250.0 88 3302 15.5 71 1 0 ford torino 500

7/3/2021 temp-162530791369287433
mpg cyl disp hp wt acc yr origin car_type car_name
37 18.0 6 232.0 100 3288 15.5 71 1 0 amc matador
38 14.0 8 350.0 165 4209 12.0 71 1 0 chevrolet impala
39 14.0 8 400.0 175 4464 11.5 71 1 0 pontiac catalina brougham
40 14.0 8 351.0 153 4154 13.5 71 1 0 ford galaxie 500
41 14.0 8 318.0 150 4096 13.0 71 1 0 plymouth fury iii
42 12.0 8 383.0 180 4955 11.5 71 1 0 dodge monaco (sw)
43 13.0 8 400.0 170 4746 12.0 71 1 0 ford country squire (sw)
44 13.0 8 400.0 175 5140 12.0 71 1 0 pontiac safari (sw)
45 18.0 6 258.0 110 2962 13.5 71 1 0 amc hornet sportabout (sw)
46 22.0 4 140.0 72 2408 19.0 71 1 1 chevrolet vega (sw)
47 19.0 6 250.0 100 3282 15.0 71 1 0 pontiac firebird
48 18.0 6 250.0 88 3139 14.5 71 1 0 ford mustang
49 23.0 4 122.0 86 2220 14.0 71 1 1 mercury capri 2000
In [24]:
# drop the car name column as it is useless for the model
mpg_df = mpg_df.drop('car_name', axis=1)
In [25]:
mpg_df
Out[25]:
mpg cyl disp hp wt acc yr origin car_type
0 18.0 8 307.0 130 3504 12.0 70 1 0
1 15.0 8 350.0 165 3693 11.5 70 1 0
2 18.0 8 318.0 150 3436 11.0 70 1 0
3 16.0 8 304.0 150 3433 12.0 70 1 0
4 17.0 8 302.0 140 3449 10.5 70 1 0
... ... ... ... ... ... ... ... ... ...
393 27.0 4 140.0 86 2790 15.6 82 1 1
394 44.0 4 97.0 52 2130 24.6 82 2 1
395 32.0 4 135.0 84 2295 11.6 82 1 1
396 28.0 4 120.0 79 2625 18.6 82 1 1
397 31.0 4 119.0 82 2720 19.4 82 1 1
398 rows × 9 columns

7/3/2021 temp-162530791369287433
In [26]:
# Replace the numbers in categorical variables with the actual country name in the orig
in col
mpg_df['origin'] = mpg_df['origin'].replace({1: 'america' , 2: 'europe' , 3: 'asia'})
In [27]:
mpg_df
Out[27]:
mpg cyl disp hp wt acc yr origin car_type
0 18.0 8 307.0 130 3504 12.0 70 america 0
1 15.0 8 350.0 165 3693 11.5 70 america 0
2 18.0 8 318.0 150 3436 11.0 70 america 0
3 16.0 8 304.0 150 3433 12.0 70 america 0
4 17.0 8 302.0 140 3449 10.5 70 america 0
... ... ... ... ... ... ... ... ... ...
393 27.0 4 140.0 86 2790 15.6 82 america 1
394 44.0 4 97.0 52 2130 24.6 82 europe 1
395 32.0 4 135.0 84 2295 11.6 82 america 1
396 28.0 4 120.0 79 2625 18.6 82 america 1
397 31.0 4 119.0 82 2720 19.4 82 america 1
In [28]:
# Convert catogrical variable into dummy/indicator variable. As many columns will be cr

eated as distinct values
# This is also kown as one hot coding. The column names will be A merica, Europe and As
ia... with one hot coding
mpg_df = pd.get_dummies(mpg_df, columns=['origin'])

7/3/2021 temp-162530791369287433
In [29]:
mpg_df
Out[29]:
mpg cyl disp hp wt acc yr car_type origin_america origin_asia origin_euro
0 18.0 8 307.0 130 3504 12.0 70 0 1 0
1 15.0 8 350.0 165 3693 11.5 70 0 1 0
2 18.0 8 318.0 150 3436 11.0 70 0 1 0
3 16.0 8 304.0 150 3433 12.0 70 0 1 0
4 17.0 8 302.0 140 3449 10.5 70 0 1 0
... ... ... ... ... ... ... ... ... ... ...
393 27.0 4 140.0 86 2790 15.6 82 1 1 0
394 44.0 4 97.0 52 2130 24.6 82 1 0 0
395 32.0 4 135.0 84 2295 11.6 82 1 1 0
396 28.0 4 120.0 79 2625 18.6 82 1 1 0
397 31.0 4 119.0 82 2720 19.4 82 1 1 0
In [31]:
#Lets analysze the distribution of the dependent (mpg) column
mpg_df.describe().transpose()
Out[31]:
count mean std min 25% 50% 75% max
mpg 398.0 23.514573 7.815984 9.0 17.500 23.0 29.000 46.6
cyl 398.0 5.454774 1.701004 3.0 4.000 4.0 8.000 8.0
disp 398.0 193.425879 104.269838 68.0 104.250 148.5 262.000 455.0
wt 398.0 2970.424623 846.841774 1613.0 2223.750 2803.5 3608.000 5140.0
acc 398.0 15.568090 2.757689 8.0 13.825 15.5 17.175 24.8
yr 398.0 76.010050 3.697627 70.0 73.000 76.0 79.000 82.0
car_type 398.0 0.530151 0.499718 0.0 0.000 1.0 1.000 1.0
origin_america 398.0 0.625628 0.484569 0.0 0.000 1.0 1.000 1.0
origin_asia 398.0 0.198492 0.399367 0.0 0.000 0.0 0.000 1.0
origin_europe 398.0 0.175879 0.381197 0.0 0.000 0.0 0.000 1.0

7/3/2021 temp-162530791369287433
In [32]:
mpg_df.dtypes
Out[32]:
mpg float64
cyl int64
disp float64
hp object
wt int64
acc float64
yr int64
car_type int64
origin_america uint8
origin_asia uint8
origin_europe uint8
dtype: object
In [33]:
# Note: HP column is missing the describe output. That indicates something is not righ
t with that column
In [34]:
#Check if the hp column contains anything other than digits
# run the "isdigit() check on 'hp' column of the mpg_df dataframe. Result will be True
or False for every row
# capture the result in temp dataframe and dow a frequency count using value_counts()
# There are six records with non digit values in 'hp' column
temp = pd.DataFrame(mpg_df.hp.str.isdigit()) # if the string is made of digits store T

rue else False in the hp column
# in temp dataframe
temp[temp['hp'] == False] # from temp take only those rows where hp has false
Out[34]:
hp
32 False
126 False
330 False
336 False
354 False
374 False
In [36]:
# On inspecting records number 32, 126 etc, we find "?" in the columns. Replace them wi
th "nan"
#Replace them with nan and remove the records from the data frame that have "nan"
mpg_df = mpg_df.replace('?', np.nan)

7/3/2021 temp-162530791369287433
In [37]:
#Let us see if we can get those records with nan
mpg_df[mpg_df.isnull().any(axis=1)]
Out[37]:
mpg cyl disp hp wt acc yr car_type origin_america origin_asia origin_eur
32 25.0 4 98.0 NaN 2046 19.0 71 1 1 0
126 21.0 6 200.0 NaN 2875 17.0 74 0 1 0
330 40.9 4 85.0 NaN 1835 17.3 80 1 0 0
336 23.6 4 140.0 NaN 2905 14.3 80 1 1 0
354 34.5 4 100.0 NaN 2320 15.8 81 1 0 0
374 23.0 4 151.0 NaN 3035 20.5 82 1 1 0
In [38]:
# There are various ways to handle missing values. Drop the rows, replace missing value
s with median values etc.
In [39]:
#of the 398 rows 6 have NAN in the hp column. We will drop those 6 rows. Not a good ide
a under all situations
#note: HP is missing becauses of the non-numeric values in the column.
#mpg_df = mpg_df.dropna()

7/3/2021 temp-162530791369287433
In [40]:
#instead of dropping the rows, lets replace the missing values with median value
mpg_df.median
Out[40]:
<bound method DataFrame.median of mpg cyl disp hp wt acc

yr car_type origin_america \
0 18.0 8 307.0 130 3504 12.0 70 0 1
1 15.0 8 350.0 165 3693 11.5 70 0 1
2 18.0 8 318.0 150 3436 11.0 70 0 1
3 16.0 8 304.0 150 3433 12.0 70 0 1
4 17.0 8 302.0 140 3449 10.5 70 0 1
.. ... ... ... ... ... ... .. ... ...
393 27.0 4 140.0 86 2790 15.6 82 1 1
394 44.0 4 97.0 52 2130 24.6 82 1 0
395 32.0 4 135.0 84 2295 11.6 82 1 1
396 28.0 4 120.0 79 2625 18.6 82 1 1
397 31.0 4 119.0 82 2720 19.4 82 1 1
origin_asia origin_europe
0 0 0
1 0 0
2 0 0
3 0 0
4 0 0
.. ... ...
393 0 0
394 0 1
395 0 0
396 0 0
397 0 0
[398 rows x 11 columns]>
In [41]:
# replace the missing values in 'hp' with median value of 'hp' :Note, we do not need to
specify the column names
# every column's missing value is replaced with that column's median respectively (axi
s =0 means columnwise)
#mpg_df = mpg_df.fillna(mpg_df.median())
mpg_df = mpg_df.apply(lambda x: x.fillna(x.median()),axis=0)

7/3/2021 temp-162530791369287433
In [42]:
mpg_df.dtypes
Out[42]:
mpg float64
cyl int64
disp float64
hp object
wt int64
acc float64
yr int64
car_type int64
origin_america uint8
origin_asia uint8
origin_europe uint8
dtype: object
In [43]:
mpg_df['hp'] = mpg_df['hp'].astype('float64') # converting the hp column from object /

string type to float
In [45]:
mpg_df.describe()
Out[45]:
mpg cyl disp hp wt acc yr
count 398.000000 398.000000 398.000000 398.000000 398.000000 398.000000 398.000000
mean 23.514573 5.454774 193.425879 104.304020 2970.424623 15.568090 76.010050
std 7.815984 1.701004 104.269838 38.222625 846.841774 2.757689 3.697627
min 9.000000 3.000000 68.000000 46.000000 1613.000000 8.000000 70.000000
25% 17.500000 4.000000 104.250000 76.000000 2223.750000 13.825000 73.000000
50% 23.000000 4.000000 148.500000 93.500000 2803.500000 15.500000 76.000000
75% 29.000000 8.000000 262.000000 125.000000 3608.000000 17.175000 79.000000
max 46.600000 8.000000 455.000000 230.000000 5140.000000 24.800000 82.000000

7/3/2021 temp-162530791369287433
In [46]:
# Let us do a correlation analysis among the different dimensions and also each dimensi
on with the dependent dimension
# This is done using scatter matrix function which creates a dashboard reflecting usefu
l information about the dimensions
# The result can be stored as a .png file and opened in say, paint to get a larger view
mpg_df_attr = mpg_df.iloc[:, 0:10]
#axes = pd.plotting.scatter_matrix(mpg_df_attr)
#plt.tight_layout()
#plt.savefig('d:\greatlakes\mpg_pairpanel.png')
sns.pairplot(mpg_df_attr, diag_kind='kde') # to plot density curve instead of histogr

am
#sns.pairplot(mpg_df_attr) # to plot histogram, the default
Out[46]:
<seaborn.axisgrid.PairGrid at 0x1817e50cfd0>

7/3/2021 temp-162530791369287433
In [47]:
#The data distribution across various dimensions except 'Acc' do not look normal
#Close observation between 'mpg' and other attributes indicate the relationship is not
really linear
#relation between 'mpg' and 'hp' show hetroscedacity... which will impact model accurac
y
#How about 'mpg' vs 'yr' surprising to see a positive relation
In [50]:
# Copy all the predictor variables into X dataframe. Since 'mpg' is dependent variable
drop it
X = mpg_df.drop('mpg', axis=1)
X = X.drop({'origin_america', 'origin_asia' ,'origin_europe'}, axis=1)
# Copy the 'mpg' column alone into the y dataframe. This is the dependent variable
y = mpg_df[['mpg']]
In [51]:
#Let us break the X and y dataframes into training set and test set. For this we will u
se
#Sklearn package's data splitting function which is based on random function
from sklearn.model_selection import train_test_split
In [52]:
# Split X and y into training and test set in 75:25 ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30 , random_state

=1)
In [53]:
# invoke the LinearRegression function and find the bestfit model on training data
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)
Out[53]:
LinearRegression()

7/3/2021 temp-162530791369287433
In [54]:
# Let us explore the coefficients for each of the independent attributes
for idx, col_name in enumerate(X_train.columns):
print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx

]))
The coefficient for cyl is 1.8637178336269913
The coefficient for disp is 0.010066050755003018
The coefficient for hp is -0.039229006295896617
The coefficient for wt is -0.006414996525192761
The coefficient for acc is 0.011723809488668087
The coefficient for yr is 0.7588184849759366
The coefficient for car_type is 6.62652133924925
In [55]:
# Let us check the intercept for the model
intercept = regression_model.intercept_[0]
print("The intercept for our model is {}".format(intercept))
The intercept for our model is -26.693360125832104
In [56]:
regression_model.score(X_train, y_train)
Out[56]:
0.825809118133759
In [57]:
# Model score - R2 or coeff of determinant
# R^2=1–RSS / TSS = RegErr / TSS
regression_model.score(X_test, y_test)
Out[57]:
0.8404952015294237

7/3/2021 temp-162530791369287433
In [58]:
# R^2 is not a reliable metric as it always increases with addition of more attributes
even if the attributes have no
# influence on the predicted variable. Instead we use adjusted R^2 which removes the st
atistical chance that improves R^2
# Scikit does not provide a facility for adjusted R^2... so we use
# statsmodel, a library that gives results similar to
# what you obtain in R language
# This library expects the X and Y to be given in one single dataframe
data_train = pd.concat([X_train, y_train], axis=1)
data_train.head()
Out[58]:
cyl disp hp wt acc yr car_type mpg
350 4 105.0 63.0 2215 14.9 81 1 34.7
59 4 97.0 54.0 2254 23.5 72 1 23.0
120 4 121.0 112.0 2868 15.5 73 1 19.0
12 8 400.0 150.0 3761 9.5 70 0 15.0
349 4 91.0 68.0 1985 16.0 81 1 34.1
In [59]:
import statsmodels.formula.api as smf
lm1 = smf.ols(formula= 'mpg ~ cyl+disp+hp+wt+acc+yr+car_type', data = data_train).fit()
lm1.params
Out[59]:
Intercept -26.693360
cyl 1.863718
disp 0.010066
hp -0.039229
wt -0.006415
acc 0.011724
yr 0.758818
car_type 6.626521
dtype: float64

7/3/2021 temp-162530791369287433
In [60]:
print(lm1.summary()) #Inferential statistics

7/3/2021 temp-162530791369287433
OLS Regression Results

==========================================================================
====
Dep. Variable: mpg R-squared:

0.826
Model: OLS Adj. R-squared:

0.821
Method: Least Squares F-statistic: 1

82.9
Date: Wed, 31 Mar 2021 Prob (F-statistic): 1.41

e-98
Time: 15:17:36 Log-Likelihood: -72

5.17
No. Observations: 278 AIC: 1

466.
Df Residuals: 270 BIC: 1

495.
Df Model: 7
Covariance Type: nonrobust
==========================================================================
====
coef std err t P>|t| [0.025 0.

975]
--------------------------------------------------------------------------
----
Intercept -26.6934 5.597 -4.769 0.000 -37.713 -1

5.674
cyl 1.8637 0.517 3.606 0.000 0.846

2.881
disp 0.0101 0.009 1.123 0.262 -0.008

0.028
hp -0.0392 0.016 -2.420 0.016 -0.071 -

0.007
wt -0.0064 0.001 -7.865 0.000 -0.008 -

0.005
acc 0.0117 0.114 0.103 0.918 -0.212

0.236
yr 0.7588 0.060 12.668 0.000 0.641

0.877
car_type 6.6265 1.041 6.364 0.000 4.577

8.677
==========================================================================
====
Omnibus: 35.838 Durbin-Watson:

2.082
Prob(Omnibus): 0.000 Jarque-Bera (JB): 6

8.579
Skew: 0.693 Prob(JB): 1.28

e-15
Kurtosis: 5.000 Cond. No. 8.64

e+04
==========================================================================
====
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is cor
rectly specified.
[2] The condition number is large, 8.64e+04. This might indicate that ther
e are
strong multicollinearity or other numerical problems.

7/3/2021 temp-162530791369287433
In [61]:
# Let us check the sum of squared errors by predicting value of y for test cases and
# subtracting from the actual y for the test cases
mse = np.mean((regression_model.predict(X_test)-y_test)**2)
In [62]:
# underroot of mean_sq_error is standard deviation i.e. avg variance between predicted

and actual
import math
math.sqrt(mse)
Out[62]:
3.0538103653849573
In [63]:
# so there is avg of 3.0 (roundoff) mpg difference from real mpg on an avg
In [64]:
# R^2=1–RSS / TSS
regression_model.score(X_test, y_test)
Out[64]:
0.8404952015294237
In [65]:
# predict mileage (mpg) for a set of attributes not in the training or test set
y_pred = regression_model.predict(X_test)

7/3/2021 temp-162530791369287433
In [66]:
# Since this is regression, plot the predicted y value vs actual y values for the test
data
# A good model's prediction will be close to actual leading to high R and R2 values
#plt.rcParams['figure.dpi'] = 500
plt.scatter(y_test['mpg'], y_pred)
Out[66]:
<matplotlib.collections.PathCollection at 0x18104b9ddc0>
In [67]:
# How do we improve the model? the R^2 is .844, how do we improve it

# The indpendent attributes have different units and scales of measurement
# It is always a good practice to scale all the dimensions using z scores or someother
methode to address the problem of different scales

7/3/2021 temp-162530791369287433
In [68]:
from scipy.stats import zscore
X_train_scaled = X_train.apply(zscore)
X_test_scaled = X_test.apply(zscore)
y_train_scaled = y_train.apply(zscore)
y_test_scaled = y_test.apply(zscore)
In [69]:
# invoke the LinearRegression function and find the bestfit model on training data
regression_model = LinearRegression()
regression_model.fit(X_train_scaled, y_train_scaled)
Out[69]:
LinearRegression()
In [70]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
Out[70]:
LinearRegression()
In [71]:
# Let us explore the coefficients for each of the independent attributes
for idx, col_name in enumerate(X_train.columns):
print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx

]))
The coefficient for cyl is 0.3957470005920464
The coefficient for disp is 0.1305719525443726
The coefficient for hp is -0.18704682143280665
The coefficient for wt is -0.6881521871765556
The coefficient for acc is 0.004130471087879466
The coefficient for yr is 0.35562759880850775
The coefficient for car_type is 0.4197709075092129
In [72]:
intercept = regression_model.intercept_[0]
print("The intercept for our model is {}".format(intercept))
The intercept for our model is -2.8520553264365213e-18

7/3/2021 temp-162530791369287433
In [73]:
# R^2=1–RSS / TSS
regression_model.score(X_test_scaled, y_test_scaled)
Out[73]:
0.8460575288663481
In [74]:
# Let us check the sum of squared errors by predicting value of y for training cases an
d
# subtracting from the actual y for the training cases
mse = np.mean((regression_model.predict(X_test_scaled)-y_test_scaled)**2)
In [75]:
# underroot of mean_sq_error is standard deviation i.e. avg variance between predicted

and actual
import math
math.sqrt(mse)
Out[75]:
0.39235503199736316
In [76]:
# predict mileage (mpg) for a set of attributes not in the training or test set
y_pred = regression_model.predict(X_test_scaled)

7/3/2021 temp-162530791369287433
In [77]:
# Since this is regression, plot the predicted y value vs actual y values for the test
data
# A good model's prediction will be close to actual leading to high R and R2 values
plt.scatter(y_test_scaled['mpg'], y_pred)
Out[77]:
<matplotlib.collections.PathCollection at 0x18104bfc880>
In [78]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
In [79]:
vif = [variance_inflation_factor(X.values, ix) for ix in range(X.shape[1])]

7/3/2021 temp-162530791369287433
In [80]:
i=0
for column in X.columns:
if i < 11:
print (column ,"--->", vif[i])
i = i+1
cyl ---> 172.09167529137474
disp ---> 87.05808335183303
hp ---> 71.23983108333236
wt ---> 139.1665144189037
acc ---> 69.82068667385671
yr ---> 166.95012233353933
car_type ---> 12.993508077923245
In [ ]:

Statisitics Project 3

Uploaded by

Document Information

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Statisitics Project 3

Uploaded by

Copyright:

Available Formats

7/3/2021 temp-162530791369287433

# Import Linear Regression machine learning library

from sklearn.linear_model import LinearRegression

# to handle data in form of rows and columns

# importing ploting libraries

import matplotlib.pyplot as plt

#importing seaborn for statistical plots

import seaborn as sns

https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/notebook 1.html?t=1625308015432 1/22

https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/notebook 1.html?t=1625308015432 2/22

mpg cyl disp hp wt acc yr origin car_type car_name

0 18.0 8 307.0 130 3504 12.0 70 1 0 chevrolet chevelle malibu

1 15.0 8 350.0 165 3693 11.5 70 1 0 buick skylark 320

2 18.0 8 318.0 150 3436 11.0 70 1 0 plymouth satellite

3 16.0 8 304.0 150 3433 12.0 70 1 0 amc rebel sst

4 17.0 8 302.0 140 3449 10.5 70 1 0 ford torino

5 15.0 8 429.0 198 4341 10.0 70 1 0 ford galaxie 500

6 14.0 8 454.0 220 4354 9.0 70 1 0 chevrolet impala

7 14.0 8 440.0 215 4312 8.5 70 1 0 plymouth fury iii

8 14.0 8 455.0 225 4425 10.0 70 1 0 pontiac catalina

9 15.0 8 390.0 190 3850 8.5 70 1 0 amc ambassador dpl

10 15.0 8 383.0 170 3563 10.0 70 1 0 dodge challenger se

11 14.0 8 340.0 160 3609 8.0 70 1 0 plymouth 'cuda 340

12 15.0 8 400.0 150 3761 9.5 70 1 0 chevrolet monte carlo

13 14.0 8 455.0 225 3086 10.0 70 1 0 buick estate wagon (sw)

14 24.0 4 113.0 95 2372 15.0 70 3 1 toyota corona mark ii

15 22.0 6 198.0 95 2833 15.5 70 1 0 plymouth duster

16 18.0 6 199.0 97 2774 15.5 70 1 0 amc hornet

17 21.0 6 200.0 85 2587 16.0 70 1 0 ford maverick

18 27.0 4 97.0 88 2130 14.5 70 3 1 datsun pl510

19 26.0 4 97.0 46 1835 20.5 70 2 1 volkswagen 1131 deluxe sedan

20 25.0 4 110.0 87 2672 17.5 70 2 1 peugeot 504

21 24.0 4 107.0 90 2430 14.5 70 2 1 audi 100 ls

22 25.0 4 104.0 95 2375 17.5 70 2 1 saab 99e

23 26.0 4 121.0 113 2234 12.5 70 2 1 bmw 2002

24 21.0 6 199.0 90 2648 15.0 70 1 0 amc gremlin

25 10.0 8 360.0 215 4615 14.0 70 1 0 ford f250

26 10.0 8 307.0 200 4376 15.0 70 1 0 chevy c20

27 11.0 8 318.0 210 4382 13.5 70 1 0 dodge d200

28 9.0 8 304.0 193 4732 18.5 70 1 0 hi 1200d

29 27.0 4 97.0 88 2130 14.5 71 3 1 datsun pl510

30 28.0 4 140.0 90 2264 15.5 71 1 1 chevrolet vega 2300

31 25.0 4 113.0 95 2228 14.0 71 3 1 toyota corona

32 25.0 4 98.0 ? 2046 19.0 71 1 1 ford pinto

33 19.0 6 232.0 100 2634 13.0 71 1 0 amc gremlin

34 16.0 6 225.0 105 3439 15.5 71 1 0 plymouth satellite custom

35 17.0 6 250.0 100 3329 15.5 71 1 0 chevrolet chevelle malibu

36 19.0 6 250.0 88 3302 15.5 71 1 0 ford torino 500

https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/notebook 1.html?t=1625308015432 3/22

mpg cyl disp hp wt acc yr origin car_type car_name

37 18.0 6 232.0 100 3288 15.5 71 1 0 amc matador

38 14.0 8 350.0 165 4209 12.0 71 1 0 chevrolet impala

39 14.0 8 400.0 175 4464 11.5 71 1 0 pontiac catalina brougham

40 14.0 8 351.0 153 4154 13.5 71 1 0 ford galaxie 500

41 14.0 8 318.0 150 4096 13.0 71 1 0 plymouth fury iii

42 12.0 8 383.0 180 4955 11.5 71 1 0 dodge monaco (sw)

43 13.0 8 400.0 170 4746 12.0 71 1 0 ford country squire (sw)

44 13.0 8 400.0 175 5140 12.0 71 1 0 pontiac safari (sw)

45 18.0 6 258.0 110 2962 13.5 71 1 0 amc hornet sportabout (sw)

46 22.0 4 140.0 72 2408 19.0 71 1 1 chevrolet vega (sw)

47 19.0 6 250.0 100 3282 15.0 71 1 0 pontiac firebird

48 18.0 6 250.0 88 3139 14.5 71 1 0 ford mustang

49 23.0 4 122.0 86 2220 14.0 71 1 1 mercury capri 2000

# drop the car name column as it is useless for the model