You are on page 1of 8

Ashish Chavan Practical No.4 Roll no.

11 (TA1)

Import libraries
In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
In [4]:
boston = load_boston()
boston.keys()
Out[4]:
dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename',
'data_module'])
In [5]:
x = pd.DataFrame(boston.data, columns=boston.feature_names)
y = pd.DataFrame(boston.target, columns=['MEDV'])
In [6]:
x.head()
Out[6]:
INDU CHA NO AG RA TA PTRATI LSTA
CRIM ZN RM DIS B
S S X E D X O T

0.0063 18. 0.53 6.57 4.090 296. 396.9


0 2.31 0.0 65.2 1.0 15.3 4.98
2 0 8 5 0 0 0

0.0273 0.46 6.42 4.967 242. 396.9


1 0.0 7.07 0.0 78.9 2.0 17.8 9.14
1 9 1 1 0 0

0.0272 0.46 7.18 4.967 242. 392.8


2 0.0 7.07 0.0 61.1 2.0 17.8 4.03
9 9 5 1 0 3

0.0323 0.45 6.99 6.062 222. 394.6


3 0.0 2.18 0.0 45.8 3.0 18.7 2.94
7 8 8 2 0 3

0.0690 0.45 7.14 6.062 222. 396.9


4 0.0 2.18 0.0 54.2 3.0 18.7 5.33
5 8 7 2 0 0

In [7]:
x.shape, y.shape
Out[7]:
Ashish Chavan Practical No.4 Roll no.11 (TA1)

((506, 13), (506, 1))

Basic stats
In [8]:
x.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 CRIM 506 non-null float64
1 ZN 506 non-null float64
2 INDUS 506 non-null float64
3 CHAS 506 non-null float64
4 NOX 506 non-null float64
5 RM 506 non-null float64
6 AGE 506 non-null float64
7 DIS 506 non-null float64
8 RAD 506 non-null float64
9 TAX 506 non-null float64
10 PTRATIO 506 non-null float64
11 B 506 non-null float64
12 LSTAT 506 non-null float64
dtypes: float64(13)
memory usage: 51.5 KB
In [9]:
x.describe()
Out[9]:
PTR
CRI IND CHA LST
ZN NOX RM AGE DIS RAD TAX ATI B
M US S AT
O

co 506.0 506.0 506.0 506.0 506.0 506.0 506.0 506.0 506.0 506.0 506.0 506.0 506.0
un 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000
t 0 0 0 0 0 0 0 0 0 0 0 0 0

m 408.2 356.6
3.613 11.36 11.13 0.069 0.554 6.284 68.57 3.795 9.549 18.45 12.65
ea 3715 7403
524 3636 6779 170 695 634 4901 043 407 5534 3063
n 4 2

168.5
st 8.601 23.32 6.860 0.253 0.115 0.702 28.14 2.105 8.707 2.164 91.29 7.141
3711
d 545 2453 353 994 878 617 8861 710 259 946 4864 062
6

187.0
mi 0.006 0.000 0.460 0.000 0.385 3.561 2.900 1.129 1.000 12.60 0.320 1.730
0000
n 320 000 000 000 000 000 000 600 000 0000 000 000
0

25 0.082 0.000 5.190 0.000 0.449 5.885 45.02 2.100 4.000 279.0 17.40 375.3 6.950
% 045 000 000 000 000 500 5000 175 000 0000 0000 7750
Ashish Chavan Practical No.4 Roll no.11 (TA1)

PTR
CRI IND CHA LST
ZN NOX RM AGE DIS RAD TAX ATI B
M US S AT
O

0 0 000

330.0 391.4
50 0.256 0.000 9.690 0.000 0.538 6.208 77.50 3.207 5.000 19.05 11.36
0000 4000
% 510 000 000 000 000 500 0000 450 000 0000 0000
0 0

666.0 396.2
75 3.677 12.50 18.10 0.000 0.624 6.623 94.07 5.188 24.00 20.20 16.95
0000 2500
% 083 0000 0000 000 000 500 5000 425 0000 0000 5000
0 0

100.0 100.0 711.0 396.9


m 88.97 27.74 1.000 0.871 8.780 12.12 24.00 22.00 37.97
0000 0000 0000 0000
ax 6200 0000 000 000 000 6500 0000 0000 0000
0 0 0 0

In [10]:
y.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 1 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 MEDV 506 non-null float64
dtypes: float64(1)
memory usage: 4.1 KB
In [11]:
y.describe()
Out[11]:
MEDV

count 506.000000

mean 22.532806

std 9.197104

min 5.000000

25% 17.025000

50% 21.200000

75% 25.000000
Ashish Chavan Practical No.4 Roll no.11 (TA1)

MEDV

max 50.000000

In [12]:
x.isnull().sum()
Out[12]:
CRIM 0
ZN 0
INDUS 0
CHAS 0
NOX 0
RM 0
AGE 0
DIS 0
RAD 0
TAX 0
PTRATIO 0
B 0
LSTAT 0
dtype: int64
In [13]:
y.isnull().sum()
Out[13]:
MEDV 0
dtype: int64
In [14]:
df = x
df["target"] = y
df.head()
Out[14]:
CRI Z IND CH NO AG RA TA PTRAT LST targ
RM DIS B
M N US AS X E D X IO AT et

0.006 18. 0.53 6.57 65. 4.09 296. 396.


0 2.31 0.0 1.0 15.3 4.98 24.0
32 0 8 5 2 00 0 90

0.027 0.46 6.42 78. 4.96 242. 396.


1 0.0 7.07 0.0 2.0 17.8 9.14 21.6
31 9 1 9 71 0 90

0.027 0.46 7.18 61. 4.96 242. 392.


2 0.0 7.07 0.0 2.0 17.8 4.03 34.7
29 9 5 1 71 0 83

0.032 0.45 6.99 45. 6.06 222. 394.


3 0.0 2.18 0.0 3.0 18.7 2.94 33.4
37 8 8 8 22 0 63
Ashish Chavan Practical No.4 Roll no.11 (TA1)

CRI Z IND CH NO AG RA TA PTRAT LST targ


RM DIS B
M N US AS X E D X IO AT et

0.069 0.45 7.14 54. 6.06 222. 396.


4 0.0 2.18 0.0 3.0 18.7 5.33 36.2
05 8 7 2 22 0 90

In [15]:
plt.figure(figsize=(15,10))
sns.heatmap(df.corr(), annot=True)
plt.show()

Considering only 'RM' and 'LSTAT' by considering correlation and multi-


collinearity of other features

In [51]:
df = df[['RM', 'LSTAT', 'target']]
In [52]:
sns.pairplot(df)
plt.show()
Ashish Chavan Practical No.4 Roll no.11 (TA1)

In [63]:
x = df[['RM', 'LSTAT']]
y = df['target']

Scale the data


In [64]:
scaler = StandardScaler()
In [65]:
x = scaler.fit_transform(x)

Split the data


In [67]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3,
shuffle=True)
Ashish Chavan Practical No.4 Roll no.11 (TA1)

In [68]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape
Out[68]:
((354, 2), (152, 2), (354,), (152,))

Linear Regression Modelling


In [69]:
model = LinearRegression(n_jobs=-1)
In [70]:
model.fit(x_train, y_train)
Out[70]:
LinearRegression(n_jobs=-1)

Make predictions
In [71]:
y_pred = model.predict(x_test)
In [72]:
mean_absolute_error(y_test, y_pred)
Out[72]:
3.701010266760501
In [73]:
mean_squared_error(y_test, y_pred)
Out[73]:
30.5001478179898
In [74]:
sns.regplot(y_test, y_pred, color='red')
plt.show()
Ashish Chavan Practical No.4 Roll no.11 (TA1)

You might also like