Professional Documents
Culture Documents
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
Exploring Dataset
df = pd.read_csv('/content/Housing.csv')
df
price area bedrooms bathrooms stories mainroad guestroom basement hotwaterheating aircondit
1767150 2400 3 1 1 no no no no
1750000 2910 3 1 1 no no no no
ows × 13 columns
df[df.columns].value_counts()
price area bedrooms bathrooms stories mainroad guestroom basement hotwaterheating airconditioning parking prefarea
furnishingstatus
1750000 2910 3 1 1 0 0 0 0 0 0 0
0 1
5229000 7085 3 1 1 1 1 1 0 0 2 1
1 1
5110000 11410 2 1 2 1 0 0 0 0 0 1
0 1
5145000 3410 3 1 2 0 0 0 0 1 0 0
1 1
7980 3 1 1 1 0 0 0 0 1 1
1 1
..
3675000 3630 2 1 1 1 0 0 0 1 0 0
2 1
3600 2 1 1 1 0 0 0 0 0 0
0 1
3640000 5960 3 1 2 1 1 1 0 0 0 0
2 1
4280 2 1 1 1 0 0 0 1 2 0
1 1
13300000 7420 4 2 3 1 0 0 0 1 2 1
0 1
Length: 545, dtype: int64
df.shape
(545, 13)
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
https://colab.research.google.com/drive/1_eJyUtCOeeskUN6uNmHQYel6QJP98u0l#scrollTo=eLxfZUqG37F9&printMode=true 1/6
2/10/23, 9:34 PM housing_21070126112 - Colaboratory
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 price 545 non-null int64
1 area 545 non-null int64
2 bedrooms 545 non-null int64
3 bathrooms 545 non-null int64
4 stories 545 non-null int64
5 mainroad 545 non-null object
6 guestroom 545 non-null object
7 basement 545 non-null object
8 hotwaterheating 545 non-null object
9 airconditioning 545 non-null object
10 parking 545 non-null int64
11 prefarea 545 non-null object
12 furnishingstatus 545 non-null object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB
df.isnull().sum()
price 0
area 0
bedrooms 0
bathrooms 0
stories 0
mainroad 0
guestroom 0
basement 0
hotwaterheating 0
airconditioning 0
parking 0
prefarea 0
furnishingstatus 0
dtype: int64
Encoding
label_encoder = LabelEncoder()
df['mainroad']= label_encoder.fit_transform(df['mainroad'])
df['guestroom'] = label_encoder.fit_transform(df['guestroom'])
df['basement'] = label_encoder.fit_transform(df['basement'])
df['hotwaterheating'] = label_encoder.fit_transform(df['hotwaterheating'])
df['airconditioning'] = label_encoder.fit_transform(df['airconditioning'])
df['prefarea'] = label_encoder.fit_transform(df['prefarea'])
df
price area bedrooms bathrooms stories mainroad guestroom basement hotwaterheating airconditi
13300000 7420 4 2 3 1 0 0 0
12250000 8960 4 4 4 1 0 0 0
12250000 9960 3 2 2 1 0 1 0
12215000 7500 4 2 2 1 0 1 0
11410000 7420 4 1 2 1 1 1 0
1820000 3000 2 1 1 1 0 1 0
1767150 2400 3 1 1 0 0 0 0
1750000 3620 2 1 1 1 0 0 0
1750000 2910 3 1 1 0 0 0 0
1750000 3850 3 1 2 1 0 0 0
ws × 13 columns
df['furnishingstatus']= label_encoder.fit_transform(df['furnishingstatus'])
df
https://colab.research.google.com/drive/1_eJyUtCOeeskUN6uNmHQYel6QJP98u0l#scrollTo=eLxfZUqG37F9&printMode=true 2/6
2/10/23, 9:34 PM housing_21070126112 - Colaboratory
price area bedrooms bathrooms stories mainroad guestroom basement hotwaterheating airconditi
13300000 7420 4 2 3 1 0 0 0
12250000 8960 4 4 4 1 0 0 0
12250000 9960 3 2 2 1 0 1 0
12215000 7500 4 2 2 1 0 1 0
11410000 7420 4 1 2 1 1 1 0
1820000 3000 2 1 1 1 0 1 0
1767150 2400 3 1 1 0 0 0 0
1750000 3620 2 1 1 1 0 0 0
1750000 2910 3 1 1 0 0 0 0
1750000 3850 3 1 2 1 0 0 0
EDA-visualisation
ws × 13 columns
fig,ax = plt.subplots(figsize=(14,14))
sns.heatmap(df.corr(),annot·=True)
<matplotlib.axes._subplots.AxesSubplot at 0x7fa992ac0c10>
sns.jointplot(data=df,x="area",y="price",hue="furnishingstatus",palette ='dark:salmon_r')
https://colab.research.google.com/drive/1_eJyUtCOeeskUN6uNmHQYel6QJP98u0l#scrollTo=eLxfZUqG37F9&printMode=true 3/6
2/10/23, 9:34 PM housing_21070126112 - Colaboratory
<seaborn.axisgrid.JointGrid at 0x7fa98ec74190>
sns.scatterplot(df["area"],df["price"],hue=df["parking"],style=df['guestroom'],palette ="rainbow")
plt.show()
sns.lineplot(data=df,x="parking",y="area",hue="furnishingstatus")
<matplotlib.axes._subplots.AxesSubplot at 0x7fa98f10e5e0>
df.columns
X=df[['area','bedrooms','bathrooms','stories','mainroad','guestroom','basement','hotwaterheating','airconditioning','parking','prefarea'
y=df['price']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size =0.33,random_state =42)
lr = LinearRegression()
lr.fit(X_train,y_train)
LinearRegression()
0.6490698391713673
print("Coefficients: ")
print("Intercept: ", lr.intercept_)
list(zip(X, lr.coef_))
Coefficients:
Intercept: -230797.8185174279
[('area', 252.74190153064856),
('bedrooms', 99525.89859410445),
('bathrooms', 1132688.6894603525),
('stories', 414361.566065939),
('mainroad', 458206.3753303046),
('guestroom', 339809.57305214717),
('basement', 518163.65271511185),
('hotwaterheating', 645620.7904595259),
('airconditioning', 698145.6458023346),
('parking', 308089.73272037157),
('prefarea', 533164.4687235687)]
y_pred_lr = lr.predict(X_test)
print("Prediction for test set:{}".format(y_pred_lr))
lr_diff = pd.DataFrame({'Actual value':y_test,'Predicted value':y_pred_lr})
lr_diff
https://colab.research.google.com/drive/1_eJyUtCOeeskUN6uNmHQYel6QJP98u0l#scrollTo=eLxfZUqG37F9&printMode=true 5/6
2/10/23, 9:34 PM housing_21070126112 - Colaboratory
77 6650000 6.908581e+06
90 6440000 4.449253e+06
meanAbErr = metrics.mean_absolute_error(y_test, y_pred_lr)
... ... ...
meanSqErr = metrics.mean_squared_error(y_test, y_pred_lr)
17 8960000 7.911441e+06
rootMeanSqErr = np.sqrt(metrics.mean_squared_error(y_test, y_pred_lr))
print("R squared {:.2f}".format(lr.score(X,y)*100))
402 3500000 3.459633e+06
print("Mean Absolute Error: ",meanAbErr)
66 6930000 6.781578e+06
print("Mean Square Error: ",meanSqErr)
from sklearn import metrics
238 4613000 5.377642e+06
print("Root Mean Square Error",rootMeanSqErr)
272 4340000 4.577112e+06
R squared 66.73
Meanrows
180 Absolute Error: 902928.2604905332
× 2 columns
Mean Square Error: 1503987779975.0256
Root Mean Square Error 1226371.795164511
https://colab.research.google.com/drive/1_eJyUtCOeeskUN6uNmHQYel6QJP98u0l#scrollTo=eLxfZUqG37F9&printMode=true 6/6