You are on page 1of 16

Importing necessary Python Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

Read Data Set

data = pd.read_csv("/content/House_Rent_Dataset.csv")

See top 5 rows

data.head()

Posted Area Furnishing Tenant Point of


BHK Rent Size Floor Area Locality City Bathroom
On Type Status Preferred Contact

2022-05- Ground out Super Contact


0 2 10000 1100 Bandel Kolkata Unfurnished Bachelors/Family 2
18 of 2 Area Owner

2022-05- Super Phool Bagan, Semi- Contact


1 2 20000 800 1 out of 3 Kolkata Bachelors/Family 1
13 Area Kankurgachi Furnished Owner

2022-05- Super Salt Lake City Sector Semi- Contact


2 2 17000 1000 1 out of 3 Kolkata Bachelors/Family 1
16 Area 2 Furnished Owner

2022-07- Super Contact


3 2 10000 800 1 out of 2 Dumdum Park Kolkata Unfurnished Bachelors/Family 1
04 Area Owner

2022-05- Carpet Contact


4 2 7500 850 1 out of 2 South Dum Dum Kolkata Unfurnished Bachelors 1
09 Area Owner

see column data type and some info

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4746 entries, 0 to 4745
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Posted On 4746 non-null object
1 BHK 4746 non-null int64
2 Rent 4746 non-null int64
3 Size 4746 non-null int64
4 Floor 4746 non-null object
5 Area Type 4746 non-null object
6 Area Locality 4746 non-null object
7 City 4746 non-null object
8 Furnishing Status 4746 non-null object
9 Tenant Preferred 4746 non-null object
10 Bathroom 4746 non-null int64
11 Point of Contact 4746 non-null object
dtypes: int64(4), object(8)
memory usage: 445.1+ KB

see precentege of missing value in each column

data.isna().sum() / data.shape[0]

Posted On 0.0
BHK 0.0
Rent 0.0
Size 0.0
Floor 0.0
Area Type 0.0
Area Locality 0.0
City 0.0
Furnishing Status 0.0
Tenant Preferred 0.0
Bathroom 0.0
Point of Contact 0.0
dtype: float64

check if duplicated in data

data.duplicated().any()

False

see quick info of numeric values

data.describe()
data.describe()

BHK Rent Size Bathroom

count 4746.000000 4.746000e+03 4746.000000 4746.000000

mean 2.083860 3.499345e+04 967.490729 1.965866

std 0.832256 7.810641e+04 634.202328 0.884532

min 1.000000 1.200000e+03 10.000000 1.000000

25% 2.000000 1.000000e+04 550.000000 1.000000

50% 2.000000 1.600000e+04 850.000000 2.000000

75% 3.000000 3.300000e+04 1200.000000 2.000000

max 6.000000 3.500000e+06 8000.000000 10.000000

see quick info of category values

data.describe(include = object)

Posted On Floor Area Type Area Locality City Furnishing Status Tenant Preferred Point of Contact

count 4746 4746 4746 4746 4746 4746 4746 4746

unique 81 480 3 2235 6 3 3 3

top 2022-07-06 1 out of 2 Super Area Bandra West Mumbai Semi-Furnished Bachelors/Family Contact Owner

freq 311 379 2446 37 972 2251 3444 3216

Univariate Analysis & Visualizations

create function to visualized categorical column using count plot

def count_plot(column_name, hue = None, rotation = 0):

graph = sns.countplot(x = column_name, data = data, hue = hue, order = data[column_name].value_counts().index


for container in graph.containers:
graph.bar_label(container)

plt.xticks(rotation = rotation)
plt.show()

create function that visualized categorical column using pie plot

def pie_plot(column_name):

plt.pie(data[column_name].value_counts(), labels = data[column_name].value_counts().index, shadow = True, autopct


plt.show()

create function that visualized numeric columns using box plot

def box_plot(x_axis = None, y_axis = None):


sns.boxplot(x = x_axis, y = y_axis, data = data)
plt.xlabel(x_axis)
plt.ylabel("FRQ")
plt.show()

Discovering Rent column

# see distribution

box_plot(y_axis = "Rent")
remove rows that It has a value greater than 2000000

data = data[data['Rent'] <= 2000000]

see distribution after remove outlier

box_plot(y_axis = "Rent")

Discovering Posted On column

see unique values

data["Posted On"].unique()
array(['2022-05-18', '2022-05-13', '2022-05-16', '2022-07-04',
'2022-05-09', '2022-04-29', '2022-06-21', '2022-06-07',
'2022-06-20', '2022-05-23', '2022-05-14', '2022-05-05',
'2022-06-01', '2022-05-17', '2022-06-09', '2022-07-02',
'2022-06-14', '2022-06-15', '2022-05-28', '2022-05-22',
'2022-06-18', '2022-06-25', '2022-06-22', '2022-05-21',
'2022-06-26', '2022-06-16', '2022-06-29', '2022-05-10',
'2022-05-12', '2022-06-03', '2022-05-31', '2022-06-10',
'2022-06-24', '2022-04-30', '2022-05-27', '2022-06-06',
'2022-05-20', '2022-05-26', '2022-07-03', '2022-04-23',
'2022-06-19', '2022-07-06', '2022-06-04', '2022-05-30',
'2022-06-08', '2022-05-03', '2022-05-07', '2022-06-27',
'2022-05-15', '2022-04-28', '2022-06-28', '2022-06-23',
'2022-06-13', '2022-05-04', '2022-04-25', '2022-05-06',
'2022-06-30', '2022-07-01', '2022-06-17', '2022-06-11',
'2022-05-11', '2022-06-12', '2022-06-05', '2022-05-25',
'2022-05-24', '2022-05-19', '2022-04-24', '2022-04-27',
'2022-05-08', '2022-04-26', '2022-06-02', '2022-05-29',
'2022-07-07', '2022-07-08', '2022-07-09', '2022-07-05',
'2022-05-01', '2022-04-13', '2022-05-02', '2022-07-10',
'2022-07-11'], dtype=object)

convert column to data time

data["Posted On"] = pd.to_datetime(data["Posted On"])

create new columns "Year", "Month"

data["Year"] = data["Posted On"].dt.year

data["Month"] = data["Posted On"].dt.month

create function that find season

def find_seasons(month_number):

if month_number in [12, 1, 2]:


return 1

elif month_number in [3, 4, 5]:


return 2

elif month_number in [6, 7, 8]:


return 3

elif month_number in [9, 10, 11]:


return 4

create seasone column

data["Season"] = data["Month"].apply(find_seasons)

new drop "Posted On" column

data.drop(columns = "Posted On", inplace = True)

see sample of data

data.sample(3)

Area Area Furnishing Tenant Point of


BHK Rent Size Floor City Bathroom Year Month Season
Type Locality Status Preferred Contact

Gandhi
Ground Carpet Contact
394 2 7000 450 Colony, Netaji Kolkata Unfurnished Bachelors/Family 1 2022 6 3
out of 4 Area Owner
Nagar

Ground Super Contact


4500 2 10500 1050 Hastinapuram Hyderabad Unfurnished Bachelors/Family 2 2022 5 2
out of 3 Area Owner

6 out of Carpet Contact


2209 3 280000 2800 Rajajinagar Bangalore Furnished Bachelors/Family 3 2022 7 3
30 Area Agent

see unique values in "Year" column

data["Year"].unique()

array([2022])

see unique values in "Month" column

data["Month"].unique()
array([5, 7, 4, 6])

see unique values in "Season" column

data["Season"].unique()

array([2, 3])

number of unique values is small, which is appropriate with pie plot

pie_plot("Season")

Discovering BHK column

see unique value

data["BHK"].unique()

array([2, 1, 3, 6, 4, 5])

number of unique values is relatively large, count plot more suitable for it

# first set figure size


plt.figure(figsize = (15,6))

# call function i create it in cell 9


count_plot(column_name = "BHK")

Discovering Size column

box_plot(y_axis = "Size")
Discovering Floor column

# see unique values

data["Floor"].unique()[:50]

array(['Ground out of 2', '1 out of 3', '1 out of 2', 'Ground out of 1',
'Ground out of 4', '1 out of 4', '1 out of 1', 'Ground out of 3',
'2 out of 3', '4 out of 5', '2 out of 2', '2 out of 5',
'4 out of 14', '3 out of 3', '5 out of 5', '4 out of 4',
'7 out of 8', '2 out of 4', '3 out of 4', '1 out of 5',
'8 out of 5', 'Ground out of 6', '2 out of 1',
'Upper Basement out of 4', 'Ground out of 5', '3 out of 5',
'11 out of 19', '5 out of 10', '11 out of 14',
'Lower Basement out of 2', '2 out of 7', '4 out of 10',
'7 out of 10', '2 out of 13', '6 out of 7', '4 out of 7',
'14 out of 14', '43 out of 78', '2 out of 8', '13 out of 18',
'5 out of 12', '18 out of 24', '3 out of 7', '17 out of 31',
'11 out of 21', '7 out of 19', '14 out of 23', '9 out of 20',
'Upper Basement out of 9', '19 out of 24'], dtype=object)

create function that return Floor level

def finde_level(sentence):
"""
input : string from floor column
output : find level of Flat
case 1:
input : '2 out of 3'
output : 2

case 2:
input : 'Upper Basement out of 9'
output : Upper Basement
i used try and except Because there is some sentence that contains only the floor level
"""
sentence = sentence.split(" ")
try:
return " ".join(sentence[:2]) if sentence[1] == "Basement" else sentence[0]
except:
return sentence[0]

create function that return number of levels

def finde_levels_number(sentence):
"""
input : string from floor column
output : find number of levels
"""
sentence = sentence.split(" ")
return sentence[-1] if len(sentence) > 1 else "1"

create new column level

data["Level"] = data["Floor"].apply(finde_level)

create new column Floors numbers

data["Floors numbers"] = data["Floor"].apply(finde_levels_number)


new, drop "Floor" column

data.drop(columns = "Floor", inplace = True)

see sample of data

data.sample(3)

Area Area Furnishing Tenant Point of Floors


BHK Rent Size City Bathroom Year Month Season Level
Type Locality Status Preferred Contact numbers

Carpet Thakur Contact


987 1 22000 430 Mumbai Unfurnished Family 1 2022 6 3 4
Area Complex Agent

Super Moodalaplya, Contact


1657 2 10000 535 Bangalore Unfurnished Bachelors/Family 1 2022 5 2 3
Area Vijayanagar Owner

Super Contact
3359 2 21000 1100 Navalur Chennai Furnished Bachelors/Family 2 2022 6 3 2 16
Area Owner

see unique values in "Level" clumn

data["Level"].unique()

array(['Ground', '1', '2', '4', '3', '5', '7', '8', 'Upper Basement',
'11', 'Lower Basement', '6', '14', '43', '13', '18', '17', '9',
'19', '60', '34', '12', '26', '25', '53', '16', '10', '39', '32',
'47', '28', '20', '15', '65', '40', '37', '22', '21', '30', '35',
'33', '44', '41', '46', '27', '45', '48', '50', '24', '23', '29',
'49', '36', '76'], dtype=object)

replace 'Lower Basement', 'Ground' and "Upper Basement" to

data["Level"].replace({"Ground" : 0 , "Lower Basement" : 0, "Upper Basement" : 0},inplace = True)

number of unique values is relatively large, count plot more suitable for it

# first set figure size


plt.figure(figsize = (15,6))

# call function i create it in cell 9


count_plot(column_name = "Level", rotation = 90)

see unique values in "Floors numbers" column

data["Floors numbers"].unique()

array(['2', '3', '1', '4', '5', '14', '8', '6', '19', '10', '7', '13',
'78', '18', '12', '24', '31', '21', '23', '20', '9', '22', '58',
'16', '66', '48', '40', '44', '42', '41', '60', '32', '30', '29',
'89', '15', '11', '28', '17', '45', '35', '75', '38', '51', '43',
'25', '27', '26', '76', '36', '37', '55', '68', '77', '50', '59',
'62', '39', '52', '54', '33', '46', '85', '71', '81', '34'],
dtype=object)

number of unique values is relatively large, count plot more suitable for it

# first set figure size


plt.figure(figsize = (15,6))
# call function i create it in cell 9
count_plot(column_name = "Floors numbers", rotation = 90)

Discovering Area Type column

# see unique values

data["Area Type"].unique()

array(['Super Area', 'Carpet Area', 'Built Area'], dtype=object)

number of unique values is small, which is appropriate with pie plot

pie_plot("Area Type")

Discovering City column

# see unique values

data["City"].unique()

array(['Kolkata', 'Mumbai', 'Bangalore', 'Delhi', 'Chennai', 'Hyderabad'],


dtype=object)

number of unique values is relatively large, count plot more suitable for it

# first set figure size


plt.figure(figsize = (15,6))

# call function i create it in cell 9


count_plot(column_name = "City")
Discovering Furnishing Status column

# see unique values

data["Furnishing Status"].unique()

array(['Unfurnished', 'Semi-Furnished', 'Furnished'], dtype=object)

number of unique values is small, which is appropriate with pie plot

pie_plot("Furnishing Status")

Discovering Tenant Preferred column

# see unique values

data["Tenant Preferred"].unique()

array(['Bachelors/Family', 'Bachelors', 'Family'], dtype=object)

number of unique values is relatively large, count plot more suitable for it

# first set figure size


plt.figure(figsize = (15,6))

# call function i create it in cell 9


count_plot(column_name = "Tenant Preferred")
Discovering Bathroom column

# see unique values

data["Bathroom"].unique()

array([ 2, 1, 3, 5, 4, 6, 7, 10])

number of unique values is relatively large, count plot more suitable for it

# first set figure size


plt.figure(figsize = (15,6))

# call function i create it in cell 9


count_plot(column_name = "Bathroom")

Discovering Point of Contact column

# see unique values

data["Point of Contact"].unique()

array(['Contact Owner', 'Contact Agent', 'Contact Builder'], dtype=object)

number of unique values is small, which is appropriate with pie plot

pie_plot("Point of Contact")
Bivariate Analysis & Visualizations

# What is "BHK" and "Rent" distribution

# set figure size


plt.figure(figsize = (15,6))

# call function i create it in cell 11


box_plot(x_axis = "BHK", y_axis = "Rent")

What is "Bathroom" and "Rent" distribution

# set figure size


plt.figure(figsize = (15,6))

# call function i create it in cell 11


box_plot(x_axis = "Bathroom", y_axis = "Rent")
What is " Area Type" and "Rent" Distribution

# set figure size


plt.figure(figsize = (15,6))

# call function i create it in cell 11


box_plot(x_axis = "Area Type", y_axis = "Rent")

What is "City" and "Rent" Distribution

# set figure size


plt.figure(figsize = (15,6))

# call function i create it in cell 11


box_plot(x_axis = "City", y_axis = "Rent")
What is "Furnishing Status" and "Rent" Distribution

# set figure size


plt.figure(figsize = (15,6))

# call function i create it in cell 11


box_plot(x_axis = "Furnishing Status", y_axis = "Rent")

What is "Tenant Preferred" and "Rent" Distribution

# set figure size


plt.figure(figsize = (15,6))

# call function i create it in cell 11


box_plot(x_axis = "Tenant Preferred", y_axis = "Rent")
see all columns

data.columns

Index(['BHK', 'Rent', 'Size', 'Area Type', 'Area Locality', 'City',


'Furnishing Status', 'Tenant Preferred', 'Bathroom', 'Point of Contact',
'Year', 'Month', 'Season', 'Level', 'Floors numbers'],
dtype='object')

convert some columns data type

data["Level"] = data["Level"].astype(int)

data["Floors numbers"] = data["Floors numbers"].astype(int)

first visualize correlation matrix between numerical columns

plt.figure(figsize = (10,6))
sns.heatmap(data.select_dtypes(exclude = object).corr(), annot = True, fmt = ".2f", linewidths = 0.2)
plt.show()

new i will droped some columns, Because they have weak Relationship with Rent

data.drop(columns = ["Year", "Month", "Season", "Area Locality"], inplace = True)


data.drop(columns = ["Year", "Month", "Season", "Area Locality"], inplace = True)

see sample of data

data.sample(3)

BHK Rent Size Area Type City Furnishing Status Tenant Preferred Bathroom Point of Contact Level Floors numbers

3788 2 9000 843 Super Area Chennai Semi-Furnished Bachelors/Family 2 Contact Owner 2 3

778 4 230000 3700 Carpet Area Mumbai Furnished Bachelors/Family 4 Contact Agent 5 20

1046 3 40000 1200 Super Area Mumbai Semi-Furnished Family 2 Contact Owner 2 4

MODELING

import libraries to model

from sklearn.preprocessing import LabelEncoder


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error,r2_score

create object from labelencoder

encoder = LabelEncoder()

loop over category column to encoded it

for column in data.select_dtypes(include = object).columns.tolist():


data[column] = encoder.fit_transform(data[column])

see sample of data

data.sample(3)

BHK Rent Size Area Type City Furnishing Status Tenant Preferred Bathroom Point of Contact Level Floors numbers

3091 2 15000 2800 1 1 2 2 2 2 0 1

43 2 8500 700 1 4 1 1 1 2 4 14

461 2 12500 1250 2 4 1 1 2 2 1 2

calculate base 10 logarithm of Rent column

data["Rent"] = np.log10(data["Rent"])

Spliting Data

Target = data["Rent"]
training = data.drop(columns = ["Rent"])

Splitting into Train and Test dataset

X_train, X_test, y_train, y_test = train_test_split(training, Target, test_size = 0.3, random_state = 42)

see shape of splited data

print("x_train shape: ", X_train.shape)


print("x_test shape: ", X_test.shape)
print("y_train shape: ", y_train.shape)
print("y_test shape: ", y_test.shape)

x_train shape: (3321, 10)


x_test shape: (1424, 10)
y_train shape: (3321,)
y_test shape: (1424,)

Scaling train data using min max scaler

scaler = MinMaxScaler()
X_train= scaler.fit_transform(X_train)
X_test= scaler.transform(X_test)

create object from RandomForestRegressor

model = RandomForestRegressor()
fit model

model.fit(X_train,y_train)

▾ RandomForestRegressor
RandomForestRegressor()

Calculate the score of the model on the training data

model.score(X_train, y_train)

0.969403280429433

see predicted values

predict = model.predict(X_test)

plt.scatter(y_test, predict)
plt.show()

see score

r2_score(y_test,predict)

0.8203406677569746

see mean absolute error

mean_absolute_error(y_test,predict)

0.1284177940495873

Loading [MathJax]/jax/output/CommonHTML/fonts/TeX/fontdata.js

You might also like