Professional Documents
Culture Documents
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
data = pd.read_csv("/content/House_Rent_Dataset.csv")
data.head()
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4746 entries, 0 to 4745
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Posted On 4746 non-null object
1 BHK 4746 non-null int64
2 Rent 4746 non-null int64
3 Size 4746 non-null int64
4 Floor 4746 non-null object
5 Area Type 4746 non-null object
6 Area Locality 4746 non-null object
7 City 4746 non-null object
8 Furnishing Status 4746 non-null object
9 Tenant Preferred 4746 non-null object
10 Bathroom 4746 non-null int64
11 Point of Contact 4746 non-null object
dtypes: int64(4), object(8)
memory usage: 445.1+ KB
data.isna().sum() / data.shape[0]
Posted On 0.0
BHK 0.0
Rent 0.0
Size 0.0
Floor 0.0
Area Type 0.0
Area Locality 0.0
City 0.0
Furnishing Status 0.0
Tenant Preferred 0.0
Bathroom 0.0
Point of Contact 0.0
dtype: float64
data.duplicated().any()
False
data.describe()
data.describe()
data.describe(include = object)
Posted On Floor Area Type Area Locality City Furnishing Status Tenant Preferred Point of Contact
top 2022-07-06 1 out of 2 Super Area Bandra West Mumbai Semi-Furnished Bachelors/Family Contact Owner
plt.xticks(rotation = rotation)
plt.show()
def pie_plot(column_name):
# see distribution
box_plot(y_axis = "Rent")
remove rows that It has a value greater than 2000000
box_plot(y_axis = "Rent")
data["Posted On"].unique()
array(['2022-05-18', '2022-05-13', '2022-05-16', '2022-07-04',
'2022-05-09', '2022-04-29', '2022-06-21', '2022-06-07',
'2022-06-20', '2022-05-23', '2022-05-14', '2022-05-05',
'2022-06-01', '2022-05-17', '2022-06-09', '2022-07-02',
'2022-06-14', '2022-06-15', '2022-05-28', '2022-05-22',
'2022-06-18', '2022-06-25', '2022-06-22', '2022-05-21',
'2022-06-26', '2022-06-16', '2022-06-29', '2022-05-10',
'2022-05-12', '2022-06-03', '2022-05-31', '2022-06-10',
'2022-06-24', '2022-04-30', '2022-05-27', '2022-06-06',
'2022-05-20', '2022-05-26', '2022-07-03', '2022-04-23',
'2022-06-19', '2022-07-06', '2022-06-04', '2022-05-30',
'2022-06-08', '2022-05-03', '2022-05-07', '2022-06-27',
'2022-05-15', '2022-04-28', '2022-06-28', '2022-06-23',
'2022-06-13', '2022-05-04', '2022-04-25', '2022-05-06',
'2022-06-30', '2022-07-01', '2022-06-17', '2022-06-11',
'2022-05-11', '2022-06-12', '2022-06-05', '2022-05-25',
'2022-05-24', '2022-05-19', '2022-04-24', '2022-04-27',
'2022-05-08', '2022-04-26', '2022-06-02', '2022-05-29',
'2022-07-07', '2022-07-08', '2022-07-09', '2022-07-05',
'2022-05-01', '2022-04-13', '2022-05-02', '2022-07-10',
'2022-07-11'], dtype=object)
def find_seasons(month_number):
data["Season"] = data["Month"].apply(find_seasons)
data.sample(3)
Gandhi
Ground Carpet Contact
394 2 7000 450 Colony, Netaji Kolkata Unfurnished Bachelors/Family 1 2022 6 3
out of 4 Area Owner
Nagar
data["Year"].unique()
array([2022])
data["Month"].unique()
array([5, 7, 4, 6])
data["Season"].unique()
array([2, 3])
pie_plot("Season")
data["BHK"].unique()
array([2, 1, 3, 6, 4, 5])
number of unique values is relatively large, count plot more suitable for it
box_plot(y_axis = "Size")
Discovering Floor column
data["Floor"].unique()[:50]
array(['Ground out of 2', '1 out of 3', '1 out of 2', 'Ground out of 1',
'Ground out of 4', '1 out of 4', '1 out of 1', 'Ground out of 3',
'2 out of 3', '4 out of 5', '2 out of 2', '2 out of 5',
'4 out of 14', '3 out of 3', '5 out of 5', '4 out of 4',
'7 out of 8', '2 out of 4', '3 out of 4', '1 out of 5',
'8 out of 5', 'Ground out of 6', '2 out of 1',
'Upper Basement out of 4', 'Ground out of 5', '3 out of 5',
'11 out of 19', '5 out of 10', '11 out of 14',
'Lower Basement out of 2', '2 out of 7', '4 out of 10',
'7 out of 10', '2 out of 13', '6 out of 7', '4 out of 7',
'14 out of 14', '43 out of 78', '2 out of 8', '13 out of 18',
'5 out of 12', '18 out of 24', '3 out of 7', '17 out of 31',
'11 out of 21', '7 out of 19', '14 out of 23', '9 out of 20',
'Upper Basement out of 9', '19 out of 24'], dtype=object)
def finde_level(sentence):
"""
input : string from floor column
output : find level of Flat
case 1:
input : '2 out of 3'
output : 2
case 2:
input : 'Upper Basement out of 9'
output : Upper Basement
i used try and except Because there is some sentence that contains only the floor level
"""
sentence = sentence.split(" ")
try:
return " ".join(sentence[:2]) if sentence[1] == "Basement" else sentence[0]
except:
return sentence[0]
def finde_levels_number(sentence):
"""
input : string from floor column
output : find number of levels
"""
sentence = sentence.split(" ")
return sentence[-1] if len(sentence) > 1 else "1"
data["Level"] = data["Floor"].apply(finde_level)
data.sample(3)
Super Contact
3359 2 21000 1100 Navalur Chennai Furnished Bachelors/Family 2 2022 6 3 2 16
Area Owner
data["Level"].unique()
array(['Ground', '1', '2', '4', '3', '5', '7', '8', 'Upper Basement',
'11', 'Lower Basement', '6', '14', '43', '13', '18', '17', '9',
'19', '60', '34', '12', '26', '25', '53', '16', '10', '39', '32',
'47', '28', '20', '15', '65', '40', '37', '22', '21', '30', '35',
'33', '44', '41', '46', '27', '45', '48', '50', '24', '23', '29',
'49', '36', '76'], dtype=object)
number of unique values is relatively large, count plot more suitable for it
data["Floors numbers"].unique()
array(['2', '3', '1', '4', '5', '14', '8', '6', '19', '10', '7', '13',
'78', '18', '12', '24', '31', '21', '23', '20', '9', '22', '58',
'16', '66', '48', '40', '44', '42', '41', '60', '32', '30', '29',
'89', '15', '11', '28', '17', '45', '35', '75', '38', '51', '43',
'25', '27', '26', '76', '36', '37', '55', '68', '77', '50', '59',
'62', '39', '52', '54', '33', '46', '85', '71', '81', '34'],
dtype=object)
number of unique values is relatively large, count plot more suitable for it
data["Area Type"].unique()
pie_plot("Area Type")
data["City"].unique()
number of unique values is relatively large, count plot more suitable for it
data["Furnishing Status"].unique()
pie_plot("Furnishing Status")
data["Tenant Preferred"].unique()
number of unique values is relatively large, count plot more suitable for it
data["Bathroom"].unique()
array([ 2, 1, 3, 5, 4, 6, 7, 10])
number of unique values is relatively large, count plot more suitable for it
data["Point of Contact"].unique()
pie_plot("Point of Contact")
Bivariate Analysis & Visualizations
data.columns
data["Level"] = data["Level"].astype(int)
plt.figure(figsize = (10,6))
sns.heatmap(data.select_dtypes(exclude = object).corr(), annot = True, fmt = ".2f", linewidths = 0.2)
plt.show()
new i will droped some columns, Because they have weak Relationship with Rent
data.sample(3)
BHK Rent Size Area Type City Furnishing Status Tenant Preferred Bathroom Point of Contact Level Floors numbers
3788 2 9000 843 Super Area Chennai Semi-Furnished Bachelors/Family 2 Contact Owner 2 3
778 4 230000 3700 Carpet Area Mumbai Furnished Bachelors/Family 4 Contact Agent 5 20
1046 3 40000 1200 Super Area Mumbai Semi-Furnished Family 2 Contact Owner 2 4
MODELING
encoder = LabelEncoder()
data.sample(3)
BHK Rent Size Area Type City Furnishing Status Tenant Preferred Bathroom Point of Contact Level Floors numbers
43 2 8500 700 1 4 1 1 1 2 4 14
data["Rent"] = np.log10(data["Rent"])
Spliting Data
Target = data["Rent"]
training = data.drop(columns = ["Rent"])
X_train, X_test, y_train, y_test = train_test_split(training, Target, test_size = 0.3, random_state = 42)
scaler = MinMaxScaler()
X_train= scaler.fit_transform(X_train)
X_test= scaler.transform(X_test)
model = RandomForestRegressor()
fit model
model.fit(X_train,y_train)
▾ RandomForestRegressor
RandomForestRegressor()
model.score(X_train, y_train)
0.969403280429433
predict = model.predict(X_test)
plt.scatter(y_test, predict)
plt.show()
see score
r2_score(y_test,predict)
0.8203406677569746
mean_absolute_error(y_test,predict)
0.1284177940495873
Loading [MathJax]/jax/output/CommonHTML/fonts/TeX/fontdata.js