Real Estate

# -*- coding: utf-8 -*-
"""Real Estate.ipynb
Automatically generated by
Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1Fe9_8xkXtzrvDqFz_a3MLU1uUWBfjqzt
# **WEEK 1: Data
Import and Preperation**
"""
# Commented out IPython magic to ensure Python

compatibility.
#import libraries
import numpy as np
import pandas as pd
import seaborn as
sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import normalize
import
warnings
warnings.filterwarnings('ignore')
# %matplotlib inline
"""1. import
data"""
#importing datasets
train = pd.read_csv('train.csv')
test =
pd.read_csv('test.csv')
#check number of columns and rows in train dataset

train.shape
#print
the number of columns in train dataset
train.columns
#check the number of rows and columns in

test dataset
test.shape
#print the number of clumns in test dataset

test.columns
#finding
duplicate values for train dataset
train_dup = train[train.duplicated()]
#check number of rows

and columns in train duplicate daatasets
train_dup.shape
#finding duplicate values for test

dataset
test_dup = test[test.duplicated()]
#check number of rows and columns in test duplicate

daatasets
test_dup.shape
#deleting duplicate values in train

dataset
train.drop_duplicates(keep='first',inplace=True)
print(train)
#checking number of
rows and columns after deleting duplicates in train dataset
train.shape
#deleting duplicate
values in test dataset
test.drop_duplicates(keep='first',inplace=True)
#checking number of
rows and columns after deleting duplicates in test dataset
test.shape
#combine train and test

into one file to perform EDA
train['source'] = 'train'
test['source'] = 'test'
df =
pd.concat([test,train])
print(df.shape)
#check the train dataset

train.head()
#checking
number of rows and columns after deleting duplicates in test dataset
test.head()
#checking
number of rows and columns after deleting duplicates in df dataset
df.head()
#Removing
outliers
#from scipy import stats
#z_scores = np.abs(stats.zscore(df))
#threshold =
3
#df_clean = df[(z_scores < threshold).all(axis=1)]
"""2. Figure out the

primary key and look for the requirement of indexing."""
# Get a summary of the

dataset
df.describe()
# Count the number of missing values in a specific

column
print(df['UID'].isna().sum())
"""**UID is the primary

key.**"""
#setting index to UID

df.set_index(keys =
'UID')
"""3. Gauge the fill rate of the variables and devise plans for missing
value treatment. Please explain explicitly the reason for the treatment chosen for each
variable."""
#checking null values

df.isnull().sum()
#calculating percentage
of missing values from Train dataset
missing_value =
df.isnull().sum()*100/len(df)
df_missing_value =
pd.DataFrame(missing_value,columns=['Percentage of missing
value'])
df_missing_value
#dropping columns 'BLOCKID' & 'SUMLEVEL'
df =
df.drop('BLOCKID', axis=1)
df = df.drop('SUMLEVEL',
axis=1)
sns.heatmap(df.corr())
df.info()
df['debt']
df['pct_own'].unique()
df.isnull().sum
().sort_values(ascending=False)
df.dtypes.unique()
df.fillna(df.median(),inplace=True)
df.is
null().sum().sort_values(ascending=False)
# Identify the columns with numeric

values
numeric_cols = df.select_dtypes(include=np.number).columns
from scipy import stats
#
Identify and remove outliers using z-score normalization
z_scores =
np.abs(stats.zscore(df[numeric_cols]))
threshold = 3
data = df[(z_scores <
threshold).all(axis=1)]
# Show the cleaned

dataset
print(data)
data.shape
"""# **Exploratory Data Analysis (EDA)**
4.
Perform debt analysis.
### a) Explore the top 2,500 locations where the percentage of

households with a second mortgage is the highest and percent ownership is above 10 percent.
Visualize using geo-map. You may keep the upper limit for the percent of households with a
second mortgage to 50 percent
"""
df_new = data[(data['pct_own'] > 0.1)

& (data['pct_own'] < 0.5)].nlargest(2500,'second_mortgage')
df_new.sort_index()
import
folium
# create a map centered at the USA

map_mortgage = folium.Map(location=[37.0902,
-95.7129], zoom_start=4)
# add markers to the map

for index, row in df_new.iterrows():
folium.Marker([row['lat'],
row['lng']]).add_to(map_mortgage)
map_mortgage
"""### b) Use the following bad
debt equation: Bad Debt = P (Second Mortgage ? Home Equity Loan) Bad Debt = second_mortgage +
home_equity - home_equity_second_mortgage"""
data['bad_debt'] =
data['second_mortgage'] + data['home_equity'] -
data['home_equity_second_mortgage']
"""### c) Create pie charts to show overall

debt and bad debt"""
plt.pie(x=[data['debt'].sum(), data['bad_debt'].sum()],
explode=None,
labels=['Overall Debt', 'Bad Debt'],
colors=['blue',
'yellow'],
autopct='%1.1f%%',
startangle=90)
plt.title('Overall Debt vs Bad
Debt')
plt.show()
"""### d) Create Box and whisker plot and analyze the

distribution for 2nd mortgage, home equity, good debt, and bad debt for different
cities"""
#Assigning top 5 cities w.r.t value counts in different data

frames
df_city_Brooklyn = data.loc[data['city']=='Brooklyn']
df_city_Chicago =
data.loc[data['city']=='Chicago']
df_city_LosAngeles = data.loc[data['city']=='Los
Angeles']
df_city_Houston = data.loc[data['city']=='Houston']
df_city_Philadelphia =
data.loc[data['city']=='Philadelphia']
#Assigning all the top 5 cities in a single dataframe

to analyze and plot box and whisker graph
df_city =
pd.concat([df_city_Brooklyn,df_city_Chicago,df_city_LosAngeles,df_city_Houston,df_city_Philadel
phia])
#box and whisker plot for 'second_mortgage', 'home_equity', 'debt' & 'bad_debt'
w.r.t 'city' as in df_city
plt.figure(figsize=(10,8))
plt.subplot(2,2,1)
sns.boxplot(x='city',
y='second_mortgage', data=df_city)
plt.subplot(2,2,2)
sns.boxplot(x='city', y='home_equity',
data=df_city)
plt.subplot(2,2,3)
sns.boxplot(x='city', y='debt',
data=df_city)
plt.subplot(2,2,4)
sns.boxplot(x='city', y='bad_debt',
data=df_city)
plt.show()
"""### e) Create a collated income distribution chart

for family income, house hold income, and remaining
income"""
data['remaining_mean'] = data['family_mean'] -
data['hi_mean']
#Income distribution chart

plt.hist([data['family_mean'], data['hi_mean'],
data['remaining_mean']], bins=50, label=['Family Income', 'Household Income', 'Remaining
Income'])
plt.legend(loc='upper
right')
plt.xlabel('Income')
plt.ylabel('Frequency')
plt.show()
"""# **Week 2:
Exploratory Data Analysis (EDA)**
1. Perform EDA and come out with insights into population

density and age. You may have to derive new fields (make sure to weight averages for accurate
measurements):
### a) Use pop and ALand variables to create a new field called population
density
"""
data['pop_density'] = data['pop'] /
data['ALand']
data
"""### b) Use male_age_median, female_age_median, male_pop,

and female_pop to create a new field called median age """
data['median_age'] =
(data['male_age_median']*data['male_pop'] + data['female_age_median']*data['female_pop']) /
(data['male_pop'] + data['female_pop'])
data.head()
"""### c) Visualize the

findings using appropriate chart
type"""
sns.scatterplot(x='pop_density',
y='median_age', hue='state', data=data)
"""2. Create bins for population into a

new variable by selecting appropriate class interval so that the number of categories don’t
exceed 5 for the ease of analysis."""
data['pop_bin'] = pd.cut(data['pop'],
bins=5, labels=['Very Low', 'Low', 'Medium', 'High', 'Very
High'])
data['pop_bin'].value_counts()
"""### a) Analyze the married,

separated, and divorced population for these population brackets"""
pop_groups
= data.groupby('pop_bin')[['married', 'separated',
'divorced']].count()
pop_groups.head()
"""### b) Visualize using appropriate

chart
type"""
plt.subplot(2,2,1)
sns.histplot(data=pop_gro
ups, x='married')
plt.subplot(2,2,2)
sns.histplot(data=pop_groups,
x='separated')
plt.subplot(2,2,3)
sns.histplot(data=pop_groups,
x='divorced')
plt.show()
plt.hist([pop_groups['married'],
pop_groups['separated'], pop_groups['divorced']], bins=50, label=['Married', 'Separated',
'Divorced'])
plt.legend(loc='upper right')
plt.xlabel('Population
Bins')
plt.ylabel('Count')
plt.show()
"""### 3. Please detail your observations

for rent as a percentage of income at an overall level, and for different
states."""
data['income_mean'] = data['family_mean'] +
data['hi_mean']
data['rent_percent'] = data['rent_mean'] * 100 /

data['income_mean']
pd.set_option('display.precision',2)
data.groupby(by='state')['rent_perce
nt'].agg('mean')
"""#### 4. Perform correlation analysis for all the relevant

variables by creating a heatmap. Describe your findings."""
df_corr =
data[['COUNTYID','STATEID','ALand','AWater','pop','rent_mean','family_mean','hc_mean','hc_mortg
age_mean','second_mortgage','home_equity',
'debt','hs_degree','bad_debt','pop_density','median_age','income_mean','rent_percent']]
plt.fi
gure(figsize=(20,20))
sns.heatmap(df_corr.corr(),cmap='cubehelix',annot=True)
""&quo
t;# **Week 3: Data Pre-processing**
### 1. The economic multivariate data has a significant

number of measured variables. The goal is to find where the measured variables depend on a
number of smaller unobserved common factors or latent variables. 2. Each variable is assumed to
be dependent upon a linear combination of the common factors, and the coefficients are known as
loadings. Each measured variable also includes a component due to independent random
variability, known as “specific variance” because it is specific to one variable. Obtain
the common factors and then plot the loadings. Use factor analysis to find latent variables in
our dataset and gain insight into the linear relationships in the data. Following are
the list of latent variables:
• Highschool graduation rates
• Median population age
•
Second mortgage statistics
• Percent own
• Bad debt expense

"""
!pip
install factor_analyzer
from factor_analyzer import FactorAnalyzer

from sklearn.preprocessing
import StandardScaler
# we have prior knowledge that certain variables in our dataset are

highly related to the latent variables.
# In order to focus on the remaining measured variables
and identify the underlying structure of these variables,
# we removed the columns
corresponding to the highly related variables using the drop() method.
#This allowed us to
better isolate the effects of the latent variables and identify their relationships with the
remaining measured variables.
data_numeric = data.drop(['hs_degree','median_age',
'second_mortgage', 'pct_own', 'bad_debt'], axis=1)
data_numeric =
data_numeric.select_dtypes(include=('int','float'))
#check for missing

values
data_numeric.isnull().sum().any()
# Scale the data to have zero mean and unit

variance
scaler = StandardScaler()
data_numeric_scaled =
scaler.fit_transform(data_numeric)
#Use the EFA (Exploratory factor analysis) method for

factor analysis
fa = FactorAnalyzer(n_factors=5,rotation
='varimax',method='principal')
#Fitting dataset in the factor

analyzer
fa.fit(data_numeric_scaled)
#interpret the results

factors =
fa.loadings_
factors
"""# **WEEK 4: Data Modeling**
1. Build a linear
Regression model to predict the total monthly expenditure for home mortgages loan. Please refer
‘deplotment_RE.xlsx’. Column hc_mortgage_mean is predicted variable. This is the mean
monthly mortgage and owner costs of specified geographical location. Note: Exclude loans from
prediction model which have NaN (Not a Number) values for hc_mortgage_mean.
### a) Run a model

at a Nation level. If the accuracy levels and R square are not satisfactory proceed to below
step.
"""
# check for categorical and object datatype

df_cat =
data.select_dtypes(include=('object','category'))
#checking correlation of target variable

hr_mortgage-mean with predictor variables/features
data.corr()['hc_mortgage_mean']
# encoding
all the object and categorical
column
df_encod=pd.get_dummies(df_cat)
df_encod
sns.distplot(data_numeric.hc_mortgage_mean)
"""## **hc_motgage has a positive skew**"""
# Calculate the
correlation matrix
corr_matrix = data.corr()
# Select the target variable

target_var =
'hc_mortgage_mean'
# Sort the correlation matrix by the absolute values of the correlations
with the target variable
sorted_corr = corr_matrix[target_var].sort_values(ascending=False)
#
Choose the features with the highest correlation with the target variable
highest_corr_features
= sorted_corr[1:6].index.tolist()
print(highest_corr_features)
sorted_corr
#finding number
of positive features in sorted_corr ranges between 0.5 and 1
num_pos = 0
for corr in
sorted_corr:
if corr >= 0.5:
num_pos += 1
print(num_pos)
#finding number of
negative features in sorted_corr ranges between -0.5 and -1
num_neg = 0
for corr in
sorted_corr:
if corr <= -0.5:
num_neg += 1
print(num_neg)
corr_features =
sorted_corr[1:18].index.tolist()
print(corr_features)
corr_features
#finding number of
positive features in sorted_corr higher than 0.7
num = 0
for corr in sorted_corr:
if corr
> 0.7:
num += 1
num
mul_colinear = sorted_corr[12:]
mul_colinear
# label
encoding for the categorical datatypes
from sklearn.preprocessing import LabelEncoder

for
df_cat in data:
encoder = LabelEncoder()
data[df_cat] =
encoder.fit_transform(data[df_cat])
data.isnull().sum().any()
data
# separating feature
columns from target variables
feature = data.drop('hc_mortgage_mean',axis=1)
target =
data['hc_mortgage_mean']
# Calculate correlation coefficient between 'feature' and

'target'
correlations = feature.corrwith(target)
# sort the correlations in descending

order
correlations = correlations.sort_values(ascending=False)
correlations
# Predicting on
the basis of mortgage mean
x = data[corr_features]
y = data['hc_mortgage_mean']
from
sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test =
train_test_split(x,y,test_size=0.3,random_state=23)
from sklearn.linear_model import

LinearRegression
lr = LinearRegression()
lr.fit(x,y)
y_pred = lr.predict(x_test)
from
sklearn.metrics import mean_squared_error, r2_score
rmse = np.sqrt(mean_squared_error(y_test,
y_pred))
rmse
r2 = r2_score(y_test, y_pred)
print(r2)
"""Here accuracy of r2
is not good.
### b) Run another model at State level. There are 52 states in

USA.
"""
#check stateid available
data['STATEID'].value_counts()
#take any
state Id to check and create new dataset for that id
#fit into train test and run model for the
dataset
#check new r2 and rmse values
#using for
loop
lr2=LinearRegression()
state_ids=[3,30,8,36,41,33,11,20]
for i in state_ids:
x_state_df=data[data['STATEID']==i][corr_features]
y_state_df=data[data['STATEID']==i]['hc_mortgage_mean']
x_train2, x_test2, y_train2, y_test2
= train_test_split(x_state_df,y_state_df,test_size=0.3,random_state=23)
lr2.fit(x_train2,y_train2)
y_pred2 = lr2.predict(x_test2)
print('Result of state id =',i)
print("Overall R2 score of linear regression model",r2_score(y_test2,y_pred2))
print("Overall RMSE of linear regression model",

np.sqrt(mean_squared_error(y_test2,y_pred2)))
#saving file
file_name =
'final_data.xlsx'
data.to_excel(file_name)
"""# **Conclusion:**
# **As r2
values are good we can say our model is good to test, we can test it on test
data**
"""
Powered by TCPDF (www.tcpdf.org)

Real Estate

Uploaded by

Document Information

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Real Estate

Uploaded by

Copyright:

Available Formats

# -*- coding: utf-8 -*-

Original file is located at

# Commented out IPython magic to ensure Python

#check number of columns and rows in train dataset

#check the number of rows and columns in

#print the number of clumns in test dataset

#check number of rows

#finding duplicate values for test

#check number of rows and columns in test duplicate

#deleting duplicate values in train

#combine train and test

#check the train dataset

"""2. Figure out the

# Get a summary of the

# Count the number of missing values in a specific

"""**UID is the primary

#setting index to UID

#checking null values

# Identify the columns with numeric

from scipy import stats

# Show the cleaned

"""# **Exploratory Data Analysis (EDA)**

### a) Explore the top 2,500 locations where the percentage of

df_new = data[(data['pct_own'] > 0.1)

# create a map centered at the USA

# add markers to the map

"""### c) Create pie charts to show overall

"""### d) Create Box and whisker plot and analyze the

#Assigning top 5 cities w.r.t value counts in different data

#Assigning all the top 5 cities in a single dataframe

"""### e) Create a collated income distribution chart

#Income distribution chart

1. Perform EDA and come out with insights into population

"""### b) Use male_age_median, female_age_median, male_pop,

"""### c) Visualize the

"""2. Create bins for population into a

"""### a) Analyze the married,

"""### b) Visualize using appropriate

"""### 3. Please detail your observations

data['rent_percent'] = data['rent_mean'] * 100 /

"""#### 4. Perform correlation analysis for all the relevant

### 1. The economic multivariate data has a significant

• Highschool graduation rates

• Median population age

• Bad debt expense

from factor_analyzer import FactorAnalyzer

# we have prior knowledge that certain variables in our dataset are

#check for missing

# Scale the data to have zero mean and unit

#Use the EFA (Exploratory factor analysis) method for

#Fitting dataset in the factor

#interpret the results

"""# **WEEK 4: Data Modeling**

### a) Run a model

# check for categorical and object datatype

#checking correlation of target variable

"""## **hc_motgage has a positive skew**"""

# Select the target variable

from sklearn.preprocessing import LabelEncoder

# Calculate correlation coefficient between 'feature' and

# sort the correlations in descending

from sklearn.linear_model import

### b) Run another model at State level. There are 52 states in

#check stateid available

# -- coding: utf-8 --

"""# Exploratory Data Analysis (EDA)

"""# WEEK 4: Data Modeling

"""## hc_motgage has a positive skew"""