You are on page 1of 10

# -*- coding: utf-8 -*-

"""Real Estate.ipynb

Automatically generated by
Colaboratory.

Original file is located at

https://colab.research.google.com/drive/1Fe9_8xkXtzrvDqFz_a3MLU1uUWBfjqzt

# **WEEK 1: Data
Import and Preperation**
"""

# Commented out IPython magic to ensure Python


compatibility.
#import libraries
import numpy as np
import pandas as pd
import seaborn as
sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import normalize
import
warnings

warnings.filterwarnings('ignore')

# %matplotlib inline

"""1. import
data"""

#importing datasets
train = pd.read_csv('train.csv')
test =
pd.read_csv('test.csv')

#check number of columns and rows in train dataset


train.shape

#print
the number of columns in train dataset
train.columns

#check the number of rows and columns in


test dataset
test.shape

#print the number of clumns in test dataset


test.columns

#finding
duplicate values for train dataset
train_dup = train[train.duplicated()]

#check number of rows


and columns in train duplicate daatasets
train_dup.shape

#finding duplicate values for test


dataset
test_dup = test[test.duplicated()]

#check number of rows and columns in test duplicate


daatasets
test_dup.shape

#deleting duplicate values in train


dataset
train.drop_duplicates(keep='first',inplace=True)

print(train)
#checking number of
rows and columns after deleting duplicates in train dataset
train.shape

#deleting duplicate
values in test dataset
test.drop_duplicates(keep='first',inplace=True)

#checking number of
rows and columns after deleting duplicates in test dataset
test.shape

#combine train and test


into one file to perform EDA
train['source'] = 'train'
test['source'] = 'test'
df =
pd.concat([test,train])
print(df.shape)

#check the train dataset


train.head()

#checking
number of rows and columns after deleting duplicates in test dataset
test.head()

#checking
number of rows and columns after deleting duplicates in df dataset
df.head()

#Removing
outliers
#from scipy import stats

#z_scores = np.abs(stats.zscore(df))
#threshold =
3
#df_clean = df[(z_scores < threshold).all(axis=1)]

"""2. Figure out the


primary key and look for the requirement of indexing."""

# Get a summary of the


dataset
df.describe()

# Count the number of missing values in a specific


column
print(df['UID'].isna().sum())

"""**UID is the primary


key.**"""

#setting index to UID


df.set_index(keys =
'UID')

"""3. Gauge the fill rate of the variables and devise plans for missing
value treatment. Please explain explicitly the reason for the treatment chosen for each
variable."""

#checking null values


df.isnull().sum()

#calculating percentage
of missing values from Train dataset
missing_value =
df.isnull().sum()*100/len(df)
df_missing_value =
pd.DataFrame(missing_value,columns=['Percentage of missing
value'])

df_missing_value
#dropping columns 'BLOCKID' & 'SUMLEVEL'
df =
df.drop('BLOCKID', axis=1)
df = df.drop('SUMLEVEL',
axis=1)

sns.heatmap(df.corr())

df.info()

df['debt']

df['pct_own'].unique()

df.isnull().sum
().sort_values(ascending=False)

df.dtypes.unique()

df.fillna(df.median(),inplace=True)

df.is
null().sum().sort_values(ascending=False)

# Identify the columns with numeric


values
numeric_cols = df.select_dtypes(include=np.number).columns

from scipy import stats

#
Identify and remove outliers using z-score normalization
z_scores =
np.abs(stats.zscore(df[numeric_cols]))
threshold = 3
data = df[(z_scores <
threshold).all(axis=1)]

# Show the cleaned


dataset
print(data)

data.shape

"""# **Exploratory Data Analysis (EDA)**

4.
Perform debt analysis.

### a) Explore the top 2,500 locations where the percentage of


households with a second mortgage is the highest and percent ownership is above 10 percent.
Visualize using geo-map. You may keep the upper limit for the percent of households with a
second mortgage to 50 percent
"""

df_new = data[(data['pct_own'] > 0.1)


& (data['pct_own'] < 0.5)].nlargest(2500,'second_mortgage')

df_new.sort_index()

import
folium

# create a map centered at the USA


map_mortgage = folium.Map(location=[37.0902,
-95.7129], zoom_start=4)

# add markers to the map


for index, row in df_new.iterrows():

folium.Marker([row['lat'],
row['lng']]).add_to(map_mortgage)

map_mortgage
"""### b) Use the following bad
debt equation: Bad Debt = P (Second Mortgage ? Home Equity Loan) Bad Debt = second_mortgage +
home_equity - home_equity_second_mortgage"""

data['bad_debt'] =
data['second_mortgage'] + data['home_equity'] -
data['home_equity_second_mortgage']

"""### c) Create pie charts to show overall


debt and bad debt"""

plt.pie(x=[data['debt'].sum(), data['bad_debt'].sum()],

explode=None,
labels=['Overall Debt', 'Bad Debt'],
colors=['blue',
'yellow'],
autopct='%1.1f%%',
startangle=90)
plt.title('Overall Debt vs Bad
Debt')
plt.show()

"""### d) Create Box and whisker plot and analyze the


distribution for 2nd mortgage, home equity, good debt, and bad debt for different
cities"""

#Assigning top 5 cities w.r.t value counts in different data


frames
df_city_Brooklyn = data.loc[data['city']=='Brooklyn']
df_city_Chicago =
data.loc[data['city']=='Chicago']
df_city_LosAngeles = data.loc[data['city']=='Los
Angeles']
df_city_Houston = data.loc[data['city']=='Houston']
df_city_Philadelphia =
data.loc[data['city']=='Philadelphia']

#Assigning all the top 5 cities in a single dataframe


to analyze and plot box and whisker graph
df_city =
pd.concat([df_city_Brooklyn,df_city_Chicago,df_city_LosAngeles,df_city_Houston,df_city_Philadel
phia])

#box and whisker plot for 'second_mortgage', 'home_equity', 'debt' & 'bad_debt'
w.r.t 'city' as in df_city
plt.figure(figsize=(10,8))

plt.subplot(2,2,1)
sns.boxplot(x='city',
y='second_mortgage', data=df_city)
plt.subplot(2,2,2)
sns.boxplot(x='city', y='home_equity',
data=df_city)
plt.subplot(2,2,3)
sns.boxplot(x='city', y='debt',
data=df_city)
plt.subplot(2,2,4)
sns.boxplot(x='city', y='bad_debt',
data=df_city)

plt.show()

"""### e) Create a collated income distribution chart


for family income, house hold income, and remaining
income"""

data['remaining_mean'] = data['family_mean'] -
data['hi_mean']

#Income distribution chart


plt.hist([data['family_mean'], data['hi_mean'],
data['remaining_mean']], bins=50, label=['Family Income', 'Household Income', 'Remaining
Income'])
plt.legend(loc='upper
right')
plt.xlabel('Income')
plt.ylabel('Frequency')
plt.show()

"""# **Week 2:
Exploratory Data Analysis (EDA)**

1. Perform EDA and come out with insights into population


density and age. You may have to derive new fields (make sure to weight averages for accurate
measurements):

### a) Use pop and ALand variables to create a new field called population
density
"""

data['pop_density'] = data['pop'] /
data['ALand']

data

"""### b) Use male_age_median, female_age_median, male_pop,


and female_pop to create a new field called median age """

data['median_age'] =
(data['male_age_median']*data['male_pop'] + data['female_age_median']*data['female_pop']) /
(data['male_pop'] + data['female_pop'])

data.head()

"""### c) Visualize the


findings using appropriate chart
type"""

plt.figure(figsize=(15,15))
sns.scatterplot(x='pop_density',
y='median_age', hue='state', data=data)

"""2. Create bins for population into a


new variable by selecting appropriate class interval so that the number of categories don’t
exceed 5 for the ease of analysis."""

data['pop_bin'] = pd.cut(data['pop'],
bins=5, labels=['Very Low', 'Low', 'Medium', 'High', 'Very
High'])

data['pop_bin'].value_counts()

"""### a) Analyze the married,


separated, and divorced population for these population brackets"""

pop_groups
= data.groupby('pop_bin')[['married', 'separated',
'divorced']].count()

pop_groups.head()

"""### b) Visualize using appropriate


chart
type"""

plt.figure(figsize=(6,4))

plt.subplot(2,2,1)
sns.histplot(data=pop_gro
ups, x='married')
plt.subplot(2,2,2)
sns.histplot(data=pop_groups,
x='separated')
plt.subplot(2,2,3)
sns.histplot(data=pop_groups,
x='divorced')

plt.show()
plt.figure(figsize=(10,7))
plt.hist([pop_groups['married'],
pop_groups['separated'], pop_groups['divorced']], bins=50, label=['Married', 'Separated',
'Divorced'])
plt.legend(loc='upper right')
plt.xlabel('Population
Bins')
plt.ylabel('Count')
plt.show()

"""### 3. Please detail your observations


for rent as a percentage of income at an overall level, and for different
states."""

data['income_mean'] = data['family_mean'] +
data['hi_mean']

data['rent_percent'] = data['rent_mean'] * 100 /


data['income_mean']

pd.set_option('display.precision',2)

data.groupby(by='state')['rent_perce
nt'].agg('mean')

"""#### 4. Perform correlation analysis for all the relevant


variables by creating a heatmap. Describe your findings."""

df_corr =
data[['COUNTYID','STATEID','ALand','AWater','pop','rent_mean','family_mean','hc_mean','hc_mortg
age_mean','second_mortgage','home_equity',
'debt','hs_degree','bad_debt','pop_density','median_age','income_mean','rent_percent']]

plt.fi
gure(figsize=(20,20))
sns.heatmap(df_corr.corr(),cmap='cubehelix',annot=True)

""&quo
t;# **Week 3: Data Pre-processing**

### 1. The economic multivariate data has a significant


number of measured variables. The goal is to find where the measured variables depend on a
number of smaller unobserved common factors or latent variables. 2. Each variable is assumed to
be dependent upon a linear combination of the common factors, and the coefficients are known as
loadings. Each measured variable also includes a component due to independent random
variability, known as “specific variance” because it is specific to one variable. Obtain
the common factors and then plot the loadings. Use factor analysis to find latent variables in
our dataset and gain insight into the linear relationships in the data. Following are
the list of latent variables:

• Highschool graduation rates

• Median population age


Second mortgage statistics

• Percent own

• Bad debt expense


"""

!pip
install factor_analyzer

from factor_analyzer import FactorAnalyzer


from sklearn.preprocessing
import StandardScaler

# we have prior knowledge that certain variables in our dataset are


highly related to the latent variables.
# In order to focus on the remaining measured variables
and identify the underlying structure of these variables,
# we removed the columns
corresponding to the highly related variables using the drop() method.
#This allowed us to
better isolate the effects of the latent variables and identify their relationships with the
remaining measured variables.

data_numeric = data.drop(['hs_degree','median_age',
'second_mortgage', 'pct_own', 'bad_debt'], axis=1)

data_numeric =
data_numeric.select_dtypes(include=('int','float'))

#check for missing


values
data_numeric.isnull().sum().any()

# Scale the data to have zero mean and unit


variance
scaler = StandardScaler()
data_numeric_scaled =
scaler.fit_transform(data_numeric)

#Use the EFA (Exploratory factor analysis) method for


factor analysis
fa = FactorAnalyzer(n_factors=5,rotation
='varimax',method='principal')

#Fitting dataset in the factor


analyzer
fa.fit(data_numeric_scaled)

#interpret the results


factors =
fa.loadings_
factors

"""# **WEEK 4: Data Modeling**

1. Build a linear
Regression model to predict the total monthly expenditure for home mortgages loan. Please refer
‘deplotment_RE.xlsx’. Column hc_mortgage_mean is predicted variable. This is the mean
monthly mortgage and owner costs of specified geographical location. Note: Exclude loans from
prediction model which have NaN (Not a Number) values for hc_mortgage_mean.

### a) Run a model


at a Nation level. If the accuracy levels and R square are not satisfactory proceed to below
step.
"""

# check for categorical and object datatype


df_cat =
data.select_dtypes(include=('object','category'))

#checking correlation of target variable


hr_mortgage-mean with predictor variables/features
data.corr()['hc_mortgage_mean']

# encoding
all the object and categorical
column
df_encod=pd.get_dummies(df_cat)

df_encod

sns.distplot(data_numeric.hc_mortgage_mean)

"""## **hc_motgage has a positive skew**"""

# Calculate the
correlation matrix
corr_matrix = data.corr()

# Select the target variable


target_var =
'hc_mortgage_mean'
# Sort the correlation matrix by the absolute values of the correlations
with the target variable
sorted_corr = corr_matrix[target_var].sort_values(ascending=False)

#
Choose the features with the highest correlation with the target variable
highest_corr_features
= sorted_corr[1:6].index.tolist()

print(highest_corr_features)

sorted_corr

#finding number
of positive features in sorted_corr ranges between 0.5 and 1
num_pos = 0
for corr in
sorted_corr:
if corr >= 0.5:
num_pos += 1

print(num_pos)

#finding number of
negative features in sorted_corr ranges between -0.5 and -1
num_neg = 0
for corr in
sorted_corr:
if corr <= -0.5:
num_neg += 1

print(num_neg)

corr_features =
sorted_corr[1:18].index.tolist()

print(corr_features)

corr_features

#finding number of
positive features in sorted_corr higher than 0.7
num = 0
for corr in sorted_corr:
if corr
> 0.7:
num += 1

num

mul_colinear = sorted_corr[12:]

mul_colinear

# label
encoding for the categorical datatypes

from sklearn.preprocessing import LabelEncoder


for
df_cat in data:
encoder = LabelEncoder()
data[df_cat] =
encoder.fit_transform(data[df_cat])

data.isnull().sum().any()

data

# separating feature
columns from target variables

feature = data.drop('hc_mortgage_mean',axis=1)

target =
data['hc_mortgage_mean']

# Calculate correlation coefficient between 'feature' and


'target'
correlations = feature.corrwith(target)

# sort the correlations in descending


order
correlations = correlations.sort_values(ascending=False)

correlations

# Predicting on
the basis of mortgage mean

x = data[corr_features]
y = data['hc_mortgage_mean']

from
sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test =
train_test_split(x,y,test_size=0.3,random_state=23)

from sklearn.linear_model import


LinearRegression
lr = LinearRegression()
lr.fit(x,y)

y_pred = lr.predict(x_test)

from
sklearn.metrics import mean_squared_error, r2_score
rmse = np.sqrt(mean_squared_error(y_test,
y_pred))

rmse

r2 = r2_score(y_test, y_pred)
print(r2)

"""Here accuracy of r2
is not good.

### b) Run another model at State level. There are 52 states in


USA.
"""

#check stateid available

data['STATEID'].value_counts()

#take any
state Id to check and create new dataset for that id
#fit into train test and run model for the
dataset
#check new r2 and rmse values
#using for
loop

lr2=LinearRegression()

state_ids=[3,30,8,36,41,33,11,20]

for i in state_ids:

x_state_df=data[data['STATEID']==i][corr_features]

y_state_df=data[data['STATEID']==i]['hc_mortgage_mean']
x_train2, x_test2, y_train2, y_test2
= train_test_split(x_state_df,y_state_df,test_size=0.3,random_state=23)

lr2.fit(x_train2,y_train2)
y_pred2 = lr2.predict(x_test2)
print('Result of state id =',i)
print("Overall R2 score of linear regression model",r2_score(y_test2,y_pred2))

print("Overall RMSE of linear regression model",


np.sqrt(mean_squared_error(y_test2,y_pred2)))

#saving file

file_name =
'final_data.xlsx'
data.to_excel(file_name)

"""# **Conclusion:**
# **As r2
values are good we can say our model is good to test, we can test it on test
data**
"""

Powered by TCPDF (www.tcpdf.org)

You might also like