Professional Documents
Culture Documents
"""Real Estate.ipynb
Automatically generated by
Colaboratory.
https://colab.research.google.com/drive/1Fe9_8xkXtzrvDqFz_a3MLU1uUWBfjqzt
# **WEEK 1: Data
Import and Preperation**
"""
warnings.filterwarnings('ignore')
# %matplotlib inline
"""1. import
data"""
#importing datasets
train = pd.read_csv('train.csv')
test =
pd.read_csv('test.csv')
#print
the number of columns in train dataset
train.columns
#finding
duplicate values for train dataset
train_dup = train[train.duplicated()]
print(train)
#checking number of
rows and columns after deleting duplicates in train dataset
train.shape
#deleting duplicate
values in test dataset
test.drop_duplicates(keep='first',inplace=True)
#checking number of
rows and columns after deleting duplicates in test dataset
test.shape
#checking
number of rows and columns after deleting duplicates in test dataset
test.head()
#checking
number of rows and columns after deleting duplicates in df dataset
df.head()
#Removing
outliers
#from scipy import stats
#z_scores = np.abs(stats.zscore(df))
#threshold =
3
#df_clean = df[(z_scores < threshold).all(axis=1)]
"""3. Gauge the fill rate of the variables and devise plans for missing
value treatment. Please explain explicitly the reason for the treatment chosen for each
variable."""
#calculating percentage
of missing values from Train dataset
missing_value =
df.isnull().sum()*100/len(df)
df_missing_value =
pd.DataFrame(missing_value,columns=['Percentage of missing
value'])
df_missing_value
#dropping columns 'BLOCKID' & 'SUMLEVEL'
df =
df.drop('BLOCKID', axis=1)
df = df.drop('SUMLEVEL',
axis=1)
sns.heatmap(df.corr())
df.info()
df['debt']
df['pct_own'].unique()
df.isnull().sum
().sort_values(ascending=False)
df.dtypes.unique()
df.fillna(df.median(),inplace=True)
df.is
null().sum().sort_values(ascending=False)
#
Identify and remove outliers using z-score normalization
z_scores =
np.abs(stats.zscore(df[numeric_cols]))
threshold = 3
data = df[(z_scores <
threshold).all(axis=1)]
data.shape
4.
Perform debt analysis.
df_new.sort_index()
import
folium
folium.Marker([row['lat'],
row['lng']]).add_to(map_mortgage)
map_mortgage
"""### b) Use the following bad
debt equation: Bad Debt = P (Second Mortgage ? Home Equity Loan) Bad Debt = second_mortgage +
home_equity - home_equity_second_mortgage"""
data['bad_debt'] =
data['second_mortgage'] + data['home_equity'] -
data['home_equity_second_mortgage']
plt.pie(x=[data['debt'].sum(), data['bad_debt'].sum()],
explode=None,
labels=['Overall Debt', 'Bad Debt'],
colors=['blue',
'yellow'],
autopct='%1.1f%%',
startangle=90)
plt.title('Overall Debt vs Bad
Debt')
plt.show()
#box and whisker plot for 'second_mortgage', 'home_equity', 'debt' & 'bad_debt'
w.r.t 'city' as in df_city
plt.figure(figsize=(10,8))
plt.subplot(2,2,1)
sns.boxplot(x='city',
y='second_mortgage', data=df_city)
plt.subplot(2,2,2)
sns.boxplot(x='city', y='home_equity',
data=df_city)
plt.subplot(2,2,3)
sns.boxplot(x='city', y='debt',
data=df_city)
plt.subplot(2,2,4)
sns.boxplot(x='city', y='bad_debt',
data=df_city)
plt.show()
data['remaining_mean'] = data['family_mean'] -
data['hi_mean']
"""# **Week 2:
Exploratory Data Analysis (EDA)**
### a) Use pop and ALand variables to create a new field called population
density
"""
data['pop_density'] = data['pop'] /
data['ALand']
data
data['median_age'] =
(data['male_age_median']*data['male_pop'] + data['female_age_median']*data['female_pop']) /
(data['male_pop'] + data['female_pop'])
data.head()
plt.figure(figsize=(15,15))
sns.scatterplot(x='pop_density',
y='median_age', hue='state', data=data)
data['pop_bin'] = pd.cut(data['pop'],
bins=5, labels=['Very Low', 'Low', 'Medium', 'High', 'Very
High'])
data['pop_bin'].value_counts()
pop_groups
= data.groupby('pop_bin')[['married', 'separated',
'divorced']].count()
pop_groups.head()
plt.figure(figsize=(6,4))
plt.subplot(2,2,1)
sns.histplot(data=pop_gro
ups, x='married')
plt.subplot(2,2,2)
sns.histplot(data=pop_groups,
x='separated')
plt.subplot(2,2,3)
sns.histplot(data=pop_groups,
x='divorced')
plt.show()
plt.figure(figsize=(10,7))
plt.hist([pop_groups['married'],
pop_groups['separated'], pop_groups['divorced']], bins=50, label=['Married', 'Separated',
'Divorced'])
plt.legend(loc='upper right')
plt.xlabel('Population
Bins')
plt.ylabel('Count')
plt.show()
data['income_mean'] = data['family_mean'] +
data['hi_mean']
pd.set_option('display.precision',2)
data.groupby(by='state')['rent_perce
nt'].agg('mean')
df_corr =
data[['COUNTYID','STATEID','ALand','AWater','pop','rent_mean','family_mean','hc_mean','hc_mortg
age_mean','second_mortgage','home_equity',
'debt','hs_degree','bad_debt','pop_density','median_age','income_mean','rent_percent']]
plt.fi
gure(figsize=(20,20))
sns.heatmap(df_corr.corr(),cmap='cubehelix',annot=True)
""&quo
t;# **Week 3: Data Pre-processing**
•
Second mortgage statistics
• Percent own
!pip
install factor_analyzer
data_numeric = data.drop(['hs_degree','median_age',
'second_mortgage', 'pct_own', 'bad_debt'], axis=1)
data_numeric =
data_numeric.select_dtypes(include=('int','float'))
1. Build a linear
Regression model to predict the total monthly expenditure for home mortgages loan. Please refer
‘deplotment_RE.xlsx’. Column hc_mortgage_mean is predicted variable. This is the mean
monthly mortgage and owner costs of specified geographical location. Note: Exclude loans from
prediction model which have NaN (Not a Number) values for hc_mortgage_mean.
# encoding
all the object and categorical
column
df_encod=pd.get_dummies(df_cat)
df_encod
sns.distplot(data_numeric.hc_mortgage_mean)
# Calculate the
correlation matrix
corr_matrix = data.corr()
#
Choose the features with the highest correlation with the target variable
highest_corr_features
= sorted_corr[1:6].index.tolist()
print(highest_corr_features)
sorted_corr
#finding number
of positive features in sorted_corr ranges between 0.5 and 1
num_pos = 0
for corr in
sorted_corr:
if corr >= 0.5:
num_pos += 1
print(num_pos)
#finding number of
negative features in sorted_corr ranges between -0.5 and -1
num_neg = 0
for corr in
sorted_corr:
if corr <= -0.5:
num_neg += 1
print(num_neg)
corr_features =
sorted_corr[1:18].index.tolist()
print(corr_features)
corr_features
#finding number of
positive features in sorted_corr higher than 0.7
num = 0
for corr in sorted_corr:
if corr
> 0.7:
num += 1
num
mul_colinear = sorted_corr[12:]
mul_colinear
# label
encoding for the categorical datatypes
data.isnull().sum().any()
data
# separating feature
columns from target variables
feature = data.drop('hc_mortgage_mean',axis=1)
target =
data['hc_mortgage_mean']
correlations
# Predicting on
the basis of mortgage mean
x = data[corr_features]
y = data['hc_mortgage_mean']
from
sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test =
train_test_split(x,y,test_size=0.3,random_state=23)
y_pred = lr.predict(x_test)
from
sklearn.metrics import mean_squared_error, r2_score
rmse = np.sqrt(mean_squared_error(y_test,
y_pred))
rmse
r2 = r2_score(y_test, y_pred)
print(r2)
"""Here accuracy of r2
is not good.
data['STATEID'].value_counts()
#take any
state Id to check and create new dataset for that id
#fit into train test and run model for the
dataset
#check new r2 and rmse values
#using for
loop
lr2=LinearRegression()
state_ids=[3,30,8,36,41,33,11,20]
for i in state_ids:
x_state_df=data[data['STATEID']==i][corr_features]
y_state_df=data[data['STATEID']==i]['hc_mortgage_mean']
x_train2, x_test2, y_train2, y_test2
= train_test_split(x_state_df,y_state_df,test_size=0.3,random_state=23)
lr2.fit(x_train2,y_train2)
y_pred2 = lr2.predict(x_test2)
print('Result of state id =',i)
print("Overall R2 score of linear regression model",r2_score(y_test2,y_pred2))
#saving file
file_name =
'final_data.xlsx'
data.to_excel(file_name)
"""# **Conclusion:**
# **As r2
values are good we can say our model is good to test, we can test it on test
data**
"""