You are on page 1of 29

using_XGBoost

May 25, 2023

[1]: from xgboost import XGBRegressor


from catboost import CatBoostRegressor

[2]: import numpy as np


import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_absolute_error
pd.set_option('display.max_columns', None)

[3]: df_train = pd.read_csv('train.csv', index_col='Id')


df_train.head()

[3]: MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape \


Id
1 60 RL 65.0 8450 Pave NaN Reg
2 20 RL 80.0 9600 Pave NaN Reg
3 60 RL 68.0 11250 Pave NaN IR1
4 70 RL 60.0 9550 Pave NaN IR1
5 60 RL 84.0 14260 Pave NaN IR1

LandContour Utilities LotConfig LandSlope Neighborhood Condition1 \


Id
1 Lvl AllPub Inside Gtl CollgCr Norm
2 Lvl AllPub FR2 Gtl Veenker Feedr
3 Lvl AllPub Inside Gtl CollgCr Norm
4 Lvl AllPub Corner Gtl Crawfor Norm
5 Lvl AllPub FR2 Gtl NoRidge Norm

Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt \


Id
1 Norm 1Fam 2Story 7 5 2003

1
2 Norm 1Fam 1Story 6 8 1976
3 Norm 1Fam 2Story 7 5 2001
4 Norm 1Fam 2Story 7 5 1915
5 Norm 1Fam 2Story 8 5 2000

YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType \


Id
1 2003 Gable CompShg VinylSd VinylSd BrkFace
2 1976 Gable CompShg MetalSd MetalSd None
3 2002 Gable CompShg VinylSd VinylSd BrkFace
4 1970 Gable CompShg Wd Sdng Wd Shng None
5 2000 Gable CompShg VinylSd VinylSd BrkFace

MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure \


Id
1 196.0 Gd TA PConc Gd TA No
2 0.0 TA TA CBlock Gd TA Gd
3 162.0 Gd TA PConc Gd TA Mn
4 0.0 TA TA BrkTil TA Gd No
5 350.0 Gd TA PConc Gd TA Av

BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF \


Id
1 GLQ 706 Unf 0 150 856
2 ALQ 978 Unf 0 284 1262
3 GLQ 486 Unf 0 434 920
4 ALQ 216 Unf 0 540 756
5 GLQ 655 Unf 0 490 1145

Heating HeatingQC CentralAir Electrical 1stFlrSF 2ndFlrSF LowQualFinSF \


Id
1 GasA Ex Y SBrkr 856 854 0
2 GasA Ex Y SBrkr 1262 0 0
3 GasA Ex Y SBrkr 920 866 0
4 GasA Gd Y SBrkr 961 756 0
5 GasA Ex Y SBrkr 1145 1053 0

GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr \


Id
1 1710 1 0 2 1 3
2 1262 0 1 2 0 3
3 1786 1 0 2 1 3
4 1717 1 0 1 0 3
5 2198 1 0 2 1 4

KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu \


Id

2
1 1 Gd 8 Typ 0 NaN
2 1 TA 6 Typ 1 TA
3 1 Gd 6 Typ 1 TA
4 1 Gd 7 Typ 1 Gd
5 1 Gd 9 Typ 1 TA

GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual \


Id
1 Attchd 2003.0 RFn 2 548 TA
2 Attchd 1976.0 RFn 2 460 TA
3 Attchd 2001.0 RFn 2 608 TA
4 Detchd 1998.0 Unf 3 642 TA
5 Attchd 2000.0 RFn 3 836 TA

GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch \


Id
1 TA Y 0 61 0 0
2 TA Y 298 0 0 0
3 TA Y 0 42 0 0
4 TA Y 0 35 272 0
5 TA Y 192 84 0 0

ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold \


Id
1 0 0 NaN NaN NaN 0 2 2008
2 0 0 NaN NaN NaN 0 5 2007
3 0 0 NaN NaN NaN 0 9 2008
4 0 0 NaN NaN NaN 0 2 2006
5 0 0 NaN NaN NaN 0 12 2008

SaleType SaleCondition SalePrice


Id
1 WD Normal 208500
2 WD Normal 181500
3 WD Normal 223500
4 WD Abnorml 140000
5 WD Normal 250000

[4]: df_test = pd.read_csv('test.csv', index_col='Id')


df_test.head()

[4]: MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape \


Id
1461 20 RH 80.0 11622 Pave NaN Reg
1462 20 RL 81.0 14267 Pave NaN IR1
1463 60 RL 74.0 13830 Pave NaN IR1
1464 60 RL 78.0 9978 Pave NaN IR1

3
1465 120 RL 43.0 5005 Pave NaN IR1

LandContour Utilities LotConfig LandSlope Neighborhood Condition1 \


Id
1461 Lvl AllPub Inside Gtl NAmes Feedr
1462 Lvl AllPub Corner Gtl NAmes Norm
1463 Lvl AllPub Inside Gtl Gilbert Norm
1464 Lvl AllPub Inside Gtl Gilbert Norm
1465 HLS AllPub Inside Gtl StoneBr Norm

Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt \


Id
1461 Norm 1Fam 1Story 5 6 1961
1462 Norm 1Fam 1Story 6 6 1958
1463 Norm 1Fam 2Story 5 5 1997
1464 Norm 1Fam 2Story 6 6 1998
1465 Norm TwnhsE 1Story 8 5 1992

YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType \


Id
1461 1961 Gable CompShg VinylSd VinylSd None
1462 1958 Hip CompShg Wd Sdng Wd Sdng BrkFace
1463 1998 Gable CompShg VinylSd VinylSd None
1464 1998 Gable CompShg VinylSd VinylSd BrkFace
1465 1992 Gable CompShg HdBoard HdBoard None

MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond \


Id
1461 0.0 TA TA CBlock TA TA
1462 108.0 TA TA CBlock TA TA
1463 0.0 TA TA PConc Gd TA
1464 20.0 TA TA PConc TA TA
1465 0.0 Gd TA PConc Gd TA

BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 \


Id
1461 No Rec 468.0 LwQ 144.0
1462 No ALQ 923.0 Unf 0.0
1463 No GLQ 791.0 Unf 0.0
1464 No GLQ 602.0 Unf 0.0
1465 No ALQ 263.0 Unf 0.0

BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical \


Id
1461 270.0 882.0 GasA TA Y SBrkr
1462 406.0 1329.0 GasA TA Y SBrkr
1463 137.0 928.0 GasA Gd Y SBrkr

4
1464 324.0 926.0 GasA Ex Y SBrkr
1465 1017.0 1280.0 GasA Ex Y SBrkr

1stFlrSF 2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath \


Id
1461 896 0 0 896 0.0 0.0
1462 1329 0 0 1329 0.0 0.0
1463 928 701 0 1629 0.0 0.0
1464 926 678 0 1604 0.0 0.0
1465 1280 0 0 1280 0.0 0.0

FullBath HalfBath BedroomAbvGr KitchenAbvGr KitchenQual \


Id
1461 1 0 2 1 TA
1462 1 1 3 1 Gd
1463 2 1 3 1 TA
1464 2 1 3 1 Gd
1465 2 0 2 1 Gd

TotRmsAbvGrd Functional Fireplaces FireplaceQu GarageType GarageYrBlt \


Id
1461 5 Typ 0 NaN Attchd 1961.0
1462 6 Typ 0 NaN Attchd 1958.0
1463 6 Typ 1 TA Attchd 1997.0
1464 7 Typ 1 Gd Attchd 1998.0
1465 5 Typ 0 NaN Attchd 1992.0

GarageFinish GarageCars GarageArea GarageQual GarageCond PavedDrive \


Id
1461 Unf 1.0 730.0 TA TA Y
1462 Unf 1.0 312.0 TA TA Y
1463 Fin 2.0 482.0 TA TA Y
1464 Fin 2.0 470.0 TA TA Y
1465 RFn 2.0 506.0 TA TA Y

WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch ScreenPorch \


Id
1461 140 0 0 0 120
1462 393 36 0 0 0
1463 212 34 0 0 0
1464 360 36 0 0 0
1465 0 82 0 0 144

PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType \


Id
1461 0 NaN MnPrv NaN 0 6 2010 WD
1462 0 NaN NaN Gar2 12500 6 2010 WD

5
1463 0 NaN MnPrv NaN 0 3 2010 WD
1464 0 NaN NaN NaN 0 6 2010 WD
1465 0 NaN NaN NaN 0 1 2010 WD

SaleCondition
Id
1461 Normal
1462 Normal
1463 Normal
1464 Normal
1465 Normal

[5]: sns.displot(data=df_train,x='SalePrice',kde=False )
plt.show()

[6]: numerucal_data = df_train.select_dtypes(exclude='object')

6
[7]: for column in numerucal_data:
plt.figure(figsize=(16, 4))
sns.set_theme(style="whitegrid")
sns.boxplot(data=numerucal_data, x=column)

7
8
9
10
11
12
13
14
15
16
[8]: feat_train_not_test = [col for col in df_train.columns if col not in df_test.
↪columns and col != 'SalePrice']

print(feat_train_not_test)

[]

[9]: feat_test_not_train = [col for col in df_test.columns if col not in df_train.


↪columns and col != 'SalePrice']

print(feat_train_not_test)

[]

[10]: df_merge = pd.concat([df_test.assign(ind = 'test'), df_train.assign(ind =␣


↪'train')])

df_merge.head()

[10]: MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape \


Id
1461 20 RH 80.0 11622 Pave NaN Reg
1462 20 RL 81.0 14267 Pave NaN IR1
1463 60 RL 74.0 13830 Pave NaN IR1

17
1464 60 RL 78.0 9978 Pave NaN IR1
1465 120 RL 43.0 5005 Pave NaN IR1

LandContour Utilities LotConfig LandSlope Neighborhood Condition1 \


Id
1461 Lvl AllPub Inside Gtl NAmes Feedr
1462 Lvl AllPub Corner Gtl NAmes Norm
1463 Lvl AllPub Inside Gtl Gilbert Norm
1464 Lvl AllPub Inside Gtl Gilbert Norm
1465 HLS AllPub Inside Gtl StoneBr Norm

Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt \


Id
1461 Norm 1Fam 1Story 5 6 1961
1462 Norm 1Fam 1Story 6 6 1958
1463 Norm 1Fam 2Story 5 5 1997
1464 Norm 1Fam 2Story 6 6 1998
1465 Norm TwnhsE 1Story 8 5 1992

YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType \


Id
1461 1961 Gable CompShg VinylSd VinylSd None
1462 1958 Hip CompShg Wd Sdng Wd Sdng BrkFace
1463 1998 Gable CompShg VinylSd VinylSd None
1464 1998 Gable CompShg VinylSd VinylSd BrkFace
1465 1992 Gable CompShg HdBoard HdBoard None

MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond \


Id
1461 0.0 TA TA CBlock TA TA
1462 108.0 TA TA CBlock TA TA
1463 0.0 TA TA PConc Gd TA
1464 20.0 TA TA PConc TA TA
1465 0.0 Gd TA PConc Gd TA

BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 \


Id
1461 No Rec 468.0 LwQ 144.0
1462 No ALQ 923.0 Unf 0.0
1463 No GLQ 791.0 Unf 0.0
1464 No GLQ 602.0 Unf 0.0
1465 No ALQ 263.0 Unf 0.0

BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical \


Id
1461 270.0 882.0 GasA TA Y SBrkr
1462 406.0 1329.0 GasA TA Y SBrkr

18
1463 137.0 928.0 GasA Gd Y SBrkr
1464 324.0 926.0 GasA Ex Y SBrkr
1465 1017.0 1280.0 GasA Ex Y SBrkr

1stFlrSF 2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath \


Id
1461 896 0 0 896 0.0 0.0
1462 1329 0 0 1329 0.0 0.0
1463 928 701 0 1629 0.0 0.0
1464 926 678 0 1604 0.0 0.0
1465 1280 0 0 1280 0.0 0.0

FullBath HalfBath BedroomAbvGr KitchenAbvGr KitchenQual \


Id
1461 1 0 2 1 TA
1462 1 1 3 1 Gd
1463 2 1 3 1 TA
1464 2 1 3 1 Gd
1465 2 0 2 1 Gd

TotRmsAbvGrd Functional Fireplaces FireplaceQu GarageType GarageYrBlt \


Id
1461 5 Typ 0 NaN Attchd 1961.0
1462 6 Typ 0 NaN Attchd 1958.0
1463 6 Typ 1 TA Attchd 1997.0
1464 7 Typ 1 Gd Attchd 1998.0
1465 5 Typ 0 NaN Attchd 1992.0

GarageFinish GarageCars GarageArea GarageQual GarageCond PavedDrive \


Id
1461 Unf 1.0 730.0 TA TA Y
1462 Unf 1.0 312.0 TA TA Y
1463 Fin 2.0 482.0 TA TA Y
1464 Fin 2.0 470.0 TA TA Y
1465 RFn 2.0 506.0 TA TA Y

WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch ScreenPorch \


Id
1461 140 0 0 0 120
1462 393 36 0 0 0
1463 212 34 0 0 0
1464 360 36 0 0 0
1465 0 82 0 0 144

PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType \


Id
1461 0 NaN MnPrv NaN 0 6 2010 WD

19
1462 0 NaN NaN Gar2 12500 6 2010 WD
1463 0 NaN MnPrv NaN 0 3 2010 WD
1464 0 NaN NaN NaN 0 6 2010 WD
1465 0 NaN NaN NaN 0 1 2010 WD

SaleCondition ind SalePrice


Id
1461 Normal test NaN
1462 Normal test NaN
1463 Normal test NaN
1464 Normal test NaN
1465 Normal test NaN

[11]: pd.set_option('display.max_rows', 100)


df_merge.isna().sum()

[11]: MSSubClass 0
MSZoning 4
LotFrontage 486
LotArea 0
Street 0
Alley 2721
LotShape 0
LandContour 0
Utilities 2
LotConfig 0
LandSlope 0
Neighborhood 0
Condition1 0
Condition2 0
BldgType 0
HouseStyle 0
OverallQual 0
OverallCond 0
YearBuilt 0
YearRemodAdd 0
RoofStyle 0
RoofMatl 0
Exterior1st 1
Exterior2nd 1
MasVnrType 24
MasVnrArea 23
ExterQual 0
ExterCond 0
Foundation 0
BsmtQual 81
BsmtCond 82

20
BsmtExposure 82
BsmtFinType1 79
BsmtFinSF1 1
BsmtFinType2 80
BsmtFinSF2 1
BsmtUnfSF 1
TotalBsmtSF 1
Heating 0
HeatingQC 0
CentralAir 0
Electrical 1
1stFlrSF 0
2ndFlrSF 0
LowQualFinSF 0
GrLivArea 0
BsmtFullBath 2
BsmtHalfBath 2
FullBath 0
HalfBath 0
BedroomAbvGr 0
KitchenAbvGr 0
KitchenQual 1
TotRmsAbvGrd 0
Functional 2
Fireplaces 0
FireplaceQu 1420
GarageType 157
GarageYrBlt 159
GarageFinish 159
GarageCars 1
GarageArea 1
GarageQual 159
GarageCond 159
PavedDrive 0
WoodDeckSF 0
OpenPorchSF 0
EnclosedPorch 0
3SsnPorch 0
ScreenPorch 0
PoolArea 0
PoolQC 2909
Fence 2348
MiscFeature 2814
MiscVal 0
MoSold 0
YrSold 0
SaleType 1

21
SaleCondition 0
ind 0
SalePrice 1459
dtype: int64

[12]: #Get a list of all the categorical features that have the keyword 'Qual' OR␣
↪'Cond' OR 'Qu' OR 'QC' in the feature name

feat_rating_Qual = [col for col in df_merge.columns if 'Qual' in col and␣


↪df_merge[col].dtypes=='object']

feat_rating_Cond = [col for col in df_merge.columns if 'Cond' in col and col␣


↪not in ['Condition1', 'Condition2', 'SaleCondition'] and df_merge[col].

↪dtypes=='object']

feat_rating_Qu = [col for col in df_merge.columns if 'Qu' in col and col not in␣
↪feat_rating_Qual and df_merge[col].dtypes=='object']

feat_rating_QC = [col for col in df_merge.columns if 'QC' in col and␣


↪df_merge[col].dtypes=='object']

cat_feat_with_rating = feat_rating_Qual + feat_rating_Cond + feat_rating_Qu +␣


↪feat_rating_QC

print(cat_feat_with_rating)

['ExterQual', 'BsmtQual', 'KitchenQual', 'GarageQual', 'ExterCond', 'BsmtCond',


'GarageCond', 'FireplaceQu', 'HeatingQC', 'PoolQC']

[13]: cat_feat_with_legit_na = ['Alley', 'BsmtExposure', 'BsmtFinType1',␣


↪'BsmtFinType2', 'GarageType', 'GarageFinish', 'Fence', 'MiscFeature']

df_merge[cat_feat_with_legit_na].head()

[13]: Alley BsmtExposure BsmtFinType1 BsmtFinType2 GarageType GarageFinish \


Id
1461 NaN No Rec LwQ Attchd Unf
1462 NaN No ALQ Unf Attchd Unf
1463 NaN No GLQ Unf Attchd Fin
1464 NaN No GLQ Unf Attchd Fin
1465 NaN No ALQ Unf Attchd RFn

Fence MiscFeature
Id
1461 MnPrv NaN
1462 NaN Gar2
1463 MnPrv NaN
1464 NaN NaN
1465 NaN NaN

[14]: ordinal_cat_feat = cat_feat_with_rating + cat_feat_with_legit_na


df_merge[ordinal_cat_feat].head()

22
[14]: ExterQual BsmtQual KitchenQual GarageQual ExterCond BsmtCond GarageCond \
Id
1461 TA TA TA TA TA TA TA
1462 TA TA Gd TA TA TA TA
1463 TA Gd TA TA TA TA TA
1464 TA TA Gd TA TA TA TA
1465 Gd Gd Gd TA TA TA TA

FireplaceQu HeatingQC PoolQC Alley BsmtExposure BsmtFinType1 \


Id
1461 NaN TA NaN NaN No Rec
1462 NaN TA NaN NaN No ALQ
1463 TA Gd NaN NaN No GLQ
1464 Gd Ex NaN NaN No GLQ
1465 NaN Ex NaN NaN No ALQ

BsmtFinType2 GarageType GarageFinish Fence MiscFeature


Id
1461 LwQ Attchd Unf MnPrv NaN
1462 Unf Attchd Unf NaN Gar2
1463 Unf Attchd Fin MnPrv NaN
1464 Unf Attchd Fin NaN NaN
1465 Unf Attchd RFn NaN NaN

[15]: df_merge[ordinal_cat_feat] = df_merge[ordinal_cat_feat].fillna("Missing")


df_merge[ordinal_cat_feat].isna().sum()

[15]: ExterQual 0
BsmtQual 0
KitchenQual 0
GarageQual 0
ExterCond 0
BsmtCond 0
GarageCond 0
FireplaceQu 0
HeatingQC 0
PoolQC 0
Alley 0
BsmtExposure 0
BsmtFinType1 0
BsmtFinType2 0
GarageType 0
GarageFinish 0
Fence 0
MiscFeature 0
dtype: int64

23
[16]: cat_cols = [col for col in df_merge if df_merge[col].dtypes=='object'and col !=␣
↪'ind']

remain_cat_col = [col for col in cat_cols if col not in ordinal_cat_feat]


for col in remain_cat_col:
df_merge[col] = df_merge[col].fillna(df_merge[col].mode()[0])

[17]: #Handling Missing Values in Numerical features by replacing them with Mean value
num_cols = [col for col in df_merge.columns if df_merge[col].dtypes != 'object']
df_merge[num_cols] = df_merge[num_cols].fillna(df_merge[num_cols].mean())

[18]: df_merge['GarageYrBlt'] = df_merge['GarageYrBlt'].astype('int')


df_merge['GarageYrBlt'] = df_merge['YrSold'] - df_merge['GarageYrBlt']
df_merge['YearBuilt'] = df_merge['YrSold'] - df_merge['YearBuilt']
df_merge['YearRemodAdd'] = df_merge['YrSold'] - df_merge['YearRemodAdd']
df_merge.drop(["YrSold"], axis=1, inplace=True)
df_merge.drop(["MoSold"], axis=1, inplace=True)

[19]: #TotalBsmtSF(Total square feet of basement area) = BsmtFinSF1(Type 1 finished␣


↪square feet) + BsmtFinSF2(Type 2 finished square feet) +␣

↪BsmtUnfSF(Unfinished square feet of basement area)df_merge_clean.

↪drop(["TotalBsmtSF"], axis=1, inplace=True)#Basement finished area

df_merge['BsmtFinSF'] = df_merge['BsmtFinSF1'] + df_merge['BsmtFinSF2']


df_merge.drop(["BsmtFinSF1"], axis=1, inplace=True)
df_merge.drop(["BsmtFinSF2"], axis=1, inplace=True)#Total floor square feet
df_merge['TotalFlrSF'] = df_merge['1stFlrSF'] + df_merge['2ndFlrSF']
df_merge.drop(["1stFlrSF"], axis=1, inplace=True)
df_merge.drop(["2ndFlrSF"], axis=1, inplace=True)

[20]: df_merge['Total_Bath'] = (df_merge['FullBath'] + (0.5*df_merge['HalfBath']) +␣


↪df_merge['BsmtFullBath'] + (0.5*df_merge['BsmtHalfBath']))

df_merge.drop(["FullBath"], axis=1, inplace=True)


df_merge.drop(["HalfBath"], axis=1, inplace=True)
df_merge.drop(["BsmtFullBath"], axis=1, inplace=True)
df_merge.drop(["BsmtHalfBath"], axis=1, inplace=True)

[21]: num_cols = [col for col in df_merge.columns if df_merge[col].dtypes != 'object'␣


↪and col != 'SalePrice']

skew_df = pd.DataFrame(num_cols, columns=['Feature'])

[22]: skew_df['Skew'] = skew_df['Feature'].apply(lambda feature: scipy.stats.


↪skew(df_merge[feature]))

[23]: skew_df['Absolute Skew'] = skew_df['Skew'].apply(abs)

[24]: skew_df['Skewed'] = skew_df['Absolute Skew'].apply(lambda x: True if x >= 0.5␣


↪else False)

24
[25]: skew_df

[25]: Feature Skew Absolute Skew Skewed


0 MSSubClass 1.375457 1.375457 True
1 LotFrontage 1.645574 1.645574 True
2 LotArea 12.822431 12.822431 True
3 OverallQual 0.197110 0.197110 False
4 OverallCond 0.570312 0.570312 True
5 YearBuilt 0.598609 0.598609 True
6 YearRemodAdd 0.450226 0.450226 False
7 MasVnrArea 2.611549 2.611549 True
8 BsmtUnfSF 0.919508 0.919508 True
9 TotalBsmtSF 1.162484 1.162484 True
10 LowQualFinSF 12.088761 12.088761 True
11 GrLivArea 1.269358 1.269358 True
12 BedroomAbvGr 0.326324 0.326324 False
13 KitchenAbvGr 4.302254 4.302254 True
14 TotRmsAbvGrd 0.758367 0.758367 True
15 Fireplaces 0.733495 0.733495 True
16 GarageYrBlt 0.386423 0.386423 False
17 GarageCars -0.218298 0.218298 False
18 GarageArea 0.241218 0.241218 False
19 WoodDeckSF 1.842433 1.842433 True
20 OpenPorchSF 2.535114 2.535114 True
21 EnclosedPorch 4.003891 4.003891 True
22 3SsnPorch 11.376065 11.376065 True
23 ScreenPorch 3.946694 3.946694 True
24 PoolArea 16.898328 16.898328 True
25 MiscVal 21.947195 21.947195 True
26 BsmtFinSF 1.178603 1.178603 True
27 TotalFlrSF 1.252036 1.252036 True
28 Total_Bath 0.493840 0.493840 False

[26]: df_merge[num_cols].describe()

[26]: MSSubClass LotFrontage LotArea OverallQual OverallCond \


count 2919.000000 2919.000000 2919.000000 2919.000000 2919.000000
mean 57.137718 69.305795 10168.114080 6.089072 5.564577
std 42.517628 21.312345 7886.996359 1.409947 1.113131
min 20.000000 21.000000 1300.000000 1.000000 1.000000
25% 20.000000 60.000000 7478.000000 5.000000 5.000000
50% 50.000000 69.305795 9453.000000 6.000000 5.000000
75% 70.000000 78.000000 11570.000000 7.000000 6.000000
max 190.000000 313.000000 215245.000000 10.000000 9.000000

YearBuilt YearRemodAdd MasVnrArea BsmtUnfSF TotalBsmtSF \


count 2919.000000 2919.000000 2919.000000 2919.000000 2919.000000

25
mean 36.479959 23.528263 102.201312 560.772104 1051.777587
std 30.336182 20.892061 178.626089 439.468337 440.690726
min -1.000000 -2.000000 0.000000 0.000000 0.000000
25% 7.000000 4.000000 0.000000 220.000000 793.000000
50% 35.000000 15.000000 0.000000 467.000000 990.000000
75% 54.500000 43.000000 163.500000 805.000000 1302.000000
max 136.000000 60.000000 1600.000000 2336.000000 6110.000000

LowQualFinSF GrLivArea BedroomAbvGr KitchenAbvGr TotRmsAbvGrd \


count 2919.000000 2919.000000 2919.000000 2919.000000 2919.000000
mean 4.694416 1500.759849 2.860226 1.044536 6.451524
std 46.396825 506.051045 0.822693 0.214462 1.569379
min 0.000000 334.000000 0.000000 0.000000 2.000000
25% 0.000000 1126.000000 2.000000 1.000000 5.000000
50% 0.000000 1444.000000 3.000000 1.000000 6.000000
75% 0.000000 1743.500000 3.000000 1.000000 7.000000
max 1064.000000 5642.000000 8.000000 3.000000 15.000000

Fireplaces GarageYrBlt GarageCars GarageArea WoodDeckSF \


count 2919.000000 2919.000000 2919.000000 2919.000000 2919.000000
mean 0.597122 29.685509 1.766621 472.874572 93.709832
std 0.646129 24.908326 0.761494 215.357904 126.526589
min 0.000000 -200.000000 0.000000 0.000000 0.000000
25% 0.000000 7.000000 1.000000 320.000000 0.000000
50% 1.000000 29.000000 2.000000 480.000000 0.000000
75% 1.000000 46.000000 2.000000 576.000000 168.000000
max 4.000000 114.000000 5.000000 1488.000000 1424.000000

OpenPorchSF EnclosedPorch 3SsnPorch ScreenPorch PoolArea \


count 2919.000000 2919.000000 2919.000000 2919.000000 2919.000000
mean 47.486811 23.098321 2.602261 16.062350 2.251799
std 67.575493 64.244246 25.188169 56.184365 35.663946
min 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.000000 0.000000 0.000000 0.000000 0.000000
50% 26.000000 0.000000 0.000000 0.000000 0.000000
75% 70.000000 0.000000 0.000000 0.000000 0.000000
max 742.000000 1012.000000 508.000000 576.000000 800.000000

MiscVal BsmtFinSF TotalFlrSF Total_Bath


count 2919.000000 2919.000000 2919.000000 2919.000000
mean 50.825968 491.005483 1496.065433 2.218712
std 567.402211 477.123276 503.675790 0.808943
min 0.000000 0.000000 334.000000 1.000000
25% 0.000000 0.000000 1120.000000 1.500000
50% 0.000000 458.000000 1440.000000 2.000000
75% 0.000000 805.500000 1742.500000 2.500000
max 17000.000000 5644.000000 5642.000000 7.000000

26
[27]: for column in skew_df.query("Skewed == True")['Feature'].values:
df_merge[column] = np.log1p(df_merge[column])

[28]: df_merge[cat_feat_with_rating]

[28]: ExterQual BsmtQual KitchenQual GarageQual ExterCond BsmtCond GarageCond \


Id
1461 TA TA TA TA TA TA TA
1462 TA TA Gd TA TA TA TA
1463 TA Gd TA TA TA TA TA
1464 TA TA Gd TA TA TA TA
1465 Gd Gd Gd TA TA TA TA
… … … … … … … …
1456 TA Gd TA TA TA TA TA
1457 TA Gd TA TA TA TA TA
1458 Ex TA Gd TA Gd Gd TA
1459 TA TA Gd TA TA TA TA
1460 Gd TA TA TA TA TA TA

FireplaceQu HeatingQC PoolQC


Id
1461 Missing TA Missing
1462 Missing TA Missing
1463 TA Gd Missing
1464 Gd Ex Missing
1465 Missing Ex Missing
… … … …
1456 TA Ex Missing
1457 TA TA Missing
1458 Gd Ex Missing
1459 Missing Gd Missing
1460 Missing Gd Missing

[2919 rows x 10 columns]

[29]: for col in cat_feat_with_rating:


df_merge[col] = df_merge[col].map({"Missing":0,"Po":1,"Fa":2,"TA":3,"Gd":
↪4,"Ex":5})

[30]: #Exclude 'Alley', MiscFeature' and 'GarageType' feature as they are not ordinal
df_merge['BsmtExposure'] = df_merge['BsmtExposure'].map({"Missing":0,"No":
↪1,"Mn":2,"Av":3,"Gd":4}).astype('int')

df_merge['BsmtFinType1'] = df_merge['BsmtFinType1'].map({"Missing":0,"Unf":
↪1,"LwQ":2,"Rec":3,"BLQ":4,"ALQ":5,"GLQ":6}).astype('int')

df_merge['BsmtFinType2'] = df_merge['BsmtFinType2'].map({"Missing":0,"Unf":
↪1,"LwQ":2,"Rec":3,"BLQ":4,"ALQ":5,"GLQ":6}).astype('int')

27
df_merge['GarageFinish'] = df_merge['GarageFinish'].map({"Missing":0,"Unf":
↪1,"RFn":2,"Fin":3}).astype('int')

df_merge['Fence'] = df_merge['Fence'].map({"Missing":0,"MnWw":1,"GdWo":
↪2,"MnPrv":3,"GdPrv":4}).astype('int')

[31]: df_merge['LotShape'] = df_merge['LotShape'].map({"IR3":1,"IR2":2,"IR1":3,"Reg":


↪4}).astype('int')

df_merge['LandContour'] = df_merge['LandContour'].map({"Low":1,"Bnk":2,"HLS":
↪3,"Lvl":4}).astype('int')

df_merge['Utilities'] = df_merge['Utilities'].map({"ELO":1,"NoSeWa":2,"NoSewr":
↪3,"AllPub":4}).astype('int')

df_merge['LandSlope'] = df_merge['LandSlope'].map({"Sev":1,"Mod":2,"Gtl":3}).
↪astype('int')

df_merge['CentralAir'] = df_merge['CentralAir'].map({"N":0,"Y":1}).astype('int')
df_merge['PavedDrive'] = df_merge['PavedDrive'].map({"N":0,"P":1,"Y":2}).
↪astype('int')

[32]: cat_remaining_to_encode = [col for col in df_merge.columns if df_merge[col].


↪dtypes=='object' and col !='ind']

print(cat_remaining_to_encode)

['MSZoning', 'Street', 'Alley', 'LotConfig', 'Neighborhood', 'Condition1',


'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
'Exterior2nd', 'MasVnrType', 'Foundation', 'Heating', 'Electrical',
'Functional', 'GarageType', 'MiscFeature', 'SaleType', 'SaleCondition']

[33]: df_merge_dummies = pd.


↪get_dummies(df_merge[cat_remaining_to_encode],drop_first=True)

df_merge.drop(cat_remaining_to_encode,axis=1,inplace=True)
df_merge = pd.concat([df_merge,df_merge_dummies],axis=1)

[34]: test, train= df_merge[df_merge["ind"].eq("test")], df_merge[df_merge["ind"].


↪eq("train")]

test.drop(["SalePrice", "ind"], axis=1, inplace=True)


train.drop(["ind"], axis=1, inplace=True)

[35]: log_target = np.log(train['SalePrice'])

train.drop(["SalePrice"], axis=1, inplace=True)

[36]: catboost_params = {
'iterations': 5000,
'learning_rate': 0.02,
'depth': 4,
'eval_metric':'RMSE',
'early_stopping_rounds': 20

28
}
xgboost_params = {
'n_estimators': 5000,
'learning_rate': 0.02,
'colsample_bytree': 0.5,
'subsample': 0.5,
'min_child_weight': 2,
'early_stopping_rounds': 20
}

[37]: models = {
"catboost": CatBoostRegressor(**catboost_params, verbose=0),
#"xgb": XGBRegressor(**xgboost_params, verbose=0)
}

[38]: for name, model in models.items():


model.fit(train, log_target)
print(name + " trained.")

catboost trained.

[39]: results = {}
kf = KFold(n_splits=10)
for name, model in models.items():
result = np.exp(np.sqrt(-cross_val_score(model, train, log_target,␣
↪scoring='neg_mean_squared_error', cv=kf)))

results[name] = result

[40]: for name, result in results.items():


print("----------\n" + name)
print(np.mean(result))
print(np.std(result))

----------
catboost
1.1241005620323778
0.018378204411854795

[41]: #Combine predictions


final_predictions = models['catboost'].predict(test)

[42]: # Save test predictions to file


output = pd.DataFrame({'Id': test.index,
'SalePrice': np.exp(final_predictions)})
output.to_csv('submission.csv', index=False)

[ ]:

29

You might also like