Professional Documents
Culture Documents
1
2 Norm 1Fam 1Story 6 8 1976
3 Norm 1Fam 2Story 7 5 2001
4 Norm 1Fam 2Story 7 5 1915
5 Norm 1Fam 2Story 8 5 2000
2
1 1 Gd 8 Typ 0 NaN
2 1 TA 6 Typ 1 TA
3 1 Gd 6 Typ 1 TA
4 1 Gd 7 Typ 1 Gd
5 1 Gd 9 Typ 1 TA
3
1465 120 RL 43.0 5005 Pave NaN IR1
4
1464 324.0 926.0 GasA Ex Y SBrkr
1465 1017.0 1280.0 GasA Ex Y SBrkr
5
1463 0 NaN MnPrv NaN 0 3 2010 WD
1464 0 NaN NaN NaN 0 6 2010 WD
1465 0 NaN NaN NaN 0 1 2010 WD
SaleCondition
Id
1461 Normal
1462 Normal
1463 Normal
1464 Normal
1465 Normal
[5]: sns.displot(data=df_train,x='SalePrice',kde=False )
plt.show()
6
[7]: for column in numerucal_data:
plt.figure(figsize=(16, 4))
sns.set_theme(style="whitegrid")
sns.boxplot(data=numerucal_data, x=column)
7
8
9
10
11
12
13
14
15
16
[8]: feat_train_not_test = [col for col in df_train.columns if col not in df_test.
↪columns and col != 'SalePrice']
print(feat_train_not_test)
[]
print(feat_train_not_test)
[]
df_merge.head()
17
1464 60 RL 78.0 9978 Pave NaN IR1
1465 120 RL 43.0 5005 Pave NaN IR1
18
1463 137.0 928.0 GasA Gd Y SBrkr
1464 324.0 926.0 GasA Ex Y SBrkr
1465 1017.0 1280.0 GasA Ex Y SBrkr
19
1462 0 NaN NaN Gar2 12500 6 2010 WD
1463 0 NaN MnPrv NaN 0 3 2010 WD
1464 0 NaN NaN NaN 0 6 2010 WD
1465 0 NaN NaN NaN 0 1 2010 WD
[11]: MSSubClass 0
MSZoning 4
LotFrontage 486
LotArea 0
Street 0
Alley 2721
LotShape 0
LandContour 0
Utilities 2
LotConfig 0
LandSlope 0
Neighborhood 0
Condition1 0
Condition2 0
BldgType 0
HouseStyle 0
OverallQual 0
OverallCond 0
YearBuilt 0
YearRemodAdd 0
RoofStyle 0
RoofMatl 0
Exterior1st 1
Exterior2nd 1
MasVnrType 24
MasVnrArea 23
ExterQual 0
ExterCond 0
Foundation 0
BsmtQual 81
BsmtCond 82
20
BsmtExposure 82
BsmtFinType1 79
BsmtFinSF1 1
BsmtFinType2 80
BsmtFinSF2 1
BsmtUnfSF 1
TotalBsmtSF 1
Heating 0
HeatingQC 0
CentralAir 0
Electrical 1
1stFlrSF 0
2ndFlrSF 0
LowQualFinSF 0
GrLivArea 0
BsmtFullBath 2
BsmtHalfBath 2
FullBath 0
HalfBath 0
BedroomAbvGr 0
KitchenAbvGr 0
KitchenQual 1
TotRmsAbvGrd 0
Functional 2
Fireplaces 0
FireplaceQu 1420
GarageType 157
GarageYrBlt 159
GarageFinish 159
GarageCars 1
GarageArea 1
GarageQual 159
GarageCond 159
PavedDrive 0
WoodDeckSF 0
OpenPorchSF 0
EnclosedPorch 0
3SsnPorch 0
ScreenPorch 0
PoolArea 0
PoolQC 2909
Fence 2348
MiscFeature 2814
MiscVal 0
MoSold 0
YrSold 0
SaleType 1
21
SaleCondition 0
ind 0
SalePrice 1459
dtype: int64
[12]: #Get a list of all the categorical features that have the keyword 'Qual' OR␣
↪'Cond' OR 'Qu' OR 'QC' in the feature name
↪dtypes=='object']
feat_rating_Qu = [col for col in df_merge.columns if 'Qu' in col and col not in␣
↪feat_rating_Qual and df_merge[col].dtypes=='object']
print(cat_feat_with_rating)
df_merge[cat_feat_with_legit_na].head()
Fence MiscFeature
Id
1461 MnPrv NaN
1462 NaN Gar2
1463 MnPrv NaN
1464 NaN NaN
1465 NaN NaN
22
[14]: ExterQual BsmtQual KitchenQual GarageQual ExterCond BsmtCond GarageCond \
Id
1461 TA TA TA TA TA TA TA
1462 TA TA Gd TA TA TA TA
1463 TA Gd TA TA TA TA TA
1464 TA TA Gd TA TA TA TA
1465 Gd Gd Gd TA TA TA TA
[15]: ExterQual 0
BsmtQual 0
KitchenQual 0
GarageQual 0
ExterCond 0
BsmtCond 0
GarageCond 0
FireplaceQu 0
HeatingQC 0
PoolQC 0
Alley 0
BsmtExposure 0
BsmtFinType1 0
BsmtFinType2 0
GarageType 0
GarageFinish 0
Fence 0
MiscFeature 0
dtype: int64
23
[16]: cat_cols = [col for col in df_merge if df_merge[col].dtypes=='object'and col !=␣
↪'ind']
[17]: #Handling Missing Values in Numerical features by replacing them with Mean value
num_cols = [col for col in df_merge.columns if df_merge[col].dtypes != 'object']
df_merge[num_cols] = df_merge[num_cols].fillna(df_merge[num_cols].mean())
24
[25]: skew_df
[26]: df_merge[num_cols].describe()
25
mean 36.479959 23.528263 102.201312 560.772104 1051.777587
std 30.336182 20.892061 178.626089 439.468337 440.690726
min -1.000000 -2.000000 0.000000 0.000000 0.000000
25% 7.000000 4.000000 0.000000 220.000000 793.000000
50% 35.000000 15.000000 0.000000 467.000000 990.000000
75% 54.500000 43.000000 163.500000 805.000000 1302.000000
max 136.000000 60.000000 1600.000000 2336.000000 6110.000000
26
[27]: for column in skew_df.query("Skewed == True")['Feature'].values:
df_merge[column] = np.log1p(df_merge[column])
[28]: df_merge[cat_feat_with_rating]
[30]: #Exclude 'Alley', MiscFeature' and 'GarageType' feature as they are not ordinal
df_merge['BsmtExposure'] = df_merge['BsmtExposure'].map({"Missing":0,"No":
↪1,"Mn":2,"Av":3,"Gd":4}).astype('int')
df_merge['BsmtFinType1'] = df_merge['BsmtFinType1'].map({"Missing":0,"Unf":
↪1,"LwQ":2,"Rec":3,"BLQ":4,"ALQ":5,"GLQ":6}).astype('int')
df_merge['BsmtFinType2'] = df_merge['BsmtFinType2'].map({"Missing":0,"Unf":
↪1,"LwQ":2,"Rec":3,"BLQ":4,"ALQ":5,"GLQ":6}).astype('int')
27
df_merge['GarageFinish'] = df_merge['GarageFinish'].map({"Missing":0,"Unf":
↪1,"RFn":2,"Fin":3}).astype('int')
df_merge['Fence'] = df_merge['Fence'].map({"Missing":0,"MnWw":1,"GdWo":
↪2,"MnPrv":3,"GdPrv":4}).astype('int')
df_merge['LandContour'] = df_merge['LandContour'].map({"Low":1,"Bnk":2,"HLS":
↪3,"Lvl":4}).astype('int')
df_merge['Utilities'] = df_merge['Utilities'].map({"ELO":1,"NoSeWa":2,"NoSewr":
↪3,"AllPub":4}).astype('int')
df_merge['LandSlope'] = df_merge['LandSlope'].map({"Sev":1,"Mod":2,"Gtl":3}).
↪astype('int')
df_merge['CentralAir'] = df_merge['CentralAir'].map({"N":0,"Y":1}).astype('int')
df_merge['PavedDrive'] = df_merge['PavedDrive'].map({"N":0,"P":1,"Y":2}).
↪astype('int')
print(cat_remaining_to_encode)
df_merge.drop(cat_remaining_to_encode,axis=1,inplace=True)
df_merge = pd.concat([df_merge,df_merge_dummies],axis=1)
[36]: catboost_params = {
'iterations': 5000,
'learning_rate': 0.02,
'depth': 4,
'eval_metric':'RMSE',
'early_stopping_rounds': 20
28
}
xgboost_params = {
'n_estimators': 5000,
'learning_rate': 0.02,
'colsample_bytree': 0.5,
'subsample': 0.5,
'min_child_weight': 2,
'early_stopping_rounds': 20
}
[37]: models = {
"catboost": CatBoostRegressor(**catboost_params, verbose=0),
#"xgb": XGBRegressor(**xgboost_params, verbose=0)
}
catboost trained.
[39]: results = {}
kf = KFold(n_splits=10)
for name, model in models.items():
result = np.exp(np.sqrt(-cross_val_score(model, train, log_target,␣
↪scoring='neg_mean_squared_error', cv=kf)))
results[name] = result
----------
catboost
1.1241005620323778
0.018378204411854795
[ ]:
29