Professional Documents
Culture Documents
Airbnb Inc is an online marketplace for arranging or offering lodging, primarily homestays, or tourism experiences. Airbnb has close to 150
million customers across the world. Price is the most important factor considered by the customer while making booking into a property.
Strategic pricing of the properties is important to avoid losing customers to the competitors.
We have a data of 74111 Airbnb properties across the nations. Based on this data build a simple and multiple linear regression model to predict
the strategic pricing of a new listed property on Airbnb.
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import matplotlib.pyplot as plt
import matplotlib.style
Importing data
# reading the CSV file into pandas dataframe
df = pd.read_csv("AirBNB.csv")
EDA
# Check top few records to get a feel of the data structure
df.head()
# drop the id column as it is useless for the model
df.head()
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74111 entries, 0 to 74110
Data columns (total 10 columns):
room_type 74106 non-null object
accommodates 74108 non-null float64
bathrooms 73908 non-null float64
cancellation_policy 74103 non-null object
cleaning_fee 74107 non-null object
instant_bookable 74111 non-null object
review_scores_rating 57389 non-null float64
bedrooms 74019 non-null float64
beds 73980 non-null float64
log_price 74111 non-null float64
dtypes: float64(6), object(4)
memory usage: 5.7+ MB
df.describe(include="all")
ROOM_TYPE : 3
Shared room 2163
Private room 30635
Entire home/apt 41308
Name: room_type, dtype: int64
CANCELLATION_POLICY : 3
moderate 19060
flexible 22543
strict 32500
Name: cancellation_policy, dtype: int64
CLEANING_FEE : 2
False 19708
True 54399
Name: cleaning_fee, dtype: int64
INSTANT_BOOKABLE : 2
t 19451
f 54660
Name: instant_bookable, dtype: int64
df.instant_bookable.astype("object")
df.head()
# Are there any missing values ?
df.isnull().sum()
accommodates 3
bathrooms 203
instant_bookable 0
review_scores_rating 16722
bedrooms 92
beds 131
log_price 0
room_type_Private room 0
room_type_Shared room 0
cancellation_policy_moderate 0
cancellation_policy_strict 0
cleaning_fee_True 0
dtype: int64
for column in df.columns:
if df[column].dtype != 'object':
mean = df[column].mean()
df[column] = df[column].fillna(mean)
df.isnull().sum()
accommodates 0
bathrooms 0
instant_bookable 0
review_scores_rating 0
bedrooms 0
beds 0
log_price 0
room_type_Private room 0
room_type_Shared room 0
cancellation_policy_moderate 0
cancellation_policy_strict 0
cleaning_fee_True 0
dtype: int64
df.head()
df.describe()
# Are there any duplicates ?
dups = df.duplicated()
print('Number of duplicate rows = %d' % (dups.sum()))
#df[dups]
print('Before',df.shape)
df.drop_duplicates(inplace=True)
print('After',df.shape)
dups = df.duplicated()
print('Number of duplicate rows = %d' % (dups.sum()))
Correlation Plot
sns.heatmap(df.iloc[:, 0:6].corr(),annot=True)
plt.show()
The variable 'Accommodates', 'bedrooms','bathrooms', 'beds' are having correlation with each other.
df.head()
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 54113 entries, 0 to 74110
Data columns (total 12 columns):
accommodates 54113 non-null float64
bathrooms 54113 non-null float64
instant_bookable 54113 non-null bool
review_scores_rating 54113 non-null float64
bedrooms 54113 non-null float64
beds 54113 non-null float64
log_price 54113 non-null float64
room_type_Private room 54113 non-null uint8
room_type_Shared room 54113 non-null uint8
cancellation_policy_moderate 54113 non-null uint8
cancellation_policy_strict 54113 non-null uint8
cleaning_fee_True 54113 non-null uint8
dtypes: bool(1), float64(6), uint8(5)
memory usage: 3.2 MB
Outlier Checks
# construct box plot for continuous variables
cont=df.dtypes[(df.dtypes!='uint8') & (df.dtypes!='bool')].index
plt.figure(figsize=(10,10))
df[cont].boxplot(vert=0)
plt.title('With Outliers',fontsize=16)
plt.show()
def remove_outlier(col):
sorted(col)
Q1,Q3=np.percentile(col,[25,75])
IQR=Q3-Q1
lower_range= Q1-(1.5 * IQR)
upper_range= Q3+(1.5 * IQR)
return lower_range, upper_range
for column in df[cont].columns:
lr,ur=remove_outlier(df[column])
df[column]=np.where(df[column]>ur,ur,df[column])
df[column]=np.where(df[column]<lr,lr,df[column])
plt.figure(figsize=(10,10))
df[cont].boxplot(vert=0)
plt.title('After Outlier Removal',fontsize=16)
plt.show()
Data Distribution
df_attr = (df[cont])
sns.pairplot(df_attr, diag_kind='kde')
plt.show()
Train-Test Split
# Copy all the predictor variables into X dataframe
X = df.drop('log_price', axis=1)
# Copy target into the y dataframe.
y = df[['log_price']]
X.head()
# Split X and y into training and test set in 75:25 ratio
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25 , random_state=1)
# invoke the LinearRegression function and find the bestfit model on training data
import statsmodels.api as sm
X_train=sm.add_constant(X_train)# This adds the constant term beta0 to the Simple Linear Regression.
X_test=sm.add_constant(X_test)
#Let us build the model
# let's check the VIF of the predictors
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif_series1 = pd.Series(
[variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])],
index=X_train.columns,
)
print("VIF values: \n\n{}\n".format(vif_series1))
AS few predictors have VIF values > 2 therefore there is some multicolinearity in the data
We remove those predictors with multicolinarity due to which there is least impact on the adjusted R2
X_train.columns
#1) Removing predictor 'accomodates' as VIF>2
X_train2 = X_train.drop(["accommodates"], axis=1)
olsmod_1 = sm.OLS(y_train, X_train2)
olsres_1 = olsmod_1.fit()
print(
"R-squared:",
np.round(olsres_1.rsquared, 3),
"\nAdjusted R-squared:",
np.round(olsres_1.rsquared_adj, 3),
)
0.508-0.495
# Seems like an important predictor and thus wouldnot choose to drop it .
#2) Removing predictor 'beds' as VIF>2
0.508-0.507
# As very less effect on the adjusted R2 we can choose to drop it
X_train = X_train.drop(["beds"], axis=1)
olsmod_5 = sm.OLS(y_train, X_train)
# let's check the VIF of the predictors after removing'beds'
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif_series1 = pd.Series(
[variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])],
index=X_train.columns,
)
print("VIF values: \n\n{}\n".format(vif_series1))
Now that we do not have multicollinearity in our data, the p-values of the coefficients have become reliable and we
can remove the non-significant predictor variables.
print(olsres_5.summary())
Now as observed from above the predictor 'cancellation_policy_strict' has p-value>0.05 we remove that and build
the model
After dropping the features causing strong multicollinearity and the statistically insignificant ones, our model
performance hasn't dropped sharply . This shows that these variables did not have much predictive power.
Testing the Assumptions of Linear Regression
# let us plot the fitted values vs residuals
import statsmodels.stats.api as sms
# Let us write the equation of linear regression
Equation = "log_price ="
print(Equation, end=" ")
for i in range(len(X_train5.columns)):
if i == 0:
print(olsres_6.params[i], "+", end=" ")
elif i != len(X_train5.columns) - 1:
print(
olsres_6.params[i],
"* (",
X_train5.columns[i],
")",
"+",
end=" ",
)
else:
print(olsres_6.params[i], "* (", X_train5.columns[i], ")")
Observation
###
1 unit increase in the no.of bathrooms lead to a 0.2 times increase in the log price
1 unit increase in the no.of bedrooms lead to a 0.14 times increase in the log price
if instant booking available for a property it decreases the log ptice by a factor of 0.06
if cleaning fee is charged for a property it decreases the log ptice by a factor of 0.08
Predictions
X_train5.columns
X_test.columns
# dropping columns from the test data that are not there in the training data
# let's make predictions on the test set
# To check model performance
from sklearn.metrics import mean_absolute_error, mean_squared_error
# let's check the RMSE on the train data
rmse1 = np.sqrt(mean_squared_error(y_train, y_pred_train))
rmse1
# let's check the RMSE on the test data
rmse2 = np.sqrt(mean_squared_error(y_test, y_pred_test))
rmse2
# invoke the LinearRegression function and find the bestfit model on training data
# Let us explore the coefficients for each of the independent attributes
for idx, col_name in enumerate(X_train5.columns):
print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))
# Let us check the intercept for the model
intercept = regression_model.intercept_[0]
print("The intercept for our model is {}".format(intercept))
# R square on training data
regression_model.score(X_train5, y_train)
0.5073800293325945
50% of the variation in the log_price is explained by the predictors in the model for train set
# R square on testing data
regression_model.score(X_test2, y_test)
0.5011325551678224
#RMSE on Training data
predicted_train=regression_model.fit(X_train5, y_train).predict(X_train5)
np.sqrt(metrics.mean_squared_error(y_train,predicted_train))
0.5164036750698143
#RMSE on Testing data
predicted_test=regression_model.fit(X_train5, y_train).predict(X_test2)
np.sqrt(metrics.mean_squared_error(y_test,predicted_test))
0.5202270256722598
# invoke the LinearRegression function and find the bestfit model on training data
regression_model = LinearRegression()
regression_model.fit(X_train5, y_train)
# Let us explore the coefficients for each of the independent attributes
for idx, col_name in enumerate(X_train5.columns):
print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))
# Let us check the intercept for the model
intercept = regression_model.intercept_[0]
print("The intercept for our model is {}".format(intercept))
# R square on training data
regression_model.score(X_train5, y_train)
#50% of the variation in the log_price is explained by the predictors in the model for train set
# R square on testing data
regression_model.score(X_test2, y_test)
#RMSE on Training data
predicted_train=regression_model.fit(X_train5, y_train).predict(X_train5)
np.sqrt(metrics.mean_squared_error(y_train,predicted_train))
#RMSE on Testing data
predicted_test=regression_model.fit(X_train5, y_train).predict(X_test2)
np.sqrt(metrics.mean_squared_error(y_test,predicted_test))