You are on page 1of 13

7/3/2021 temp-162530801090469818

In [20]:

import numpy as np

import pandas as pd

import seaborn as sns

from sklearn.linear_model import LinearRegression

from sklearn import metrics

import matplotlib.pyplot as plt

import matplotlib.style

In [21]:

# reading the CSV file into pandas dataframe

df = pd.read_csv("AirBNB.csv")

C:\Users\hp\anaconda3\lib\site-packages\IPython\core\interactiveshell.py:3
146: DtypeWarning: Columns (5) have mixed types.Specify dtype option on im
port or set low_memory=False.

has_raised = await self.run_ast_nodes(code_ast.body, cell_name,

EDA

In [22]:

# Check top few records to get a feel of the data structure

df.head()

Out[22]:

id room_type accommodates bathrooms cancellation_policy cleaning_fee instan

Entire
0 6901257 3.0 1.0 strict True
home/apt

Entire
1 6304928 7.0 1.0 strict True
home/apt

Entire
2 7919400 5.0 1.0 moderate True
home/apt

Entire
3 13418779 4.0 1.0 flexible True
home/apt

Entire
4 3808709 2.0 1.0 moderate True
home/apt

In [6]:

# drop the id column as it is useless for the model

df = df.drop('id', axis=1)

https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/notebook2.html?t=1625308112294 1/13
7/3/2021 temp-162530801090469818

In [23]:

df.head()

Out[23]:

id room_type accommodates bathrooms cancellation_policy cleaning_fee instan

Entire
0 6901257 3.0 1.0 strict True
home/apt

Entire
1 6304928 7.0 1.0 strict True
home/apt

Entire
2 7919400 5.0 1.0 moderate True
home/apt

Entire
3 13418779 4.0 1.0 flexible True
home/apt

Entire
4 3808709 2.0 1.0 moderate True
home/apt

In [24]:

df.info()

<class 'pandas.core.frame.DataFrame'>

RangeIndex: 74111 entries, 0 to 74110

Data columns (total 11 columns):

# Column Non-Null Count Dtype

--- ------ -------------- -----

0 id 74111 non-null int64

1 room_type 74106 non-null object

2 accommodates 74108 non-null float64

3 bathrooms 73908 non-null float64

4 cancellation_policy 74103 non-null object

5 cleaning_fee 74107 non-null object

6 instant_bookable 74111 non-null object

7 review_scores_rating 57389 non-null float64

8 bedrooms 74019 non-null float64

9 beds 73980 non-null float64

10 log_price 74111 non-null float64

dtypes: float64(6), int64(1), object(4)

memory usage: 6.2+ MB

https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/notebook2.html?t=1625308112294 2/13
7/3/2021 temp-162530801090469818

In [25]:

df.describe(include="all")

Out[25]:

id room_type accommodates bathrooms cancellation_policy cleaning

count 7.411100e+04 74106 74108.000000 73908.000000 74103 74

unique NaN 3 NaN NaN 3

Entire
top NaN NaN NaN strict
home/apt

freq NaN 41308 NaN NaN 32500 54

mean 1.126662e+07 NaN 3.155125 1.235272 NaN

std 6.081735e+06 NaN 2.153603 0.582054 NaN

min 3.440000e+02 NaN 1.000000 0.000000 NaN

25% 6.261964e+06 NaN 2.000000 1.000000 NaN

50% 1.225415e+07 NaN 2.000000 1.000000 NaN

75% 1.640226e+07 NaN 4.000000 1.000000 NaN

max 2.123090e+07 NaN 16.000000 8.000000 NaN

unique values for categorical variables

https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/notebook2.html?t=1625308112294 3/13
7/3/2021 temp-162530801090469818

In [26]:

for column in df.columns:

if df[column].dtype == 'object':

print(column.upper(),': ',df[column].nunique())

print(df[column].value_counts().sort_values())

print('\n')

ROOM_TYPE : 3

Shared room 2163

Private room 30635

Entire home/apt 41308

Name: room_type, dtype: int64

CANCELLATION_POLICY : 3

moderate 19060

flexible 22543

strict 32500

Name: cancellation_policy, dtype: int64

CLEANING_FEE : 2

False 19708

True 54399

Name: cleaning_fee, dtype: int64

INSTANT_BOOKABLE : 2

t 19451

f 54660

Name: instant_bookable, dtype: int64

Converting Instant_bookable from 'f' and 't' to boolean

In [27]:

df.instant_bookable.replace(['f','t'],[False,True],inplace=True )

Converting categorical to dummy variables

In [29]:

df = pd.get_dummies(df, columns=['room_type','cancellation_policy','cleaning_fee'],drop
_first=True)

https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/notebook2.html?t=1625308112294 4/13
7/3/2021 temp-162530801090469818

In [30]:

df.head()

Out[30]:

id accommodates bathrooms instant_bookable review_scores_rating bedrooms

0 6901257 3.0 1.0 False 100.0 1.0

1 6304928 7.0 1.0 True 93.0 3.0

2 7919400 5.0 1.0 True 92.0 1.0

3 13418779 4.0 1.0 False NaN 2.0

4 3808709 2.0 1.0 True 40.0 0.0

In [31]:

# Are there any missing values ?

df.isnull().sum()

Out[31]:

id 0

accommodates 3

bathrooms 203

instant_bookable 0

review_scores_rating 16722

bedrooms 92

beds 131

log_price 0

room_type_Private room 0

room_type_Shared room 0

cancellation_policy_moderate 0

cancellation_policy_strict 0

cleaning_fee_True 0

dtype: int64

Imputing missing values

https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/notebook2.html?t=1625308112294 5/13
7/3/2021 temp-162530801090469818

In [32]:

for column in df.columns:

if df[column].dtype != 'object':

mean = df[column].mean()

df[column] = df[column].fillna(mean)

df.isnull().sum()

Out[32]:

id 0

accommodates 0

bathrooms 0

instant_bookable 0

review_scores_rating 0

bedrooms 0

beds 0

log_price 0

room_type_Private room 0

room_type_Shared room 0

cancellation_policy_moderate 0

cancellation_policy_strict 0

cleaning_fee_True 0

dtype: int64

In [33]:

df.head()

Out[33]:

id accommodates bathrooms instant_bookable review_scores_rating bedrooms

0 6901257 3.0 1.0 False 100.000000 1.0

1 6304928 7.0 1.0 True 93.000000 3.0

2 7919400 5.0 1.0 True 92.000000 1.0

3 13418779 4.0 1.0 False 94.067365 2.0

4 3808709 2.0 1.0 True 40.000000 0.0

https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/notebook2.html?t=1625308112294 6/13
7/3/2021 temp-162530801090469818

In [39]:

df.describe()

Out[39]:

id accommodates bathrooms review_scores_rating bedrooms

count 7.411100e+04 74111.000000 74111.000000 74111.000000 74111.000000 74111

mean 1.126662e+07 3.155125 1.235272 94.067365 1.265797 1

std 6.081735e+06 2.153559 0.581256 6.896002 0.851620 1

min 3.440000e+02 1.000000 0.000000 20.000000 0.000000 0

25% 6.261964e+06 2.000000 1.000000 93.000000 1.000000 1

50% 1.225415e+07 2.000000 1.000000 94.067365 1.000000 1

75% 1.640226e+07 4.000000 1.000000 99.000000 1.000000 2

max 2.123090e+07 16.000000 8.000000 100.000000 10.000000 18

Checking for duplicates

In [42]:

# Are there any duplicates ?

dups = df.duplicated()

print('Number of duplicate rows = %d' % (dups.sum()))

#df[dups]

Number of duplicate rows = 0

In [43]:

print('Before',df.shape)

df.drop_duplicates(inplace=True)

print('After',df.shape)

Before (74111, 13)

After (74111, 13)

In [44]:

dups = df.duplicated()

print('Number of duplicate rows = %d' % (dups.sum()))

Number of duplicate rows = 0

Correlation Plot

https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/notebook2.html?t=1625308112294 7/13
7/3/2021 temp-162530801090469818

In [45]:

sns.heatmap(df.iloc[:, 0:6].corr(),annot=True)

plt.show()

The variable 'Accommodates', 'bedrooms','bathrooms', 'beds' are having correlation with each other.

In [46]:

df.head()

Out[46]:

id accommodates bathrooms instant_bookable review_scores_rating bedrooms

0 6901257 3.0 1.0 False 100.000000 1.0

1 6304928 7.0 1.0 True 93.000000 3.0

2 7919400 5.0 1.0 True 92.000000 1.0

3 13418779 4.0 1.0 False 94.067365 2.0

4 3808709 2.0 1.0 True 40.000000 0.0

https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/notebook2.html?t=1625308112294 8/13
7/3/2021 temp-162530801090469818

In [47]:

df.info()

<class 'pandas.core.frame.DataFrame'>

Int64Index: 74111 entries, 0 to 74110

Data columns (total 13 columns):

# Column Non-Null Count Dtype

--- ------ -------------- -----

0 id 74111 non-null int64

1 accommodates 74111 non-null float64

2 bathrooms 74111 non-null float64

3 instant_bookable 74111 non-null bool

4 review_scores_rating 74111 non-null float64

5 bedrooms 74111 non-null float64

6 beds 74111 non-null float64

7 log_price 74111 non-null float64

8 room_type_Private room 74111 non-null uint8

9 room_type_Shared room 74111 non-null uint8

10 cancellation_policy_moderate 74111 non-null uint8

11 cancellation_policy_strict 74111 non-null uint8

12 cleaning_fee_True 74111 non-null uint8

dtypes: bool(1), float64(6), int64(1), uint8(5)

memory usage: 4.9 MB

Outlier Checks

https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/notebook2.html?t=1625308112294 9/13
7/3/2021 temp-162530801090469818

In [49]:

# construct box plot for continuous variables

cont=df.dtypes[(df.dtypes!='uint8') & (df.dtypes!='bool')].index

plt.figure(figsize=(10,10))

df[cont].boxplot(vert=0)

plt.title('With Outliers',fontsize=16)

plt.show()

https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/notebook2.html?t=1625308112294 10/13
7/3/2021 temp-162530801090469818

https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/notebook2.html?t=1625308112294 11/13
7/3/2021 temp-162530801090469818

In [50]:

def remove_outlier(col):

sorted(col)
Q1,Q3=np.percentile(col,[25,75])

IQR=Q3-Q1

lower_range= Q1-(1.5 * IQR)

upper_range= Q3+(1.5 * IQR)

return lower_range, upper_range

In [51]:

for column in df[cont].columns:

lr,ur=remove_outlier(df[column])

df[column]=np.where(df[column]>ur,ur,df[column])

df[column]=np.where(df[column]<lr,lr,df[column])

In [52]:

plt.figure(figsize=(10,10))

df[cont].boxplot(vert=0)

plt.title('After Outlier Removal',fontsize=16)

plt.show()

Data Distribution

https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/notebook2.html?t=1625308112294 12/13
7/3/2021 temp-162530801090469818

In [53]:

df_attr = (df[cont])

sns.pairplot(df_attr, diag_kind='kde')

plt.show()

C:\Users\hp\anaconda3\lib\site-packages\seaborn\distributions.py:305: User
Warning: Dataset has 0 variance; skipping density estimate.

warnings.warn(msg, UserWarning)

C:\Users\hp\anaconda3\lib\site-packages\seaborn\distributions.py:305: User
Warning: Dataset has 0 variance; skipping density estimate.

warnings.warn(msg, UserWarning)

https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/notebook2.html?t=1625308112294 13/13

You might also like