Statisitics Project 4

7/3/2021 temp-162530801090469818
In [20]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import matplotlib.pyplot as plt
import matplotlib.style
In [21]:
# reading the CSV file into pandas dataframe
df = pd.read_csv("AirBNB.csv")
C:\Users\hp\anaconda3\lib\site-packages\IPython\core\interactiveshell.py:3
146: DtypeWarning: Columns (5) have mixed types.Specify dtype option on im
port or set low_memory=False.
has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
EDA
In [22]:
# Check top few records to get a feel of the data structure
df.head()
Out[22]:
id room_type accommodates bathrooms cancellation_policy cleaning_fee instan
Entire
0 6901257 3.0 1.0 strict True
home/apt
Entire
1 6304928 7.0 1.0 strict True
home/apt
Entire
2 7919400 5.0 1.0 moderate True
home/apt
Entire
3 13418779 4.0 1.0 flexible True
home/apt
Entire
4 3808709 2.0 1.0 moderate True
home/apt
In [6]:
# drop the id column as it is useless for the model
df = df.drop('id', axis=1)
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/notebook2.html?t=1625308112294 1/13
7/3/2021 temp-162530801090469818
In [23]:
df.head()
Out[23]:
id room_type accommodates bathrooms cancellation_policy cleaning_fee instan
Entire
0 6901257 3.0 1.0 strict True
home/apt
Entire
1 6304928 7.0 1.0 strict True
home/apt
Entire
2 7919400 5.0 1.0 moderate True
home/apt
Entire
3 13418779 4.0 1.0 flexible True
home/apt
Entire
4 3808709 2.0 1.0 moderate True
home/apt
In [24]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74111 entries, 0 to 74110
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 74111 non-null int64
1 room_type 74106 non-null object
2 accommodates 74108 non-null float64
3 bathrooms 73908 non-null float64
4 cancellation_policy 74103 non-null object
5 cleaning_fee 74107 non-null object
6 instant_bookable 74111 non-null object
7 review_scores_rating 57389 non-null float64
8 bedrooms 74019 non-null float64
9 beds 73980 non-null float64
10 log_price 74111 non-null float64
dtypes: float64(6), int64(1), object(4)
memory usage: 6.2+ MB
7/3/2021 temp-162530801090469818
In [25]:
df.describe(include="all")
Out[25]:
id room_type accommodates bathrooms cancellation_policy cleaning
count 7.411100e+04 74106 74108.000000 73908.000000 74103 74
unique NaN 3 NaN NaN 3
Entire
top NaN NaN NaN strict
home/apt
freq NaN 41308 NaN NaN 32500 54
mean 1.126662e+07 NaN 3.155125 1.235272 NaN
std 6.081735e+06 NaN 2.153603 0.582054 NaN
min 3.440000e+02 NaN 1.000000 0.000000 NaN
25% 6.261964e+06 NaN 2.000000 1.000000 NaN
50% 1.225415e+07 NaN 2.000000 1.000000 NaN
75% 1.640226e+07 NaN 4.000000 1.000000 NaN
max 2.123090e+07 NaN 16.000000 8.000000 NaN
unique values for categorical variables
7/3/2021 temp-162530801090469818
In [26]:
for column in df.columns:
if df[column].dtype == 'object':
print(column.upper(),': ',df[column].nunique())
print(df[column].value_counts().sort_values())
print('\n')
ROOM_TYPE : 3
Shared room 2163
Private room 30635
Entire home/apt 41308
Name: room_type, dtype: int64
CANCELLATION_POLICY : 3
moderate 19060
flexible 22543
strict 32500
Name: cancellation_policy, dtype: int64
CLEANING_FEE : 2
False 19708
True 54399
Name: cleaning_fee, dtype: int64
INSTANT_BOOKABLE : 2
t 19451
f 54660
Name: instant_bookable, dtype: int64
Converting Instant_bookable from 'f' and 't' to boolean
In [27]:
df.instant_bookable.replace(['f','t'],[False,True],inplace=True )
Converting categorical to dummy variables
In [29]:
df = pd.get_dummies(df, columns=['room_type','cancellation_policy','cleaning_fee'],drop
_first=True)
7/3/2021 temp-162530801090469818
In [30]:
df.head()
Out[30]:
id accommodates bathrooms instant_bookable review_scores_rating bedrooms
0 6901257 3.0 1.0 False 100.0 1.0
1 6304928 7.0 1.0 True 93.0 3.0
2 7919400 5.0 1.0 True 92.0 1.0
3 13418779 4.0 1.0 False NaN 2.0
4 3808709 2.0 1.0 True 40.0 0.0
In [31]:
# Are there any missing values ?
df.isnull().sum()
Out[31]:
id 0
accommodates 3
bathrooms 203
instant_bookable 0
review_scores_rating 16722
bedrooms 92
beds 131
log_price 0
room_type_Private room 0
room_type_Shared room 0
cancellation_policy_moderate 0
cancellation_policy_strict 0
cleaning_fee_True 0
dtype: int64
Imputing missing values
7/3/2021 temp-162530801090469818
In [32]:
for column in df.columns:
if df[column].dtype != 'object':
mean = df[column].mean()
df[column] = df[column].fillna(mean)
df.isnull().sum()
Out[32]:
id 0
accommodates 0
bathrooms 0
instant_bookable 0
review_scores_rating 0
bedrooms 0
beds 0
log_price 0
room_type_Private room 0
room_type_Shared room 0
cancellation_policy_moderate 0
cancellation_policy_strict 0
cleaning_fee_True 0
dtype: int64
In [33]:
df.head()
Out[33]:
0 6901257 3.0 1.0 False 100.000000 1.0
1 6304928 7.0 1.0 True 93.000000 3.0
2 7919400 5.0 1.0 True 92.000000 1.0
3 13418779 4.0 1.0 False 94.067365 2.0
4 3808709 2.0 1.0 True 40.000000 0.0
7/3/2021 temp-162530801090469818
In [39]:
df.describe()
Out[39]:
id accommodates bathrooms review_scores_rating bedrooms
count 7.411100e+04 74111.000000 74111.000000 74111.000000 74111.000000 74111
mean 1.126662e+07 3.155125 1.235272 94.067365 1.265797 1
std 6.081735e+06 2.153559 0.581256 6.896002 0.851620 1
min 3.440000e+02 1.000000 0.000000 20.000000 0.000000 0
25% 6.261964e+06 2.000000 1.000000 93.000000 1.000000 1
50% 1.225415e+07 2.000000 1.000000 94.067365 1.000000 1
75% 1.640226e+07 4.000000 1.000000 99.000000 1.000000 2
max 2.123090e+07 16.000000 8.000000 100.000000 10.000000 18
Checking for duplicates
In [42]:
# Are there any duplicates ?
dups = df.duplicated()
print('Number of duplicate rows = %d' % (dups.sum()))
#df[dups]
Number of duplicate rows = 0
In [43]:
print('Before',df.shape)
df.drop_duplicates(inplace=True)
print('After',df.shape)
Before (74111, 13)
After (74111, 13)
In [44]:
dups = df.duplicated()
print('Number of duplicate rows = %d' % (dups.sum()))
Number of duplicate rows = 0
Correlation Plot
7/3/2021 temp-162530801090469818
In [45]:
sns.heatmap(df.iloc[:, 0:6].corr(),annot=True)
plt.show()
The variable 'Accommodates', 'bedrooms','bathrooms', 'beds' are having correlation with each other.
In [46]:
df.head()
Out[46]:
0 6901257 3.0 1.0 False 100.000000 1.0
1 6304928 7.0 1.0 True 93.000000 3.0
2 7919400 5.0 1.0 True 92.000000 1.0
3 13418779 4.0 1.0 False 94.067365 2.0
4 3808709 2.0 1.0 True 40.000000 0.0
7/3/2021 temp-162530801090469818
In [47]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 74111 entries, 0 to 74110
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 74111 non-null int64
1 accommodates 74111 non-null float64
2 bathrooms 74111 non-null float64
3 instant_bookable 74111 non-null bool
4 review_scores_rating 74111 non-null float64
5 bedrooms 74111 non-null float64
6 beds 74111 non-null float64
7 log_price 74111 non-null float64
8 room_type_Private room 74111 non-null uint8
9 room_type_Shared room 74111 non-null uint8
10 cancellation_policy_moderate 74111 non-null uint8
11 cancellation_policy_strict 74111 non-null uint8
12 cleaning_fee_True 74111 non-null uint8
dtypes: bool(1), float64(6), int64(1), uint8(5)
memory usage: 4.9 MB
Outlier Checks
7/3/2021 temp-162530801090469818
In [49]:
# construct box plot for continuous variables
cont=df.dtypes[(df.dtypes!='uint8') & (df.dtypes!='bool')].index
plt.figure(figsize=(10,10))
df[cont].boxplot(vert=0)
plt.title('With Outliers',fontsize=16)
plt.show()
7/3/2021 temp-162530801090469818
7/3/2021 temp-162530801090469818
In [50]:
def remove_outlier(col):
sorted(col)
Q1,Q3=np.percentile(col,[25,75])
IQR=Q3-Q1
lower_range= Q1-(1.5 * IQR)
upper_range= Q3+(1.5 * IQR)
return lower_range, upper_range
In [51]:
for column in df[cont].columns:
lr,ur=remove_outlier(df[column])
df[column]=np.where(df[column]>ur,ur,df[column])
df[column]=np.where(df[column]<lr,lr,df[column])
In [52]:
plt.figure(figsize=(10,10))
df[cont].boxplot(vert=0)
plt.title('After Outlier Removal',fontsize=16)
plt.show()
Data Distribution
7/3/2021 temp-162530801090469818
In [53]:
df_attr = (df[cont])
sns.pairplot(df_attr, diag_kind='kde')
plt.show()
C:\Users\hp\anaconda3\lib\site-packages\seaborn\distributions.py:305: User
Warning: Dataset has 0 variance; skipping density estimate.
warnings.warn(msg, UserWarning)
C:\Users\hp\anaconda3\lib\site-packages\seaborn\distributions.py:305: User
Warning: Dataset has 0 variance; skipping density estimate.
warnings.warn(msg, UserWarning)

Statisitics Project 4

Uploaded by

Document Information

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Statisitics Project 4

Uploaded by

Copyright:

Available Formats

7/3/2021 temp-162530801090469818

import seaborn as sns

from sklearn.linear_model import LinearRegression

from sklearn import metrics

import matplotlib.pyplot as plt

# reading the CSV file into pandas dataframe

has_raised = await self.run_ast_nodes(code_ast.body, cell_name,

# Check top few records to get a feel of the data structure

id room_type accommodates bathrooms cancellation_policy cleaning_fee instan

# drop the id column as it is useless for the model

id room_type accommodates bathrooms cancellation_policy cleaning_fee instan

RangeIndex: 74111 entries, 0 to 74110

Data columns (total 11 columns):

# Column Non-Null Count Dtype

--- ------ -------------- -----

0 id 74111 non-null int64

1 room_type 74106 non-null object

2 accommodates 74108 non-null float64

3 bathrooms 73908 non-null float64

4 cancellation_policy 74103 non-null object

5 cleaning_fee 74107 non-null object

6 instant_bookable 74111 non-null object

7 review_scores_rating 57389 non-null float64

8 bedrooms 74019 non-null float64

9 beds 73980 non-null float64

10 log_price 74111 non-null float64

dtypes: float64(6), int64(1), object(4)

memory usage: 6.2+ MB

id room_type accommodates bathrooms cancellation_policy cleaning

count 7.411100e+04 74106 74108.000000 73908.000000 74103 74

unique NaN 3 NaN NaN 3

freq NaN 41308 NaN NaN 32500 54

mean 1.126662e+07 NaN 3.155125 1.235272 NaN

std 6.081735e+06 NaN 2.153603 0.582054 NaN

min 3.440000e+02 NaN 1.000000 0.000000 NaN

25% 6.261964e+06 NaN 2.000000 1.000000 NaN

50% 1.225415e+07 NaN 2.000000 1.000000 NaN

75% 1.640226e+07 NaN 4.000000 1.000000 NaN

max 2.123090e+07 NaN 16.000000 8.000000 NaN

unique values for categorical variables

for column in df.columns:

Shared room 2163

Private room 30635

Entire home/apt 41308

Name: room_type, dtype: int64

Name: cancellation_policy, dtype: int64

Name: cleaning_fee, dtype: int64

Name: instant_bookable, dtype: int64

Converting Instant_bookable from 'f' and 't' to boolean

Converting categorical to dummy variables

id accommodates bathrooms instant_bookable review_scores_rating bedrooms

0 6901257 3.0 1.0 False 100.0 1.0

1 6304928 7.0 1.0 True 93.0 3.0

2 7919400 5.0 1.0 True 92.0 1.0

3 13418779 4.0 1.0 False NaN 2.0

4 3808709 2.0 1.0 True 40.0 0.0

# Are there any missing values ?

Imputing missing values

for column in df.columns:

id accommodates bathrooms instant_bookable review_scores_rating bedrooms

0 6901257 3.0 1.0 False 100.000000 1.0

1 6304928 7.0 1.0 True 93.000000 3.0

2 7919400 5.0 1.0 True 92.000000 1.0

3 13418779 4.0 1.0 False 94.067365 2.0