Professional Documents
Culture Documents
Statisitics Project 4
Statisitics Project 4
In [20]:
import numpy as np
import pandas as pd
import matplotlib.style
In [21]:
df = pd.read_csv("AirBNB.csv")
C:\Users\hp\anaconda3\lib\site-packages\IPython\core\interactiveshell.py:3
146: DtypeWarning: Columns (5) have mixed types.Specify dtype option on im
port or set low_memory=False.
EDA
In [22]:
df.head()
Out[22]:
Entire
0 6901257 3.0 1.0 strict True
home/apt
Entire
1 6304928 7.0 1.0 strict True
home/apt
Entire
2 7919400 5.0 1.0 moderate True
home/apt
Entire
3 13418779 4.0 1.0 flexible True
home/apt
Entire
4 3808709 2.0 1.0 moderate True
home/apt
In [6]:
df = df.drop('id', axis=1)
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/notebook2.html?t=1625308112294 1/13
7/3/2021 temp-162530801090469818
In [23]:
df.head()
Out[23]:
Entire
0 6901257 3.0 1.0 strict True
home/apt
Entire
1 6304928 7.0 1.0 strict True
home/apt
Entire
2 7919400 5.0 1.0 moderate True
home/apt
Entire
3 13418779 4.0 1.0 flexible True
home/apt
Entire
4 3808709 2.0 1.0 moderate True
home/apt
In [24]:
df.info()
<class 'pandas.core.frame.DataFrame'>
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/notebook2.html?t=1625308112294 2/13
7/3/2021 temp-162530801090469818
In [25]:
df.describe(include="all")
Out[25]:
Entire
top NaN NaN NaN strict
home/apt
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/notebook2.html?t=1625308112294 3/13
7/3/2021 temp-162530801090469818
In [26]:
if df[column].dtype == 'object':
print(column.upper(),': ',df[column].nunique())
print(df[column].value_counts().sort_values())
print('\n')
ROOM_TYPE : 3
CANCELLATION_POLICY : 3
moderate 19060
flexible 22543
strict 32500
CLEANING_FEE : 2
False 19708
True 54399
INSTANT_BOOKABLE : 2
t 19451
f 54660
In [27]:
df.instant_bookable.replace(['f','t'],[False,True],inplace=True )
In [29]:
df = pd.get_dummies(df, columns=['room_type','cancellation_policy','cleaning_fee'],drop
_first=True)
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/notebook2.html?t=1625308112294 4/13
7/3/2021 temp-162530801090469818
In [30]:
df.head()
Out[30]:
In [31]:
df.isnull().sum()
Out[31]:
id 0
accommodates 3
bathrooms 203
instant_bookable 0
review_scores_rating 16722
bedrooms 92
beds 131
log_price 0
room_type_Private room 0
room_type_Shared room 0
cancellation_policy_moderate 0
cancellation_policy_strict 0
cleaning_fee_True 0
dtype: int64
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/notebook2.html?t=1625308112294 5/13
7/3/2021 temp-162530801090469818
In [32]:
if df[column].dtype != 'object':
mean = df[column].mean()
df[column] = df[column].fillna(mean)
df.isnull().sum()
Out[32]:
id 0
accommodates 0
bathrooms 0
instant_bookable 0
review_scores_rating 0
bedrooms 0
beds 0
log_price 0
room_type_Private room 0
room_type_Shared room 0
cancellation_policy_moderate 0
cancellation_policy_strict 0
cleaning_fee_True 0
dtype: int64
In [33]:
df.head()
Out[33]:
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/notebook2.html?t=1625308112294 6/13
7/3/2021 temp-162530801090469818
In [39]:
df.describe()
Out[39]:
In [42]:
dups = df.duplicated()
#df[dups]
In [43]:
print('Before',df.shape)
df.drop_duplicates(inplace=True)
print('After',df.shape)
In [44]:
dups = df.duplicated()
Correlation Plot
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/notebook2.html?t=1625308112294 7/13
7/3/2021 temp-162530801090469818
In [45]:
sns.heatmap(df.iloc[:, 0:6].corr(),annot=True)
plt.show()
The variable 'Accommodates', 'bedrooms','bathrooms', 'beds' are having correlation with each other.
In [46]:
df.head()
Out[46]:
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/notebook2.html?t=1625308112294 8/13
7/3/2021 temp-162530801090469818
In [47]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Outlier Checks
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/notebook2.html?t=1625308112294 9/13
7/3/2021 temp-162530801090469818
In [49]:
plt.figure(figsize=(10,10))
df[cont].boxplot(vert=0)
plt.title('With Outliers',fontsize=16)
plt.show()
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/notebook2.html?t=1625308112294 10/13
7/3/2021 temp-162530801090469818
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/notebook2.html?t=1625308112294 11/13
7/3/2021 temp-162530801090469818
In [50]:
def remove_outlier(col):
sorted(col)
Q1,Q3=np.percentile(col,[25,75])
IQR=Q3-Q1
In [51]:
lr,ur=remove_outlier(df[column])
df[column]=np.where(df[column]>ur,ur,df[column])
df[column]=np.where(df[column]<lr,lr,df[column])
In [52]:
plt.figure(figsize=(10,10))
df[cont].boxplot(vert=0)
plt.show()
Data Distribution
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/notebook2.html?t=1625308112294 12/13
7/3/2021 temp-162530801090469818
In [53]:
df_attr = (df[cont])
sns.pairplot(df_attr, diag_kind='kde')
plt.show()
C:\Users\hp\anaconda3\lib\site-packages\seaborn\distributions.py:305: User
Warning: Dataset has 0 variance; skipping density estimate.
warnings.warn(msg, UserWarning)
C:\Users\hp\anaconda3\lib\site-packages\seaborn\distributions.py:305: User
Warning: Dataset has 0 variance; skipping density estimate.
warnings.warn(msg, UserWarning)
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/notebook2.html?t=1625308112294 13/13