Professional Documents
Culture Documents
Week 5 LAB
Week 5 LAB
Adding attributes
In [3]: # importing pandas as pd
import pandas as pd
# Creating the DataFrame
df = pd.DataFrame({'Date':['10/2/2011', '11/2/2011', '12/2/2011', '13/2/2011'],
'Event':['Music', 'Poetry', 'Theatre', 'Comedy'],
'Cost':[10000, 5000, 15000, 2000]})
# Print the dataframe
print(df)
In [28]: df = pd.DataFrame({'Name':['John','Ted','Dove','Brad','Rex'],
'Salary':[44000, 35000, 75000, 20000,6000]})
# Print the dataframe
print(df)
Name Salary
0 John 44000
1 Ted 35000
2 Dove 75000
3 Brad 20000
4 Rex 6000
df['salary_stats'] = df['Salary'].map(salary_stats)
df
0 A High School
1 B Masters
2 C Doctorate
3 D Bachelors
4 E Masters
5 F High School
Binary encoding
In [86]: education_data = pd.get_dummies(data.Education)
print(education_data)
Ranking Transformation
In [80]: education_map = {
'High School' : 1,
'Bachelors' : 2,
'Masters': 3,
'Doctorate': 4
}
education_data = data['Education'].map(education_map)
data['Education'] = education_data
data
0 A 1
1 B 3
2 C 4
3 D 2
4 E 3
5 F 1
In [84]: education_map = {
'High School' : 12,
'Bachelors' : 16,
'Masters': 18,
'Doctorate': 21
}
education_data = data['Education'].map(education_map)
data['Education'] = education_data
data
0 A 12
1 B 18
2 C 21
3 D 16
4 E 18
5 F 12
DataFrame 1:
Name Country Role
0 Pankaj India CEO
1 Meghna India CTO
2 Lisa USA CTO
DataFrame 2:
ID Name
0 1 Pankaj
1 2 Anupam
2 3 Amit
DataFrame 3:
Name Country Role
0 Priya India COO
C:\Users\YASH\AppData\Local\Temp\ipykernel_13020\2048347772.py:1: FutureWarning: T
he frame.append method is deprecated and will be removed from pandas in a future v
ersion. Use pandas.concat instead.
a_df=df1.append(df2, ignore_index=True)
Out[50]: Name Country Role ID
Result:
Name Country Role ID
0 Pankaj India CEO 1
print(df1.merge(df2, on='ID'))
print('\n',df1.merge(df2, on='Name'))
titanic_df = pd.read_csv("titanic.csv")
titanic_df
Out[8]: survived pclass sex age sibsp parch fare embarked class who adult_male
... ... ... ... ... ... ... ... ... ... ... ..
In [10]: titanic_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 survived 891 non-null int64
1 pclass 891 non-null int64
2 sex 891 non-null object
3 age 714 non-null float64
4 sibsp 891 non-null int64
5 parch 891 non-null int64
6 fare 891 non-null float64
7 embarked 889 non-null object
8 class 891 non-null object
9 who 891 non-null object
10 adult_male 891 non-null bool
11 deck 203 non-null object
12 embark_town 889 non-null object
13 alive 891 non-null object
14 alone 891 non-null bool
dtypes: bool(2), float64(2), int64(4), object(7)
memory usage: 92.4+ KB
In [11]: titanic_df.isnull()
localhost:8888/nbconvert/html/Downloads/MissingValues.ipynb?download=false 1/9
12/15/22, 4:06 PM MissingValues
Out[11]: survived pclass sex age sibsp parch fare embarked class who adult_male deck
0 False False False False False False False False False False False True
1 False False False False False False False False False False False False
2 False False False False False False False False False False False True
3 False False False False False False False False False False False False
4 False False False False False False False False False False False True
... ... ... ... ... ... ... ... ... ... ... ... ...
886 False False False False False False False False False False False True
887 False False False False False False False False False False False False
888 False False False True False False False False False False False True
889 False False False False False False False False False False False False
890 False False False False False False False False False False False True
<AxesSubplot:>
Out[12]:
In [13]: msno.matrix(titanic_df)
<AxesSubplot:>
Out[13]:
localhost:8888/nbconvert/html/Downloads/MissingValues.ipynb?download=false 2/9
12/15/22, 4:06 PM MissingValues
survived 0
Out[14]:
pclass 0
sex 0
age 177
sibsp 0
parch 0
fare 0
embarked 2
class 0
who 0
adult_male 0
deck 688
embark_town 2
alive 0
alone 0
dtype: int64
In [15]: df = titanic_df.dropna(axis=0)
df.isnull().sum()
survived 0
Out[15]:
pclass 0
sex 0
age 0
sibsp 0
parch 0
fare 0
embarked 0
class 0
who 0
adult_male 0
deck 0
embark_town 0
alive 0
alone 0
dtype: int64
In [16]: df.info()
localhost:8888/nbconvert/html/Downloads/MissingValues.ipynb?download=false 3/9
12/15/22, 4:06 PM MissingValues
<class 'pandas.core.frame.DataFrame'>
Int64Index: 182 entries, 1 to 889
Data columns (total 15 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 survived 182 non-null int64
1 pclass 182 non-null int64
2 sex 182 non-null object
3 age 182 non-null float64
4 sibsp 182 non-null int64
5 parch 182 non-null int64
6 fare 182 non-null float64
7 embarked 182 non-null object
8 class 182 non-null object
9 who 182 non-null object
10 adult_male 182 non-null bool
11 deck 182 non-null object
12 embark_town 182 non-null object
13 alive 182 non-null object
14 alone 182 non-null bool
dtypes: bool(2), float64(2), int64(4), object(7)
memory usage: 20.3+ KB
In [18]: df = titanic_df.drop(['deck'],axis=1)
df.isnull().sum()
survived 0
Out[18]:
pclass 0
sex 0
age 177
sibsp 0
parch 0
fare 0
embarked 2
class 0
who 0
adult_male 0
embark_town 2
alive 0
alone 0
dtype: int64
localhost:8888/nbconvert/html/Downloads/MissingValues.ipynb?download=false 4/9
12/15/22, 4:06 PM MissingValues
If you can make an educated guess about the missing value then you can replace it with
some arbitrary value using the following code.
In [20]: titanic_df['deck'].unique()
0
Out[23]:
In [24]: titanic_df
Out[24]: survived pclass sex age sibsp parch fare embarked class who adult_male
... ... ... ... ... ... ... ... ... ... ... ..
29.69911764705882
localhost:8888/nbconvert/html/Downloads/MissingValues.ipynb?download=false 5/9
12/15/22, 4:06 PM MissingValues
0 22.000000
Out[25]:
1 38.000000
2 26.000000
3 35.000000
4 35.000000
...
886 27.000000
887 19.000000
888 29.699118
889 26.000000
890 32.000000
Name: age, Length: 891, dtype: float64
In [27]: titanic_df['deck']
0 C
Out[27]:
1 C
2 C
3 C
4 C
..
886 C
887 B
888 C
889 C
890 C
Name: deck, Length: 891, dtype: object
localhost:8888/nbconvert/html/Downloads/MissingValues.ipynb?download=false 6/9
12/15/22, 4:06 PM MissingValues
0 22.0
Out[28]:
1 38.0
2 26.0
3 35.0
4 35.0
...
886 27.0
887 19.0
888 28.0
889 26.0
890 32.0
Name: age, Length: 891, dtype: float64
Out[29]: survived pclass sex age sibsp parch fare embarked class who adult_male
... ... ... ... ... ... ... ... ... ... ... ..
localhost:8888/nbconvert/html/Downloads/MissingValues.ipynb?download=false 7/9
12/15/22, 4:06 PM MissingValues
Out[30]: survived pclass sex age sibsp parch fare embarked class who adult_male
... ... ... ... ... ... ... ... ... ... ... ...
Out[31]: survived pclass sex age sibsp parch fare embarked class who adult_male
... ... ... ... ... ... ... ... ... ... ... ..
localhost:8888/nbconvert/html/Downloads/MissingValues.ipynb?download=false 8/9
12/15/22, 4:06 PM MissingValues
Out[33]: survived pclass sex age sibsp parch fare embarked class who adult_male
... ... ... ... ... ... ... ... ... ... ... ...
localhost:8888/nbconvert/html/Downloads/MissingValues.ipynb?download=false 9/9
12/15/22, 4:07 PM Numerosity_reduction
(3150, 9)
0 2655
1 495
Name: Churn, dtype: int64
(1000, 9)
In [6]: customer_df_rs
Out[6]: Distinct
Call Subscription Seconds Frequency Frequency
Complains Called Status Churn
Failure Length of Use of use of SMS
Numbers
2001 0 0 37 0 0 0 0 0 0
943 0 0 24 2515 50 0 23 1 0
1611 4 0 37 2048 54 33 19 0 0
403 8 0 35 4018 58 0 30 1 0
1301 0 0 43 273 9 15 8 0 0
... ... ... ... ... ... ... ... ... ...
2182 8 0 40 703 13 16 6 1 0
709 5 0 38 4325 82 0 22 1 0
1227 5 0 38 2043 40 37 33 0 1
In [49]: print(customer_df_rs.Churn.value_counts())
localhost:8888/nbconvert/html/Downloads/Numerosity_reduction.ipynb?download=false 1/3
12/15/22, 4:07 PM Numerosity_reduction
0 856
1 144
Name: Churn, dtype: int64
Stratified sampling
Example – Stratified sampling for imbalanced dataset
In [59]: n,s=len(customer_df),1000
print(n,s)
r = s/n
print('Ratio of each Churn class in sample:',r)
sample_df = customer_df.groupby('Churn').apply(lambda sdf: sdf.sample(round(len(sdf
print(sample_df.Churn.value_counts())
3150 1000
Ratio of each Churn class in sample: 0.31746031746031744
0 843
1 157
Name: Churn, dtype: int64
In [60]: customer_df.Churn.value_counts().plot.bar()
<AxesSubplot:>
Out[60]:
In [61]: sample_df.Churn.value_counts().plot.bar()
<AxesSubplot:>
Out[61]:
localhost:8888/nbconvert/html/Downloads/Numerosity_reduction.ipynb?download=false 2/3
12/15/22, 4:07 PM Numerosity_reduction
Random Over/Under-sampling
In [65]: n,s=len(customer_df),500
sample_df = customer_df.groupby('Churn').apply(lambda sdf: sdf.sample(250))
print(sample_df.Churn.value_counts())
0 250
1 250
Name: Churn, dtype: int64
In [66]: sample_df.Churn.value_counts().plot.bar()
<AxesSubplot:>
Out[66]:
In [67]: sample_df
Out[67]: Distinct
Call Subscription Seconds Frequency Frequency
Complains Called Status
Failure Length of Use of use of SMS
Numbers
Churn
410 4 0 35 5738 87 0 7 1
1455 4 0 33 3290 68 14 21 1
2086 0 0 33 1360 38 31 18 0
... ... ... ... ... ... ... ... ... ...
1 2178 8 0 40 498 11 12 6 1
1599 0 0 7 0 0 0 0 1
1476 2 1 30 2505 27 0 9 0
1382 0 1 28 0 0 0 0 1
368 2 0 40 180 8 11 5 0
localhost:8888/nbconvert/html/Downloads/Numerosity_reduction.ipynb?download=false 3/3
12/15/22, 4:08 PM OutlierDetection
Out[3]: survived pclass sex age sibsp parch fare embarked class who adult_male
... ... ... ... ... ... ... ... ... ... ... ..
localhost:8888/nbconvert/html/Downloads/OutlierDetection.ipynb?download=false 1/5
12/15/22, 4:08 PM OutlierDetection
<AxesSubplot:>
Out[6]:
localhost:8888/nbconvert/html/Downloads/OutlierDetection.ipynb?download=false 2/5
12/15/22, 4:08 PM OutlierDetection
<AxesSubplot:ylabel='Frequency'>
Out[9]:
localhost:8888/nbconvert/html/Downloads/OutlierDetection.ipynb?download=false 3/5
12/15/22, 4:08 PM OutlierDetection
<class 'pandas.core.frame.DataFrame'>
Int64Index: 880 entries, 0 to 890
Data columns (total 15 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 survived 880 non-null int64
1 pclass 880 non-null int64
2 sex 880 non-null object
3 age 703 non-null float64
4 sibsp 880 non-null int64
5 parch 880 non-null int64
6 fare 880 non-null float64
7 embarked 878 non-null object
8 class 880 non-null object
9 who 880 non-null object
10 adult_male 880 non-null bool
11 deck 198 non-null object
12 embark_town 878 non-null object
13 alive 880 non-null object
14 alone 880 non-null bool
dtypes: bool(2), float64(2), int64(4), object(7)
memory usage: 98.0+ KB
In [14]: titanic_df.info()
localhost:8888/nbconvert/html/Downloads/OutlierDetection.ipynb?download=false 4/5
12/15/22, 4:08 PM OutlierDetection
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 survived 891 non-null int64
1 pclass 891 non-null int64
2 sex 891 non-null object
3 age 714 non-null float64
4 sibsp 891 non-null int64
5 parch 891 non-null int64
6 fare 891 non-null float64
7 embarked 889 non-null object
8 class 891 non-null object
9 who 891 non-null object
10 adult_male 891 non-null bool
11 deck 203 non-null object
12 embark_town 889 non-null object
13 alive 891 non-null object
14 alone 891 non-null bool
dtypes: bool(2), float64(2), int64(4), object(7)
memory usage: 92.4+ KB
In [19]: m = np.mean(titanic_df['age'])
print('mean:',m)
for i in titanic_df['age']:
if i<lower_bound or i>upper_bound :
titanic_df['age'] = titanic_df['age'].replace(i,m)
mean: 29.081737106607342
In [26]: q1 = titanic_df["age"].quantile(0.25)
In [27]: m = titanic_df['age'].median()
print(m)
for i in titanic_df['age']:
if i<lower_bound or i>upper_bound :
titanic_df['age'] = titanic_df['age'].replace(i,m)
28.0
localhost:8888/nbconvert/html/Downloads/OutlierDetection.ipynb?download=false 5/5