You are on page 1of 12

data-cleaning-and-pre-processing-1

March 24, 2024

0.1 Importing needful libraries and modules


[1]: import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np

[2]: # DataFrame of Students

dataframe = pd.DataFrame({'Student ID': [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],


'Subject':
↪['Mathematics','Science','Science','Art','Art','Art','Mathematics','History','Mathematics','

'Mid-Term Scores': [12, 9, 17, 19, 20, 11, 15, 12, 9,␣
↪4, 20, 19, 17,19, 18],

'Grade':
↪['B','C','A','A','A','B','B','B','C','F','A','A','A','A','A'],

'Gender':['Female', 'Male', 'Male', 'Female', 'Male',␣


↪'Female', 'Male', 'Female', 'Female', 'Female', 'Male', 'Male', 'Male',␣

↪'Male', 'Male'],

'Final Score':[42, 23, 64, 88, 30, 86, 95, 78, 75, 43,␣
↪20, 43, 53, 64, 21]})

[23]: dataframe

[23]: Student ID Subject Mid-Term Scores Grade Gender Final Score


0 1 Mathematics 12 B Female 42
1 2 Science 9 C Male 23
2 3 Science 17 A Male 64
3 4 Art 19 A Female 88
4 5 Art 20 A Male 30
5 6 Art 11 B Female 86
6 7 Mathematics 15 B Male 95
7 8 History 12 B Female 78
8 9 Mathematics 9 C Female 75
9 10 Mathematics 4 F Female 43
10 11 Art 20 A Male 20
11 12 Art 19 A Male 43

1
12 13 Art 17 A Male 53
13 14 Mathematics 19 A Male 64
14 15 History 18 A Male 21

1 Visual Data Exploration


1.1 1. Pie chart
[4]: dataframe['Subject'].unique()

[4]: array(['Mathematics', 'Science', 'Art', 'History'], dtype=object)

[5]: #get the value counts for each category in the Subject variable
dataframe['Subject'].value_counts()

[5]: Art 6
Mathematics 5
Science 2
History 2
Name: Subject, dtype: int64

[6]: #from the above include the labels in descending order of count
labels=['Art','Mathematics','History','Science']

x=pd.value_counts(dataframe["Subject"])
y=plt.pie(x,labels=labels,autopct='%1.2f%%')
plt.legend(labels,loc="best")
plt.axis('equal')
plt.title("Subject")
plt.show()

2
1.2 2. Bar/Column chart
[7]: dataframe["Subject"].value_counts(normalize=False).plot.bar(title="Subject")␣
↪#if you want o normalize data can use normalize=True

[7]: <AxesSubplot:title={'center':'Subject'}>

3
1.3 3. Simple Boxplot (Uni Variate)
[8]: b_plot = dataframe.boxplot(column = 'Mid-Term Scores', color = 'blue' )
b_plot.plot()
plt.show()

4
1.4 4. Boxplot with 2 variables (bivariate)
[9]: sns.catplot(x="Subject",y='Final Score',kind="box",data=dataframe)

[9]: <seaborn.axisgrid.FacetGrid at 0x23a3d6a24f0>

5
1.5 5. Boxplot with 3 variables (trivariate)
[10]: sns.boxplot(x="Subject",y='Final Score', hue='Gender', data=dataframe)

[10]: <AxesSubplot:xlabel='Subject', ylabel='Final Score'>

6
1.6 6. Scatter plot
[11]: sns.scatterplot(data=dataframe, x="Mid-Term Scores", y="Final Score")

[11]: <AxesSubplot:xlabel='Mid-Term Scores', ylabel='Final Score'>

7
1.7 7. Scatter plot with 3 variables
[12]: sns.scatterplot(data=dataframe, x="Mid-Term Scores", y="Final Score",␣
↪hue="Gender")

[12]: <AxesSubplot:xlabel='Mid-Term Scores', ylabel='Final Score'>

8
1.8 8. Histogram
[13]: sns.histplot(dataframe['Mid-Term Scores'])
plt.title('Mid-Term Scores')
plt.xlabel('Mid-Term Scores')
plt.ylabel("Frequency")
plt.show()

9
2 Contingency table
[25]: data_crosstab = pd.crosstab([dataframe.Grade, dataframe.Gender],dataframe.
↪Subject, margins = True)

print(data_crosstab)

Subject Art History Mathematics Science All


Grade Gender
A Female 1 0 0 0 1
Male 4 1 1 1 7
B Female 1 1 1 0 3
Male 0 0 1 0 1
C Female 0 0 1 0 1
Male 0 0 0 1 1
F Female 0 0 1 0 1
All 6 2 5 2 15

10
3 Exploratory Statistical Analysis- Summary Statistics
[15]: #defining data types
dataframe.dtypes

[15]: Student ID int64


Subject object
Mid-Term Scores int64
Grade object
Gender object
Final Score int64
dtype: object

[16]: #for Numerical data - default gives numerical data output

df=dataframe.drop(['Student ID'], axis=1) #because its a unique identifier

df.describe()

[16]: Mid-Term Scores Final Score


count 15.000000 15.000000
mean 14.733333 55.000000
std 4.934813 25.679341
min 4.000000 20.000000
25% 11.500000 36.000000
50% 17.000000 53.000000
75% 19.000000 76.500000
max 20.000000 95.000000

[17]: # convert the object data types to category


dataframe['Subject'] = dataframe['Subject'].astype('category')
dataframe['Grade'] = dataframe['Grade'].astype('category')
dataframe['Gender'] = dataframe['Gender'].astype('category')

dataframe.dtypes

[17]: Student ID int64


Subject category
Mid-Term Scores int64
Grade category
Gender category
Final Score int64
dtype: object

[18]: #for categorical data


dataframe.describe(include=['category'])

11
[18]: Subject Grade Gender
count 15 15 15
unique 4 4 2
top Art A Male
freq 6 8 9

[26]: dataframe.describe(include='all')

[26]: Student ID Subject Mid-Term Scores Grade Gender Final Score


count 15.000000 15 15.000000 15 15 15.000000
unique NaN 4 NaN 4 2 NaN
top NaN Art NaN A Male NaN
freq NaN 6 NaN 8 9 NaN
mean 8.000000 NaN 14.733333 NaN NaN 55.000000
std 4.472136 NaN 4.934813 NaN NaN 25.679341
min 1.000000 NaN 4.000000 NaN NaN 20.000000
25% 4.500000 NaN 11.500000 NaN NaN 36.000000
50% 8.000000 NaN 17.000000 NaN NaN 53.000000
75% 11.500000 NaN 19.000000 NaN NaN 76.500000
max 15.000000 NaN 20.000000 NaN NaN 95.000000

[19]: # Homework : how to get the percentile values?

[ ]:

12

You might also like