You are on page 1of 15

import pandas as pd

# data = pd.read_excel('Sample_sheet.xlsx')
data = pd.read_csv('D:\Study_python\DataAnalysis1 (1)\
Sample_sheet.csv', encoding ='utf-8')
data

S. No. Names Years of Experience Domain Relevant


Experience \
0 1 John 8 Automotive
6.0
1 2 Jason 5 Entertainment
4.0
2 3 Maria 10 Banking
3.0
3 4 Jacob 12 Insurance
10.0
4 5 Sarah 15 Logistics
5.0
5 6 Angelina 3 Travel
3.0
6 7 Krishna 0 NaN
NaN
7 8 Adam 8 Food
5.0
8 9 Deepika 15 IT
13.0
9 10 Alan 2 Space
1.0

Income(USD) Marital Status Number of siblings


0 20000.0 Single 3
1 15000.0 Married 3
2 18000.0 Single 3
3 24000.0 Single 3
4 8000.0 Married 3
5 9500.0 Married 3
6 NaN Single 3
7 7500.0 Single 3
8 19500.0 Married 3
9 12500.0 Married 3

S. No. Names Years of Experience Domain Relevant


Experience \
0 1 John 8 Automotive
6.0
1 2 Jason 5 Entertainment
4.0
2 3 Maria 10 Banking
3.0
3 4 Jacob 12 Insurance
10.0
4 5 Sarah 15 Logistics
5.0
5 6 Angelina 3 Travel
3.0
6 7 Krishna 0 NaN
NaN
7 8 Adam 8 Food
5.0
8 9 Deepika 15 IT
13.0
9 10 Alan 2 Space
1.0

Income(USD) Marital Status Number of siblings


0 20000.0 Single 3
1 15000.0 Married 3
2 18000.0 Single 3
3 24000.0 Single 3
4 8000.0 Married 3
5 9500.0 Married 3
6 NaN Single 3
7 7500.0 Single 3
8 19500.0 Married 3
9 12500.0 Married 3

print('Shape of data :', data.shape)

Shape of data : (10, 8)


Shape of data : (10, 8)

print('Size of data :', data.size)

Size of data : 80
Size of data : 80

print('Data types : \n', data.dtypes)

Data types :
S. No. int64
Names object
Years of Experience int64
Domain object
Relevant Experience float64
Income(USD) float64
Marital Status object
Number of siblings int64
dtype: object
Data types :
S. No. int64
Names object
Years of Experience int64
Domain object
Relevant Experience float64
Income(USD) float64
Marital Status object
Number of siblings int64
dtype: object

print('Statistical data analysis :\n',data.describe())

Statistical data analysis :


S. No. Years of Experience Relevant Experience
Income(USD) \
count 10.00000 10.000000 9.000000
9.000000
mean 5.50000 7.800000 5.555556
14888.888889
std 3.02765 5.287301 3.745368
5883.120865
min 1.00000 0.000000 1.000000
7500.000000
25% 3.25000 3.500000 3.000000
9500.000000
50% 5.50000 8.000000 5.000000
15000.000000
75% 7.75000 11.500000 6.000000
19500.000000
max 10.00000 15.000000 13.000000
24000.000000

Number of siblings
count 10.0
mean 3.0
std 0.0
min 3.0
25% 3.0
50% 3.0
75% 3.0
max 3.0
Statistical data analysis :
S. No. Years of Experience Relevant Experience
Income(USD) \
count 10.00000 10.000000 9.000000
9.000000
mean 5.50000 7.800000 5.555556
14888.888889
std 3.02765 5.287301 3.745368
5883.120865
min 1.00000 0.000000 1.000000
7500.000000
25% 3.25000 3.500000 3.000000
9500.000000
50% 5.50000 8.000000 5.000000
15000.000000
75% 7.75000 11.500000 6.000000
19500.000000
max 10.00000 15.000000 13.000000
24000.000000

Number of siblings
count 10.0
mean 3.0
std 0.0
min 3.0
25% 3.0
50% 3.0
75% 3.0
max 3.0

#Drop the column


data.drop('Number of siblings', axis = 1)

S. No. Names Years of Experience Domain Relevant


Experience \
0 1 John 8 Automotive
6.0
1 2 Jason 5 Entertainment
4.0
2 3 Maria 10 Banking
3.0
3 4 Jacob 12 Insurance
10.0
4 5 Sarah 15 Logistics
5.0
5 6 Angelina 3 Travel
3.0
6 7 Krishna 0 NaN
NaN
7 8 Adam 8 Food
5.0
8 9 Deepika 15 IT
13.0
9 10 Alan 2 Space
1.0

Income(USD) Marital Status


0 20000.0 Single
1 15000.0 Married
2 18000.0 Single
3 24000.0 Single
4 8000.0 Married
5 9500.0 Married
6 NaN Single
7 7500.0 Single
8 19500.0 Married
9 12500.0 Married

#Drop the row


data.drop(2)
# Use this to permanently drop
# data.drop(2, inplace=True)

S. No. Names Years of Experience Domain Relevant


Experience \
0 1 John 8 Automotive
6.0
1 2 Jason 5 Entertainment
4.0
3 4 Jacob 12 Insurance
10.0
4 5 Sarah 15 Logistics
5.0
5 6 Angelina 3 Travel
3.0
6 7 Krishna 0 NaN
NaN
7 8 Adam 8 Food
5.0
8 9 Deepika 15 IT
13.0
9 10 Alan 2 Space
1.0

Income(USD) Marital Status Number of siblings


0 20000.0 Single 3
1 15000.0 Married 3
3 24000.0 Single 3
4 8000.0 Married 3
5 9500.0 Married 3
6 NaN Single 3
7 7500.0 Single 3
8 19500.0 Married 3
9 12500.0 Married 3

data

S. No. Names Years of Experience Domain Relevant


Experience \
0 1 John 8 Automotive
6.0
1 2 Jason 5 Entertainment
4.0
2 3 Maria 10 Banking
3.0
3 4 Jacob 12 Insurance
10.0
4 5 Sarah 15 Logistics
5.0
5 6 Angelina 3 Travel
3.0
6 7 Krishna 0 NaN
NaN
7 8 Adam 8 Food
5.0
8 9 Deepika 15 IT
13.0
9 10 Alan 2 Space
1.0

Income(USD) Marital Status Number of siblings


0 20000.0 Single 3
1 15000.0 Married 3
2 18000.0 Single 3
3 24000.0 Single 3
4 8000.0 Married 3
5 9500.0 Married 3
6 NaN Single 3
7 7500.0 Single 3
8 19500.0 Married 3
9 12500.0 Married 3

#List all values and unique values


print('All the values :', data['Years of Experience'].values)
print('All the unique values :', data['Years of Experience'].unique())

print('All the values :', data['Marital Status'].values)


print('All the unique values :', data['Marital Status'].unique())

data

#Correlation of dataframe
data.drop('S. No.',axis = 1).corr()

data

Exercise : Add a column to your dataframe which will have negative correlation with
Income and then find correlation
data['Expenditure'] = [1000, 1520, 1200, 800, 2000, 1800, 0, 1769,
450, 150]
data
S. No. Names Years of Experience Domain Relevant
Experience \
0 1 John 8 Automotive
6.0
1 2 Jason 5 Entertainment
4.0
2 3 Maria 10 Banking
3.0
3 4 Jacob 12 Insurance
10.0
4 5 Sarah 15 Logistics
5.0
5 6 Angelina 3 Travel
3.0
6 7 Krishna 0 NaN
NaN
7 8 Adam 8 Food
5.0
8 9 Deepika 15 IT
13.0
9 10 Alan 2 Space
1.0

Income(USD) Marital Status Number of siblings Expenditure


0 20000.0 Single 3 1000
1 15000.0 Married 3 1520
2 18000.0 Single 3 1200
3 24000.0 Single 3 800
4 8000.0 Married 3 2000
5 9500.0 Married 3 1800
6 NaN Single 3 0
7 7500.0 Single 3 1769
8 19500.0 Married 3 450
9 12500.0 Married 3 150

correlation = data.drop(['S. No.', 'Number of siblings'], axis =


1).corr()
correlation

Years of Experience Relevant Experience


Income(USD) \
Years of Experience 1.000000 0.735342
0.299787
Relevant Experience 0.735342 1.000000
0.576118
Income(USD) 0.299787 0.576118
1.000000
Expenditure 0.307885 -0.284077 -
0.613952
Expenditure
Years of Experience 0.307885
Relevant Experience -0.284077
Income(USD) -0.613952
Expenditure 1.000000

#Saving data to csv file


correlation.to_csv('correlation.csv')

Missing Values
data

S. No. Names Years of Experience Domain Relevant


Experience \
0 1 John 8 Automotive
6.0
1 2 Jason 5 Entertainment
4.0
2 3 Maria 10 Banking
3.0
3 4 Jacob 12 Insurance
10.0
4 5 Sarah 15 Logistics
5.0
5 6 Angelina 3 Travel
3.0
6 7 Krishna 0 NaN
NaN
7 8 Adam 8 Food
5.0
8 9 Deepika 15 IT
13.0
9 10 Alan 2 Space
1.0

Income(USD) Marital Status Number of siblings Expenditure


0 20000.0 Single 3 1000
1 15000.0 Married 3 1520
2 18000.0 Single 3 1200
3 24000.0 Single 3 800
4 8000.0 Married 3 2000
5 9500.0 Married 3 1800
6 NaN Single 3 0
7 7500.0 Single 3 1769
8 19500.0 Married 3 450
9 12500.0 Married 3 150

# Detection of missing values

data.isna().any()
S. No. False
Names False
Years of Experience False
Domain True
Relevant Experience True
Income(USD) True
Marital Status False
Number of siblings False
Expenditure False
dtype: bool

# Number of missing values across columns


data.isna().sum()

S. No. 0
Names 0
Years of Experience 0
Domain 1
Relevant Experience 1
Income(USD) 1
Marital Status 0
Number of siblings 0
Expenditure 0
dtype: int64

Treatment of missing values


#Using a constant value

data['Domain'].fillna('Government')

0 Automotive
1 Entertainment
2 Banking
3 Insurance
4 Logistics
5 Travel
6 Government
7 Food
8 IT
9 Space
Name: Domain, dtype: object

data['Income(USD)'].fillna(3000)

0 20000.0
1 15000.0
2 18000.0
3 24000.0
4 8000.0
5 9500.0
6 3000.0
7 7500.0
8 19500.0
9 12500.0
Name: Income(USD), dtype: float64

#Using a mean of the series

data['Income(USD)'].fillna(data['Income(USD)'].mean())

0 20000.000000
1 15000.000000
2 18000.000000
3 24000.000000
4 8000.000000
5 9500.000000
6 14888.888889
7 7500.000000
8 19500.000000
9 12500.000000
Name: Income(USD), dtype: float64

#Using a median of the series

data['Income(USD)'].fillna(data['Income(USD)'].median())

0 20000.0
1 15000.0
2 18000.0
3 24000.0
4 8000.0
5 9500.0
6 15000.0
7 7500.0
8 19500.0
9 12500.0
Name: Income(USD), dtype: float64

from sklearn.impute import SimpleImputer


imp_constant = SimpleImputer(strategy='constant', fill_value=1111)
imp_mean = SimpleImputer(strategy='mean')
imp_median = SimpleImputer(strategy='median')
imp_mode = SimpleImputer(strategy='most_frequent')

data[['Income(USD)']]

Income(USD)
0 20000.0
1 15000.0
2 18000.0
3 24000.0
4 8000.0
5 9500.0
6 NaN
7 7500.0
8 19500.0
9 12500.0

imp_constant.fit_transform(data[['Income(USD)']])

array([[20000.],
[15000.],
[18000.],
[24000.],
[ 8000.],
[ 9500.],
[ 1111.],
[ 7500.],
[19500.],
[12500.]])

imp_mean.fit_transform(data[['Income(USD)']])

array([[20000. ],
[15000. ],
[18000. ],
[24000. ],
[ 8000. ],
[ 9500. ],
[14888.88888889],
[ 7500. ],
[19500. ],
[12500. ]])

imp_mean.statistics_

array([14888.88888889])

imp_mean.fit_transform(data[['Relevant Experience']])
imp_mean.statistics_

array([5.55555556])

imp_median.fit_transform(data[['Income(USD)']])

array([[20000.],
[15000.],
[18000.],
[24000.],
[ 8000.],
[ 9500.],
[15000.],
[ 7500.],
[19500.],
[12500.]])
imp_mode.fit_transform(data[['Income(USD)']])

array([[20000.],
[15000.],
[18000.],
[24000.],
[ 8000.],
[ 9500.],
[ 7500.],
[ 7500.],
[19500.],
[12500.]])

data

S. No. Names Years of Experience Domain Relevant


Experience \
0 1 John 8 Automotive
6.0
1 2 Jason 5 Entertainment
4.0
2 3 Maria 10 Banking
3.0
3 4 Jacob 12 Insurance
10.0
4 5 Sarah 15 Logistics
5.0
5 6 Angelina 3 Travel
3.0
6 7 Krishna 0 NaN
NaN
7 8 Adam 8 Food
5.0
8 9 Deepika 15 IT
13.0
9 10 Alan 2 Space
1.0

Income(USD) Marital Status Number of siblings Expenditure


0 20000.0 Single 3 1000
1 15000.0 Married 3 1520
2 18000.0 Single 3 1200
3 24000.0 Single 3 800
4 8000.0 Married 3 2000
5 9500.0 Married 3 1800
6 NaN Single 3 0
7 7500.0 Single 3 1769
8 19500.0 Married 3 450
9 12500.0 Married 3 150
# Permanent replacement of missing values
data['Income(USD)'] = imp_mode.fit_transform(data[['Income(USD)']])
data

S. No. Names Years of Experience Domain Relevant


Experience \
0 1 John 8 Automotive
6.0
1 2 Jason 5 Entertainment
4.0
2 3 Maria 10 Banking
3.0
3 4 Jacob 12 Insurance
10.0
4 5 Sarah 15 Logistics
5.0
5 6 Angelina 3 Travel
3.0
6 7 Krishna 0 NaN
NaN
7 8 Adam 8 Food
5.0
8 9 Deepika 15 IT
13.0
9 10 Alan 2 Space
1.0

Income(USD) Marital Status Number of siblings Expenditure


0 20000.0 Single 3 1000
1 15000.0 Married 3 1520
2 18000.0 Single 3 1200
3 24000.0 Single 3 800
4 8000.0 Married 3 2000
5 9500.0 Married 3 1800
6 7500.0 Single 3 0
7 7500.0 Single 3 1769
8 19500.0 Married 3 450
9 12500.0 Married 3 150

# Row with the missing values


data[data['Domain'].isna()]

S. No. Names Years of Experience Domain Relevant Experience \


6 7 Krishna 0 NaN NaN

Income(USD) Marital Status Number of siblings Expenditure


6 7500.0 Single 3 0

Reading from JSON


import json
my_dictionary = { "Name" : "Mahesh",
"Course" : "Machine Learning",
"Profession" : "IT Admin",
"Experience" : "4"

my_json = json.dumps(my_dictionary)

with open("sample.json", "w") as file:


file.write(my_json)

pd.read_json('sample.json',orient='index').T

Name Course Profession Experience


0 Mahesh Machine Learning IT Admin 4

Detecting Duplicates
data_ = pd.DataFrame({'Name':['Krishna', 'Adam', 'Adam', 'Alan',
'Krishna'],
'DOB':['January', 'March', 'March', 'May',
'December'],
'Age':[29, 44, 45, 12, 39]})

data_

Name DOB Age


0 Krishna January 29
1 Adam March 44
2 Adam March 45
3 Alan May 12
4 Krishna December 39

data_['Name']

0 Krishna
1 Adam
2 Adam
3 Alan
4 Krishna
Name: Name, dtype: object

data_['Name'].duplicated()

0 False
1 False
2 True
3 False
4 True
Name: Name, dtype: bool

data_['Name'].drop_duplicates()
0 Krishna
1 Adam
3 Alan
Name: Name, dtype: object

data_.drop_duplicates()

Name DOB Age


0 Krishna January 29
1 Adam March 44
2 Adam March 45
3 Alan May 12
4 Krishna December 39

data_

Name DOB Age


0 Krishna January 29
1 Adam March 44
2 Adam March 45
3 Alan May 12
4 Krishna December 39

data_.drop_duplicates(subset=['Name','DOB'], keep = 'first')

Name DOB Age


0 Krishna January 29
1 Adam March 44
3 Alan May 12
4 Krishna December 39

data[(data['Years of Experience'] == 8) & (data['Marital


Status']=='Single')]

S. No. Names Years of Experience Domain Relevant Experience


\
0 1 John 8 Automotive 6.0

7 8 Adam 8 Food 5.0

Income(USD) Marital Status Number of siblings Expenditure


0 20000.0 Single 3 1000
7 7500.0 Single 3 1769

You might also like