Professional Documents
Culture Documents
In [1]:
import pandas as pd
df=pd.read_csv("dataset.csv")
In [2]:
import pandas as pd
In [3]:
df.head(7)
Out[3]:
In [4]:
df.shape
Out[4]:
(8287, 15)
In [5]:
to_drop=['Corporate Author','Corporate Contributors','Former owner','Engraver','Edition df.drop(co
localhost:8888/notebooks/Documents/21dce026/practical-1.ipynb# 1/11
7/19/23, 9:32 PM practical-1 - Jupyter Notebook
In [6]:
df.head()
Out[6]:
Place of
Identifier Issua
Publication Date of Publisher Title Author Contributors
Publication t
Walter
S. Tinsley
0 206 London 1879 [1878] Forbes. [A FORBES, monogra
& Co. novel.] By A. A. A. Walter.
A
Welsh
Sketches, Appleyard,
3 472 London 1851 James A., E.
chiefly Ernest monogra
Darling S.
ecclesiastical, Silvanus.
to the...
[The World in
Wertheim
4 480 London 1857 which I live, A., E. BROOME,
& and my place monogra
S. John Henry.
Macintosh in it...
In [7]:
df['Identifier'].is_unique
Out[7]:
True
In [8]:
df.set_index('Identifier',inplace=True)
df.loc[206]
Out[8]:
Place of Publication London
Date of Publication 1879 [1878]
Publisher S. Tinsley & Co.
Title Walter Forbes. [A novel.] By A. A
Author A. A.
Contributors FORBES, Walter.
Issuance type monographic
Flickr URL http://www.flickr.com/photos/britishlibrary/ta...
(http://www.flickr.com/photos/britishlibrary/ta...)
Shelfmarks British Library HMNTS 12641.b.30.
Name: 206, dtype: object
localhost:8888/notebooks/Documents/21dce026/practical-1.ipynb# 2/11
7/19/23, 9:32 PM practical-1 - Jupyter Notebook
In [9]:
df.dtypes
Out[9]:
In [10]:
df.dtypes
Out[10]:
In [11]:
regex=r'^(\d{4})'
date_num=df['Date of Publication'].str.extract(regex,expand=False)
In [12]:
date_num.head()
Out[12]:
Identifier
206 1879
216 1868
218 1869
472 1851
480 1857
Name: Date of Publication, dtype: object
localhost:8888/notebooks/Documents/21dce026/practical-1.ipynb# 3/11
7/19/23, 9:32 PM practical-1 - Jupyter Notebook
In [13]:
df['Date of Publication']=pd.to_numeric(date_num,downcast='integer') df['Date of Publication'].dty
Out[13]:
dtype('float64')
In [14]:
df['Date of Publication'].isnull().sum()
Out[14]:
977
In [15]:
df['Date of Publication'].isnull().sum()/len(df)
Out[15]:
0.11789549897429709
In [16]:
df=df[df['Date of Publication'].isnull()!=True] df.head()
Out[16]:
Identifier
Walter
S. Tinsley Forbes. [A FORBES,
206 London 1879.0 & Co. novel.] By A. A. A. Walter. monographi
A
Welsh
Sketches, Appleyard,
James A., E.
472 London 1851.0 chiefly Ernest monographi
Darling S.
ecclesiastical, Silvanus.
to the...
[The World in
Wertheim which I live, A., E. BROOME,
480 London 1857.0 & and my place monographi
S. John Henry.
Macintosh in it...
localhost:8888/notebooks/Documents/21dce026/practical-1.ipynb# 4/11
7/19/23, 9:32 PM practical-1 - Jupyter Notebook
In [17]:
df['Date of Publication'].isnull().sum()
Out[17]:
In [18]:
book_index=[4157862,4159587] df.loc[book_index]
Out[18]:
Identifier
An historical,
topographical
Newcastle Mackenzie Mackenzie, ROSS, M. - of
4159587 upon Tyne 1834.0 and Durham monogr
& Dent E. (Eneas)
descriptive
v...
In [19]:
import numpy as np
In [20]:
In [ ]:
localhost:8888/notebooks/Documents/21dce026/practical-1.ipynb# 5/11
7/19/23, 9:32 PM practical-1 - Jupyter Notebook
In [21]:
null_values=df.isnull().sum()
print(null_values)
Place of Publication 0
Date of Publication 0
Publisher 3736
Title 0
Author 1534
Contributors 0
Issuance type 0
Flickr URL 0
Shelfmarks 0
dtype: int64
In [22]:
df["Author"]=df["Author"].str.strip().str.title()
In [23]:
df["Author"].fillna("Unknown",inplace=True)
In [24]:
print(df.isnull().sum())
Place of Publication 0
Date of Publication 0
Publisher 3736
Title 0
Author 0
Contributors 0
Issuance type 0
Flickr URL 0
Shelfmarks 0
dtype: int64
In [25]:
df["Publisher"]=df["Publisher"].str.strip().str.title()
In [26]:
df["Publisher"].fillna(df["Place of Publication"],inplace=True)
In [27]:
df["Date of Publication"].fillna("NA",inplace=True)
localhost:8888/notebooks/Documents/21dce026/practical-1.ipynb# 6/11
7/19/23, 9:32 PM practical-1 - Jupyter Notebook
In [28]:
print(df.isnull().sum())
Place of Publication 0
Date of Publication 0
Publisher 0
Title 0
Author 0
Contributors 0
Issuance type 0
Flickr URL 0
Shelfmarks 0
dtype: int64
In [29]:
df_olympics=pd.read_csv("olympics.csv",header=1)
In [30]:
print(df_olympics.isnull().sum())
Unnamed: 0 0
? Summer 0
01 ! 0
02 ! 0
03 ! 0
Total 0
? Winter 0
01 !.1 0
02 !.1 0
03 !.1 0
Total.1 0
? Games 0
01 !.2 0
02 !.2 0
03 !.2 0
Combined total 0
dtype: int64
localhost:8888/notebooks/Documents/21dce026/practical-1.ipynb# 7/11
7/19/23, 9:32 PM practical-1 - Jupyter Notebook
In [31]:
new_col={
'Unnamed: 0':'Country',
'? Summer':'Summer Olympics', '01 !':'Gold',
'02 !': 'Silver',
'03 !':'Bronze',
'? Winter':'Winter Olympics', '01 !.1':'Gold.1',
'02 !.1':'Silver.1',
'03 !.1':'Bronze.1',
'01 !.2':'Gold.2',
'02 !.2':'Silver.2',
'03 !.2':'Bronze.2',
'? Games':'#Games'
}
df_olympics.rename(columns=new_col,inplace=True)
In [32]:
df_olympics.head()
Out[32]:
Summer
Country
Olympics Winter
Gold Silver Bronze Total Gold.1 Silver.1 Bronze.1
Olympics
Afghanistan
0 13 0 0 2 2 0 0 0 0
(AFG)
Algeria
1 12 5 2 8 15 3 0 0 0
(ALG)
Argentina
2 23 18 24 28 70 18 0 0 0
(ARG)
Armenia
3 5 1 2 9 12 6 0 0 0
(ARM)
Australasia
4 (ANZ) 2 3 4 5 12 0 0 0 0
[ANZ]
In [33]:
z_score=(df_olympics-df_olympics.mean()/df_olympics.std())
C:\Users\Shruti gohil\AppData\Local\Temp\ipykernel_10596\119135984.py:1: F
utureWarning: The default value of numeric_only in DataFrame.mean is depre
cated. In a future version, it will default to False. In addition, specify
ing 'numeric_only=None' is deprecated. Select only valid columns or specif
y the value of numeric_only to silence this warning.
z_score=(df_olympics-df_olympics.mean()/df_olympics.std())
C:\Users\Shruti gohil\AppData\Local\Temp\ipykernel_10596\119135984.py:1: F
utureWarning: The default value of numeric_only in DataFrame.std is deprec
ated. In a future version, it will default to False. In addition, specifyi
ng 'numeric_only=None' is deprecated. Select only valid columns or specify
the value of numeric_only to silence this warning.
z_score=(df_olympics-df_olympics.mean()/df_olympics.std())
localhost:8888/notebooks/Documents/21dce026/practical-1.ipynb# 8/11
7/19/23, 9:32 PM practical-1 - Jupyter Notebook
In [34]:
df_olympics.dtypes
Out[34]:
Country object
Summer Olympics int64
Gold int64
Silver int64
Bronze int64
Total int64
Winter Olympics int64
Gold.1 int64
Silver.1 int64
Bronze.1 int64
Total.1 int64
#Games int64
Gold.2 int64
Silver.2 int64
Bronze.2 int64
Combined total int64
dtype: object
In [35]:
to_drop=['Country']
df_new=df_olympics.drop(columns=to_drop,inplace=False)
In [36]:
z_score=(df_new-df_new.mean()/df_new.std())
In [37]:
outliers=df_new[(z_score>3).any(axis=1)]
localhost:8888/notebooks/Documents/21dce026/practical-1.ipynb# 9/11
7/19/23, 9:32 PM practical-1 - Jupyter Notebook
In [38]:
outliers
Out[38]:
Summe Winte
Gold Silver Bronze Total Gold.1 Silver.1 Bronze.1 Total.1 #Games Gold.
r r
Olympic Olympic
s s
0 13 0 0 2 2 0 0 0 0 0 13
1 12 5 2 8 15 3 0 0 0 0 15
2 23 18 24 28 70 18 0 0 0 0 41 1
3 5 1 2 9 12 6 0 0 0 0 11
4 2 3 4 5 12 0 0 0 0 0 2
... ... ... ... ... ... ... ... ... ... ... ... .
141 16 26 29 28 83 14 0 3 1 4 30 2
143 12 0 1 1 2 0 0 0 0 0 12
144 12 3 4 1 8 1 0 0 0 0 13
In [39]:
z_score
Out[39]:
Summer Winter
Gold Silver Bronze Total Gold.1
Olympics Olympics
localhost:8888/notebooks/Documents/21dce026/practical-1.ipynb# 10/11
7/19/23, 9:32 PM practical-1 - Jupyter Notebook
In [ ]:
In [ ]:
localhost:8888/notebooks/Documents/21dce026/practical-1.ipynb# 11/11