You are on page 1of 11

7/19/23, 9:32 PM practical-1 - Jupyter Notebook

In [1]:
import pandas as pd
df=pd.read_csv("dataset.csv")

In [2]:
import pandas as pd

In [3]:

df.head(7)

Out[3]: 

Editio Place Date of Author Contributors


Identifier Publisher Title
n of Publication Corporate
Statemen Publicatio Author
t n
Walte
S. Tinsley r Forbes. [A
1879 [1878] FORBES,
0 206 NaN London A. A.
& Co. novel.] By NaN
A. Walter.
A
All for Greed. BLAZE DE
London; A., A. BURY, Marie
Virtue [A novel. The
1 216 NaN Virtue 1868 & NaN
Co dedication
& A. Pauline Rose
. signed... - Baroness
Yorston
Love the BLAZE DE
Bradbury, A., A. BURY, Marie
Avenger. By
2 218 NaN London 1869 Evans & NaN
the author of
Co A. Pauline Rose
“All for Gr... 
. - Baroness
Welsh
 

In [4]:
df.shape

Out[4]:

(8287, 15)

In [5]:
to_drop=['Corporate Author','Corporate Contributors','Former owner','Engraver','Edition df.drop(co

localhost:8888/notebooks/Documents/21dce026/practical-1.ipynb# 1/11
7/19/23, 9:32 PM practical-1 - Jupyter Notebook

In [6]:
df.head()

Out[6]:

Place of
Identifier Issua
Publication Date of Publisher Title Author Contributors
Publication t

Walter
S. Tinsley
0 206 London 1879 [1878] Forbes. [A FORBES, monogra
& Co. novel.] By A. A. A. Walter.
A

London; All for Greed. BLAZE DE


1 216 Virtue & [A novel. The A., A. BURY, Marie
Virtue & 1868 Co. monogra
dedication A. Pauline Rose
Yorston
signed... - Baroness

Bradbury, Love the BLAZE DE


2 218 London 1869 Avenger. By A., A. BURY, Marie
Evans & monogra
the author of A. Pauline Rose
Co.
“All for Gr... - Baroness

Welsh
Sketches, Appleyard,
3 472 London 1851 James A., E.
chiefly Ernest monogra
Darling S.
ecclesiastical, Silvanus.
to the...

[The World in
Wertheim
4 480 London 1857 which I live, A., E. BROOME,
& and my place monogra
S. John Henry.
Macintosh in it...



In [7]:

df['Identifier'].is_unique

Out[7]:
True

In [8]:

df.set_index('Identifier',inplace=True)
df.loc[206]

Out[8]:
Place of Publication London
Date of Publication 1879 [1878]
Publisher S. Tinsley & Co.
Title Walter Forbes. [A novel.] By A. A
Author A. A.
Contributors FORBES, Walter.
Issuance type monographic
Flickr URL http://www.flickr.com/photos/britishlibrary/ta...
(http://www.flickr.com/photos/britishlibrary/ta...)
Shelfmarks British Library HMNTS 12641.b.30.
Name: 206, dtype: object

localhost:8888/notebooks/Documents/21dce026/practical-1.ipynb# 2/11
7/19/23, 9:32 PM practical-1 - Jupyter Notebook

In [9]:
df.dtypes

Out[9]:

Place of Publication object


Date of Publication object
Publisher object
Title object
Author object
Contributors object
Issuance type object
Flickr URL object
Shelfmarks object
dtype: object

In [10]:
df.dtypes

Out[10]:

Place of Publication object


Date of Publication object
Publisher object
Title object
Author object
Contributors object
Issuance type object
Flickr URL object
Shelfmarks object
dtype: object

In [11]:
regex=r'^(\d{4})'
date_num=df['Date of Publication'].str.extract(regex,expand=False)

In [12]:
date_num.head()

Out[12]:

Identifier
206 1879
216 1868
218 1869
472 1851
480 1857
Name: Date of Publication, dtype: object

localhost:8888/notebooks/Documents/21dce026/practical-1.ipynb# 3/11
7/19/23, 9:32 PM practical-1 - Jupyter Notebook

In [13]:
df['Date of Publication']=pd.to_numeric(date_num,downcast='integer') df['Date of Publication'].dty

Out[13]:

dtype('float64')

In [14]:
df['Date of Publication'].isnull().sum()

Out[14]:

977

In [15]:
df['Date of Publication'].isnull().sum()/len(df)

Out[15]:

0.11789549897429709

In [16]:
df=df[df['Date of Publication'].isnull()!=True] df.head()

Out[16]:

Place of Date of Issuanc


Publisher Title Author Contributors
Publication Publication typ

Identifier

Walter
S. Tinsley Forbes. [A FORBES,
206 London 1879.0 & Co. novel.] By A. A. A. Walter. monographi
A

London; All for Greed. BLAZE DE


216 Virtue & Virtue & [A novel. The A., A. BURY, Marie
1868.0 Co. dedication A. Pauline Rose monographi
Yorston signed... - Baroness

Love the BLAZE DE


Bradbury,
Avenger. By A., A. BURY, Marie
218 London 1869.0 Evans & monographi
the author of A. Pauline Rose
Co.
“All for Gr... - Baroness

Welsh
Sketches, Appleyard,
James A., E.
472 London 1851.0 chiefly Ernest monographi
Darling S.
ecclesiastical, Silvanus.
to the...

[The World in
Wertheim which I live, A., E. BROOME,
480 London 1857.0 & and my place monographi
S. John Henry.
Macintosh in it...



localhost:8888/notebooks/Documents/21dce026/practical-1.ipynb# 4/11
7/19/23, 9:32 PM practical-1 - Jupyter Notebook

In [17]:
df['Date of Publication'].isnull().sum()

Out[17]:

In [18]:
book_index=[4157862,4159587] df.loc[book_index]

Out[18]:

Place of Date of Issu


Publication Publication Publisher Title Author Contributors

Identifier

Local FORDYCE, SYKES,


Newcastle- Records; or, T. - Printer, John -
4157862 1867.0 T. Fordyce Historical of Bookseller, of monogr
upon-Tyne
Register of Newcastle- Newcastle-
rema... upon-Tyne upon-Tyne

An historical,
topographical
Newcastle Mackenzie Mackenzie, ROSS, M. - of
4159587 upon Tyne 1834.0 and Durham monogr
& Dent E. (Eneas)
descriptive
v...



In [19]:
import numpy as np

In [20]:

['Place of Publication']=np.where(df['Place of Publication'].str.contains('Newcastle'),'N


 

In [ ]:

localhost:8888/notebooks/Documents/21dce026/practical-1.ipynb# 5/11
7/19/23, 9:32 PM practical-1 - Jupyter Notebook

In [21]:

null_values=df.isnull().sum()
print(null_values)

Place of Publication 0
Date of Publication 0
Publisher 3736
Title 0
Author 1534
Contributors 0
Issuance type 0
Flickr URL 0
Shelfmarks 0
dtype: int64

In [22]:
df["Author"]=df["Author"].str.strip().str.title()

In [23]:
df["Author"].fillna("Unknown",inplace=True)

In [24]:

print(df.isnull().sum())

Place of Publication 0
Date of Publication 0
Publisher 3736
Title 0
Author 0
Contributors 0
Issuance type 0
Flickr URL 0
Shelfmarks 0
dtype: int64

In [25]:
df["Publisher"]=df["Publisher"].str.strip().str.title()

In [26]:
df["Publisher"].fillna(df["Place of Publication"],inplace=True)

In [27]:
df["Date of Publication"].fillna("NA",inplace=True)

localhost:8888/notebooks/Documents/21dce026/practical-1.ipynb# 6/11
7/19/23, 9:32 PM practical-1 - Jupyter Notebook

In [28]:
print(df.isnull().sum())

Place of Publication 0
Date of Publication 0
Publisher 0
Title 0
Author 0
Contributors 0
Issuance type 0
Flickr URL 0
Shelfmarks 0
dtype: int64

In [29]:
df_olympics=pd.read_csv("olympics.csv",header=1)

In [30]:
print(df_olympics.isnull().sum())

Unnamed: 0 0
? Summer 0
01 ! 0
02 ! 0
03 ! 0
Total 0
? Winter 0
01 !.1 0
02 !.1 0
03 !.1 0
Total.1 0
? Games 0
01 !.2 0
02 !.2 0
03 !.2 0
Combined total 0
dtype: int64

localhost:8888/notebooks/Documents/21dce026/practical-1.ipynb# 7/11
7/19/23, 9:32 PM practical-1 - Jupyter Notebook

In [31]:
new_col={
'Unnamed: 0':'Country',
'? Summer':'Summer Olympics', '01 !':'Gold',
'02 !': 'Silver',
'03 !':'Bronze',
'? Winter':'Winter Olympics', '01 !.1':'Gold.1',
'02 !.1':'Silver.1',
'03 !.1':'Bronze.1',
'01 !.2':'Gold.2',
'02 !.2':'Silver.2',
'03 !.2':'Bronze.2',
'? Games':'#Games'
}
df_olympics.rename(columns=new_col,inplace=True)

In [32]:
df_olympics.head()

Out[32]:

Summer
Country
Olympics Winter
Gold Silver Bronze Total Gold.1 Silver.1 Bronze.1
Olympics
Afghanistan
0 13 0 0 2 2 0 0 0 0
(AFG)
Algeria
1 12 5 2 8 15 3 0 0 0
(ALG)
Argentina
2 23 18 24 28 70 18 0 0 0
(ARG)
Armenia
3 5 1 2 9 12 6 0 0 0
(ARM)

Australasia
4 (ANZ) 2 3 4 5 12 0 0 0 0
[ANZ]

 

In [33]:
z_score=(df_olympics-df_olympics.mean()/df_olympics.std())

C:\Users\Shruti gohil\AppData\Local\Temp\ipykernel_10596\119135984.py:1: F
utureWarning: The default value of numeric_only in DataFrame.mean is depre
cated. In a future version, it will default to False. In addition, specify
ing 'numeric_only=None' is deprecated. Select only valid columns or specif
y the value of numeric_only to silence this warning.
z_score=(df_olympics-df_olympics.mean()/df_olympics.std())
C:\Users\Shruti gohil\AppData\Local\Temp\ipykernel_10596\119135984.py:1: F
utureWarning: The default value of numeric_only in DataFrame.std is deprec
ated. In a future version, it will default to False. In addition, specifyi
ng 'numeric_only=None' is deprecated. Select only valid columns or specify
the value of numeric_only to silence this warning.
z_score=(df_olympics-df_olympics.mean()/df_olympics.std())

localhost:8888/notebooks/Documents/21dce026/practical-1.ipynb# 8/11
7/19/23, 9:32 PM practical-1 - Jupyter Notebook

In [34]:
df_olympics.dtypes

Out[34]:

Country object
Summer Olympics int64
Gold int64
Silver int64
Bronze int64
Total int64
Winter Olympics int64
Gold.1 int64
Silver.1 int64
Bronze.1 int64
Total.1 int64
#Games int64
Gold.2 int64
Silver.2 int64
Bronze.2 int64
Combined total int64
dtype: object

In [35]:
to_drop=['Country']
df_new=df_olympics.drop(columns=to_drop,inplace=False)

In [36]:
z_score=(df_new-df_new.mean()/df_new.std())

In [37]:
outliers=df_new[(z_score>3).any(axis=1)]

localhost:8888/notebooks/Documents/21dce026/practical-1.ipynb# 9/11
7/19/23, 9:32 PM practical-1 - Jupyter Notebook

In [38]:

outliers

Out[38]: 

Summe Winte
Gold Silver Bronze Total Gold.1 Silver.1 Bronze.1 Total.1 #Games Gold.
r r
Olympic Olympic
s s
0 13 0 0 2 2 0 0 0 0 0 13

1 12 5 2 8 15 3 0 0 0 0 15

2 23 18 24 28 70 18 0 0 0 0 41 1

3 5 1 2 9 12 6 0 0 0 0 11

4 2 3 4 5 12 0 0 0 0 0 2

... ... ... ... ... ... ... ... ... ... ... ... .

141 16 26 29 28 83 14 0 3 1 4 30 2

143 12 0 1 1 2 0 0 0 0 0 12 
144 12 3 4 1 8 1 0 0 0 0 13
 

In [39]:

z_score

Out[39]:

Summer Winter
Gold Silver Bronze Total Gold.1
Olympics Olympics

0 11.094527 -0.161333 -0.162696 1.836615 1.837416 -0.901455 -0.161482

1 10.094527 4.838667 1.837304 7.836615 14.837416 2.098545 -0.161482

2 21.094527 17.838667 23.837304 27.836615 69.837416 17.098545 -0.161482

3 3.094527 0.838667 1.837304 8.836615 11.837416 5.098545 -0.161482

4 0.094527 2.838667 3.837304 4.836615 11.837416 -0.901455 -0.161482

... ... ... ... ... ... ... ...

142 -0.905473 -0.161333 0.837304 1.836615 2.837416 -0.901455 -0.161482

143 10.094527 -0.161333 0.837304 0.836615 1.837416 -0.901455 -0.161482

144 10.094527 2.838667 3.837304 0.836615 7.837416 0.098545 -0.161482

145 1.094527 7.838667 4.837304 3.836615 16.837416 -0.901455 -0.161482

146 25.094527 4808.838667 4774.837304 5129.836615 14713.837416 21.098545 958.838518

147 rows × 15 columns




localhost:8888/notebooks/Documents/21dce026/practical-1.ipynb# 10/11
7/19/23, 9:32 PM practical-1 - Jupyter Notebook

In [ ]:

In [ ]:

localhost:8888/notebooks/Documents/21dce026/practical-1.ipynb# 11/11

You might also like