Professional Documents
Culture Documents
Lab Assignment 3
Lab Assignment 3
In [1]:
import numpy as np
import pandas as pd
i t t l tlib l t lt
In [2]:
df = pd.read_csv(r'https://raw.githubusercontent.com/realpython/python-data-cleaning/master
df i f ()
<class 'pandas.core.frame.DataFrame'>
In [3]:
df d ib ()
Out[3]:
In [4]:
df h d()
Out[4]:
Walter
S. Tinsley Forbes. [A FORBES,
0 206 NaN London 1879 [1878] A. A.
& Co. novel.] By A. Walter.
A
Welsh
Sketches, Appleyard,
James A., E.
3 472 NaN London 1851 chiefly Ernest
Darling S.
ecclesiastical, Silvanus.
to the...
In [5]:
df t il()
Out[5]:
The History
GLOVER,
M. Mozley and Gazetteer
8283 4158128 NaN Derby 1831, 32 Stephen - NO
& Son of the County
of Derby
of Der...
LYSONS,
Magna
Daniel -
T. Cadell Britannia;
M.A.,
8284 4159563 NaN London [1806]-22 and W. being a Matth
F.R.S., and
Davies concise S
LYSONS
topographical...
(Sam...
An historical,
Newcastle Mackenzie topographical Mackenzie,
8285 4159587 NaN 1834
upon Tyne & Dent and descriptive E. (Eneas)
v...
Collectanea
Topographica
8286 4160339 NaN London 1834-43 NaN et NaN Bul
Genealogica.
[Firs...
In [6]:
# Method 1
df.drop(columns_with_nan, inplace = True, axis = 1) # axis = 1 (or axis = 'columns') is ver
# when inplace = True => the data is modified in place, it will return nothing and the dataf
# when inplace = False => new dataframe is created
# Method 2
# df d ( l l ith i l T ) # d l i l
In [7]:
True
In [8]:
Place of Date of
Publisher Title Author
Publication Publication
Identifier
Walter Forbes.
S. Tinsley
206 London 1879 [1878] [A novel.] By A. A. A. http://www.flickr.com/photo
& Co.
A
Love the
Bradbury,
Avenger. By
218 London 1869 Evans & A., A. A. http://www.flickr.com/photo
the author of
Co.
“All for Gr...
Welsh
Sketches,
James
472 London 1851 chiefly A., E. S. http://www.flickr.com/photo
Darling
ecclesiastical,
to the...
[The World in
Wertheim
which I live,
480 London 1857 & A., E. S. http://www.flickr.com/photo
and my place
Macintosh
in it...
The History
GLOVER,
M. Mozley and Gazetteer
4158128 Derby 1831, 32 Stephen - http://www.flickr.com/photo
& Son of the County
of Derby
of Der...
LYSONS,
Magna
Daniel -
T. Cadell Britannia;
M.A.,
4159563 London [1806]-22 and W. being a http://www.flickr.com/photo
F.R.S., and
Davies concise
LYSONS
topographical...
(Sam...
An historical,
Newcastle Mackenzie topographical Mackenzie,
4159587 1834 http://www.flickr.com/photo
upon Tyne & Dent and descriptive E. (Eneas)
v...
Collectanea
Topographica
4160339 London 1834-43 NaN et NaN http://www.flickr.com/photo
Genealogica.
[Firs...
In [9]:
object 6
dtype: int64
In [10]:
Identifier
1905 1888
2836 1897
2854 1865
2956 1860-63
...
4158088 1838
4158128 1831, 32
4159563 [1806]-22
4159587 1834
4160339 1834-43
In [11]:
# r'^(\d{4})' => regex finds any four digits at the beginning of a string
extr = df['Date of Publication'].str.extract(r'^(\d{4})', expand = False)
t
Out[11]:
Identifier
206 1879
216 1868
218 1869
472 1851
480 1857
...
4158088 1838
4158128 1831
4159563 NaN
4159587 1834
4160339 1834
In [12]:
# convert the string nan to NumPy’s NaN value (i.e. where dtype is float)
df['Date of Publication'] = pd.to_numeric(extr)
df['D t f P bli ti '] dt
Out[12]:
dtype('float64')
In [13]:
# we will clean Place of Publication since this column has string objects
df['Pl f P bli ti ']
Out[13]:
Identifier
206 London
218 London
472 London
480 London
...
4158088 London
4158128 Derby
4159563 London
4160339 London
In [14]:
df l [4157862]
Out[14]:
Publisher T. Fordyce
In [15]:
df.loc[4159587]
# t th t th h l i t h h h i th f th l hil th
Out[15]:
In [16]:
In [17]:
Identifier
206 London
216 London
218 London
472 London
480 London
...
4158088 London
4158128 Derby
4159563 London
4160339 London
In [18]:
# t t h th '[ dit]' b t i i th hi h b t k d t f
Out[18]:
['Alabama[edit]\n',
'Alaska[edit]\n',
'Arizona[edit]\n',
'Arkansas[edit]\n',
In [19]:
i it t [1 10]
Out[19]:
('Alabama[edit]\n',
In [20]:
In [21]:
def get_citystate(item):
if '(' in item:
return item[:item.find('(')]
elif '[' in item:
return item[:item.find('[')]
else:
ret rn it
In [22]:
towns_df = towns_df.applymap(get_citystate)
t df h d()
Out[22]:
0 Alabama Auburn
1 Alabama Florence
2 Alabama Jacksonville
3 Alabama Livingston
4 Alabama Montevallo
In [23]:
olympics_df = pd.read_csv(r'https://raw.githubusercontent.com/realpython/python-data-cleani
l i df
Out[23]:
0 1 2 3 4 5 6 7 8 9 10 11 12
? ? ?
0 NaN 01 ! 02 ! 03 ! Total 01 ! 02 ! 03 ! Total 01
Summer Winter Games
Afghanistan
1 13 0 0 2 2 0 0 0 0 0 13 0
(AFG)
Algeria
2 12 5 2 8 15 3 0 0 0 0 15 5
(ALG)
Argentina
3 23 18 24 28 70 18 0 0 0 0 41 18
(ARG)
Armenia
4 5 1 2 9 12 6 0 0 0 0 11 1
(ARM)
... ... ... ... ... ... ... ... ... ... ... ... ... ..
Independent
Olympic
143 1 0 1 2 3 0 0 0 0 0 1 0
Participants
(IOP) [IOP]
Zambia
144 (ZAM) 12 0 1 1 2 0 0 0 0 0 12 0
[ZAM]
Zimbabwe
145 12 3 4 1 8 1 0 0 0 0 13 3
(ZIM) [ZIM]
Mixed team
146 3 8 5 4 17 0 0 0 0 0 3 8
(ZZX) [ZZX]
147 Totals 27 4809 4775 5130 14714 22 959 958 948 2865 49 5768
In [24]:
# if we were to go to the source of this dataset, we’d see that NaN above should really be
# like “Country”, ? Summer is supposed to represent “Summer Games”, 01 ! should be “Gold”,
# skipping one row and setting the header as the first (0-indexed) row
olympics_df = pd.read_csv(r'https://raw.githubusercontent.com/realpython/python-data-cleani
header = 1)
l i df
Out[24]:
Unnamed: ? ? 01 02 03 ?
01 ! 02 ! 03 ! Total Total.1
0 Summer Winter !.1 !.1 !.1 Games
Afghanistan
0 13 0 0 2 2 0 0 0 0 0 13
(AFG)
Algeria
1 12 5 2 8 15 3 0 0 0 0 15
(ALG)
Argentina
2 23 18 24 28 70 18 0 0 0 0 41
(ARG)
Armenia
3 5 1 2 9 12 6 0 0 0 0 11
(ARM)
Australasia
4 2 3 4 5 12 0 0 0 0 0 2
(ANZ) [ANZ]
... ... ... ... ... ... ... ... ... ... ... ... ...
Independent
Olympic
142 1 0 1 2 3 0 0 0 0 0 1
Participants
(IOP) [IOP]
Zambia
143 (ZAM) 12 0 1 1 2 0 0 0 0 0 12
[ZAM]
Zimbabwe
144 12 3 4 1 8 1 0 0 0 0 13
(ZIM) [ZIM]
Mixed team
145 3 8 5 4 17 0 0 0 0 0 3
(ZZX) [ZZX]
146 Totals 27 4809 4775 5130 14714 22 959 958 948 2865 49 5
In [25]:
# dictionary that maps current column names (as keys) to more usable ones (the dictionary’s
new_names = {'Unnamed: 0': 'Country',
'? Summer': 'Summer Olympics',
'01 !': 'Gold',
'02 !': 'Silver',
'03 !': 'Bronze',
'? Winter': 'Winter Olympics',
'01 !.1': 'Gold.1',
'02 !.1': 'Silver.1',
'03 !.1': 'Bronze.1',
'? Games': '# Games',
'01 !.2': 'Gold.2',
'02 !.2': 'Silver.2',
'03 ! 2' 'B 2'}
In [26]:
Summer Winter
Country Gold Silver Bronze Total Gold.1 Silver.1 Bronze.1 Tot
Olympics Olympics
Afghanistan
0 13 0 0 2 2 0 0 0 0
(AFG)
Algeria
1 12 5 2 8 15 3 0 0 0
(ALG)
Argentina
2 23 18 24 28 70 18 0 0 0
(ARG)
Armenia
3 5 1 2 9 12 6 0 0 0
(ARM)
Australasia
4 (ANZ) 2 3 4 5 12 0 0 0 0
[ANZ]
Seaborn Practice
In [27]:
In [28]:
Content
App Category Rating Reviews Size Installs Type Price
Rating
Photo
Editor &
Candy
0 ART_AND_DESIGN 4.1 159 19M 10,000+ Free 0 Everyone
Camera &
Grid &
ScrapBook
Coloring
1 book ART_AND_DESIGN 3.9 967 14M 500,000+ Free 0 Everyone
moana
U
Launcher
Lite –
2 FREE Live ART_AND_DESIGN 4.7 87510 8.7M 5,000,000+ Free 0 Everyone
Cool
Themes,
Hide ...
Sketch -
3 Draw & ART_AND_DESIGN 4.5 215644 25M 50,000,000+ Free 0 Teen
Paint
Pixel Draw
- Number
4 Art ART_AND_DESIGN 4.3 967 2.8M 100,000+ Free 0 Everyone
D
Coloring
Book
1. Displot
In [29]:
sns.displot(pstore.Rating, bins = 20, kde = True) # A kernel density estimate (KDE) plot is
# the distribution of observations in a d
plt.title('Distribution of app ratings')
lt h ()
In [30]:
# sns.set_style('darkgrid')
# sns.set_context('paper')
In [31]:
Everyone 8714
Teen 1208
Unrated 2
In [32]:
Everyone 8714
Teen 1208
In [33]:
# pie chart
pstore['Content Rating'].value_counts().plot.pie()
plt.legend()
lt h ()
In [34]:
# bar chart
# sns.set_style(style = None)
# sns.set_context(context = None)
pstore['Content Rating'].value_counts().plot.barh()
lt h ()
3. Scatter Plot
In [35]:
4. Pair Plot
In [36]:
In [37]:
5. Heat maps
In [38]: