You are on page 1of 4

# ## Data Cleaning

# Standard imports

# In[2]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as pp
get_ipython().run_line_magic('matplotlib', 'inline')
# In[3]:
#billboard = pd.read_csv('billboard.csv')
# In Unix or MacOS, we can easily find out the encoding with a command line ultility, file. In
Windows, you may have to open the file with an editor.
# In[4]:
get_ipython().system('file billboard.csv')
# In[5]:
billboard = pd.read_csv('billboard.csv',encoding='latin-1')
# In Pandas, we can use head to see the first few lines of a table.
# In[6]:
billboard
# Let's look at the name of the columns.
# In[7]:
billboard.columns
# We can plot the evolution of the ranking for any given song.
# In[8]:
pp.plot(billboard.loc[0,'x1st.week':'x76th.week'])
# In[9]:
pp.plot(billboard.loc[1,'x1st.week':'x76th.week'])
# We're going to tell plot explicitly to use a range of one through 76.
# In[10]:
pp.plot(range(1,77),billboard.loc[2,'x1st.week':'x76th.week'],color='#880088',alpha=1.0)
# In[11]:
for i in range(0,317):
pp.plot(range(1,77),billboard.loc[i,'x1st.week':'x76th.week'],color='#880088',alpha=0.5)
# Plot several songs at once:
# We can iterate over rows with the DataFrame method iterrows, which yields both the index and
the content of the row.
# In[12]:
for index, row in billboard.iterrows():
pp.plot(range(1,77),row['x1st.week':'x76th.week'],color='#ff00ff',alpha=0.1)
# One problem with this DataFrame is that the rankings are not very usable for analysis because
they're divided over multiple columns.
#
# It would make more sense to have each ranking in a separate row with the week number as a
variable.
#
# Technically, this is called melting each row into multiple ones, each of which represent a single
ranking.
#
# In[13]:
bshort =
billboard[['artist.inverted','track','time','date.entered','x1st.week','x2nd.week','x3rd.week']]
# In[14]:
bshort.head()
# We should also rename the columns to better, simpler, and more consistent variable names. We
can do that by assigning directly to the columns attribute.
#
# In[15]:
bshort.columns = ['artist','track','time','date.entered','wk1','wk2','wk3']
bshort.head()
# We get to melting using the Pandas melt method.
#
# Melt requires that we specify which should be the identifier variables that are repeated for
several rows. In our case, we need artist, track, time, and date.entered.
#
# Next we tell melt which should be the values, or observation columns. In this case, week one,
week two, and week three.
#
# Next what should be the name of the column that holds the type of observation. It would be
week.
#
# And what should be the name of the column that holds the value of the observation. It should
be rank.
#
# In[16]:
bmelt = bshort.melt(['artist','track','time','date.entered'],['wk1','wk2','wk3'],'week','rank')
bmelt.head()
# To see that this works, it's better to select a specific song.
#
# We can do that with the Pandas method query which takes a query as a string
# in something like a natural language specification.
#
# In[17]:
bmelt.query('track == "Independent Women Part I"')
# In[18]:
bmelt.query('artist == "Savage Garden"')
# Converting the week to a number:
#
# You can do using the apply method on the Pandas series for the column week. Apply takes a
Python function. And we're going to define it on-the-fly using lambda. We need the third
character in each string and we need to turn that
# into a number, an integer.
#
# In[19]:
bmelt['week'] = bmelt['week'].apply(lambda s: int(s[2]))
# In[20]:
bmelt.head()
# Once we have true dates, we can do date arithmetic and apply, for instance, Timedeltas,
obtaining the correct date for each ranking in week one, two, and three.
#
# In[21]:
bmelt['date.entered'] = pd.to_datetime(bmelt['date.entered'])
# In[22]:
bmelt['date.entered'][0]
# In[23]:
bmelt['date.entered'][0] + pd.Timedelta('7 days')
# In[24]:
bmelt['date'] = bmelt['date.entered'] + pd.Timedelta('7 days') * (bmelt['week'] - 1)
# In[25]:
bmelt.head()
# In[26]:
bmelt.query('week == "2"')
# At this point, we may as well drop the column date.entered. We need to tell Pandas that we're
working with the columns, so axis 1.
#
# In[27]:
bmelt.drop(['date.entered'],axis=1,inplace=True)
# We sort both the columns and the rows. Artist, track, time, date, week, and rank.
# In[28]:
bfinal = bmelt[['artist','track','time','date','week','rank']]
bfinal.sort_values(['artist','track'],inplace=True)
# In[29]:
bfinal.head()
# The artist name, the track name, and the track length appear in multiple rows.
# The correct way to remove the redundancy is to create a separate table with track data and
linking it to the rankings using an index.
#
# In the context of relational databases, this is called data normalization.
#
# In[72]:
tracks = bfinal[['artist','track','time']].drop_duplicates()
tracks = tracks.head()
print(tracks)
print('')
print(tracks.loc[tracks.track == 'Loser'])
# In[31]:
tracks.info()
# The index is unique and we need to carry it explicitly as a column. So, we reset the index and
rename it id. We can assign directly to index.name.
#
# In[32]:
tracks.index.name = 'id'
tracksid = tracks.reset_index()
tracksid.head()
# We now perform a database style join operation between the new table of tracks and the
Billboard table. We do so by matching by artist and track.
#
# In[33]:
pd.merge(tracksid,bfinal,on=['track','artist']).head()
# In[34]:
pd.merge(tracksid,bfinal,on=['track','artist','time']).head()
# In[35]:
tidy = pd.merge(tracksid,bfinal,on=['track','artist','time']).drop(['artist','track','time'],axis=1)
tidy.head()
# In[36]:
tidy.query('week == 1')
# In[37]:
tidy[tidy.week == 1]
# In[38]:
tidy[tidy.week == 1]['rank'].idxmin()
# In[39]:
tidy.loc[tidy[tidy.week == 1]['rank'].idxmin()]
# In[40]:
tracksid[tracksid.id == 1]
# In[41]:
tracksid[tracksid.id == tidy.loc[tidy[tidy.week == 1]['rank'].idxmin()]['id']]
# In[42]:
tracksid.info()

You might also like