WEBINTEL GUIDED LAB ACTIVITY Introduction To Pandas

In [ ]: # a software library written for the Python programming language for data manipulation and analysis.
# In particular, it offers data structures and operations

# for manipulating numerical tables and time series.
import pandas
In [ ]: # Two-dimensional, size-mutable, potentially heterogeneous tabular data.

# Data structure also contains labeled axes (rows and columns).
df = pandas.read_csv('gapminder.tsv', sep='\t')
In [ ]: df.head() # print the first 5 records
In [ ]: df.columns # print the columns name
In [ ]: df.index # print the indexes
In [ ]: df.values # print the values
In [ ]: type(df) # print the type of your data frame
In [ ]: # The shape attribute of pandas. DataFrame stores the number of rows and columns
# as a tuple (number of rows, number of columns) .
df.shape
In [ ]: df.shape()
In [ ]: # The info() function is used to print a concise summary of a DataFrame.

df.info()
In [ ]: df['country'] # Read the specific column
In [ ]: # creating a new data frame which is to stored all the data about country column
country_df = df['country']
In [ ]: country_df.head() # print the first 5 records
In [ ]: # Series is a one-dimensional labeled array capable of

#holding data of any type (integer, string, float, python objects, etc.).
type(country_df) # print the type of your data frame
In [ ]: # creating a new data frame which is to stored all the data about following column
subset = df[['country', 'continent', 'year']]
In [ ]: subset.head() # print the first 5 records
In [ ]: pandas.__version__ # print the version of your pandas
In [ ]: # loc is label-based, which means that you have to specify rows and columns based
# on their row and column labels
df.loc[2]
In [ ]: df.loc[[2, 0]]
In [ ]: # iloc is integer index based, so you have to specify rows and columns by their integer index
df.iloc[2]
In [ ]: df.head() # print the first 5 records
In [ ]: # ix indexer was an early addition to the library that allowed for flexibility selecting rows and columns
# by either integer location or by label. DEPRICATED
df.ix[2]
In [ ]: # create a new data frame which store all the observation / rows from colums year and pop
subset = df.loc[:, ['year', 'pop']]
In [ ]: subset.head() # print the first 5 records
In [ ]: # display the year and pop then locate the year which is equal to 1967
df.loc[df['year'] == 1967, ['year', 'pop']]
In [ ]: # display the year and pop then locate the year which is equal to 1967 and pop > 1,000,000
df.loc[(df['year'] == 1967) & (df['pop'] > 1_000_000),
['year', 'pop']]
In [ ]: # display the rows using for loop

for index, row in df.iterrows():
print(index, row)
In [ ]: # display the rows using for loop with specify column name
for index, row in df.iterrows():
print(index, row['country'])
In [ ]: # The describe() method is used for calculating some statistical data like percentile,
# mean and std of the numerical values of the Series or DataFrame.
df.describe()
In [ ]: # display the data frame in descending format using the column 'country'
df.sort_values('country', ascending = False)
In [ ]: # display the data frame using the two columns 'country' (A-Z) and 'pop' (High - Low)
df.sort_values(['country','pop'], ascending = [1,0])
In [ ]: # adding a new column to ur data frame

df['new_continent'] = df['continent']
In [ ]: df.head()
In [ ]: # dropping a specific columns

df = df.drop(columns = ['new_continent'])
df.head()
In [ ]: # adding a new column to ur data frame

df['new_continent'] = df.iloc[:,1]
df
In [ ]: # Rearrangin data frame columns

df = df[['country','continent','new_continent','year','lifeExp','pop','gdpPercap']]
df
In [ ]: # create and save a new csv file

df.to_csv('modified.csv')
In [ ]: # create and save a new csv file without index

df.to_csv('modified.csv', index= False)
In [ ]: # create and save a new excel file without index

df.to_excel('modified.xlsx', index= False)
In [ ]: # create and save a new text file without index seperated by tab
df.to_csv('modified.txt', index= False, sep = '\t')
In [ ]: # filter the data frame with country that contains 'Afg'

df.loc[df['country'].str.contains('Afg')]
In [ ]: # filter the data frame with country does not contains 'Afg'
df.loc[~df['country'].str.contains('Afg')]
In [ ]: # filter the data frame usign re

import re
df.loc[df['country'].str.contains('Afg|Alb', regex=True)]
In [ ]: # filter the data frame usign re

import re
df.loc[df['country'].str.contains('afg|alb', flags = re.I, regex=True)]
In [ ]: # Changing the value of column new_continent with content of Asia to Asya

df.loc[df['new_continent'] == 'Asia', 'new_continent'] = 'Asya'
df
In [ ]: df.loc[df['pop'] > 1000000, ['new_continent','continent']] = 'Hello Word'

df
In [ ]: df.loc[df['pop'] > 1000000, ['new_continent','continent']] = ['Hello', 'World']

df
In [ ]: df = pandas.read_csv('modified.csv')
In [ ]: df
In [ ]: #Aggregate Statistics Using Groupby function

df.groupby(['continent']).mean()

df.groupby(['continent']).mean().sort_values('pop', ascending = False)

df.groupby(['continent']).mean().sort_values('gdpPercap', ascending = False)

df.groupby(['continent']).count()
In [ ]: df['count'] = 1
df
In [ ]: df.groupby(['continent']).count()['count']
In [ ]:

WEBINTEL GUIDED LAB ACTIVITY Introduction To Pandas

Uploaded by

Document Information

Original Description:

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

WEBINTEL GUIDED LAB ACTIVITY Introduction To Pandas

Uploaded by

Copyright:

Available Formats

In [ ]: # a software library written for the Python programming language for data manipulation and analysis.

# In particular, it offers data structures and operations

In [ ]: # Two-dimensional, size-mutable, potentially heterogeneous tabular data.

In [ ]: df.head() # print the first 5 records

In [ ]: df.columns # print the columns name

In [ ]: df.index # print the indexes

In [ ]: df.values # print the values

In [ ]: type(df) # print the type of your data frame

In [ ]: # The info() function is used to print a concise summary of a DataFrame.

In [ ]: df['country'] # Read the specific column

In [ ]: country_df.head() # print the first 5 records

In [ ]: # Series is a one-dimensional labeled array capable of

In [ ]: subset.head() # print the first 5 records

In [ ]: pandas.__version__ # print the version of your pandas

In [ ]: df.head() # print the first 5 records

In [ ]: subset.head() # print the first 5 records

In [ ]: # display the rows using for loop

In [ ]: # adding a new column to ur data frame

In [ ]: # dropping a specific columns

In [ ]: # adding a new column to ur data frame

In [ ]: # Rearrangin data frame columns

In [ ]: # create and save a new csv file

In [ ]: # create and save a new csv file without index

In [ ]: # create and save a new excel file without index

In [ ]: # filter the data frame with country that contains 'Afg'

In [ ]: # filter the data frame usign re

In [ ]: # filter the data frame usign re

In [ ]: # Changing the value of column new_continent with content of Asia to Asya

In [ ]: df.loc[df['pop'] > 1000000, ['new_continent','continent']] = 'Hello Word'

In [ ]: df.loc[df['pop'] > 1000000, ['new_continent','continent']] = ['Hello', 'World']

In [ ]: #Aggregate Statistics Using Groupby function

In [ ]: #Aggregate Statistics Using Groupby function

In [ ]: #Aggregate Statistics Using Groupby function

In [ ]: #Aggregate Statistics Using Groupby function

You might also like

In [ ]: pandas.version # print the version of your pandas