You are on page 1of 1

In [ ]: # a software library written for the Python programming language for data manipulation and analysis.

# In particular, it offers data structures and operations


# for manipulating numerical tables and time series.
import pandas

In [ ]: # Two-dimensional, size-mutable, potentially heterogeneous tabular data.


# Data structure also contains labeled axes (rows and columns).
df = pandas.read_csv('gapminder.tsv', sep='\t')

In [ ]: df.head() # print the first 5 records

In [ ]: df.columns # print the columns name

In [ ]: df.index # print the indexes

In [ ]: df.values # print the values

In [ ]: type(df) # print the type of your data frame

In [ ]: # The shape attribute of pandas. DataFrame stores the number of rows and columns
# as a tuple (number of rows, number of columns) .
df.shape

In [ ]: df.shape()

In [ ]: # The info() function is used to print a concise summary of a DataFrame.


df.info()

In [ ]: df['country'] # Read the specific column

In [ ]: # creating a new data frame which is to stored all the data about country column
country_df = df['country']

In [ ]: country_df.head() # print the first 5 records

In [ ]: # Series is a one-dimensional labeled array capable of


#holding data of any type (integer, string, float, python objects, etc.).
type(country_df) # print the type of your data frame

In [ ]: # creating a new data frame which is to stored all the data about following column
subset = df[['country', 'continent', 'year']]

In [ ]: subset.head() # print the first 5 records

In [ ]: pandas.__version__ # print the version of your pandas

In [ ]: # loc is label-based, which means that you have to specify rows and columns based
# on their row and column labels
df.loc[2]

In [ ]: df.loc[[2, 0]]

In [ ]: # iloc is integer index based, so you have to specify rows and columns by their integer index
df.iloc[2]

In [ ]: df.head() # print the first 5 records

In [ ]: # ix indexer was an early addition to the library that allowed for flexibility selecting rows and columns
# by either integer location or by label. DEPRICATED
df.ix[2]

In [ ]: # create a new data frame which store all the observation / rows from colums year and pop
subset = df.loc[:, ['year', 'pop']]

In [ ]: subset.head() # print the first 5 records

In [ ]: # display the year and pop then locate the year which is equal to 1967
df.loc[df['year'] == 1967, ['year', 'pop']]

In [ ]: # display the year and pop then locate the year which is equal to 1967 and pop > 1,000,000
df.loc[(df['year'] == 1967) & (df['pop'] > 1_000_000),
['year', 'pop']]

In [ ]: # display the rows using for loop


for index, row in df.iterrows():
print(index, row)

In [ ]: # display the rows using for loop with specify column name
for index, row in df.iterrows():
print(index, row['country'])

In [ ]: # The describe() method is used for calculating some statistical data like percentile,
# mean and std of the numerical values of the Series or DataFrame.
df.describe()

In [ ]: # display the data frame in descending format using the column 'country'
df.sort_values('country', ascending = False)

In [ ]: # display the data frame using the two columns 'country' (A-Z) and 'pop' (High - Low)
df.sort_values(['country','pop'], ascending = [1,0])

In [ ]: # adding a new column to ur data frame


df['new_continent'] = df['continent']

In [ ]: df.head()

In [ ]: # dropping a specific columns


df = df.drop(columns = ['new_continent'])
df.head()

In [ ]: # adding a new column to ur data frame


df['new_continent'] = df.iloc[:,1]
df

In [ ]: # Rearrangin data frame columns


df = df[['country','continent','new_continent','year','lifeExp','pop','gdpPercap']]
df

In [ ]: # create and save a new csv file


df.to_csv('modified.csv')

In [ ]: # create and save a new csv file without index


df.to_csv('modified.csv', index= False)

In [ ]: # create and save a new excel file without index


df.to_excel('modified.xlsx', index= False)

In [ ]: # create and save a new text file without index seperated by tab
df.to_csv('modified.txt', index= False, sep = '\t')

In [ ]: # filter the data frame with country that contains 'Afg'


df.loc[df['country'].str.contains('Afg')]

In [ ]: # filter the data frame with country does not contains 'Afg'
df.loc[~df['country'].str.contains('Afg')]

In [ ]: # filter the data frame usign re


import re
df.loc[df['country'].str.contains('Afg|Alb', regex=True)]

In [ ]: # filter the data frame usign re


import re
df.loc[df['country'].str.contains('afg|alb', flags = re.I, regex=True)]

In [ ]: # Changing the value of column new_continent with content of Asia to Asya


df.loc[df['new_continent'] == 'Asia', 'new_continent'] = 'Asya'
df

In [ ]: df.loc[df['pop'] > 1000000, ['new_continent','continent']] = 'Hello Word'


df

In [ ]: df.loc[df['pop'] > 1000000, ['new_continent','continent']] = ['Hello', 'World']


df

In [ ]: df = pandas.read_csv('modified.csv')

In [ ]: df

In [ ]: #Aggregate Statistics Using Groupby function


df.groupby(['continent']).mean()

In [ ]: #Aggregate Statistics Using Groupby function


df.groupby(['continent']).mean().sort_values('pop', ascending = False)

In [ ]: #Aggregate Statistics Using Groupby function


df.groupby(['continent']).mean().sort_values('gdpPercap', ascending = False)

In [ ]: #Aggregate Statistics Using Groupby function


df.groupby(['continent']).count()

In [ ]: df['count'] = 1
df

In [ ]: df.groupby(['continent']).count()['count']

In [ ]:

You might also like