19BBTCS069 KrishnaKantPandey DAP Lab Record

KRISHNA KANT PANDEY
19BBTCS069
of Engineering & Technology
B.Tech CSE
VII A
DAP 4BCS702
KRISHNA KANT PANDEY
19BBTCS069 VII
B.tech in CSE
Data Analysis using Python
4BCS702
*DAP Lab Exp.No.1*
Python Data Structures: Develop an application that uses different python data structures
1. Dictionaries
In [ ]:
# Creating a Dictionary
# Empty Dict can be created using
Roles = {}
# Dictionary with Elements can be created using
Roles = {"Owner":1, "Admin":2, "Manager":3, "Moderator":4}
print(Roles)
{'Owner': 1, 'Admin': 2, 'Manager': 3, 'Moderator': 4}
In [ ]:
#Displaying Keys
Roles.keys()
Out[ ]:
dict_keys(['Owner', 'Admin', 'Manager', 'Moderator'])
In [ ]:
#Displaying Values
Roles.values()
Out[ ]:
dict_values([1, 2, 3, 4])
In [ ]:
#Displaying Dictionary Items
Roles.items()
Out[ ]:
dict_items([('Owner', 1), ('Admin', 2), ('Manager', 3), ('Moderator', 4)])
In [ ]:
#Updating Values using Key
Roles['Moderator'] = 5
print(Roles)
In [ ]:
#Updating Values using update() function
Roles.update({'Moderator':4})
print(Roles)
In [ ]:
#Deleting an item from Dict
del Roles['Manager'] # Or We May use Roles.pop('Manager')
print(Roles)
{'Owner': 1, 'Admin': 2, 'Moderator': 4}
In [ ]:
#Empyting entire items in the dictionary using clear()
Roles.clear()
print(Roles)
{}
2. Lists
In [ ]:
#Creatung Lists
# empty list
my_list = []
print(my_list)
# list of integers
my_list = [1, 2, 3]
print(my_list)
# list with mixed data types
my_list = [1, "Hello", 3.4]
print(my_list)
# nested list
my_list = ["mouse", [8, 4, 6], ['a']]
print(my_list)
[]
[1, 2, 3]
[1, 'Hello', 3.4]
['mouse', [8, 4, 6], ['a']]
In [ ]:
my_list = ['p', 'r', 'o', 'b', 'e']
# first item
print(my_list[0])
# third item
print(my_list[2])
# fifth item
print(my_list[4])
# Nested List
n_list = ["Happy", [2, 0, 1, 5]]
# Nested indexing
print(n_list[0][1])
print(n_list[1][3])
In [ ]:
# Negative indexing in lists
my_list = ['p','r','o','b','e']
# last item
print(my_list[-1])
# fifth last item
print(my_list[-5])
In [ ]:
# Correcting mistake values in a list
odd = [2, 4, 6, 8]
# change the 1st item
odd[0] = 1
print(odd)
# change 2nd to 4th items
odd[1:4] = [3, 5, 7]
print(odd)
[1, 4, 6, 8]
[1, 3, 5, 7]
In [ ]:
# Concatenating and repeating lists
odd = [1, 3, 5]
print(odd + [9, 7, 5])
print(["re"] * 3)
[1, 3, 5, 9, 7, 5]
['re', 're', 're']
In [ ]:
# Demonstration of list insert() method
odd = [1, 9]
odd.insert(1,3)
print(odd)
odd[2:2] = [5, 7]
print(odd)
[1, 3, 9]
[1, 3, 5, 7, 9]
In [ ]:
# Deleting list items
my_list = ['p', 'r', 'o', 'b', 'l', 'e', 'm']
# delete one item
del my_list[2]
print(my_list)
# delete multiple items
del my_list[1:5]
print(my_list)
# delete the entire list
del my_list
['p', 'r', 'b', 'l', 'e', 'm']
['p', 'm']
In [ ]:
my_list = ['p','r','o','b','l','e','m']
my_list.remove('p')
# Output: ['r', 'o', 'b', 'l', 'e', 'm']
print(my_list)
# Output: 'o'
print(my_list.pop(1))
# Output: ['r', 'b', 'l', 'e', 'm']
print(my_list)
# Output: 'm'
print(my_list.pop())
# Output: ['r', 'b', 'l', 'e']
print(my_list)
my_list.clear()
# Output: []
print(my_list)
['r', 'o', 'b', 'l', 'e', 'm']
['r', 'b', 'l', 'e', 'm']
['r', 'b', 'l', 'e']
[]
3. Sets
In [ ]:
# Creating Sets
# set of integers
my_set = {1, 2, 3}
print(my_set)
# set of mixed datatypes
my_set = {1.0, "Hello", (1, 2, 3)}
print(my_set)
{1, 2, 3}
{1.0, 'Hello', (1, 2, 3)}
In [ ]:
# set cannot have duplicates
my_set = {1, 2, 3, 4, 3, 2}
print(my_set)
# we can make set from a list
my_set = set([1, 2, 3, 2])
print(my_set)
{1, 2, 3, 4}
{1, 2, 3}
In [ ]:
# Distinguish set and dictionary while creating empty set
# initialize a with {}
a = {}
# check data type of a
print(type(a))
# initialize a with set()
a = set()
# check data type of a
print(type(a))
<class 'dict'>
<class 'set'>
In [ ]:
# initialize my_set
my_set = {1, 3}
print(my_set)
# add an element
my_set.add(2)
print(my_set)
# add multiple elements
my_set.update([2, 3, 4])
print(my_set)
# add list and set
my_set.update([4, 5], {1, 6, 8})
print(my_set)
{1, 3}
{1, 2, 3}
{1, 2, 3, 4}
{1, 2, 3, 4, 5, 6, 8}
In [ ]:
# Difference between discard() and remove()
# initialize my_set
my_set = {1, 3, 4, 5, 6}
print(my_set)
# discard an element
# Output: {1, 3, 5, 6}
my_set.discard(4)
print(my_set)
# remove an element
# Output: {1, 3, 5}
my_set.remove(6)
print(my_set)
# discard an element
# not present in my_set
# Output: {1, 3, 5}
my_set.discard(2)
print(my_set)
{1, 3, 4, 5, 6}
{1, 3, 5, 6}
{1, 3, 5}
{1, 3, 5}
In [ ]:
# initialize my_set
# Output: set of unique elements
my_set = set("HelloWorld")
print(my_set)
# pop an element
# Output: random element
print(my_set.pop())
# pop another element
my_set.pop()
print(my_set)
# clear my_set
# Output: set()
my_set.clear()
print(my_set)
{'r', 'H', 'e', 'l', 'd', 'W', 'o'}
{'e', 'l', 'd', 'W', 'o'}
set()
In [ ]:
# Set union method
# initialize A and B
A = {1, 2, 3, 4, 5}
B = {4, 5, 6, 7, 8}
# use | operator
# Output: {1, 2, 3, 4, 5, 6, 7, 8}
print(A | B)
{1, 2, 3, 4, 5, 6, 7, 8}
In [ ]:
# Intersection of sets
A = {1, 2, 3, 4, 5}
B = {4, 5, 6, 7, 8}
# use & operator
# Output: {4, 5}
print(A & B)
{4, 5}
In [ ]:
# Difference of two sets
A = {1, 2, 3, 4, 5}
B = {4, 5, 6, 7, 8}
# use - operator on A
# Output: {1, 2, 3}
print(A - B)
{1, 2, 3}
In [ ]:
# Symmetric difference of two sets
A = {1, 2, 3, 4, 5}
B = {4, 5, 6, 7, 8}
# use ^ operator
# Output: {1, 2, 3, 6, 7, 8}
print(A ^ B)
{1, 2, 3, 6, 7, 8}
In [ ]:
# in keyword in a set
# initialize my_set
my_set = set("apple")
# check if 'a' is present
# Output: True
print('a' in my_set)
# check if 'p' is present
# Output: False
print('p' not in my_set)
True
False
In [ ]:
4. Tuples
In [ ]:
# Creating Tuples
# Empty tuple
my_tuple = ()
print(my_tuple)
# Tuple having integers
my_tuple1 = (1, 2, 3)
print(my_tuple1)
# tuple with mixed datatypes
my_tuple2 = (1, "Hello", 3.4)
print(my_tuple2)
# nested tuple
my_tuple3 = ("mouse", [8, 4, 6], (1, 2, 3))
print(my_tuple3)
()
(1, 2, 3)
(1, 'Hello', 3.4)
('mouse', [8, 4, 6], (1, 2, 3))
In [ ]:
# Tuple Packing (A tuple can also be created without using parentheses.)
my_tuple4 = 3, 4.6, "dog"
print(my_tuple4)
#Tuple Unpacking
a, b, c = my_tuple4
print("")
print(a)
print(b)
print(c)
(3, 4.6, 'dog')
4.6
dog
In [ ]:
my_tuple = ("hello")
print(type(my_tuple))
# Creating a tuple having one element
my_tuple = ("hello",)
# Parentheses is optional
my_tuple = "hello",
<class 'str'>
<class 'tuple'>
<class 'tuple'>
In [ ]:
# Accessing tuple elements using indexing
my_tuple = ('p','e','r','m','i','t')
print(my_tuple[0])
print(my_tuple[5])
# nested tuple
n_tuple = ("mouse", [8, 4, 6], (1, 2, 3))
# nested index
print(n_tuple[0][3])
print(n_tuple[1][1])
In [ ]:
# Negative indexing for accessing tuple elements
my_tuple = ('p', 'e', 'r', 'm', 'i', 't')
print(my_tuple[-1])
print(my_tuple[-6])
In [ ]:
#Deleting Tuple
del my_tuple
print(my_tuple)
--------------------------------------------------------------------------
-
NameError Traceback (most recent call las

t)
<ipython-input-33-197bc7ed8198> in <module>
1 #Deleting Tuple
2 del my_tuple
----> 3 print(my_tuple)
NameError: name 'my_tuple' is not defined
In [ ]:
my_tuple = ('a', 'p', 'p', 'l', 'e',)
print(my_tuple.count('p'))
print(my_tuple.index('l'))
In [ ]:
# Membership test in tuple
my_tuple = ('a', 'p', 'p', 'l', 'e',)
# In operation
print('a' in my_tuple)
print('b' in my_tuple)
# Not in operation
print('g' not in my_tuple)
True
False
True
*DAP Lab Exp.No.2*
Python String Functions: Write a program to explore different string functions
In [ ]:
# Creating Strings in Python
# defining strings in Python
# all of the following are equivalent
my_string = 'Hello'
print(my_string)
my_string = "Hello"
print(my_string)
my_string = '''Hello'''
print(my_string)
# triple quotes string can extend multiple lines
my_string = """Hello, Welcome to
CMR University"""
print(my_string)
Hello
Hello
Hello
Hello, Welcome to
CMR University
In [ ]:
# capitalize() function
string = "cmr university soet"
print(string.capitalize())
Cmr university soet
In [ ]:
# lower() function
string = "Cmr University SoET"
print(string.lower())
cmr university soet
In [ ]:
# casefold() function
print(string.casefold())
cmr university soet
In [ ]:
# upper() function
string = "cmr university is one of the top university in karnataka"
print(string.upper())
CMR UNIVERSITY IS ONE OF THE TOP UNIVERSITY IN KARNATAKA
In [ ]:
# title() function
string = "cmr university soet"
print(string.title())
Cmr University Soet
In [ ]:
# count() function
print(string.count('i'))
In [ ]:
# find() function
print(string.find('U'))
In [ ]:
# replace() function
print(string.replace("Cmr", "CMR"))
CMR University SoET
In [1]:
# swapcase() function
string = "kRiShNa KaNt PaNdEy"
print(string.swapcase())
KrIsHnA kAnT pAnDeY
In [2]:
# join() function
string_tuple = ("Krishna", "Kant", "Pandey")
string = " "
print(string.join(string_tuple))
Krishna Kant Pandey
*DAP Lab Exp.No.3*
Regular Expression: Demonstrate usage of regular expression
In [1]:
#Regular expressions provide a flexible way to search or match (often more complex) str
ing patterns in text.
#Python’s built-in re module is responsible for applying regular expressions to strings
import re
In [16]:
# Let's match a basic regex pattern
pattern = '^k.....a$'
string = 'krishna'
result = re.match(pattern, string)
if result:
print("Search Successful")
else:
print("Search Unsuccessful")
Search Successful
In [3]:
# The regex describing one or more whitespace characters is \s+:
text = "foo bar\t baz \tqux"
re.split('\s+', text)
Out[3]:
['foo', 'bar', 'baz', 'qux']
In [4]:
regex = re.compile('\s+')
In [5]:
regex.split(text)
Out[5]:
['foo', 'bar', 'baz', 'qux']
In [6]:
regex.findall(text)
Out[6]:
[' ', '\t ', ' \t']

In [7]:
text = """Prabhakar prabhakar.k@cmr.edu.in
Parameswaran parameswaran.t@cmr.edu.in
Sathiyaraj sathiyaraj.r@cmr.edu.in
Mariyappan mariyappan.k@cmr.edu.in
"""
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.([A-Z0-9.-]+)+\.[A-Z]{2,4}'
regex = re.compile(pattern, flags=re.IGNORECASE)
In [8]:
regex.findall(text)
Out[8]:
['edu', 'edu', 'edu', 'edu']
In [9]:
m = regex.search(text)
Out[9]:
<re.Match object; span=(10, 32), match='prabhakar.k@cmr.edu.in'>
In [10]:
print(regex.match(text))
None
In [11]:
print(regex.sub('CMRU', text))
Prabhakar CMRU
Parameswaran CMRU
Sathiyaraj CMRU
Mariyappan CMRU
In [12]:
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re.compile(pattern, flags=re.IGNORECASE)
In [13]:
m = regex.match('krishna.19bcs@cmr.edu.in')
m.groups()
Out[13]:
('krishna.19bcs', 'cmr', 'edu', 'in')

In [14]:
regex.findall(text)
Out[14]:
[('prabhakar.k', 'cmr', 'edu', 'in'),
('parameswaran.t', 'cmr', 'edu', 'in'),
('sathiyaraj.r', 'cmr', 'edu', 'in'),
('mariyappan.k', 'cmr', 'edu', 'in')]
In [15]:
print(regex.sub(r'Username: \1, Prefix: \2, Domain: \3, Suffix: \4', text))
Prabhakar Username: prabhakar.k, Prefix: cmr, Domain: edu, Suffix: in
Parameswaran Username: parameswaran.t, Prefix: cmr, Domain: edu, Suffix: i

n
Sathiyaraj Username: sathiyaraj.r, Prefix: cmr, Domain: edu, Suffix: in
Mariyappan Username: mariyappan.k, Prefix: cmr, Domain: edu, Suffix: in
*DAP Lab Exp.No.4*
Demonstrate the usage of numpy libraries
In [ ]:
import numpy as np
Creating ndarrays
In [ ]:
# generate some random data
data = np.random.randn(2, 3)
data
Out[ ]:
array([[-0.71743415, -1.90053641, -0.0862186 ],
[ 0.80863996, -0.23353389, 0.51979667]])
In [ ]:
data1 = [6, 7.5, 8, 0, 1]
In [ ]:
arr1 = np.array(data1)
arr1
Out[ ]:
array([6. , 7.5, 8. , 0. , 1. ])
In [ ]:
#Nested Sequences will be converted into multidimensional array
data2 = [[1,2,3,4],[5,6,7,8]]
In [ ]:
arr2 = np.array(data2)
arr2
Out[ ]:
array([[1, 2, 3, 4],
[5, 6, 7, 8]])
In [ ]:
arr2.ndim
Out[ ]:
2
In [ ]:
arr2.shape
Out[ ]:
(2, 4)
In [ ]:
np.zeros(10)
Out[ ]:
array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
In [ ]:
np.zeros((3,6))
Out[ ]:
array([[0., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0., 0.]])
In [ ]:
np.empty((2,3,2))
Out[ ]:
array([[[2.05050840e-316, 2.57353024e-316],
[3.95252517e-323, 2.57353024e-316],
[3.95252517e-323, 2.57353024e-316]],
[[4.44659081e-323, 0.00000000e+000],
[2.57355554e-316, 2.57355554e-316],
[2.57355554e-316, 0.00000000e+000]]])
In [ ]:
np.arange(15)
Out[ ]:
array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])
Data Types for ndarrays
In [ ]:
arr1 = np.array([1, 2, 3], dtype=np.float64)
arr2 = np.array([1, 2, 3], dtype=np.int32)
In [ ]:
arr1.dtype
Out[ ]:
dtype('float64')
In [ ]:
arr2.dtype
Out[ ]:
dtype('int32')
In [ ]:
arr = np.array([3.7, -1.2, -2.6, 0.5, 12.9, 10.1])
In [ ]:
arr
Out[ ]:
array([ 3.7, -1.2, -2.6, 0.5, 12.9, 10.1])
In [ ]:
arr.astype(np.int32)
Out[ ]:
array([ 3, -1, -2, 0, 12, 10], dtype=int32)
Arithmetic with NumPy Arrays
In [ ]:
arr = np.array([[1., 2., 3.], [4., 5., 6.]])
In [ ]:
arr
Out[ ]:
array([[1., 2., 3.],
[4., 5., 6.]])
In [ ]:
arr * arr
Out[ ]:
array([[ 1., 4., 9.],
[16., 25., 36.]])

In [ ]:
arr - arr
Out[ ]:
array([[0., 0., 0.],
[0., 0., 0.]])
In [ ]:
1 / arr
Out[ ]:
array([[1. , 0.5 , 0.33333333],
[0.25 , 0.2 , 0.16666667]])
In [ ]:
arr ** 0.5
Out[ ]:
array([[1. , 1.41421356, 1.73205081],
[2. , 2.23606798, 2.44948974]])
In [ ]:
arr2 = np.array([[0., 4., 1.], [7., 2., 12.]])
In [ ]:
arr2
Out[ ]:
array([[ 0., 4., 1.],
[ 7., 2., 12.]])
In [ ]:
arr2 > arr
Out[ ]:
array([[False, True, False],
[ True, False, True]])
Basic Indexing and Slicing
In [ ]:
arr = np.arange(10)
In [ ]:
arr
Out[ ]:
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
In [ ]:
arr[5]
Out[ ]:
In [ ]:
arr[5:8]
Out[ ]:
array([5, 6, 7])
In [ ]:
arr[5:8] = 50
In [ ]:
arr
Out[ ]:
array([ 0, 1, 2, 3, 4, 50, 50, 50, 8, 9])
In [ ]:
arr_slice = arr[5:8]
In [ ]:
arr_slice
Out[ ]:
array([50, 50, 50])
In [ ]:
arr_slice[1] = 12345
In [ ]:
arr
Out[ ]:
array([ 0, 1, 2, 3, 4, 50, 12345, 50, 8,
9])
In [ ]:
arr_slice[:] = 64
In [ ]:
arr
Out[ ]:
array([ 0, 1, 2, 3, 4, 64, 64, 64, 8, 9])
In [ ]:
arr2d = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
In [ ]:
arr2d[2]
Out[ ]:
array([7, 8, 9])
In [ ]:
arr2d[0][2]
Out[ ]:
In [ ]:
arr2d[0,2]
Out[ ]:
In [ ]:
arr3d = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]])
In [ ]:
arr3d
Out[ ]:
array([[[ 1, 2, 3],
[ 4, 5, 6]],
[[ 7, 8, 9],
[10, 11, 12]]])

In [ ]:
arr3d[0]
Out[ ]:
array([[1, 2, 3],
[4, 5, 6]])
In [ ]:
old_values = arr3d[0].copy()
In [ ]:
arr3d[0] = 42
In [ ]:
arr3d
Out[ ]:
array([[[42, 42, 42],
[42, 42, 42]],
[[ 7, 8, 9],
[10, 11, 12]]])
In [ ]:
arr3d[0] = old_values
In [ ]:
arr3d
Out[ ]:
array([[[ 1, 2, 3],
[ 4, 5, 6]],
[[ 7, 8, 9],
[10, 11, 12]]])
In [ ]:
arr3d[1, 0]
Out[ ]:
array([7, 8, 9])
In [ ]:
x = arr3d[1]
In [ ]:
Out[ ]:
array([[ 7, 8, 9],
[10, 11, 12]])
In [ ]:
x[0]
Out[ ]:
array([7, 8, 9])
In [ ]:
arr
Out[ ]:
array([ 0, 1, 2, 3, 4, 64, 64, 64, 8, 9])
In [ ]:
arr[1:6]
Out[ ]:
array([ 1, 2, 3, 4, 64])
In [ ]:
arr2d
Out[ ]:
array([[1, 2, 3],
[4, 5, 6],
[7, 8, 9]])
In [ ]:
arr2d[:2]
Out[ ]:
array([[1, 2, 3],
[4, 5, 6]])
In [ ]:
arr2d[:2, 1:]
Out[ ]:
array([[2, 3],
[5, 6]])
In [ ]:
arr2d[1, :2]
Out[ ]:
array([4, 5])
In [ ]:
arr2d[:2, 2]
Out[ ]:
array([3, 6])
In [ ]:
arr2d[:, :1]
Out[ ]:
array([[1],
[4],
[7]])
In [ ]:
arr2d[:2, 1:] = 0
In [ ]:
arr2d
Out[ ]:
array([[1, 0, 0],
[4, 0, 0],
[7, 8, 9]])
Boolean Indexing
In [ ]:
names = np.array(['Bob', 'Joe', 'Will', 'Bob', 'Will', 'Joe', 'Joe'])
In [ ]:
data = np.random.randn(7, 4)
In [ ]:
names
Out[ ]:
array(['Bob', 'Joe', 'Will', 'Bob', 'Will', 'Joe', 'Joe'], dtype='<U4')

In [ ]:
data
Out[ ]:
array([[-1.7954001 , -0.27238636, 0.26087237, 0.51739584],
[-0.22170597, 0.69652897, -0.76271219, -0.4202649 ],
[ 0.40017537, 0.56459636, 0.82344516, -0.88931561],
[-0.55276083, -0.66219038, 1.51469514, 0.74176921],
[-0.91520788, 1.16605257, 2.3109382 , 1.24915827],
[ 0.51929941, 1.298917 , -1.18915305, -1.07543908],
[ 0.02639373, 0.91449301, -1.52633874, 0.7044852 ]])
In [ ]:
names == 'Bob'
Out[ ]:
array([ True, False, False, True, False, False, False])
In [ ]:
data[names == 'Bob']
Out[ ]:
array([[-1.7954001 , -0.27238636, 0.26087237, 0.51739584],
[-0.55276083, -0.66219038, 1.51469514, 0.74176921]])
In [ ]:
data[names == 'Bob', 2:]
Out[ ]:
array([[0.26087237, 0.51739584],
[1.51469514, 0.74176921]])
Fancy Indexing
In [ ]:
arr = np.empty((8, 4))
In [ ]:
for i in range(8):
arr[i] = i
In [ ]:
arr
Out[ ]:
array([[0., 0., 0., 0.],
[1., 1., 1., 1.],
[2., 2., 2., 2.],
[3., 3., 3., 3.],
[4., 4., 4., 4.],
[5., 5., 5., 5.],
[6., 6., 6., 6.],
[7., 7., 7., 7.]])
In [ ]:
arr[[4, 3, 0, 6]]
Out[ ]:
array([[4., 4., 4., 4.],
[3., 3., 3., 3.],
[0., 0., 0., 0.],
[6., 6., 6., 6.]])
In [ ]:
arr[[-3, -5, -7]]
Out[ ]:
array([[5., 5., 5., 5.],
[3., 3., 3., 3.],
[1., 1., 1., 1.]])
In [ ]:
arr = np.arange(32).reshape((8, 4))
In [ ]:
arr
Out[ ]:
array([[ 0, 1, 2, 3],
[ 4, 5, 6, 7],
[ 8, 9, 10, 11],
[12, 13, 14, 15],
[16, 17, 18, 19],
[20, 21, 22, 23],
[24, 25, 26, 27],
[28, 29, 30, 31]])

In [ ]:
arr[[1, 5, 7, 2], [0, 3, 1, 2]]
Out[ ]:
array([ 4, 23, 29, 10])
In [ ]:
arr[[1, 5, 7, 2]][:, [0, 3, 1, 2]]
Out[ ]:
array([[ 4, 7, 5, 6],
[20, 23, 21, 22],
[28, 31, 29, 30],
[ 8, 11, 9, 10]])
Transposing Arrays and Swapping Axes
In [ ]:
arr = np.arange(15).reshape((3, 5))
In [ ]:
arr
Out[ ]:
array([[ 0, 1, 2, 3, 4],
[ 5, 6, 7, 8, 9],
[10, 11, 12, 13, 14]])
In [ ]:
arr.T
Out[ ]:
array([[ 0, 5, 10],
[ 1, 6, 11],
[ 2, 7, 12],
[ 3, 8, 13],
[ 4, 9, 14]])
In [ ]:
arr = np.random.randn(6, 3)
In [ ]:
arr
Out[ ]:
array([[-0.46558131, 0.90974406, -0.32690968],
[ 0.5625897 , -0.04384863, 0.05917592],
[ 0.81749623, 0.60133116, 0.63612664],
[ 0.20116749, -0.33649087, 0.50276219],
[ 1.53614011, 1.29664476, -0.12714641],
[ 0.22035524, 0.57704734, -0.20811359]])
In [ ]:
np.dot(arr.T, arr)
Out[ ]:
array([[ 3.65032445, 2.09464975, 0.56549172],
[ 2.09464975, 3.31865349, -0.37160616],
[ 0.56549172, -0.37160616, 0.82727613]])
In [ ]:
arr = np.arange(16).reshape((2, 2, 4))
In [ ]:
arr
Out[ ]:
array([[[ 0, 1, 2, 3],
[ 4, 5, 6, 7]],
[[ 8, 9, 10, 11],
[12, 13, 14, 15]]])
In [ ]:
arr.transpose((1, 0, 2))
Out[ ]:
array([[[ 0, 1, 2, 3],
[ 8, 9, 10, 11]],
[[ 4, 5, 6, 7],
[12, 13, 14, 15]]])
In [ ]:
arr
Out[ ]:
array([[[ 0, 1, 2, 3],
[ 4, 5, 6, 7]],
[[ 8, 9, 10, 11],
[12, 13, 14, 15]]])

In [ ]:
arr.swapaxes(1, 2)
Out[ ]:
array([[[ 0, 4],
[ 1, 5],
[ 2, 6],
[ 3, 7]],
[[ 8, 12],
[ 9, 13],
[10, 14],
[11, 15]]])
*DAP Lab Exp.No.5*
Demonstrate the usage of pandas library
In [ ]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
In [ ]:
obj = pd.Series([4, 7, -5, 3])
In [ ]:
obj
Out[ ]:
0 4
1 7
2 -5
3 3
dtype: int64
In [ ]:
data = {'state': ['Assam', 'Assam', 'Assam', 'Delhi', 'Delhi', 'Delhi'],
'year': [2000, 2001, 2002, 2001, 2002, 2003],
'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)
frame
Out[ ]:
state year pop
0 Assam 2000 1.5
1 Assam 2001 1.7
2 Assam 2002 3.6
3 Delhi 2001 2.4
4 Delhi 2002 2.9
5 Delhi 2003 3.2
In [ ]:
obj = pd.Series(range(3), index=['a', 'b', 'c'])
index = obj.index
index
Out[ ]:
Index(['a', 'b', 'c'], dtype='object')

In [ ]:
index[1:]
Out[ ]:
Index(['b', 'c'], dtype='object')
In [ ]:
labels = pd.Index(np.arange(3))
labels
Out[ ]:
Int64Index([0, 1, 2], dtype='int64')
In [ ]:
obj2 = pd.Series([1.5, -2.5, 0], index=labels)
obj2
Out[ ]:
0 1.5
1 -2.5
2 0.0
dtype: float64
In [ ]:
obj2.index is labels
Out[ ]:
True
In [ ]:
frame
Out[ ]:
state year pop
0 Assam 2000 1.5
1 Assam 2001 1.7
2 Assam 2002 3.6
3 Delhi 2001 2.4
4 Delhi 2002 2.9
5 Delhi 2003 3.2
In [ ]:
frame.columns
Out[ ]:
Index(['state', 'year', 'pop'], dtype='object')

In [ ]:
'Assam' in frame.columns
Out[ ]:
False
In [ ]:
2003 in frame.index
Out[ ]:
False
In [ ]:
dup_labels = pd.Index(['foo', 'foo', 'bar', 'bar'])
In [ ]:
dup_labels
Out[ ]:
Index(['foo', 'foo', 'bar', 'bar'], dtype='object')
In [ ]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d','b', 'a', 'c'])
In [ ]:
obj
Out[ ]:
d 4.5
b 7.2
a -5.3
c 3.6
dtype: float64
In [ ]:
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
In [ ]:
obj2
Out[ ]:
a -5.3
b 7.2
c 3.6
d 4.5
e NaN
dtype: float64
In [ ]:
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
obj3
Out[ ]:
0 blue
2 purple
4 yellow
dtype: object
In [ ]:
obj3.reindex(range(6), method='ffill')
Out[ ]:
0 blue
1 blue
2 purple
3 purple
4 yellow
5 yellow
dtype: object
In [ ]:
frame = pd.DataFrame(np.arange(9).reshape((3, 3)),index=['a', 'c', 'd'],columns=['Assa

m', 'Kolkata', 'Bengaluru'])
In [ ]:
frame
Out[ ]:
Assam Kolkata Bengaluru
a 0 1 2
c 3 4 5
d 6 7 8
In [ ]:
frame2 = frame.reindex(['a', 'b', 'c', 'd'])
frame2
Out[ ]:
Assam Kolkata Bengaluru
a 0.0 1.0 2.0
b NaN NaN NaN
c 3.0 4.0 5.0
d 6.0 7.0 8.0

In [ ]:
states = ['Kolkata', 'Goa', 'Bengaluru']
In [ ]:
frame.reindex(columns=states)
Out[ ]:
Kolkata Goa Bengaluru
a 1 NaN 2
c 4 NaN 5
d 7 NaN 8
Dropping Entries from an Axis
In [ ]:
obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
In [ ]:
obj
Out[ ]:
a 0.0
b 1.0
c 2.0
d 3.0
e 4.0
dtype: float64
In [ ]:
new_obj = obj.drop('c')
In [ ]:
new_obj
Out[ ]:
a 0.0
b 1.0
d 3.0
e 4.0
dtype: float64
In [ ]:
obj.drop(['d', 'c'])
Out[ ]:
a 0.0
b 1.0
e 4.0
dtype: float64
In [ ]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)), index=['Ohio', 'Colorado', 'Utah',

'New York'], columns=['one', 'two', 'three', 'four'])
In [ ]:
data
Out[ ]:
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
In [ ]:
data.drop(['Colorado', 'Ohio'])
Out[ ]:
one two three four
Utah 8 9 10 11
New York 12 13 14 15
In [ ]:
data.drop('two', axis=1)
Out[ ]:
one three four
Ohio 0 2 3
Colorado 4 6 7
Utah 8 10 11
New York 12 14 15
In [ ]:
data.drop(['two', 'four'], axis='columns')
Out[ ]:
one three
Ohio 0 2
Colorado 4 6
Utah 8 10
New York 12 14
In [ ]:
obj.drop('c', inplace=True)
In [ ]:
obj
Out[ ]:
a 0.0
b 1.0
d 3.0
e 4.0
dtype: float64
Indexing, Selection, and Filtering
In [ ]:
obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])
obj
Out[ ]:
a 0.0
b 1.0
c 2.0
d 3.0
dtype: float64
In [ ]:
obj['b']
Out[ ]:
1.0
In [ ]:
obj[1]
Out[ ]:
1.0
In [ ]:
obj[2:4]
Out[ ]:
c 2.0
d 3.0
dtype: float64
In [ ]:
obj[['b', 'a', 'd']]
Out[ ]:
b 1.0
a 0.0
d 3.0
dtype: float64
In [ ]:
obj[[1, 3]]
Out[ ]:
b 1.0
d 3.0
dtype: float64
In [ ]:
obj[obj < 2]
Out[ ]:
a 0.0
b 1.0
dtype: float64
In [ ]:
obj['b':'c']
Out[ ]:
b 1.0
c 2.0
dtype: float64
In [ ]:
obj['b':'c'] = 5
In [ ]:
obj
Out[ ]:
a 0.0
b 5.0
c 5.0
d 3.0
dtype: float64
In [ ]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)), index=['Ohio', 'Colorado', 'Utah',

'New York'], columns=['one', 'two', 'three', 'four'])
In [ ]:
data
Out[ ]:
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
In [ ]:
data['two']
Out[ ]:
Ohio 1
Colorado 5
Utah 9
New York 13
Name: two, dtype: int64
In [ ]:
data[['three', 'one']]
Out[ ]:
three one
Ohio 2 0
Colorado 6 4
Utah 10 8
New York 14 12
In [ ]:
data[:2]
Out[ ]:
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
In [ ]:
data[data['three'] > 5]
Out[ ]:
one two three four
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
In [ ]:
data < 5
Out[ ]:
one two three four
Ohio True True True True
Colorado True False False False
Utah False False False False
New York False False False False
In [ ]:
data[data < 5] = 0
data
Out[ ]:
one two three four
Ohio 0 0 0 0
Colorado 0 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
In [ ]:
data.loc['Colorado', ['two', 'three']]
Out[ ]:
two 5
three 6
Name: Colorado, dtype: int64
In [ ]:
data.iloc[2, [3, 0, 1]]
Out[ ]:
four 11
one 8
two 9
Name: Utah, dtype: int64
In [ ]:
data.iloc[:, :3][data.three > 5]
Out[ ]:
one two three
Colorado 0 5 6
Utah 8 9 10
New York 12 13 14
In [ ]:
data.loc[:'Utah', 'two']
Out[ ]:
Ohio 0
Colorado 5
Utah 9
Name: two, dtype: int64
In [ ]:
data.iloc[2]
Out[ ]:
one 8
two 9
three 10
four 11
Name: Utah, dtype: int64

In [ ]:
data.iloc[[1, 2], [3, 0, 1]]
Out[ ]:
four one two
Colorado 7 0 5
Utah 11 8 9
Integer Indexes
In [ ]:
ser = pd.Series(np.arange(3.))
In [ ]:
ser
Out[ ]:
0 0.0
1 1.0
2 2.0
dtype: float64
In [ ]:
ser2 = pd.Series(np.arange(3.), index=['a', 'b', 'c'])
In [ ]:
ser2[-1]
Out[ ]:
2.0
In [ ]:
ser[:1]
Out[ ]:
0 0.0
dtype: float64
In [ ]:
ser.loc[:1]
Out[ ]:
0 0.0
1 1.0
dtype: float64
In [ ]:
ser.iloc[:1]
Out[ ]:
0 0.0
dtype: float64
*DAP Lab Exp.No.6*
USA gov Data from Bitly:
In 2011, URL shortening service Bitly partnered with the US government website USA.gov to provide a feed
of anonymous data gathered from users who shorten links ending with .gov or .mil. In 2011, a live feed as
well as hourly snapshots were available as download able text files.
1. Load the data file

2. Convert a JSON string into a Python dictionary object
3. Counting Time Zones in Pure Python
4. Counting Time Zones in Pandas
5. Visualize this data using matplotlib and Seaborn
In [ ]:
from google.colab import drive
drive.mount('/content/drive/')
Drive already mounted at /content/drive/; to attempt to forcibly remount,

call drive.mount("/content/drive/", force_remount=True).
In [ ]:
from numpy.random import randn
import numpy as np
np.random.seed(123)
import os
import matplotlib.pyplot as plt
import pandas as pd
plt.rc('figure', figsize=(10, 6))
np.set_printoptions(precision=4)
pd.options.display.max_rows = 20
USA.gov Data from Bitly
1. Loading the dataset(data file)
In [ ]:
path = "/content/drive/MyDrive/example.txt"
open(path).readline()
Out[ ]:
'{ "a": "Mozilla\\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\\/535.11 (KHTM

L, like Gecko) Chrome\\/17.0.963.78 Safari\\/535.11", "c": "US", "nk": 1,
"tz": "America\\/New_York", "gr": "MA", "g": "A6qOVH", "h": "wfLQtf", "l":
"orofrog", "al": "en-US,en;q=0.8", "hh": "1.usa.gov", "r": "http:\\/\\/ww
w.facebook.com\\/l\\/7AQEFzjSi\\/1.usa.gov\\/wfLQtf", "u": "http:\\/\\/ww
w.ncbi.nlm.nih.gov\\/pubmed\\/22415991", "t": 1331923247, "hc": 133182291
8, "cy": "Danvers", "ll": [ 42.576698, -70.954903 ] }\n'
2. Converting a JSON string into a Python dictionary object
In [ ]:
import json
records = [json.loads(line) for line in open(path)]
In [ ]:
records[0]
Out[ ]:
{'a': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like

Gecko) Chrome/17.0.963.78 Safari/535.11',
'c': 'US',
'nk': 1,
'tz': 'America/New_York',
'gr': 'MA',
'g': 'A6qOVH',
'h': 'wfLQtf',
'l': 'orofrog',
'al': 'en-US,en;q=0.8',
'hh': '1.usa.gov',
'r': 'http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/wfLQtf',
'u': 'http://www.ncbi.nlm.nih.gov/pubmed/22415991',
't': 1331923247,
'hc': 1331822918,
'cy': 'Danvers',
'll': [42.576698, -70.954903]}
3. Counting Time Zones in Pure Python
In [ ]:
time_zones = [rec['tz'] for rec in records if 'tz' in rec]
time_zones[:10]
Out[ ]:
['America/New_York',
'America/Denver',
'America/New_York',
'America/Sao_Paulo',
'America/New_York',
'America/New_York',
'Europe/Warsaw',
'',
'',
'']
In [ ]:
def get_counts(sequence):
counts = {}
for x in sequence:
if x in counts:
counts[x] += 1
else:
counts[x] = 1
return counts
In [ ]:
from collections import defaultdict
def get_counts2(sequence):
counts = defaultdict(int) # values will initialize to 0
for x in sequence:
counts[x] += 1
return counts
In [ ]:
counts = get_counts(time_zones)
counts['America/New_York']
len(time_zones)
Out[ ]:
3440
In [ ]:
def top_counts(count_dict, n=10):
value_key_pairs = [(count, tz) for tz, count in count_dict.items()]
value_key_pairs.sort()
return value_key_pairs[-n:]
In [ ]:
top_counts(counts)
Out[ ]:
[(33, 'America/Sao_Paulo'),
(35, 'Europe/Madrid'),
(36, 'Pacific/Honolulu'),
(37, 'Asia/Tokyo'),
(74, 'Europe/London'),
(191, 'America/Denver'),
(382, 'America/Los_Angeles'),
(400, 'America/Chicago'),
(521, ''),
(1251, 'America/New_York')]
In [ ]:
from collections import Counter
counts = Counter(time_zones)
counts.most_common(10)
Out[ ]:
[('America/New_York', 1251),
('', 521),
('America/Chicago', 400),
('America/Los_Angeles', 382),
('America/Denver', 191),
('Europe/London', 74),
('Asia/Tokyo', 37),
('Pacific/Honolulu', 36),
('Europe/Madrid', 35),
('America/Sao_Paulo', 33)]
4. Counting Time Zones with Pandas

In [ ]:
frame = pd.DataFrame(records)
frame.info()
frame['tz'][:10]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3560 entries, 0 to 3559
Data columns (total 18 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 a 3440 non-null object
1 c 2919 non-null object
2 nk 3440 non-null float64
3 tz 3440 non-null object
4 gr 2919 non-null object
5 g 3440 non-null object
6 h 3440 non-null object
7 l 3440 non-null object
8 al 3094 non-null object
9 hh 3440 non-null object
10 r 3440 non-null object
11 u 3440 non-null object
12 t 3440 non-null float64
13 hc 3440 non-null float64
14 cy 2919 non-null object
15 ll 2919 non-null object
16 _heartbeat_ 120 non-null float64
17 kw 93 non-null object
dtypes: float64(4), object(14)
memory usage: 500.8+ KB
Out[ ]:
0 America/New_York
1 America/Denver
2 America/New_York
3 America/Sao_Paulo
4 America/New_York
5 America/New_York
6 Europe/Warsaw
Name: tz, dtype: object

In [ ]:
tz_counts = frame['tz'].value_counts()
tz_counts[:10]
Out[ ]:
America/New_York 1251
521
America/Chicago 400
America/Los_Angeles 382
America/Denver 191
Europe/London 74
Asia/Tokyo 37
Pacific/Honolulu 36
Europe/Madrid 35
America/Sao_Paulo 33
Name: tz, dtype: int64
In [ ]:
clean_tz = frame['tz'].fillna('Missing')
clean_tz[clean_tz == ''] = 'Unknown'
tz_counts = clean_tz.value_counts()
tz_counts[:10]
Out[ ]:
America/New_York 1251
Unknown 521
America/Chicago 400
America/Los_Angeles 382
America/Denver 191
Missing 120
Europe/London 74
Asia/Tokyo 37
Pacific/Honolulu 36
Europe/Madrid 35
Name: tz, dtype: int64
5. Visualize this data using matplotlib and Seaborn
In [ ]:
plt.figure(figsize=(10, 4))
Out[ ]:
<Figure size 720x288 with 0 Axes>

In [ ]:
import seaborn as sns
subset = tz_counts[:10]
sns.barplot(y=subset.index, x=subset.values)
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f9bcc9e56d0>
In [ ]:
frame['a'][1]
frame['a'][50]
frame['a'][51][:50] # long line
Out[ ]:
'Mozilla/5.0 (Linux; U; Android 2.2.2; en-us; LG-P9'
In [ ]:
results = pd.Series([x.split()[0] for x in frame.a.dropna()])
results[:5]
results.value_counts()[:8]
Out[ ]:
Mozilla/5.0 2594
Mozilla/4.0 601
GoogleMaps/RochesterNY 121
Opera/9.80 34
TEST_INTERNET_AGENT 24
GoogleProducer 21
Mozilla/6.0 5
BlackBerry8520/5.0.0.681 4
dtype: int64
In [ ]:
cframe = frame[frame.a.notnull()]
In [ ]:
cframe = cframe.copy()
In [ ]:
cframe['os'] = np.where(cframe['a'].str.contains('Windows'),
'Windows', 'Not Windows')
cframe['os'][:5]
Out[ ]:
0 Windows
1 Not Windows
2 Windows
3 Not Windows
4 Windows
Name: os, dtype: object
In [ ]:
by_tz_os = cframe.groupby(['tz', 'os'])
In [ ]:
agg_counts = by_tz_os.size().unstack().fillna(0)
agg_counts[:10]
Out[ ]:
os Not Windows Windows
tz
245.0 276.0
Africa/Cairo 0.0 3.0
Africa/Casablanca 0.0 1.0
Africa/Ceuta 0.0 2.0
Africa/Johannesburg 0.0 1.0
Africa/Lusaka 0.0 1.0
America/Anchorage 4.0 1.0
America/Argentina/Buenos_Aires 1.0 0.0
America/Argentina/Cordoba 0.0 1.0
America/Argentina/Mendoza 0.0 1.0

In [ ]:
# Use to sort in ascending order
indexer = agg_counts.sum(1).argsort()
indexer[:10]
Out[ ]:
tz
24
Africa/Cairo 20
Africa/Casablanca 21
Africa/Ceuta 92
Africa/Johannesburg 87
Africa/Lusaka 53
America/Anchorage 54
America/Argentina/Buenos_Aires 57
America/Argentina/Cordoba 26
America/Argentina/Mendoza 55
dtype: int64
In [ ]:
count_subset = agg_counts.take(indexer[-10:])
count_subset
Out[ ]:
os Not Windows Windows
tz
America/Sao_Paulo 13.0 20.0
Europe/Madrid 16.0 19.0
Pacific/Honolulu 0.0 36.0
Asia/Tokyo 2.0 35.0
Europe/London 43.0 31.0
America/Denver 132.0 59.0
America/Los_Angeles 130.0 252.0
America/Chicago 115.0 285.0
245.0 276.0
America/New_York 339.0 912.0

In [ ]:
agg_counts.sum(1).nlargest(10)
Out[ ]:
tz
America/New_York 1251.0
521.0
America/Chicago 400.0
America/Los_Angeles 382.0
America/Denver 191.0
Europe/London 74.0
Asia/Tokyo 37.0
Pacific/Honolulu 36.0
Europe/Madrid 35.0
America/Sao_Paulo 33.0
dtype: float64
In [ ]:
plt.figure()
Out[ ]:
In [ ]:
# Rearrange the data for plotting
count_subset = count_subset.stack()
count_subset.name = 'total'
count_subset = count_subset.reset_index()
count_subset[:10]
sns.barplot(x='total', y='tz', hue='os', data=count_subset)
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f9bcc464b90>
In [ ]:
def norm_total(group):
group['normed_total'] = group.total / group.total.sum()
return group
results = count_subset.groupby('tz').apply(norm_total)
In [ ]:
plt.figure()
Out[ ]:
In [ ]:
sns.barplot(x='normed_total', y='tz', hue='os', data=results)
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f9bd8dae650>
In [ ]:
g = count_subset.groupby('tz')
results2 = count_subset.total / g.total.transform('sum')
*DAP Lab Exp.No.7*
MovieLens 1M Dataset:
GroupLens Research provides a number of collections of movie ratings data collected from users of
MovieLens in the late 1990s and early 2000s. The data provide movie ratings, movie metadata (genres and
year), and demographic data about the users (age, zip code, gender identification, and occupation). Such
data is often of interest in the development of recommendation systems based on machine learning
algorithms. The MovieLens 1M dataset contains 1 million ratings collected from 6,000 users on 4,000
movies. It’s spread across three tables: ratings, user information, and movie information.
1. Load each table into a pandas data frame object using pandas.read_table
2. Merge ratings with users and then merge that result with the movies data
3. Calculate mean movie ratings for each film grouped by gender
4. Find top films among female viewers
5. Find the movies that are most divisive between male and female viewers
In [ ]:

In [ ]:
import pandas as pd
pd.options.display.max_rows = 10
In [ ]:
unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_table('/content/drive/MyDrive/ml-1m/users.dat', sep='::',header=None, n

ames=unames, engine='python')
In [ ]:
rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table('/content/drive/MyDrive/ml-1m/ratings.dat', sep='::',header=Non

e, names=rnames, engine='python')
In [ ]:
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('/content/drive/MyDrive/ml-1m/movies.dat', sep='::', header=None

, names=mnames, encoding='latin-1', engine='python')
In [ ]:
users[:5]
Out[ ]:
user_id gender age occupation zip
0 1 F 1 10 48067
1 2 M 56 16 70072
2 3 M 25 15 55117
3 4 M 45 7 02460
4 5 M 25 20 55455
In [ ]:
ratings[:5]
Out[ ]:
user_id movie_id rating timestamp
0 1 1193 5 978300760
1 1 661 3 978302109
2 1 914 3 978301968
3 1 3408 4 978300275
4 1 2355 5 978824291
In [ ]:
movies[:5]
Out[ ]:
movie_id title genres
0 1 Toy Story (1995) Animation|Children's|Comedy
1 2 Jumanji (1995) Adventure|Children's|Fantasy
2 3 Grumpier Old Men (1995) Comedy|Romance
3 4 Waiting to Exhale (1995) Comedy|Drama
4 5 Father of the Bride Part II (1995) Comedy

In [ ]:
ratings
Out[ ]:
user_id movie_id rating timestamp
0 1 1193 5 978300760
1 1 661 3 978302109
2 1 914 3 978301968
3 1 3408 4 978300275
4 1 2355 5 978824291
... ... ... ... ...
1000204 6040 1091 1 956716541
1000205 6040 1094 5 956704887
1000206 6040 562 5 956704746
1000207 6040 1096 4 956715648
1000208 6040 1097 4 956715569
1000209 rows × 4 columns
Note
Ages and occupations are coded as integers indicating groups described in

the dataset’s README
file.Analyzing the data spread across three tables is not a simple task; for example, suppose you wanted
tocompute mean ratings for a particular movie by sex and age. As you will see, this is much easier to do with
all of the data merged together into a single table. Using pandas’s merge function, we first merge ratings
withusers and then merge that result with the movies data. pandas infers which columns to use as the merge
(orjoin) keys based on overlapping names:
In [ ]:
data = pd.merge(pd.merge(ratings, users), movies)
data
Out[ ]:
user_id movie_id rating timestamp gender age occupation zip title
One Flew
Over the
0 1 1193 5 978300760 F 1 10 48067
Cuckoo's
Nest (1975)
One Flew
Over the
1 2 1193 5 978298413 M 56 16 70072
Cuckoo's
Nest (1975)
One Flew
Over the
2 12 1193 4 978220179 M 25 12 32793
Cuckoo's
Nest (1975)
One Flew
Over the
3 15 1193 4 978199279 M 25 7 22903
Cuckoo's
Nest (1975)
One Flew
Over the
4 17 1193 5 978158471 M 50 1 95350
Cuckoo's
Nest (1975)
... ... ... ... ... ... ... ... ... ...
Modulations
1000204 5949 2198 5 958846401 M 18 17 47901
(1998)
Broken
1000205 5675 2703 3 976029116 M 35 14 30030 Vessels
(1998)
White Boys
1000206 5780 2845 1 958153068 M 18 17 92886
(1999)
One Little
1000207 5851 3607 5 957756608 F 18 20 55410 Indian
(1973)
Five Wives,
Three
1000208 5938 2909 4 957273353 M 25 1 35401 Secretaries
and Me
(1998)

In [ ]:
data.iloc[0]
Out[ ]:
user_id 1
movie_id 1193
rating 5
timestamp 978300760
gender F
age 1
occupation 10
zip 48067
title One Flew Over the Cuckoo's Nest (1975)
genres Drama
Name: 0, dtype: object
In [ ]:
mean_ratings = data.pivot_table('rating', index='title',columns='gender', aggfunc='mea

n')
In [ ]:
mean_ratings[:5]
Out[ ]:
gender F M
title
$1,000,000 Duck (1971) 3.375000 2.761905
'Night Mother (1986) 3.388889 3.352941
'Til There Was You (1997) 2.675676 2.733333
'burbs, The (1989) 2.793478 2.962085
...And Justice for All (1979) 3.828571 3.689024
In [ ]:
ratings_by_title = data.groupby('title').size()
In [ ]:
ratings_by_title[:10]
Out[ ]:
title
$1,000,000 Duck (1971) 37
'Night Mother (1986) 70
'Til There Was You (1997) 52
'burbs, The (1989) 303
...And Justice for All (1979) 199
1-900 (1994) 2
10 Things I Hate About You (1999) 700
101 Dalmatians (1961) 565
101 Dalmatians (1996) 364
12 Angry Men (1957) 616
dtype: int64
In [ ]:
active_titles = ratings_by_title.index[ratings_by_title >= 250]
In [ ]:
active_titles
Out[ ]:
Index([''burbs, The (1989)', '10 Things I Hate About You (1999)',
'101 Dalmatians (1961)', '101 Dalmatians (1996)', '12 Angry Men (19
57)',
'13th Warrior, The (1999)', '2 Days in the Valley (1996)',
'20,000 Leagues Under the Sea (1954)', '2001: A Space Odyssey (196
8)',
'2010 (1984)',
...
'X-Men (2000)', 'Year of Living Dangerously (1982)',
'Yellow Submarine (1968)', 'You've Got Mail (1998)',
'Young Frankenstein (1974)', 'Young Guns (1988)',
'Young Guns II (1990)', 'Young Sherlock Holmes (1985)',
'Zero Effect (1998)', 'eXistenZ (1999)'],
dtype='object', name='title', length=1216)
In [ ]:
mean_ratings = mean_ratings.loc[active_titles]
In [ ]:
mean_ratings
Out[ ]:
gender F M
title
'burbs, The (1989) 2.793478 2.962085
10 Things I Hate About You (1999) 3.646552 3.311966
101 Dalmatians (1961) 3.791444 3.500000
101 Dalmatians (1996) 3.240000 2.911215
12 Angry Men (1957) 4.184397 4.328421
... ... ...
Young Guns (1988) 3.371795 3.425620
Young Guns II (1990) 2.934783 2.904025
Young Sherlock Holmes (1985) 3.514706 3.363344
Zero Effect (1998) 3.864407 3.723140
eXistenZ (1999) 3.098592 3.289086
In [ ]:
#To see the top films among female viewers, we can sort by the F column in descending
top_female_ratings = mean_ratings.sort_values(by='F', ascending=False)
top_female_ratings[:10]
Out[ ]:
gender F M
title
Close Shave, A (1995) 4.644444 4.473795
Wrong Trousers, The (1993) 4.588235 4.478261
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950) 4.572650 4.464589
Wallace & Gromit: The Best of Aardman Animation (1996) 4.563107 4.385075
Schindler's List (1993) 4.562602 4.491415
Shawshank Redemption, The (1994) 4.539075 4.560625
Grand Day Out, A (1992) 4.537879 4.293255
To Kill a Mockingbird (1962) 4.536667 4.372611
Creature Comforts (1990) 4.513889 4.272277
Usual Suspects, The (1995) 4.513317 4.518248

Measuring Rating Disagreement
Suppose we want to find the movies that are most divisive between male and
female viewers. One way is
toadd a column to mean_ratings containing the differ‐
ence in means, then we need to sort by that:
In [ ]:
mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F']
In [ ]:
sorted_by_diff = mean_ratings.sort_values(by='diff')
sorted_by_diff[:10]
Out[ ]:
gender F M diff
title
Dirty Dancing (1987) 3.790378 2.959596 -0.830782
Jumpin' Jack Flash (1986) 3.254717 2.578358 -0.676359
Grease (1978) 3.975265 3.367041 -0.608224
Little Women (1994) 3.870588 3.321739 -0.548849
Steel Magnolias (1989) 3.901734 3.365957 -0.535777
Anastasia (1997) 3.800000 3.281609 -0.518391
Rocky Horror Picture Show, The (1975) 3.673016 3.160131 -0.512885
Color Purple, The (1985) 4.158192 3.659341 -0.498851
Age of Innocence, The (1993) 3.827068 3.339506 -0.487561
Free Willy (1993) 2.921348 2.438776 -0.482573
Suppose instead we want the movies that elicited the most disagreement among
viewers, independent
ofgender identification. Disagreement can be measured by the variance or standard deviation of the ratings:
In [ ]:
# Standard deviation of rating grouped by title
rating_std_by_title = data.groupby('title')['rating'].std()
# Filter down to active_titles
rating_std_by_title = rating_std_by_title.loc[active_titles]
# Order Series by value in descending order
rating_std_by_title.sort_values(ascending=False)[:10]
Out[ ]:
title
Dumb & Dumber (1994) 1.321333
Blair Witch Project, The (1999) 1.316368
Natural Born Killers (1994) 1.307198
Tank Girl (1995) 1.277695
Rocky Horror Picture Show, The (1975) 1.260177
Eyes Wide Shut (1999) 1.259624
Evita (1996) 1.253631
Billy Madison (1995) 1.249970
Fear and Loathing in Las Vegas (1998) 1.246408
Bicentennial Man (1999) 1.245533
Name: rating, dtype: float64
We have noticed that movie genres are given as a pipe-separated (|) string. If
we want to do some analysis
by genre, more work would be required to transform the genre information into a more usable form.
*DAP Lab Exp.No.8*
US Baby Names 1880–2021:
The United States Social Security Administration (SSA) has made available data on the frequency of baby
names from 1880 to 2021
1. Use Data Wrangling to load this dataset

2. Find sum of the birth’s column by sex as the total number of births in that year
3. Assemble all of the data into a single Data Frame and further add a year field
4. Visualize total births by sex and year
5. Analyze Naming Trends
In [ ]:

In [ ]:
!head -n 10 /content/drive/MyDrive/names/yob1880.txt
Mary,F,7065
Anna,F,2604
Emma,F,2003
Elizabeth,F,1939
Minnie,F,1746
Margaret,F,1578
Ida,F,1472
Alice,F,1414
Bertha,F,1320
Sarah,F,1288
In [ ]:
import pandas as pd
In [ ]:
names1880 = pd.read_csv('/content/drive/MyDrive/names/yob1880.txt', names=['name', 'se

x', 'births'])
In [ ]:
names1880
Out[ ]:
name sex births
0 Mary F 7065
1 Anna F 2604
2 Emma F 2003
3 Elizabeth F 1939
4 Minnie F 1746
... ... ... ...
1995 Woodie M 5
1996 Worthy M 5
1997 Wright M 5
1998 York M 5
1999 Zachariah M 5
In [ ]:
names1880.groupby('sex').births.sum()
Out[ ]:
sex
F 90994
M 110490
Name: births, dtype: int64
Since the dataset is split into files by year, one of the first things to do is to assemble all of the data into a
single DataFrame and further to add a year field. We can do this using pandas.concat:
In [ ]:
years = range(1880, 2022)
pieces = []
columns = ['name', 'sex', 'births']
for year in years:
path = '/content/drive/MyDrive/names/yob%d.txt' % year
frame = pd.read_csv(path, names=columns)
frame['year'] = year
pieces.append(frame)
# Concatenate everything into a single DataFrame
names = pd.concat(pieces, ignore_index=True)
In [ ]:
names
Out[ ]:
name sex births year
0 Mary F 7065 1880
1 Anna F 2604 1880
2 Emma F 2003 1880
3 Elizabeth F 1939 1880
4 Minnie F 1746 1880
... ... ... ... ...
2052776 Zyeire M 5 2021
2052777 Zyel M 5 2021
2052778 Zyian M 5 2021
2052779 Zylar M 5 2021
2052780 Zyn M 5 2021
In [ ]:
total_births = names.pivot_table('births', index='year', columns='sex', aggfunc=sum)
In [ ]:
total_births.tail()
Out[ ]:
sex F M
year
2017 1723043 1847191
2018 1696917 1811738
2019 1673030 1788414
2020 1609171 1718248
2021 1627098 1734277

In [ ]:
total_births.plot(title='Total births by sex and year')
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fc928ce2d90>
Now, let’s insert a column prop with the fraction of babies given each name relative to the total number of
births. A prop value of 0.02 would indicate that 2 out of every 100 babies were given a particular name. Thus,
we group the data by year and sex, then add the new column to each group:
In [ ]:
def add_prop(group):
group['prop'] = group.births / group.births.sum()
return group
names = names.groupby(['year', 'sex']).apply(add_prop)
In [ ]:
names
Out[ ]:
name sex births year prop
0 Mary F 7065 1880 0.077642
1 Anna F 2604 1880 0.028617
2 Emma F 2003 1880 0.022012
3 Elizabeth F 1939 1880 0.021309
4 Minnie F 1746 1880 0.019188
... ... ... ... ... ...
2052776 Zyeire M 5 2021 0.000003
2052777 Zyel M 5 2021 0.000003
2052778 Zyian M 5 2021 0.000003
2052779 Zylar M 5 2021 0.000003
2052780 Zyn M 5 2021 0.000003
In [ ]:
names.groupby(['year', 'sex']).prop.sum()
Out[ ]:
year sex
1880 F 1.0
M 1.0
1881 F 1.0
M 1.0
1882 F 1.0
...
2019 M 1.0
2020 F 1.0
M 1.0
2021 F 1.0
M 1.0
Name: prop, Length: 284, dtype: float64
Now that this is done, We are going to extract a subset of the data to facilitate further analysis: the top 1,000
names for each sex/year combination. This is yet another group operation:
In [ ]:
def get_top1000(group):
return group.sort_values(by='births', ascending=False)[:1000]
grouped = names.groupby(['year', 'sex'])
top1000 = grouped.apply(get_top1000)
# Drop the group index, not needed
top1000.reset_index(inplace=True, drop=True)
In [ ]:
top1000
Out[ ]:
name sex births year prop
0 Mary F 7065 1880 0.077642
1 Anna F 2604 1880 0.028617
2 Emma F 2003 1880 0.022012
3 Elizabeth F 1939 1880 0.021309
4 Minnie F 1746 1880 0.019188
... ... ... ... ... ...
283871 Zev M 218 2021 0.000126
283872 Harris M 217 2021 0.000125
283873 Ronnie M 217 2021 0.000125
283874 Merrick M 217 2021 0.000125
283875 Mayson M 217 2021 0.000125
Analyzing Naming Trends
In [ ]:
boys = top1000[top1000.sex == 'M']
In [ ]:
girls = top1000[top1000.sex == 'F']
In [ ]:
total_births = top1000.pivot_table('births', index='year', columns='name', aggfunc=sum)
In [ ]:
total_births.info()
Int64Index: 142 entries, 1880 to 2021
Columns: 7276 entries, Aaden to Zyon
dtypes: float64(7276)
memory usage: 7.9 MB
In [ ]:
subset = total_births[['John', 'Harry', 'Mary', 'Marilyn']]
In [ ]:
subset.plot(subplots=True, figsize=(12, 10), grid=False, title="Number of births per ye

ar")
Out[ ]:
array([<matplotlib.axes._subplots.AxesSubplot object at 0x7fc920e95b50>,
<matplotlib.axes._subplots.AxesSubplot object at 0x7fc9203e1350>,
<matplotlib.axes._subplots.AxesSubplot object at 0x7fc9203e1910>,
<matplotlib.axes._subplots.AxesSubplot object at 0x7fc92034c990>],
dtype=object)
In [ ]:
all_names = pd.Series(top1000.name.unique())
In [ ]:
lesley_like = all_names[all_names.str.lower().str.contains('lesl')]
In [ ]:
lesley_like
Out[ ]:
632 Leslie
2293 Lesley
4263 Leslee
4731 Lesli
6106 Lesly
dtype: object
In [ ]:
filtered = top1000[top1000.name.isin(lesley_like)]
In [ ]:
filtered.groupby('name').births.sum()
Out[ ]:
name
Leslee 1082
Lesley 35038
Lesli 929
Leslie 379721
Lesly 11433
Name: births, dtype: int64
In [ ]:
table = filtered.pivot_table('births', index='year', columns='sex', aggfunc='sum')
In [ ]:
table = table.div(table.sum(1), axis=0)
In [ ]:
table.tail()
Out[ ]:
sex F M
year
2017 1.0 NaN
2018 1.0 NaN
2019 1.0 NaN
2020 1.0 NaN
2021 1.0 NaN
In [ ]:
table.plot(style={'M': 'k-', 'F': 'k--'})
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fc920f88810>
*DAP Lab Exp.No.9*
USDA Food Database:
The US Department of Agriculture makes available a database of food nutrient Information
1. Load dataset into python with any JSON library

2. Convert a list of dicts to a data frame
3. Plot median values by food group and nutrient type
4. Display result Amino Acids' nutrient group
In [ ]:

In [ ]:
import json
import pandas as pd
import numpy as np
sns.set_style("whitegrid")
In [ ]:
path = "/content/drive/MyDrive/usda_food/database.json"
db = json.load(open(path))
In [ ]:
len(db)
Out[ ]:
6636
In [ ]:
type(db), type(db[0])
Out[ ]:
(list, dict)
In [ ]:
db[0].keys()
Out[ ]:
dict_keys(['id', 'description', 'tags', 'manufacturer', 'group', 'portion

s', 'nutrients'])
In [ ]:
db[0]['nutrients'][0]
Out[ ]:
{'value': 25.18,
'units': 'g',
'description': 'Protein',
'group': 'Composition'}
In [ ]:
nutrients = pd.DataFrame(db[0]['nutrients'])
nutrients = nutrients[["description", "group", "units", "value"]]
# nutrients = nutrients.reindex(columns=["description", "group", "units", "value"])
In [ ]:
nutrients.head(10)
Out[ ]:
description group units value
0 Protein Composition g 25.18
1 Total lipid (fat) Composition g 29.20
2 Carbohydrate, by difference Composition g 3.06
3 Ash Other g 3.28
4 Energy Energy kcal 376.00
5 Water Composition g 39.28
6 Energy Energy kJ 1573.00
7 Fiber, total dietary Composition g 0.00
8 Calcium, Ca Elements mg 673.00
9 Iron, Fe Elements mg 0.64
In [ ]:
info_keys = ['description', 'group', 'id', 'manufacturer']
info = pd.DataFrame(db, columns=info_keys)
info[:5]
Out[ ]:
description group id manufacturer
0 Cheese, caraway Dairy and Egg Products 1008
1 Cheese, cheddar Dairy and Egg Products 1009
2 Cheese, edam Dairy and Egg Products 1018
3 Cheese, feta Dairy and Egg Products 1019
4 Cheese, mozzarella, part skim milk Dairy and Egg Products 1028
In [ ]:
info.info()
--- ------ -------------- -----
0 description 6636 non-null object
1 group 6636 non-null object
2 id 6636 non-null int64
3 manufacturer 5195 non-null object
dtypes: int64(1), object(3)
In [ ]:
# distribution of food groups
pd.value_counts(info.group)[:10]
Out[ ]:
Vegetables and Vegetable Products 812
Beef Products 618
Baked Products 496
Breakfast Cereals 403
Legumes and Legume Products 365
Fast Foods 365
Lamb, Veal, and Game Products 345
Sweets 341
Fruits and Fruit Juices 328
Pork Products 328
Name: group, dtype: int64
In [ ]:
_ = sns.barplot(x=pd.value_counts(info.group)[:10], y=pd.value_counts(info.group)[:10].
index)
_ = plt.xlabel("Counts")
In [ ]:
# for analysis of nutrients
nutrients = []
for i in db:
fnuts = pd.DataFrame(i["nutrients"])
fnuts['id'] = i["id"]
nutrients.append(fnuts)
nutrients = pd.concat(nutrients, ignore_index=True)
In [ ]:
nutrients = nutrients[["description", "group", "units", "value", "id"]]
nutrients.head(10)
Out[ ]:
description group units value id
0 Protein Composition g 25.18 1008
1 Total lipid (fat) Composition g 29.20 1008
2 Carbohydrate, by difference Composition g 3.06 1008
3 Ash Other g 3.28 1008
4 Energy Energy kcal 376.00 1008
5 Water Composition g 39.28 1008
6 Energy Energy kJ 1573.00 1008
7 Fiber, total dietary Composition g 0.00 1008
8 Calcium, Ca Elements mg 673.00 1008
9 Iron, Fe Elements mg 0.64 1008
In [ ]:
nutrients.duplicated().sum()
Out[ ]:
14179
In [ ]:
nutrients = nutrients.drop_duplicates()
In [ ]:
col_mapping = {'description' : 'food', 'group' : 'fgroup'}
In [ ]:
info = info.rename(columns=col_mapping, copy=False)

In [ ]:
info.info()
--- ------ -------------- -----
0 food 6636 non-null object
1 fgroup 6636 non-null object
dtypes: int64(1), object(3)
In [ ]:
col_mapping = {'description': 'nutrient', 'group': 'nutgroup'}
In [ ]:
nutrients = nutrients.rename(columns=col_mapping, copy=False)
In [ ]:
nutrients.info()
--- ------ -------------- -----
0 nutrient 375176 non-null object
1 nutgroup 375176 non-null object
2 units 375176 non-null object
3 value 375176 non-null float64
dtypes: float64(1), int64(1), object(3)
memory usage: 17.2+ MB
In [ ]:
# merging both dataframe to get a collective data
ndata = pd.merge(nutrients, info, on='id', how='outer')
ndata.info()
--- ------ -------------- -----
0 nutrient 375176 non-null object
1 nutgroup 375176 non-null object
2 units 375176 non-null object
3 value 375176 non-null float64
5 food 375176 non-null object
6 fgroup 375176 non-null object
In [ ]:
ndata.iloc[30000]
Out[ ]:
nutrient Glycine
nutgroup Amino Acids
units g
value 0.04
id 6158
food Soup, tomato bisque, canned, condensed
fgroup Soups, Sauces, and Gravies
manufacturer
In [ ]:
by_nutrient = ndata.groupby(['nutgroup', 'nutrient'])
get_max = lambda x: x.loc[x.value.idxmax()]
get_min = lambda x: x.loc[x.value.idxmin()]
max_foods = by_nutrient.apply(get_max)[['value', 'food']]
# make the food a little smaller
max_foods.food = max_foods.food.str[:50]
In [ ]:
max_foods.loc['Amino Acids']['food']
Out[ ]:
nutrient
Alanine Gelatins, dry powder, unsweetened
Arginine Seeds, sesame flour, low-fat
Aspartic acid Soy protein isolate
Cystine Seeds, cottonseed flour, low fat (glandless)
Glutamic acid Soy protein isolate
Glycine Gelatins, dry powder, unsweetened
Histidine Whale, beluga, meat, dried (Alaska Native)
Hydroxyproline KENTUCKY FRIED CHICKEN, Fried Chicken, ORIGINA...
Isoleucine Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Leucine Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Lysine Seal, bearded (Oogruk), meat, dried (Alaska Na...
Methionine Fish, cod, Atlantic, dried and salted
Phenylalanine Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Proline Gelatins, dry powder, unsweetened
Serine Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Threonine Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Tryptophan Sea lion, Steller, meat with fat (Alaska Native)
Tyrosine Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Valine Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Name: food, dtype: object

*DAP Lab Exp.No.10*
2012 Federal Election Commission Database:
The US Federal Election Commission publishes data on contributions to political campaigns. This includes
contributor names, occupation and employer, address, and contribution amount. An interesting dataset is
from the 2012 US presidential election
1. Load CSV file and convert into data frame

2. Compute an array of political parties from the candidate names
3. Analyze donation statistics by occupation and employer
4. Use pivot_table to aggregate the data by party and occupation
5. Plot total donations by party for top occupations
6. Bucketing donation amounts
7. Plot Percentage of total donations received by candidates for each donation size
8. Analyze donation statistics by state
In [ ]:
Mounted at /content/drive/
In [ ]:
import numpy as np
import pandas as pd
sns.set_style("darkgrid")
In [ ]:
path = "/content/drive/MyDrive/P00000001-ALL.csv"
fec = pd.read_csv(path, low_memory=False)
fec.info()
--- ------ -------------- -----
0 cmte_id 1001731 non-null object
1 cand_id 1001731 non-null object
2 cand_nm 1001731 non-null object
3 contbr_nm 1001731 non-null object
4 contbr_city 1001712 non-null object
5 contbr_st 1001727 non-null object
6 contbr_zip 1001620 non-null object
7 contbr_employer 988002 non-null object
8 contbr_occupation 993301 non-null object
9 contb_receipt_amt 1001731 non-null float64
10 contb_receipt_dt 1001731 non-null object
11 receipt_desc 14166 non-null object
12 memo_cd 92482 non-null object
13 memo_text 97770 non-null object
14 form_tp 1001731 non-null object
15 file_num 1001731 non-null int64
In [ ]:
fec.iloc[123456]
Out[ ]:
cmte_id C00431445
cand_id P80003338
cand_nm Obama, Barack
contbr_nm ELLMAN, IRA
contbr_city TEMPE
contbr_st AZ
contbr_zip 852816719
contbr_employer ARIZONA STATE UNIVERSITY
contbr_occupation PROFESSOR
contb_receipt_amt 50.0
contb_receipt_dt 01-DEC-11
receipt_desc NaN
memo_cd NaN
memo_text NaN
form_tp SA17A
file_num 772372

In [ ]:
unique_cands = fec.cand_nm.unique()
unique_cands
Out[ ]:
array(['Bachmann, Michelle', 'Romney, Mitt', 'Obama, Barack',
"Roemer, Charles E. 'Buddy' III", 'Pawlenty, Timothy',
'Johnson, Gary Earl', 'Paul, Ron', 'Santorum, Rick',
'Cain, Herman', 'Gingrich, Newt', 'McCotter, Thaddeus G',
'Huntsman, Jon', 'Perry, Rick'], dtype=object)
In [ ]:
parties = {'Bachmann, Michelle': 'Republican',
'Cain, Herman': 'Republican',
'Gingrich, Newt': 'Republican',
'Huntsman, Jon': 'Republican',
'Johnson, Gary Earl': 'Republican',
'McCotter, Thaddeus G': 'Republican',
'Obama, Barack': 'Democrat',
'Paul, Ron': 'Republican',
'Pawlenty, Timothy': 'Republican',
'Perry, Rick': 'Republican',
"Roemer, Charles E. 'Buddy' III": 'Republican',
'Romney, Mitt': 'Republican',
'Santorum, Rick': 'Republican'
In [ ]:
fec.cand_nm[123456:123461]
Out[ ]:
123456 Obama, Barack
Name: cand_nm, dtype: object
In [ ]:
fec.cand_nm[123456:123461].map(parties)
Out[ ]:
123456 Democrat
123457 Democrat
123458 Democrat
123459 Democrat
123460 Democrat
Name: cand_nm, dtype: object

In [ ]:
# adding it as a column
fec['party'] = fec.cand_nm.map(parties)
In [ ]:
fec['party'].value_counts()
Out[ ]:
Democrat 593746
Republican 407985
Name: party, dtype: int64
In [ ]:
# number of contributions and refunds
(fec.contb_receipt_amt > 0).value_counts()
Out[ ]:
True 991475
False 10256
Name: contb_receipt_amt, dtype: int64
In [ ]:
# taking only contributions rows
fec = fec[fec.contb_receipt_amt > 0]
In [ ]:
# making a seperate subset of only Barack Obama and Mitt Romney
fec_mrbo = fec[fec.cand_nm.isin(['Obama, Barack', 'Romney, Mitt'])]
Donation Statistics by Occupation and Employer

In [ ]:
top_10_donor_occ = fec.contbr_occupation.value_counts().head(10)
_ = sns.barplot(top_10_donor_occ.values, top_10_donor_occ.index)
/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWa
rning: Pass the following variables as keyword args: x, y. From version 0.
12, the only valid positional argument will be `data`, and passing other a
rguments without an explicit keyword will result in an error or misinterpr
etation.
FutureWarning
In [ ]:
redundant_data = fec.contbr_occupation.isin([
'INFORMATION REQUESTED PER BEST EFFORTS',
'INFORMATION REQUESTED',
'INFORMATION REQUESTED (BEST EFFORTS)'
])
_ = sns.countplot(redundant_data)
_ = plt.title("Redundant Occupations names")
rning: Pass the following variable as a keyword arg: x. From version 0.12,
the only valid positional argument will be `data`, and passing other argum
ents without an explicit keyword will result in an error or misinterpretat
ion.
FutureWarning
In [ ]:
occ_mapping = {
'INFORMATION REQUESTED PER BEST EFFORTS' : 'NOT PROVIDED',
'INFORMATION REQUESTED' : 'NOT PROVIDED',
'INFORMATION REQUESTED (BEST EFFORTS)' : 'NOT PROVIDED',
'C.E.O.': 'CEO'
# If no mapping provided, return x
f = lambda x: occ_mapping.get(x, x)
fec.contbr_occupation = fec.contbr_occupation.map(f)
In [ ]:
redundant_data = fec.contbr_employer.isin([
'INFORMATION REQUESTED PER BEST EFFORTS',
'INFORMATION REQUESTED',
'SELF',
'SELF EMPLOYED'
])
_ = sns.countplot(redundant_data)
_ = plt.title("Redundant Employer names")
rning: Pass the following variable as a keyword arg: x. From version 0.12,
the only valid positional argument will be `data`, and passing other argum
ents without an explicit keyword will result in an error or misinterpretat
ion.
FutureWarning
In [ ]:
# same thing for employers
emp_mapping = {
'INFORMATION REQUESTED PER BEST EFFORTS' : 'NOT PROVIDED',
'INFORMATION REQUESTED' : 'NOT PROVIDED',
'SELF' : 'SELF-EMPLOYED',
'SELF EMPLOYED' : 'SELF-EMPLOYED',
# If no mapping provided, return x
f = lambda x: emp_mapping.get(x, x)
fec.contbr_employer = fec.contbr_employer.map(f)
In [ ]:
# aggregate data by party and occupation and donation over $2 million overall
by_occupation = fec.pivot_table(values='contb_receipt_amt',
index='contbr_occupation',
columns='party',
aggfunc='sum')
In [ ]:
by_occupation
Out[ ]:
party Democrat Republican
contbr_occupation
MIXED-MEDIA ARTIST / STORYTELLER 100.0 NaN
AREA VICE PRESIDENT 250.0 NaN
RESEARCH ASSOCIATE 100.0 NaN
TEACHER 500.0 NaN
THERAPIST 3900.0 NaN
... ... ...
ZOOKEEPER 35.0 NaN
ZOOLOGIST 400.0 NaN
ZOOLOGY EDUCATION 25.0 NaN
\NONE\ NaN 250.0
~ NaN 75.0
In [ ]:
over_2mm = by_occupation[by_occupation.sum(1) > 2000000]
In [ ]:
over_2mm
Out[ ]:
party Democrat Republican
contbr_occupation
ATTORNEY 11141982.97 7477194.43
CEO 2074974.79 4211040.52
CONSULTANT 2459912.71 2544725.45
ENGINEER 951525.55 1818373.70
EXECUTIVE 1355161.05 4138850.09
HOMEMAKER 4248875.80 13634275.78
INVESTOR 884133.00 2431768.92
LAWYER 3160478.87 391224.32
MANAGER 762883.22 1444532.37
NOT PROVIDED 4866973.96 20565473.01
OWNER 1001567.36 2408286.92
PHYSICIAN 3735124.94 3594320.24
PRESIDENT 1878509.95 4720923.76
PROFESSOR 2165071.08 296702.73
REAL ESTATE 528902.09 1625902.25
RETIRED 25305116.38 23561244.49
SELF-EMPLOYED 672393.40 1640252.54

In [ ]:
_ = over_2mm.plot(kind='barh', figsize=(12, 8))
In [ ]:
def get_top_amounts(group, key, n=5):
totals = group.groupby(key)['contb_receipt_amt'].sum()
return totals.nlargest(n)
In [ ]:
# Then aggregate by occupation and employer:
grouped = fec_mrbo.groupby('cand_nm')
grouped.apply(get_top_amounts, 'contbr_occupation', n=7)
Out[ ]:
cand_nm contbr_occupation
Obama, Barack RETIRED 25305116.38

ATTORNEY 11141982.97
INFORMATION REQUESTED 4866973.96
HOMEMAKER 4248875.80
PHYSICIAN 3735124.94
LAWYER 3160478.87
CONSULTANT 2459912.71
Romney, Mitt RETIRED 11508473.59
INFORMATION REQUESTED PER BEST EFFORTS 11396894.84
ATTORNEY 5364718.82
PRESIDENT 2491244.89
EXECUTIVE 2300947.03
C.E.O. 1968386.11
Name: contb_receipt_amt, dtype: float64
In [ ]:
grouped.apply(get_top_amounts, 'contbr_employer', n=10)
Out[ ]:
cand_nm contbr_employer
Obama, Barack RETIRED 22694358.85

SELF-EMPLOYED 17080985.96
NOT EMPLOYED 8586308.70
INFORMATION REQUESTED 5053480.37
SELF 1076531.20
SELF EMPLOYED 469290.00
STUDENT 318831.45
VOLUNTEER 257104.00
MICROSOFT 215585.36
Romney, Mitt INFORMATION REQUESTED PER BEST EFFORTS 12059527.24
RETIRED 11506225.71
SELF-EMPLOYED 7409860.98
STUDENT 496490.94
CREDIT SUISSE 281150.00
MORGAN STANLEY 267266.00
GOLDMAN SACH & CO. 238250.00
BARCLAYS CAPITAL 162750.00
H.I.G. CAPITAL 139500.00
Name: contb_receipt_amt, dtype: float64
Bucketing Donation Amounts

In [ ]:
bins = np.array([0, 1, 10, 100, 1000, 10000, 100000, 1000000, 10000000])
In [ ]:
labels = pd.cut(fec_mrbo.contb_receipt_amt, bins)
labels
Out[ ]:
411 (10, 100]
412 (100, 1000]
413 (100, 1000]
414 (10, 100]
415 (10, 100]
...
701381 (10, 100]
701382 (100, 1000]
701383 (1, 10]
701384 (10, 100]
701385 (100, 1000]
Name: contb_receipt_amt, Length: 694282, dtype: category
Categories (8, interval[int64, right]): [(0, 1] < (1, 10] < (10, 100] < (1
00, 1000] <
(1000, 10000] < (10000, 100000] <

(100000, 1000000] <
(1000000, 10000000]]
In [ ]:
# We can then group the data for Obama and Romney by name and bin label to get a histog
ram by donation size
grouped = fec_mrbo.groupby(['cand_nm', labels])
In [ ]:
grouped.size().unstack(0)
Out[ ]:
cand_nm Obama, Barack Romney, Mitt
contb_receipt_amt
(0, 1] 493 77
(1, 10] 40070 3681
(10, 100] 372280 31853
(100, 1000] 153991 43357
(1000, 10000] 22284 26186
(10000, 100000] 2 1
(100000, 1000000] 3 0
(1000000, 10000000] 4 0
In [ ]:
# sum the contribution amounts and normalize within buckets to visualize percentage of
total donations of each size by candidate
bucket_sums = grouped.contb_receipt_amt.sum().unstack(0)
bucket_sums
Out[ ]:
contb_receipt_amt
(0, 1] 318.24 77.00
(1, 10] 337267.62 29819.66
(10, 100] 20288981.41 1987783.76
(100, 1000] 54798531.46 22363381.69
(1000, 10000] 51753705.67 63942145.42
(10000, 100000] 59100.00 12700.00
(100000, 1000000] 1490683.08 0.00
(1000000, 10000000] 7148839.76 0.00
In [ ]:
normed_sums = bucket_sums.div(bucket_sums.sum(axis=1), axis=0)
normed_sums
Out[ ]:
contb_receipt_amt
(0, 1] 0.805182 0.194818
(1, 10] 0.918767 0.081233
(10, 100] 0.910769 0.089231
(100, 1000] 0.710176 0.289824
(1000, 10000] 0.447326 0.552674
(10000, 100000] 0.823120 0.176880
(100000, 1000000] 1.000000 0.000000
(1000000, 10000000] 1.000000 0.000000

In [ ]:
normed_sums[:-2].plot(kind='barh', figsize=(12, 8))
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f1e3c983b10>
Donation Statistics by State
In [ ]:
grouped = fec_mrbo.groupby(['cand_nm', 'contbr_st'])
totals = grouped.contb_receipt_amt.sum().unstack(0).fillna(0)
totals = totals[totals.sum(1) > 100000]
In [ ]:
totals.head(10)
Out[ ]:
contbr_st
AK 281840.15 86204.24
AL 543123.48 527303.51
AR 359247.28 105556.00
AZ 1506476.98 1888436.23
CA 23824984.24 11237636.60
CO 2132429.49 1506714.12
CT 2068291.26 3499475.45
DC 4373538.80 1025137.50
DE 336669.14 82712.00
FL 7318178.58 8338458.81
In [ ]:
percent = totals.div(totals.sum(1), axis=0)
In [ ]:
data = percent.reset_index().melt(id_vars='contbr_st')
data.columns
Out[ ]:
Index(['contbr_st', 'cand_nm', 'value'], dtype='object')

In [ ]:
_ = sns.barplot(x='contbr_st', y='value', hue='cand_nm', data=data)
_ = plt.title("$Donation$ $Statistics$ $by$ $State$", fontsize=13)
_ = plt.xlabel("$States$", fontsize=13)
_ = plt.ylabel("$Donation$ $percentage$", fontsize=13)

19BBTCS069 KrishnaKantPandey DAP Lab Record

Uploaded by

Document Information

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

19BBTCS069 KrishnaKantPandey DAP Lab Record

Uploaded by

Copyright:

Available Formats

KRISHNA KANT PANDEY

of Engineering & Technology

Data Analysis using Python

# Empty Dict can be created using

# Dictionary with Elements can be created using

Roles = {"Owner":1, "Admin":2, "Manager":3, "Moderator":4}

{'Owner': 1, 'Admin': 2, 'Manager': 3, 'Moderator': 4}

dict_keys(['Owner', 'Admin', 'Manager', 'Moderator'])

#Displaying Dictionary Items

dict_items([('Owner', 1), ('Admin', 2), ('Manager', 3), ('Moderator', 4)])

#Updating Values using Key

{'Owner': 1, 'Admin': 2, 'Manager': 3, 'Moderator': 5}

#Updating Values using update() function

{'Owner': 1, 'Admin': 2, 'Manager': 3, 'Moderator': 4}

#Deleting an item from Dict

del Roles['Manager'] # Or We May use Roles.pop('Manager')

{'Owner': 1, 'Admin': 2, 'Moderator': 4}

#Empyting entire items in the dictionary using clear()

# list with mixed data types

my_list = [1, "Hello", 3.4]

my_list = ["mouse", [8, 4, 6], ['a']]

[1, 'Hello', 3.4]

['mouse', [8, 4, 6], ['a']]

my_list = ['p', 'r', 'o', 'b', 'e']

n_list = ["Happy", [2, 0, 1, 5]]

# Negative indexing in lists

# fifth last item

# Correcting mistake values in a list

# change the 1st item

# change 2nd to 4th items

# Concatenating and repeating lists

print(odd + [9, 7, 5])

['re', 're', 're']

# Demonstration of list insert() method

# Deleting list items

my_list = ['p', 'r', 'o', 'b', 'l', 'e', 'm']

# delete one item

# delete multiple items

# delete the entire list

['p', 'r', 'b', 'l', 'e', 'm']

# Output: ['r', 'o', 'b', 'l', 'e', 'm']

# Output: ['r', 'b', 'l', 'e', 'm']

# Output: ['r', 'b', 'l', 'e']

['r', 'o', 'b', 'l', 'e', 'm']

['r', 'b', 'l', 'e', 'm']

['r', 'b', 'l', 'e']

# set of mixed datatypes

my_set = {1.0, "Hello", (1, 2, 3)}

{1.0, 'Hello', (1, 2, 3)}

# set cannot have duplicates

# we can make set from a list

my_set = set([1, 2, 3, 2])

# Distinguish set and dictionary while creating empty set

# check data type of a

# initialize a with set()

# check data type of a

# add multiple elements

# add list and set

my_set.update([4, 5], {1, 6, 8})

# Difference between discard() and remove()

# not present in my_set

# Output: set of unique elements

# Output: random element

# pop another element

DAP Lab Exp.No.2

DAP Lab Exp.No.3