Professional Documents
Culture Documents
19BBTCS069
B.Tech CSE
VII A
DAP 4BCS702
KRISHNA KANT PANDEY
19BBTCS069 VII
B.tech in CSE
4BCS702
*DAP Lab Exp.No.1*
Python Data Structures: Develop an application that uses different python data structures
1. Dictionaries
In [ ]:
# Creating a Dictionary
Roles = {}
print(Roles)
In [ ]:
#Displaying Keys
Roles.keys()
Out[ ]:
In [ ]:
#Displaying Values
Roles.values()
Out[ ]:
dict_values([1, 2, 3, 4])
In [ ]:
Roles.items()
Out[ ]:
In [ ]:
Roles['Moderator'] = 5
print(Roles)
In [ ]:
Roles.update({'Moderator':4})
print(Roles)
In [ ]:
print(Roles)
In [ ]:
Roles.clear()
print(Roles)
{}
2. Lists
In [ ]:
#Creatung Lists
# empty list
my_list = []
print(my_list)
# list of integers
my_list = [1, 2, 3]
print(my_list)
print(my_list)
# nested list
print(my_list)
[]
[1, 2, 3]
In [ ]:
# first item
print(my_list[0])
# third item
print(my_list[2])
# fifth item
print(my_list[4])
# Nested List
# Nested indexing
print(n_list[0][1])
print(n_list[1][3])
In [ ]:
my_list = ['p','r','o','b','e']
# last item
print(my_list[-1])
print(my_list[-5])
In [ ]:
odd = [2, 4, 6, 8]
odd[0] = 1
print(odd)
odd[1:4] = [3, 5, 7]
print(odd)
[1, 4, 6, 8]
[1, 3, 5, 7]
In [ ]:
odd = [1, 3, 5]
print(["re"] * 3)
[1, 3, 5, 9, 7, 5]
In [ ]:
odd = [1, 9]
odd.insert(1,3)
print(odd)
odd[2:2] = [5, 7]
print(odd)
[1, 3, 9]
[1, 3, 5, 7, 9]
In [ ]:
del my_list[2]
print(my_list)
del my_list[1:5]
print(my_list)
del my_list
['p', 'm']
In [ ]:
my_list = ['p','r','o','b','l','e','m']
my_list.remove('p')
print(my_list)
# Output: 'o'
print(my_list.pop(1))
print(my_list)
# Output: 'm'
print(my_list.pop())
print(my_list)
my_list.clear()
# Output: []
print(my_list)
[]
3. Sets
In [ ]:
# Creating Sets
# set of integers
my_set = {1, 2, 3}
print(my_set)
print(my_set)
{1, 2, 3}
In [ ]:
my_set = {1, 2, 3, 4, 3, 2}
print(my_set)
print(my_set)
{1, 2, 3, 4}
{1, 2, 3}
In [ ]:
# initialize a with {}
a = {}
print(type(a))
a = set()
print(type(a))
<class 'dict'>
<class 'set'>
In [ ]:
# initialize my_set
my_set = {1, 3}
print(my_set)
# add an element
my_set.add(2)
print(my_set)
my_set.update([2, 3, 4])
print(my_set)
print(my_set)
{1, 3}
{1, 2, 3}
{1, 2, 3, 4}
{1, 2, 3, 4, 5, 6, 8}
In [ ]:
# initialize my_set
my_set = {1, 3, 4, 5, 6}
print(my_set)
# discard an element
# Output: {1, 3, 5, 6}
my_set.discard(4)
print(my_set)
# remove an element
# Output: {1, 3, 5}
my_set.remove(6)
print(my_set)
# discard an element
# Output: {1, 3, 5}
my_set.discard(2)
print(my_set)
{1, 3, 4, 5, 6}
{1, 3, 5, 6}
{1, 3, 5}
{1, 3, 5}
In [ ]:
# initialize my_set
my_set = set("HelloWorld")
print(my_set)
# pop an element
print(my_set.pop())
my_set.pop()
print(my_set)
# clear my_set
# Output: set()
my_set.clear()
print(my_set)
set()
In [ ]:
# initialize A and B
A = {1, 2, 3, 4, 5}
B = {4, 5, 6, 7, 8}
# use | operator
# Output: {1, 2, 3, 4, 5, 6, 7, 8}
print(A | B)
{1, 2, 3, 4, 5, 6, 7, 8}
In [ ]:
# Intersection of sets
# initialize A and B
A = {1, 2, 3, 4, 5}
B = {4, 5, 6, 7, 8}
# Output: {4, 5}
print(A & B)
{4, 5}
In [ ]:
# initialize A and B
A = {1, 2, 3, 4, 5}
B = {4, 5, 6, 7, 8}
# use - operator on A
# Output: {1, 2, 3}
print(A - B)
{1, 2, 3}
In [ ]:
# initialize A and B
A = {1, 2, 3, 4, 5}
B = {4, 5, 6, 7, 8}
# use ^ operator
# Output: {1, 2, 3, 6, 7, 8}
print(A ^ B)
{1, 2, 3, 6, 7, 8}
In [ ]:
# in keyword in a set
# initialize my_set
my_set = set("apple")
# Output: True
print('a' in my_set)
# Output: False
True
False
In [ ]:
4. Tuples
In [ ]:
# Creating Tuples
# Empty tuple
my_tuple = ()
print(my_tuple)
my_tuple1 = (1, 2, 3)
print(my_tuple1)
print(my_tuple2)
# nested tuple
print(my_tuple3)
()
(1, 2, 3)
In [ ]:
print(my_tuple4)
#Tuple Unpacking
a, b, c = my_tuple4
print("")
print(a)
print(b)
print(c)
4.6
dog
In [ ]:
my_tuple = ("hello")
print(type(my_tuple))
my_tuple = ("hello",)
print(type(my_tuple))
# Parentheses is optional
my_tuple = "hello",
print(type(my_tuple))
<class 'str'>
<class 'tuple'>
<class 'tuple'>
In [ ]:
my_tuple = ('p','e','r','m','i','t')
print(my_tuple[0])
print(my_tuple[5])
# nested tuple
# nested index
print(n_tuple[0][3])
print(n_tuple[1][1])
In [ ]:
print(my_tuple[-1])
print(my_tuple[-6])
In [ ]:
#Deleting Tuple
del my_tuple
print(my_tuple)
--------------------------------------------------------------------------
-
<ipython-input-33-197bc7ed8198> in <module>
1 #Deleting Tuple
2 del my_tuple
----> 3 print(my_tuple)
In [ ]:
print(my_tuple.count('p'))
print(my_tuple.index('l'))
In [ ]:
# In operation
print('a' in my_tuple)
print('b' in my_tuple)
# Not in operation
True
False
True
In [ ]:
my_string = 'Hello'
print(my_string)
my_string = "Hello"
print(my_string)
my_string = '''Hello'''
print(my_string)
CMR University"""
print(my_string)
Hello
Hello
Hello
Hello, Welcome to
CMR University
In [ ]:
# capitalize() function
print(string.capitalize())
In [ ]:
# lower() function
print(string.lower())
In [ ]:
# casefold() function
print(string.casefold())
In [ ]:
# upper() function
print(string.upper())
In [ ]:
# title() function
print(string.title())
In [ ]:
# count() function
print(string.count('i'))
In [ ]:
# find() function
print(string.find('U'))
In [ ]:
# replace() function
print(string.replace("Cmr", "CMR"))
In [1]:
# swapcase() function
print(string.swapcase())
In [2]:
# join() function
print(string.join(string_tuple))
In [1]:
#Regular expressions provide a flexible way to search or match (often more complex) str
ing patterns in text.
import re
In [16]:
pattern = '^k.....a$'
string = 'krishna'
if result:
print("Search Successful")
else:
print("Search Unsuccessful")
Search Successful
In [3]:
re.split('\s+', text)
Out[3]:
In [4]:
regex = re.compile('\s+')
In [5]:
regex.split(text)
Out[5]:
In [6]:
regex.findall(text)
Out[6]:
Parameswaran parameswaran.t@cmr.edu.in
Sathiyaraj sathiyaraj.r@cmr.edu.in
Mariyappan mariyappan.k@cmr.edu.in
"""
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.([A-Z0-9.-]+)+\.[A-Z]{2,4}'
In [8]:
regex.findall(text)
Out[8]:
In [9]:
m = regex.search(text)
Out[9]:
In [10]:
print(regex.match(text))
None
In [11]:
print(regex.sub('CMRU', text))
Prabhakar CMRU
Parameswaran CMRU
Sathiyaraj CMRU
Mariyappan CMRU
In [12]:
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z0-9.-]+)\.([A-Z]{2,4})'
In [13]:
m = regex.match('krishna.19bcs@cmr.edu.in')
m.groups()
Out[13]:
regex.findall(text)
Out[14]:
In [15]:
In [ ]:
import numpy as np
Creating ndarrays
In [ ]:
data = np.random.randn(2, 3)
data
Out[ ]:
In [ ]:
In [ ]:
arr1 = np.array(data1)
arr1
Out[ ]:
array([6. , 7.5, 8. , 0. , 1. ])
In [ ]:
data2 = [[1,2,3,4],[5,6,7,8]]
In [ ]:
arr2 = np.array(data2)
arr2
Out[ ]:
array([[1, 2, 3, 4],
[5, 6, 7, 8]])
In [ ]:
arr2.ndim
Out[ ]:
2
In [ ]:
arr2.shape
Out[ ]:
(2, 4)
In [ ]:
np.zeros(10)
Out[ ]:
array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
In [ ]:
np.zeros((3,6))
Out[ ]:
In [ ]:
np.empty((2,3,2))
Out[ ]:
array([[[2.05050840e-316, 2.57353024e-316],
[3.95252517e-323, 2.57353024e-316],
[3.95252517e-323, 2.57353024e-316]],
[[4.44659081e-323, 0.00000000e+000],
[2.57355554e-316, 2.57355554e-316],
[2.57355554e-316, 0.00000000e+000]]])
In [ ]:
np.arange(15)
Out[ ]:
In [ ]:
In [ ]:
arr1.dtype
Out[ ]:
dtype('float64')
In [ ]:
arr2.dtype
Out[ ]:
dtype('int32')
In [ ]:
In [ ]:
arr
Out[ ]:
In [ ]:
arr.astype(np.int32)
Out[ ]:
In [ ]:
In [ ]:
arr
Out[ ]:
In [ ]:
arr * arr
Out[ ]:
arr - arr
Out[ ]:
In [ ]:
1 / arr
Out[ ]:
In [ ]:
arr ** 0.5
Out[ ]:
In [ ]:
In [ ]:
arr2
Out[ ]:
In [ ]:
Out[ ]:
In [ ]:
arr = np.arange(10)
In [ ]:
arr
Out[ ]:
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
In [ ]:
arr[5]
Out[ ]:
In [ ]:
arr[5:8]
Out[ ]:
array([5, 6, 7])
In [ ]:
arr[5:8] = 50
In [ ]:
arr
Out[ ]:
In [ ]:
arr_slice = arr[5:8]
In [ ]:
arr_slice
Out[ ]:
In [ ]:
arr_slice[1] = 12345
In [ ]:
arr
Out[ ]:
9])
In [ ]:
arr_slice[:] = 64
In [ ]:
arr
Out[ ]:
In [ ]:
In [ ]:
arr2d[2]
Out[ ]:
array([7, 8, 9])
In [ ]:
arr2d[0][2]
Out[ ]:
In [ ]:
arr2d[0,2]
Out[ ]:
In [ ]:
arr3d = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]])
In [ ]:
arr3d
Out[ ]:
array([[[ 1, 2, 3],
[ 4, 5, 6]],
[[ 7, 8, 9],
arr3d[0]
Out[ ]:
array([[1, 2, 3],
[4, 5, 6]])
In [ ]:
old_values = arr3d[0].copy()
In [ ]:
arr3d[0] = 42
In [ ]:
arr3d
Out[ ]:
[[ 7, 8, 9],
In [ ]:
arr3d[0] = old_values
In [ ]:
arr3d
Out[ ]:
array([[[ 1, 2, 3],
[ 4, 5, 6]],
[[ 7, 8, 9],
In [ ]:
arr3d[1, 0]
Out[ ]:
array([7, 8, 9])
In [ ]:
x = arr3d[1]
In [ ]:
Out[ ]:
array([[ 7, 8, 9],
In [ ]:
x[0]
Out[ ]:
array([7, 8, 9])
In [ ]:
arr
Out[ ]:
In [ ]:
arr[1:6]
Out[ ]:
array([ 1, 2, 3, 4, 64])
In [ ]:
arr2d
Out[ ]:
array([[1, 2, 3],
[4, 5, 6],
[7, 8, 9]])
In [ ]:
arr2d[:2]
Out[ ]:
array([[1, 2, 3],
[4, 5, 6]])
In [ ]:
arr2d[:2, 1:]
Out[ ]:
array([[2, 3],
[5, 6]])
In [ ]:
arr2d[1, :2]
Out[ ]:
array([4, 5])
In [ ]:
arr2d[:2, 2]
Out[ ]:
array([3, 6])
In [ ]:
arr2d[:, :1]
Out[ ]:
array([[1],
[4],
[7]])
In [ ]:
arr2d[:2, 1:] = 0
In [ ]:
arr2d
Out[ ]:
array([[1, 0, 0],
[4, 0, 0],
[7, 8, 9]])
Boolean Indexing
In [ ]:
In [ ]:
data = np.random.randn(7, 4)
In [ ]:
names
Out[ ]:
data
Out[ ]:
In [ ]:
names == 'Bob'
Out[ ]:
In [ ]:
data[names == 'Bob']
Out[ ]:
In [ ]:
Out[ ]:
array([[0.26087237, 0.51739584],
[1.51469514, 0.74176921]])
Fancy Indexing
In [ ]:
In [ ]:
for i in range(8):
arr[i] = i
In [ ]:
arr
Out[ ]:
In [ ]:
arr[[4, 3, 0, 6]]
Out[ ]:
In [ ]:
Out[ ]:
In [ ]:
In [ ]:
arr
Out[ ]:
array([[ 0, 1, 2, 3],
[ 4, 5, 6, 7],
[ 8, 9, 10, 11],
Out[ ]:
In [ ]:
Out[ ]:
array([[ 4, 7, 5, 6],
[ 8, 11, 9, 10]])
In [ ]:
In [ ]:
arr
Out[ ]:
array([[ 0, 1, 2, 3, 4],
[ 5, 6, 7, 8, 9],
In [ ]:
arr.T
Out[ ]:
array([[ 0, 5, 10],
[ 1, 6, 11],
[ 2, 7, 12],
[ 3, 8, 13],
[ 4, 9, 14]])
In [ ]:
arr = np.random.randn(6, 3)
In [ ]:
arr
Out[ ]:
In [ ]:
np.dot(arr.T, arr)
Out[ ]:
In [ ]:
In [ ]:
arr
Out[ ]:
array([[[ 0, 1, 2, 3],
[ 4, 5, 6, 7]],
[[ 8, 9, 10, 11],
In [ ]:
arr.transpose((1, 0, 2))
Out[ ]:
array([[[ 0, 1, 2, 3],
[ 8, 9, 10, 11]],
[[ 4, 5, 6, 7],
In [ ]:
arr
Out[ ]:
array([[[ 0, 1, 2, 3],
[ 4, 5, 6, 7]],
[[ 8, 9, 10, 11],
arr.swapaxes(1, 2)
Out[ ]:
array([[[ 0, 4],
[ 1, 5],
[ 2, 6],
[ 3, 7]],
[[ 8, 12],
[ 9, 13],
[10, 14],
[11, 15]]])
*DAP Lab Exp.No.5*
In [ ]:
import pandas as pd
import numpy as np
In [ ]:
In [ ]:
obj
Out[ ]:
0 4
1 7
2 -5
3 3
dtype: int64
In [ ]:
frame = pd.DataFrame(data)
frame
Out[ ]:
In [ ]:
index = obj.index
index
Out[ ]:
index[1:]
Out[ ]:
In [ ]:
labels = pd.Index(np.arange(3))
labels
Out[ ]:
In [ ]:
obj2
Out[ ]:
0 1.5
1 -2.5
2 0.0
dtype: float64
In [ ]:
obj2.index is labels
Out[ ]:
True
In [ ]:
frame
Out[ ]:
In [ ]:
frame.columns
Out[ ]:
'Assam' in frame.columns
Out[ ]:
False
In [ ]:
2003 in frame.index
Out[ ]:
False
In [ ]:
In [ ]:
dup_labels
Out[ ]:
In [ ]:
In [ ]:
obj
Out[ ]:
d 4.5
b 7.2
a -5.3
c 3.6
dtype: float64
In [ ]:
In [ ]:
obj2
Out[ ]:
a -5.3
b 7.2
c 3.6
d 4.5
e NaN
dtype: float64
In [ ]:
obj3
Out[ ]:
0 blue
2 purple
4 yellow
dtype: object
In [ ]:
obj3.reindex(range(6), method='ffill')
Out[ ]:
0 blue
1 blue
2 purple
3 purple
4 yellow
5 yellow
dtype: object
In [ ]:
In [ ]:
frame
Out[ ]:
a 0 1 2
c 3 4 5
d 6 7 8
In [ ]:
frame2
Out[ ]:
In [ ]:
frame.reindex(columns=states)
Out[ ]:
a 1 NaN 2
c 4 NaN 5
d 7 NaN 8
In [ ]:
In [ ]:
obj
Out[ ]:
a 0.0
b 1.0
c 2.0
d 3.0
e 4.0
dtype: float64
In [ ]:
new_obj = obj.drop('c')
In [ ]:
new_obj
Out[ ]:
a 0.0
b 1.0
d 3.0
e 4.0
dtype: float64
In [ ]:
obj.drop(['d', 'c'])
Out[ ]:
a 0.0
b 1.0
e 4.0
dtype: float64
In [ ]:
In [ ]:
data
Out[ ]:
Ohio 0 1 2 3
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
In [ ]:
data.drop(['Colorado', 'Ohio'])
Out[ ]:
Utah 8 9 10 11
New York 12 13 14 15
In [ ]:
data.drop('two', axis=1)
Out[ ]:
Ohio 0 2 3
Colorado 4 6 7
Utah 8 10 11
New York 12 14 15
In [ ]:
Out[ ]:
one three
Ohio 0 2
Colorado 4 6
Utah 8 10
New York 12 14
In [ ]:
obj.drop('c', inplace=True)
In [ ]:
obj
Out[ ]:
a 0.0
b 1.0
d 3.0
e 4.0
dtype: float64
In [ ]:
obj
Out[ ]:
a 0.0
b 1.0
c 2.0
d 3.0
dtype: float64
In [ ]:
obj['b']
Out[ ]:
1.0
In [ ]:
obj[1]
Out[ ]:
1.0
In [ ]:
obj[2:4]
Out[ ]:
c 2.0
d 3.0
dtype: float64
In [ ]:
Out[ ]:
b 1.0
a 0.0
d 3.0
dtype: float64
In [ ]:
obj[[1, 3]]
Out[ ]:
b 1.0
d 3.0
dtype: float64
In [ ]:
obj[obj < 2]
Out[ ]:
a 0.0
b 1.0
dtype: float64
In [ ]:
obj['b':'c']
Out[ ]:
b 1.0
c 2.0
dtype: float64
In [ ]:
obj['b':'c'] = 5
In [ ]:
obj
Out[ ]:
a 0.0
b 5.0
c 5.0
d 3.0
dtype: float64
In [ ]:
In [ ]:
data
Out[ ]:
Ohio 0 1 2 3
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
In [ ]:
data['two']
Out[ ]:
Ohio 1
Colorado 5
Utah 9
New York 13
In [ ]:
data[['three', 'one']]
Out[ ]:
three one
Ohio 2 0
Colorado 6 4
Utah 10 8
New York 14 12
In [ ]:
data[:2]
Out[ ]:
Ohio 0 1 2 3
Colorado 4 5 6 7
In [ ]:
data[data['three'] > 5]
Out[ ]:
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
In [ ]:
data < 5
Out[ ]:
In [ ]:
data[data < 5] = 0
data
Out[ ]:
Ohio 0 0 0 0
Colorado 0 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
In [ ]:
Out[ ]:
two 5
three 6
In [ ]:
Out[ ]:
four 11
one 8
two 9
In [ ]:
Out[ ]:
Colorado 0 5 6
Utah 8 9 10
New York 12 13 14
In [ ]:
data.loc[:'Utah', 'two']
Out[ ]:
Ohio 0
Colorado 5
Utah 9
In [ ]:
data.iloc[2]
Out[ ]:
one 8
two 9
three 10
four 11
Out[ ]:
Colorado 7 0 5
Utah 11 8 9
Integer Indexes
In [ ]:
ser = pd.Series(np.arange(3.))
In [ ]:
ser
Out[ ]:
0 0.0
1 1.0
2 2.0
dtype: float64
In [ ]:
In [ ]:
ser2[-1]
Out[ ]:
2.0
In [ ]:
ser[:1]
Out[ ]:
0 0.0
dtype: float64
In [ ]:
ser.loc[:1]
Out[ ]:
0 0.0
1 1.0
dtype: float64
In [ ]:
ser.iloc[:1]
Out[ ]:
0 0.0
dtype: float64
*DAP Lab Exp.No.6*
In 2011, URL shortening service Bitly partnered with the US government website USA.gov to provide a feed
of anonymous data gathered from users who shorten links ending with .gov or .mil. In 2011, a live feed as
well as hourly snapshots were available as download able text files.
In [ ]:
drive.mount('/content/drive/')
In [ ]:
import numpy as np
np.random.seed(123)
import os
import pandas as pd
np.set_printoptions(precision=4)
pd.options.display.max_rows = 20
In [ ]:
path = "/content/drive/MyDrive/example.txt"
open(path).readline()
Out[ ]:
In [ ]:
import json
In [ ]:
records[0]
Out[ ]:
'c': 'US',
'nk': 1,
'tz': 'America/New_York',
'gr': 'MA',
'g': 'A6qOVH',
'h': 'wfLQtf',
'l': 'orofrog',
'al': 'en-US,en;q=0.8',
'hh': '1.usa.gov',
'r': 'http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/wfLQtf',
'u': 'http://www.ncbi.nlm.nih.gov/pubmed/22415991',
't': 1331923247,
'hc': 1331822918,
'cy': 'Danvers',
In [ ]:
time_zones[:10]
Out[ ]:
['America/New_York',
'America/Denver',
'America/New_York',
'America/Sao_Paulo',
'America/New_York',
'America/New_York',
'Europe/Warsaw',
'',
'',
'']
In [ ]:
def get_counts(sequence):
counts = {}
for x in sequence:
if x in counts:
counts[x] += 1
else:
counts[x] = 1
return counts
In [ ]:
def get_counts2(sequence):
for x in sequence:
counts[x] += 1
return counts
In [ ]:
counts = get_counts(time_zones)
counts['America/New_York']
len(time_zones)
Out[ ]:
3440
In [ ]:
value_key_pairs.sort()
return value_key_pairs[-n:]
In [ ]:
top_counts(counts)
Out[ ]:
[(33, 'America/Sao_Paulo'),
(35, 'Europe/Madrid'),
(36, 'Pacific/Honolulu'),
(37, 'Asia/Tokyo'),
(74, 'Europe/London'),
(191, 'America/Denver'),
(382, 'America/Los_Angeles'),
(400, 'America/Chicago'),
(521, ''),
(1251, 'America/New_York')]
In [ ]:
counts = Counter(time_zones)
counts.most_common(10)
Out[ ]:
[('America/New_York', 1251),
('', 521),
('America/Chicago', 400),
('America/Los_Angeles', 382),
('America/Denver', 191),
('Europe/London', 74),
('Asia/Tokyo', 37),
('Pacific/Honolulu', 36),
('Europe/Madrid', 35),
('America/Sao_Paulo', 33)]
frame = pd.DataFrame(records)
frame.info()
frame['tz'][:10]
<class 'pandas.core.frame.DataFrame'>
17 kw 93 non-null object
Out[ ]:
0 America/New_York
1 America/Denver
2 America/New_York
3 America/Sao_Paulo
4 America/New_York
5 America/New_York
6 Europe/Warsaw
tz_counts = frame['tz'].value_counts()
tz_counts[:10]
Out[ ]:
America/New_York 1251
521
America/Chicago 400
America/Los_Angeles 382
America/Denver 191
Europe/London 74
Asia/Tokyo 37
Pacific/Honolulu 36
Europe/Madrid 35
America/Sao_Paulo 33
In [ ]:
clean_tz = frame['tz'].fillna('Missing')
tz_counts = clean_tz.value_counts()
tz_counts[:10]
Out[ ]:
America/New_York 1251
Unknown 521
America/Chicago 400
America/Los_Angeles 382
America/Denver 191
Missing 120
Europe/London 74
Asia/Tokyo 37
Pacific/Honolulu 36
Europe/Madrid 35
In [ ]:
plt.figure(figsize=(10, 4))
Out[ ]:
subset = tz_counts[:10]
sns.barplot(y=subset.index, x=subset.values)
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f9bcc9e56d0>
In [ ]:
frame['a'][1]
frame['a'][50]
Out[ ]:
In [ ]:
results[:5]
results.value_counts()[:8]
Out[ ]:
Mozilla/5.0 2594
Mozilla/4.0 601
GoogleMaps/RochesterNY 121
Opera/9.80 34
TEST_INTERNET_AGENT 24
GoogleProducer 21
Mozilla/6.0 5
BlackBerry8520/5.0.0.681 4
dtype: int64
In [ ]:
cframe = frame[frame.a.notnull()]
In [ ]:
cframe = cframe.copy()
In [ ]:
cframe['os'] = np.where(cframe['a'].str.contains('Windows'),
cframe['os'][:5]
Out[ ]:
0 Windows
1 Not Windows
2 Windows
3 Not Windows
4 Windows
In [ ]:
In [ ]:
agg_counts = by_tz_os.size().unstack().fillna(0)
agg_counts[:10]
Out[ ]:
tz
245.0 276.0
indexer = agg_counts.sum(1).argsort()
indexer[:10]
Out[ ]:
tz
24
Africa/Cairo 20
Africa/Casablanca 21
Africa/Ceuta 92
Africa/Johannesburg 87
Africa/Lusaka 53
America/Anchorage 54
America/Argentina/Buenos_Aires 57
America/Argentina/Cordoba 26
America/Argentina/Mendoza 55
dtype: int64
In [ ]:
count_subset = agg_counts.take(indexer[-10:])
count_subset
Out[ ]:
tz
245.0 276.0
agg_counts.sum(1).nlargest(10)
Out[ ]:
tz
America/New_York 1251.0
521.0
America/Chicago 400.0
America/Los_Angeles 382.0
America/Denver 191.0
Europe/London 74.0
Asia/Tokyo 37.0
Pacific/Honolulu 36.0
Europe/Madrid 35.0
America/Sao_Paulo 33.0
dtype: float64
In [ ]:
plt.figure()
Out[ ]:
In [ ]:
count_subset = count_subset.stack()
count_subset.name = 'total'
count_subset = count_subset.reset_index()
count_subset[:10]
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f9bcc464b90>
In [ ]:
def norm_total(group):
return group
results = count_subset.groupby('tz').apply(norm_total)
In [ ]:
plt.figure()
Out[ ]:
In [ ]:
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f9bd8dae650>
In [ ]:
g = count_subset.groupby('tz')
MovieLens 1M Dataset:
GroupLens Research provides a number of collections of movie ratings data collected from users of
MovieLens in the late 1990s and early 2000s. The data provide movie ratings, movie metadata (genres and
year), and demographic data about the users (age, zip code, gender identification, and occupation). Such
data is often of interest in the development of recommendation systems based on machine learning
algorithms. The MovieLens 1M dataset contains 1 million ratings collected from 6,000 users on 4,000
movies. It’s spread across three tables: ratings, user information, and movie information.
1. Load each table into a pandas data frame object using pandas.read_table
2. Merge ratings with users and then merge that result with the movies data
3. Calculate mean movie ratings for each film grouped by gender
4. Find top films among female viewers
5. Find the movies that are most divisive between male and female viewers
In [ ]:
drive.mount('/content/drive/')
In [ ]:
import pandas as pd
pd.options.display.max_rows = 10
In [ ]:
In [ ]:
In [ ]:
In [ ]:
users[:5]
Out[ ]:
0 1 F 1 10 48067
1 2 M 56 16 70072
2 3 M 25 15 55117
3 4 M 45 7 02460
4 5 M 25 20 55455
In [ ]:
ratings[:5]
Out[ ]:
0 1 1193 5 978300760
1 1 661 3 978302109
2 1 914 3 978301968
3 1 3408 4 978300275
4 1 2355 5 978824291
In [ ]:
movies[:5]
Out[ ]:
ratings
Out[ ]:
0 1 1193 5 978300760
1 1 661 3 978302109
2 1 914 3 978301968
3 1 3408 4 978300275
4 1 2355 5 978824291
Note
data
Out[ ]:
One Flew
Over the
0 1 1193 5 978300760 F 1 10 48067
Cuckoo's
Nest (1975)
One Flew
Over the
1 2 1193 5 978298413 M 56 16 70072
Cuckoo's
Nest (1975)
One Flew
Over the
2 12 1193 4 978220179 M 25 12 32793
Cuckoo's
Nest (1975)
One Flew
Over the
3 15 1193 4 978199279 M 25 7 22903
Cuckoo's
Nest (1975)
One Flew
Over the
4 17 1193 5 978158471 M 50 1 95350
Cuckoo's
Nest (1975)
... ... ... ... ... ... ... ... ... ...
Modulations
1000204 5949 2198 5 958846401 M 18 17 47901
(1998)
Broken
1000205 5675 2703 3 976029116 M 35 14 30030 Vessels
(1998)
White Boys
1000206 5780 2845 1 958153068 M 18 17 92886
(1999)
One Little
1000207 5851 3607 5 957756608 F 18 20 55410 Indian
(1973)
Five Wives,
Three
1000208 5938 2909 4 957273353 M 25 1 35401 Secretaries
and Me
(1998)
data.iloc[0]
Out[ ]:
user_id 1
movie_id 1193
rating 5
timestamp 978300760
gender F
age 1
occupation 10
zip 48067
genres Drama
In [ ]:
In [ ]:
mean_ratings[:5]
Out[ ]:
gender F M
title
In [ ]:
ratings_by_title = data.groupby('title').size()
In [ ]:
ratings_by_title[:10]
Out[ ]:
title
1-900 (1994) 2
dtype: int64
In [ ]:
In [ ]:
active_titles
Out[ ]:
'101 Dalmatians (1961)', '101 Dalmatians (1996)', '12 Angry Men (19
57)',
'20,000 Leagues Under the Sea (1954)', '2001: A Space Odyssey (196
8)',
'2010 (1984)',
...
In [ ]:
mean_ratings = mean_ratings.loc[active_titles]
In [ ]:
mean_ratings
Out[ ]:
gender F M
title
In [ ]:
#To see the top films among female viewers, we can sort by the F column in descending
top_female_ratings[:10]
Out[ ]:
gender F M
title
Wallace & Gromit: The Best of Aardman Animation (1996) 4.563107 4.385075
Suppose we want to find the movies that are most divisive between male and
female viewers. One way is
toadd a column to mean_ratings containing the differ‐
ence in means, then we need to sort by that:
In [ ]:
In [ ]:
sorted_by_diff = mean_ratings.sort_values(by='diff')
sorted_by_diff[:10]
Out[ ]:
gender F M diff
title
Suppose instead we want the movies that elicited the most disagreement among
viewers, independent
ofgender identification. Disagreement can be measured by the variance or standard deviation of the ratings:
In [ ]:
rating_std_by_title = data.groupby('title')['rating'].std()
rating_std_by_title = rating_std_by_title.loc[active_titles]
rating_std_by_title.sort_values(ascending=False)[:10]
Out[ ]:
title
We have noticed that movie genres are given as a pipe-separated (|) string. If
we want to do some analysis
by genre, more work would be required to transform the genre information into a more usable form.
*DAP Lab Exp.No.8*
The United States Social Security Administration (SSA) has made available data on the frequency of baby
names from 1880 to 2021
In [ ]:
drive.mount('/content/drive/')
In [ ]:
!head -n 10 /content/drive/MyDrive/names/yob1880.txt
Mary,F,7065
Anna,F,2604
Emma,F,2003
Elizabeth,F,1939
Minnie,F,1746
Margaret,F,1578
Ida,F,1472
Alice,F,1414
Bertha,F,1320
Sarah,F,1288
In [ ]:
import pandas as pd
In [ ]:
In [ ]:
names1880
Out[ ]:
0 Mary F 7065
1 Anna F 2604
2 Emma F 2003
3 Elizabeth F 1939
4 Minnie F 1746
1995 Woodie M 5
1996 Worthy M 5
1997 Wright M 5
1998 York M 5
1999 Zachariah M 5
In [ ]:
names1880.groupby('sex').births.sum()
Out[ ]:
sex
F 90994
M 110490
Since the dataset is split into files by year, one of the first things to do is to assemble all of the data into a
single DataFrame and further to add a year field. We can do this using pandas.concat:
In [ ]:
pieces = []
frame['year'] = year
pieces.append(frame)
In [ ]:
names
Out[ ]:
In [ ]:
In [ ]:
total_births.tail()
Out[ ]:
sex F M
year
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fc928ce2d90>
Now, let’s insert a column prop with the fraction of babies given each name relative to the total number of
births. A prop value of 0.02 would indicate that 2 out of every 100 babies were given a particular name. Thus,
we group the data by year and sex, then add the new column to each group:
In [ ]:
def add_prop(group):
return group
In [ ]:
names
Out[ ]:
In [ ]:
names.groupby(['year', 'sex']).prop.sum()
Out[ ]:
year sex
1880 F 1.0
M 1.0
1881 F 1.0
M 1.0
1882 F 1.0
...
2019 M 1.0
2020 F 1.0
M 1.0
2021 F 1.0
M 1.0
Now that this is done, We are going to extract a subset of the data to facilitate further analysis: the top 1,000
names for each sex/year combination. This is yet another group operation:
In [ ]:
def get_top1000(group):
top1000 = grouped.apply(get_top1000)
top1000.reset_index(inplace=True, drop=True)
In [ ]:
top1000
Out[ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
total_births.info()
<class 'pandas.core.frame.DataFrame'>
dtypes: float64(7276)
In [ ]:
In [ ]:
Out[ ]:
dtype=object)
In [ ]:
all_names = pd.Series(top1000.name.unique())
In [ ]:
lesley_like = all_names[all_names.str.lower().str.contains('lesl')]
In [ ]:
lesley_like
Out[ ]:
632 Leslie
2293 Lesley
4263 Leslee
4731 Lesli
6106 Lesly
dtype: object
In [ ]:
filtered = top1000[top1000.name.isin(lesley_like)]
In [ ]:
filtered.groupby('name').births.sum()
Out[ ]:
name
Leslee 1082
Lesley 35038
Lesli 929
Leslie 379721
Lesly 11433
In [ ]:
In [ ]:
In [ ]:
table.tail()
Out[ ]:
sex F M
year
In [ ]:
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fc920f88810>
*DAP Lab Exp.No.9*
In [ ]:
drive.mount('/content/drive/')
In [ ]:
import json
import pandas as pd
import numpy as np
sns.set_style("whitegrid")
In [ ]:
path = "/content/drive/MyDrive/usda_food/database.json"
db = json.load(open(path))
In [ ]:
len(db)
Out[ ]:
6636
In [ ]:
type(db), type(db[0])
Out[ ]:
(list, dict)
In [ ]:
db[0].keys()
Out[ ]:
db[0]['nutrients'][0]
Out[ ]:
{'value': 25.18,
'units': 'g',
'description': 'Protein',
'group': 'Composition'}
In [ ]:
nutrients = pd.DataFrame(db[0]['nutrients'])
In [ ]:
nutrients.head(10)
Out[ ]:
In [ ]:
info[:5]
Out[ ]:
4 Cheese, mozzarella, part skim milk Dairy and Egg Products 1028
In [ ]:
info.info()
<class 'pandas.core.frame.DataFrame'>
In [ ]:
pd.value_counts(info.group)[:10]
Out[ ]:
Sweets 341
In [ ]:
plt.figure(figsize=(12, 8))
_ = sns.barplot(x=pd.value_counts(info.group)[:10], y=pd.value_counts(info.group)[:10].
index)
_ = plt.xlabel("Counts")
In [ ]:
nutrients = []
for i in db:
fnuts = pd.DataFrame(i["nutrients"])
fnuts['id'] = i["id"]
nutrients.append(fnuts)
In [ ]:
nutrients.head(10)
Out[ ]:
In [ ]:
nutrients.duplicated().sum()
Out[ ]:
14179
In [ ]:
nutrients = nutrients.drop_duplicates()
In [ ]:
In [ ]:
info.info()
<class 'pandas.core.frame.DataFrame'>
In [ ]:
In [ ]:
In [ ]:
nutrients.info()
<class 'pandas.core.frame.DataFrame'>
In [ ]:
ndata.info()
<class 'pandas.core.frame.DataFrame'>
In [ ]:
ndata.iloc[30000]
Out[ ]:
nutrient Glycine
units g
value 0.04
id 6158
manufacturer
In [ ]:
max_foods.food = max_foods.food.str[:50]
In [ ]:
max_foods.loc['Amino Acids']['food']
Out[ ]:
nutrient
The US Federal Election Commission publishes data on contributions to political campaigns. This includes
contributor names, occupation and employer, address, and contribution amount. An interesting dataset is
from the 2012 US presidential election
In [ ]:
drive.mount('/content/drive/')
Mounted at /content/drive/
In [ ]:
import numpy as np
import pandas as pd
sns.set_style("darkgrid")
In [ ]:
path = "/content/drive/MyDrive/P00000001-ALL.csv"
fec.info()
<class 'pandas.core.frame.DataFrame'>
In [ ]:
fec.iloc[123456]
Out[ ]:
cmte_id C00431445
cand_id P80003338
contbr_city TEMPE
contbr_st AZ
contbr_zip 852816719
contbr_occupation PROFESSOR
contb_receipt_amt 50.0
contb_receipt_dt 01-DEC-11
receipt_desc NaN
memo_cd NaN
memo_text NaN
form_tp SA17A
file_num 772372
unique_cands = fec.cand_nm.unique()
unique_cands
Out[ ]:
In [ ]:
In [ ]:
fec.cand_nm[123456:123461]
Out[ ]:
In [ ]:
fec.cand_nm[123456:123461].map(parties)
Out[ ]:
123456 Democrat
123457 Democrat
123458 Democrat
123459 Democrat
123460 Democrat
# adding it as a column
fec['party'] = fec.cand_nm.map(parties)
In [ ]:
fec['party'].value_counts()
Out[ ]:
Democrat 593746
Republican 407985
In [ ]:
Out[ ]:
True 991475
False 10256
In [ ]:
In [ ]:
plt.figure(figsize=(12, 6))
top_10_donor_occ = fec.contbr_occupation.value_counts().head(10)
_ = sns.barplot(top_10_donor_occ.values, top_10_donor_occ.index)
/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWa
rning: Pass the following variables as keyword args: x, y. From version 0.
12, the only valid positional argument will be `data`, and passing other a
rguments without an explicit keyword will result in an error or misinterpr
etation.
FutureWarning
In [ ]:
redundant_data = fec.contbr_occupation.isin([
'INFORMATION REQUESTED',
])
_ = sns.countplot(redundant_data)
/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWa
rning: Pass the following variable as a keyword arg: x. From version 0.12,
the only valid positional argument will be `data`, and passing other argum
ents without an explicit keyword will result in an error or misinterpretat
ion.
FutureWarning
In [ ]:
occ_mapping = {
'INFORMATION REQUESTED PER BEST EFFORTS' : 'NOT PROVIDED',
'C.E.O.': 'CEO'
f = lambda x: occ_mapping.get(x, x)
fec.contbr_occupation = fec.contbr_occupation.map(f)
In [ ]:
redundant_data = fec.contbr_employer.isin([
'INFORMATION REQUESTED',
'SELF',
'SELF EMPLOYED'
])
_ = sns.countplot(redundant_data)
/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWa
rning: Pass the following variable as a keyword arg: x. From version 0.12,
the only valid positional argument will be `data`, and passing other argum
ents without an explicit keyword will result in an error or misinterpretat
ion.
FutureWarning
In [ ]:
emp_mapping = {
'INFORMATION REQUESTED PER BEST EFFORTS' : 'NOT PROVIDED',
'SELF' : 'SELF-EMPLOYED',
f = lambda x: emp_mapping.get(x, x)
fec.contbr_employer = fec.contbr_employer.map(f)
In [ ]:
# aggregate data by party and occupation and donation over $2 million overall
by_occupation = fec.pivot_table(values='contb_receipt_amt',
index='contbr_occupation',
columns='party',
aggfunc='sum')
In [ ]:
by_occupation
Out[ ]:
contbr_occupation
~ NaN 75.0
In [ ]:
In [ ]:
over_2mm
Out[ ]:
contbr_occupation
In [ ]:
totals = group.groupby(key)['contb_receipt_amt'].sum()
return totals.nlargest(n)
In [ ]:
grouped = fec_mrbo.groupby('cand_nm')
Out[ ]:
cand_nm contbr_occupation
In [ ]:
Out[ ]:
cand_nm contbr_employer
In [ ]:
labels
Out[ ]:
...
Categories (8, interval[int64, right]): [(0, 1] < (1, 10] < (10, 100] < (1
00, 1000] <
(1000000, 10000000]]
In [ ]:
# We can then group the data for Obama and Romney by name and bin label to get a histog
ram by donation size
In [ ]:
grouped.size().unstack(0)
Out[ ]:
contb_receipt_amt
(0, 1] 493 77
(10000, 100000] 2 1
(100000, 1000000] 3 0
(1000000, 10000000] 4 0
In [ ]:
# sum the contribution amounts and normalize within buckets to visualize percentage of
total donations of each size by candidate
bucket_sums = grouped.contb_receipt_amt.sum().unstack(0)
bucket_sums
Out[ ]:
contb_receipt_amt
In [ ]:
normed_sums
Out[ ]:
contb_receipt_amt
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f1e3c983b10>
In [ ]:
totals = grouped.contb_receipt_amt.sum().unstack(0).fillna(0)
In [ ]:
totals.head(10)
Out[ ]:
contbr_st
AK 281840.15 86204.24
AL 543123.48 527303.51
AR 359247.28 105556.00
AZ 1506476.98 1888436.23
CA 23824984.24 11237636.60
CO 2132429.49 1506714.12
CT 2068291.26 3499475.45
DC 4373538.80 1025137.50
DE 336669.14 82712.00
FL 7318178.58 8338458.81
In [ ]:
In [ ]:
data = percent.reset_index().melt(id_vars='contbr_st')
data.columns
Out[ ]:
plt.figure(figsize=(22, 9))
_ = plt.xlabel("$States$", fontsize=13)