You are on page 1of 33

Big Data

Section 7

By : Yosra Attaher
Agenda


NumPy `

Pandas

Talk about the project
What is NumPy?

NumPy is a Python library used for working with arrays.

It also has functions for working in domain of linear
algebra, fourier transform, and matrices.

NumPy was created in 2005 by Travis Oliphant. It is an
open source project and you can use it freely.

NumPy stands for Numerical Python.
Why Use NumPy?

In Python we have lists that serve the purpose of arrays,
but they are slow to process.

NumPy aims to provide an array object that is up to 50x
faster than traditional Python lists.

The array object in NumPy is called ndarray, it provides
a lot of supporting functions that make working with
ndarray very easy.

Arrays are very frequently used in data science, where
speed and resources are very important.
NumPy, Arrays
import numpy as np
# create a NumPy ndarray object by using the array() function.
a = np.array(45)
b = np.array([1, 2, 3])
c = np.array((1, 2, 3)) #Use a tuple to create a NumPy array
d = np.array([[1, 2, 3], [3, 4, 5],[2, 3, 4]])
e = np.array((1, 2, 3), ndmin=5)
print(a.ndim, a)
print(a)
Access Arrays
print(b.ndim, b) # 1 [1 2 3]
print(b[0], ', ', b[2]) #1, 3
print(c.ndim, c) # 1 [1 2 3]
print(c[0], ', ', c[1], ', ', c[-1]) #1, 2, 3
print(d.ndim, d) # 2 [[1 2 3] [3 4 5] [2 3 4]]
print(d[0,0], ', ', d[1,2]) #1, 5
print(e.ndim, e) # 5 [[[[[1 2 3]]]]]
print(e[0,0,0,0,2]) #3
Access Arrays
x = np.array([1,2,3,4,5,6,7,8,9,10])
print(x[1:5]) # [2 3 4 5]
print(x[4:]) # [ 5 6 7 8 9 10]
print(x[:4]) # [1 2 3 4]
print(x[-3:-1]) # [8 9]
print(x[1:5:2]) # [2 4]
print(x[::2]) # [1 3 5 7 9]
Slice
x = np.array([[1, 2, 3, 5], [3, 4, 5, 9],[2, 3, 4, 12],[4, 2, 3, 1]])

RUN:
print(x[1, 1:4])
[4 5 9]
print ("--------------------------") --------------------------

print(x[0:2, 2:4]) [[3 5]


[5 9]]
print ("--------------------------")
--------------------------
print(x[1:3, 1:3]) [[4 5]
[3 4]]
Conversion
x = np.array([1.2, 2.4, 3.5], 'f8') RUN...
print(x.dtype); print(x) float64

y = x.astype('i4') [1.2 2.4 3.5]

print(y.dtype); print(y) int32

x = np.array([1.2, 2.4, 3.5], 'i4') [1 2 3]

print(x.dtype); print(x) int32

x = np.array([1.2, 2.4, 3.5], 'i1') [1 2 3]

print(x.dtype); print(x) int8


[1 2 3]
x = np.array(["1.2", "2.4", "3.5"], 'f8')
float64
print(x.dtype); print(x)
[1.2 2.4 3.5]
Copy and View
x = np.array([1,2,3,4,5,6,7,8,9,10]) RUN….
y = x.copy()
[ 1 2 3 4 5 6 7 8 9 10]
x[1] = 12
[ 1 12 3 4 5 6 7 8 9 10]
print(y)
---------------
y = x.view()
x[1] = 12 [ 1 15 3 4 5 6 7 8 9 10]
print(y) None [ 1 15 3 4 5 6 7 8 9 10]
y[1] = 15
print ("---------------")
print(x)
print(x.base, y.base)
Reshape
x = np.array([[1, 2, 3, 5], [3, 4, 5, 9],[2, 3, 4, 12],
[4, 2, 3, 1]]) RUN…...
--------------------
print("--------------------")
(4, 4)
print(x.shape)
[[ 1 2 3 5 3 4 5 9]
y = x.reshape(2, 8) [ 2 3 4 12 4 2 3 1]]
print(y) --------------------

print("--------------------") [ 1 2 3 5 3 4 5 9 2 3 4 12 4 2 3 1]
--------------------
y = x.reshape(-1)
[[ 1 2 3 5]
print(y) [ 3 4 5 9]
print("--------------------") [ 2 3 4 12]

print(y.base) [ 4 2 3 1]]
Join
x = np.array([[1, 1, 1], [2, 2, 2]]) Run…

(4, 3) [[1 1 1]
y = np.array([[3, 3, 3], [4, 4, 4]])
[2 2 2]
z = np.concatenate((x,y)) [3 3 3]

print(z.shape, z) [4 4 4]]

--------------
print("--------------")
(4, 3) [[1 1 1]
z = np.concatenate((x,y), axis=0) [2 2 2]

print(z.shape, z) [3 3 3]

[4 4 4]]
print("--------------")
--------------
z = np.concatenate((x,y), axis=1)
(2, 6) [[1 1 1 3 3 3]
print(z.shape, z) [2 2 2 4 4 4]]
Search, Sort, Filter
#search
x = np.array([11, 31, 87, 19, 23, 43])
y = np.where(x==19); print(y) RUN…
#sort
(array([3]),)
x = np.array([11, 31, 87, 19, 23, 43])
[11 19 23 31 43 87]
y = np.sort(x); print(y)
#filter [11 23 43]

x = np.array([11, 31, 87, 19, 23, 43])


s = [True, False, False, False, True, True]
y = x[s]; print(y)
NUMPY, RANDOM
import numpy as np RUN…

0.7537900893332695

from numpy import random 86

#basics [30 79 10 14 94]

[[ 7 91 46]

x = random.rand(); print(x) [ 0 65 56]

x = random.randint(100); print(x) [62 64 28]

[91 72 18]

x = random.randint(100, size=5); [16 37 24]]


print(x) [0.89308242 0.11235977 0.57879863 0.63562923 0.68296079]

x = random.randint(100, size=(5, 3)); [[0.28630843 0.87333319 0.07027453]

print(x) [0.82643457 0.81043574 0.47318528]

[0.38990336 0.267552 0.23475348]


x = random.rand(5); print(x) [0.28870442 0.82799002 0.85453119]

x = random.rand(5,3); print(x) [0.55594484 0.29363382 0.97318952]]


Random Choice
x = random.choice([5,3,7,8]); RUN…
print(x)
5
x = random.choice([5,3,7,8],
size=(10)); print(x) [7 3 3 5 5 7 5 5 8 5]

x = random.choice([5,3,7,8], [[8 8 3]
size=(2,3)); print(x) [7 8 3]]
x = random.choice([5,3,7,8], [7 7 7 7 3 3 7 7 7 7]
p=[0.1, 0.3, 0.6, 0.0],
size=(10));
print(x)
Shuffel
x = np.array([1,2,3,4,5,6,7,8]) RUN…
o = x.copy() [1 2 3 4 5 6 7 8]
random.shuffle(x) [7 6 3 5 1 4 8 2]
print('\n', o, '\n', x)
x = np.array([1,2,3,4,5,6,7,8]) [1 2 3 4 5 6 7 8]
y = random.permutation(x) [2 4 3 6 5 1 7 8]
print('\n', x, '\n', y)
Random Distribution
import numpy as np
from numpy import random
import matplotlib.pyplot as plt

# We can plot Normal Distribution, Binomial Distribution,


Poisson Distribution, Uniform Distribution, Logarithmic
Distribution, Multinomial Distribution, Exponential
Distribution, Chi-Square Distribution
What is Pandas?

Pandas is a Python library used for working with
data sets.

It has functions for analyzing, cleaning, exploring,
and manipulating data.

The name "Pandas" has a reference to both
"Panel Data", and "Python Data Analysis" and
was created by Wes McKinney in 2008.
Why Use Pandas?

Pandas allows us to analyze big data and make
conclusions based on statistical theories.

Pandas can clean messy data sets, and make
them readable and relevant.

Relevant data is very important in data science.
Series, Creation
import pandas as pd
import numpy as np
#creating series
s = pd.Series([22, 32, 31, 42, 51]); print(s)
data = np.array(['a', 'b', 'c', 'd'])
s = pd.Series(data); print(s)
s = pd.Series(data,
index=[100,101,102,103]); print(s)
Series , Creation
data = {'a':100, 'b':120, 'c':99}
s = pd.Series(data); print(s)
data = {'c':99, 'a':100, 'b':120}
s = pd.Series(data, index=['a', 'b', 'c', 'd']);
print(s)
s = pd.Series(5, index=['a', 'b', 'c', 'd']);
print(s)
Series, Accessing
s = pd.Series([1,2,3,4,5],index =
['a','b','c','d','e'])
print(s[0])
print(s[1:3])
print(s[:3])
print(s[1:])
print(s[:])
print(s[-1])
print(s[-3:-1])
print(s['a'])
print(s[['a', 'c', 'e']])
print(s[[2, 4]])
Series, Basic Functions
calories = {'day1': 200, 'day2': 380,
'day3': 480, 'day4': 290}
s = pd.Series(calories); print(s.axes)
print(s.empty)
print(s.ndim)
print(s.size)
print(s.values)
print(s.head(2))
print(s.tail(2))
DataFrame, Creation
data = [12, 12, 13, 14, 15]
df = pd.DataFrame(data); print(df)
df = pd.DataFrame(data, columns =
['Temprature']);
print(df)
df = pd.DataFrame(data, columns =
['Temprature'],
dtype=float); print(df)
data = [['Alex',10],['Bob',12],['Clarke',13]]
df =
pd.DataFrame(data,columns=['Name','Age
']); print(df)
DataFrame, Creation
data = {
"calories": [200, 380, 480, 290],
"duration": [50, 40, 45, 30]
}
df = pd.DataFrame(data); print(df)
df = pd.DataFrame(data, index=['sat', 'sun',
'mon','tus']); print(df)
df = pd.DataFrame([{'math':88,
'physics':90},{'history':75, 'math':94}]); print(df)
DataFrame, Creation
data = {
'calories': pd.Series([200, 380, 480, 290],
index=['sat', 'sun', 'mon','tus']),
'duration': pd.Series([50, 40, 45, 30], index=['sat',
'sun', 'mon','tus'])
}
df = pd.DataFrame(data); print(df)
df = pd.DataFrame([{'math':88, 'physics':90},{'art':65,
'math':94}], index=['midterm', 'final'],
columns=['physics', 'math', 'art']); print(df)
DataFrame, Basic Functions
data = {
'Name':pd.Series(['Tom','James','Steve','Smith','
Jack']),
'Age':pd.Series([25,26,25,23,30,29,23]),
'Rating':pd.Series([4.23,3.24,3.98,2.56,3.20,4.6,
3.8])};
df = pd.DataFrame(data)
print(df)
print(df.T)
print(df.axes)
print(df.dtypes)
print(df.empty)
print(df.ndim)
print(df.shape)
print(df.size)
print(df.values)
DataFrame, Files
df = pd.read_csv('data/data.csv');
print(df)
df = pd.read_json('data/data.json');
print(df)
print(df.head())
print(df.head(10))
print(df.tail())
print(df.tail(6))
print(df.info())
DataFrame, Cleaning
Df =
pd.read_csv('data/wdata.csv’);
print(df)
print(df.loc[[22, 26,7, 11, 12,
18, 28]])
print(df.info())
DataFrame, Cleaning
dfcopy = df.dropna();
print(dfcopy.info())
df.dropna(inplace = True);
print(df.info())
df = pd.read_csv('data/wdata.csv’)
print(df.loc[[22, 26, 7, 11, 12, 18, 28]])
df.fillna(130, inplace = True);
print(df.info())
print(df.loc[[22, 26, 7, 11, 12, 18, 28]])
DataFrame, Cleaning
df = pd.read_csv('data/wdata.csv')
df.dropna(subset=['Date'], inplace = True)
print(df.info())
df = pd.read_csv('data/wdata.csv')
print(df.duplicated())
df.drop_duplicates(inplace=True)
print(df.duplicated())
DataFrame, Files
df = pd.read_csv('data/data.csv');
print(df)
df = pd.read_json('data/data.json');
print(df)
print(df.head())
print(df.head(10))
print(df.tail())
print(df.tail(6))
print(df.info())
Thanks

You might also like