Professional Documents
Culture Documents
TXT file :
CSV file :
1. Creating
2. Renaming
3. Combining levels
Creating data map & data plan : Dealing with date & time values :
TF-IDF :
N-grams :
Why Python : Anacondat Prompt :
Scikit learn :
Network X :
Numpy :
Understanding Tools :
1
%matplotlib inline
3
import numpy as np
4
import matplotlib.pyplot as plt
6
np.random.seed(1000)
8
mu, sigma = 100, 15
9
x = mu + sigma * np.random.randn(10000)
10
11
n, bins, patches = plt.hist(x, 50, density=True, facecolor='g', alpha=0.75)
12
13
plt.xlabel('Smarts')
14
plt.ylabel('Probability')
15
plt.title('Histogram of IQ')
16
plt.text(60, .025, r'$\mu=100,\ \sigma=15$')
17
plt.axis([40, 160, 0, 0.03])
18
plt.grid(True)
19
plt.show()
1
from IPython.display import Image
2
Embed = Image(
3
'http://blog.johnmuellerbooks.com/' +
4
'wp-content/uploads/2015/04/Layer-Hens.jpg')
6
Embed
1
# Uploading :
2
with open("Colors.txt", 'r') as open_file:
3
print('Colors.txt content:\n' + open_file.read())
5
print('\n')
8
# Streaming :
9
with open("Colors.txt", 'r') as open_file:
10
for observation in open_file:
11
print('Reading Data: ' + observation)
Colors.txt content:
<<<<<<< HEAD
Color Value
Red 1
Orange 2
Yellow 3
Green 4
Blue 5
Purple 6
Black 7
=======
Color Value
Red 1
Orange 2
Yellow 3
Green 4
Blue 5
Purple 6
Black 7
>>>>>>> 149a6cdb21f90503efa8b2cbd7f6c46f3f891932
White 8
1
import matplotlib.pyplot as plt
2
import matplotlib.image as img
3
%matplotlib inline
5
image = img.imread("Colorblk.jpg")
6
print(image.shape)
7
print(image.size)
8
plt.imshow(image)
9
plt.show()
(100, 100, 3)
30000
1
#Sampling :
2
n = 2
3
with open("Colors.txt", 'r') as open_file:
4
for j, observation in enumerate(open_file):
5
if j % n==0:
6
print('Reading Line: ' + str(j) +
7
' Content: ' + observation)
9
print('\n')
10
11
from random import random
12
sample_size = 0.25
13
with open("Colors.txt", 'r') as open_file:
14
for j, observation in enumerate(open_file):
15
if random()<=sample_size:
16
print('Reading Line: ' + str(j) +
17
' Content: ' + observation)
1
# Txt file :
2
import pandas as pd
3
color_table = pd.io.parsers.read_table("Colors.txt")
4
print(color_table)
5
print('\n')
8
# CSV file :
9
import pandas as pd
10
titanic = pd.io.parsers.read_csv("titanic_1.csv")
11
X = titanic[['Age']]
12
print(X)
13
print('\n')
14
15
# X = titanic[['Age']].values
16
# print(X)
17
# print('\n')
18
19
20
# Excel & MO files :
21
import pandas as pd
22
xls = pd.ExcelFile("Values.xls")
23
trig_values = xls.parse('Sheet1', index_col=None,
24
na_values=['NA'])
25
print(trig_values)
<<<<<<< HEAD
Color Value
Red 1
Orange 2
Yellow 3
Green 4
Blue 5
Purple 6
Black 7
======= NaN
Color Value
Red 1
Orange 2
Yellow 3
Green 4
Blue 5
Purple 6
Black 7
White 8
Age
0 22.0
1 38.0
2 26.0
3 35.0
4 35.0
.. ...
886 27.0
887 19.0
888 NaN
889 26.0
890 32.0
1 ## Sending data in unstructured file form :
2
3 from skimage.io import imread
4 from skimage.transform import resize
5 from matplotlib import pyplot as plt
6 import matplotlib.cm as cm
7
8 example_file = ("http://upload.wikimedia.org/" +
9 "wikipedia/commons/7/7d/Dog_face.png")
10 image = imread(example_file)
11 plt.imshow(image, cmap=cm.gray)
12 plt.show()
13
14
15 print('\n')
16 print("data type: %s, shape: %s" %
17 (type(image), image.shape))
18
19
20 print('\n')
21 image2 = image[5:70,0:70]
22 plt.imshow(image2, cmap=cm.gray)
23 plt.show()
24
25 print('\n')
26 image3 = resize(image2, (30, 30), mode='symmetric')
27 plt.imshow(image3, cmap=cm.gray)
28 print("data type: %s, shape: %s" %
29 (type(image3), image3.shape))
30
31
32 print('\n')
33 image_row = image3.flatten()
34 print("data type: %s, shape: %s" %
35 (type(image_row), image_row.shape))
Conditioning Data :
data type: <class 'numpy.ndarray'>, shape: (90, 90, 3)
Formatting
data date and
type: <class time values
'numpy.ndarray'>, shape: (2700,)
Slicing rows
Slicing columns
Dicing
1
from lxml import objectify
2
import pandas as pd
4
xml = objectify.parse(open('XMLData2.xml'))
5
root = xml.getroot()
6
df = pd.DataFrame(columns=('Number', 'String', 'Boolean'))
8
for i in range(0,4):
9
obj = root.getchildren()[i].getchildren()
10
row = dict(zip(['Number', 'String', 'Boolean'],
11
[obj[0].text, obj[1].text,
12
obj[2].text]))
13
row_s = pd.Series(row)
14
row_s.name = i
15
df = df.append(row_s)
16
17
search = pd.DataFrame.duplicated(df)
18
print(df)
19
print()
20
print(search[search == True])
0 1 First True
1 2 Second False
2 3 Third True
3 3 Third True
3 True
dtype: bool
1
from lxml import objectify
2
import pandas as pd
4
xml = objectify.parse(open('XMLData2.xml'))
5
root = xml.getroot()
6
df = pd.DataFrame(columns=('Number', 'String', 'Boolean'))
7
for i in range(0,4):
8
obj = root.getchildren()[i].getchildren()
9
row = dict(zip(['Number', 'String', 'Boolean'],
10
[obj[0].text, obj[1].text,
11
obj[2].text]))
12
row_s = pd.Series(row)
13
row_s.name = i
14
df = df.append(row_s)
15
16
print(df.drop_duplicates())
0 1 First True
1 2 Second False
2 3 Third True
1
import pandas as pd
2
pd.set_option('display.width', 55)
4
df = pd.DataFrame({'A': [0,0,0,0,0,1,1],
5
'B': [1,2,3,5,4,2,5],
6
'C': [5,3,4,1,1,2,3]})
8
a_group_desc = df.groupby('A').describe()
9
print(a_group_desc)
10
11
12
print('\n')
13
stacked = a_group_desc.stack()
14
print(stacked)
15
16
17
print('\n')
18
print(a_group_desc.loc[:,(slice(None),['count','mean']),])
B ... C
count mean std min 25% 50% ... std min 25% 50% 75% max
A ...
0 5.0 3.0 1.581139 1.0 2.00 3.0 ... 1.788854 1.0 1.00 3.0 4.00 5.0
1 2.0 3.5 2.121320 2.0 2.75 3.5 ... 0.707107 2.0 2.25 2.5 2.75 3.0
[2 rows x 16 columns]
B C
B C
1
## Categorical variables :
2
3
import pandas as pd
4
print(pd.__version__)
7
car_colors = pd.Series(['Blue', 'Red', 'Green'],
8
dtype='category')
10
car_data = pd.Series(
11
pd.Categorical(
12
['Yellow', 'Green', 'Red', 'Blue', 'Purple'],
13
categories=car_colors, ordered=False))
14
15
find_entries = pd.isnull(car_data)
16
17
print(car_colors)
18
print()
19
print(car_data)
20
print()
21
print(find_entries[find_entries == True])
1.1.5
0 Blue
1 Red
2 Green
dtype: category
0 NaN
1 Green
2 Red
3 Blue
4 NaN
dtype: category
0 True
4 True
dtype: bool
1
car_colors = pd.Series(['Blue', 'Red', 'Green'],
2
dtype='category')
3
car_data = pd.Series(
4
pd.Categorical(
5
['Blue', 'Green', 'Red', 'Blue', 'Red'],
6
categories=car_colors, ordered=False))
8
car_colors.cat.categories = ["Purple", "Yellow", "Mauve"]
9
car_data.cat.categories = car_colors
10
11
print(car_data)
0 Purple
1 Yellow
2 Mauve
3 Purple
4 Mauve
dtype: category
1
import pandas as pd
3
car_colors = pd.Series(['Blue', 'Red', 'Green'],
4
dtype='category')
5
car_data = pd.Series(
6
pd.Categorical(
7
['Blue', 'Green', 'Red', 'Green', 'Red', 'Green'],
8
categories=car_colors, ordered=False))
10
car_data = car_data.cat.set_categories(
11
["Blue", "Red", "Green", "Blue_Red"])
12
print(car_data.loc[car_data.isin(['Red'])])
13
car_data.loc[car_data.isin(['Red'])] = 'Blue_Red'
14
car_data.loc[car_data.isin(['Blue'])] = 'Blue_Red'
15
16
car_data = car_data.cat.set_categories(
17
["Green", "Blue_Red"])
18
19
print()
20
print(car_data)
2 Red
4 Red
dtype: category
0 Blue_Red
1 Green
2 Blue_Red
3 Green
4 Blue_Red
5 Green
dtype: category
1
import datetime as dt
3
now = dt.datetime.now()
5
print(str(now))
6
print(now.strftime('%a, %d %B %Y'))
7
print('\n')
10
## Right time
11
now = dt.datetime.now()
12
timevalue = now + dt.timedelta(hours=2)
13
14
print(now.strftime('%H:%M:%S'))
15
print(timevalue.strftime('%H:%M:%S'))
16
print(timevalue - now)
2021-12-13 00:46:04.063371
00:46:04
02:46:04
2:00:00
1
## Missing Data :
3
import pandas as pd
4
import numpy as np
5
from sklearn.impute import SimpleImputer
7
s = pd.Series([1, 2, 3, np.NaN, 5, 6, None])
9
print(s.isnull())
10
11
print()
12
print(s[s.isnull()])
13
print('\n')
14
15
16
17
print(s.fillna(int(s.mean())))
18
print()
19
print(s.dropna())
20
print('\n')
21
22
23
s = [[1, 2, 3, np.NaN, 5, 6, None]]
24
imp = SimpleImputer(missing_values=np.NaN,
25
strategy='mean')
26
27
imp.fit([[1, 2, 3, 4, 5, 6, 7]])
28
29
x = pd.Series(imp.transform(s).tolist()[0])
30
31
print(x)
0 False
1 False
2 False
3 True
4 False
5 False
6 True
dtype: bool
3 NaN
6 NaN
dtype: float64
0 1.0
1 2.0
2 3.0
3 3.0
4 5.0
5 6.0
6 3.0
dtype: float64
0 1.0
1 2.0
2 3.0
4 5.0
5 6.0
dtype: float64
0 1.0
1 2.0
2 3.0
3 4.0
4 5.0
5 6.0
6 7.0
dtype: float64
1
x = np.array([[[1, 2, 3], [4, 5, 6], [7, 8, 9],],
2
[[11,12,13], [14,15,16], [17,18,19],],
3
[[21,22,23], [24,25,26], [27,28,29]]])
5
print(x[1])
6
print()
8
print(x[:,1])
9
print()
10
11
print(x[1,1])
12
print()
13
14
print(x[:,1,1])
15
print()
16
17
print(x[1,:,1])
18
print()
19
20
print(x[1:2, 1:2])
[[11 12 13]
[14 15 16]
[17 18 19]]
[[ 4 5 6]
[14 15 16]
[24 25 26]]
[14 15 16]
[ 5 15 25]
[12 15 18]
[[[14 15 16]]]
1
df = pd.DataFrame({'A': [2,3,1],
2
'B': [1,2,3],
3
'C': [5,3,4]})
5
df1 = pd.DataFrame({'A': [4],
6
'B': [4],
7
'C': [4]})
9
df = df.append(df1)
10
df = df.reset_index(drop=True)
11
print(df)
12
13
df.loc[df.last_valid_index() + 1] = [5, 5, 5]
14
print()
15
print(df)
16
17
df2 = pd.DataFrame({'D': [1, 2, 3, 4, 5]})
18
19
df = pd.DataFrame.join(df, df2)
20
print()
21
print(df)
A B C
0 2 1 5
1 3 2 3
2 1 3 4
3 4 4 4
A B C
0 2 1 5
1 3 2 3
2 1 3 4
3 4 4 4
4 5 5 5
A B C D
0 2 1 5 1
1 3 2 3 2
2 1 3 4 3
3 4 4 4 4
4 5 5 5 5
1
df = pd.DataFrame({'A': [2,3,1],
2
'B': [1,2,3],
3
'C': [5,3,4]})
5
df = df.drop(df.index[[1]])
6
print(df)
8
df = df.drop('B', 1)
9
print()
10
print(df)
A B C
0 2 1 5
2 1 3 4
A C
0 2 5
2 1 4
1
df = pd.DataFrame({'A': [2,1,2,3,3,5,4],
2
'B': [1,2,3,5,4,2,5],
3
'C': [5,3,4,1,1,2,3]})
5
df = df.sort_values(by=['A', 'B'], ascending=[True, True])
6
df = df.reset_index(drop=True)
7
print(df)
9
index = df.index.tolist()
10
np.random.shuffle(index)
11
df = df.loc[df.index[index]]
12
df = df.reset_index(drop=True)
13
print()
14
print(df)
A B C
0 1 2 3
1 2 1 5
2 2 3 4
3 3 4 1
4 3 5 1
5 4 5 3
6 5 2 2
A B C
0 2 3 4
1 3 4 1
2 2 1 5
3 3 5 1
4 5 2 2
5 4 5 3
6 1 2 3
1
df = pd.DataFrame({'Map': [0,0,0,1,1,2,2],
2
'Values': [1,2,3,5,4,2,5]})
4
df['S'] = df.groupby('Map')['Values'].transform(np.sum)
5
df['M'] = df.groupby('Map')['Values'].transform(np.mean)
6
df['V'] = df.groupby('Map')['Values'].transform(np.var)
8
print(df)
Map Values S M V
0 0 1 6 2.0 1.0
1 0 2 6 2.0 1.0
2 0 3 6 2.0 1.0
3 1 5 9 4.5 0.5
4 1 4 9 4.5 0.5
5 2 2 7 3.5 4.5
6 2 5 7 3.5 4.5
Shaping Data :
1
## Parsing XML & HTML
2
from lxml import objectify
3
import pandas as pd
4
from distutils import util
6
xml = objectify.parse(open('XMLData.xml'))
7
root = xml.getroot()
8
df = pd.DataFrame(columns=('Number', 'Boolean'))
10
for i in range(0, 4):
11
obj = root.getchildren()[i].getchildren()
12
row = dict(zip(['Number', 'Boolean'],
13
[obj[0].pyval,
14
bool(util.strtobool(obj[2].text))]))
15
row_s = pd.Series(row)
16
row_s.name = obj[1].text
17
df = df.append(row_s)
18
19
print(type(df.loc['First']['Number']))
20
print(type(df.loc['First']['Boolean']))
<class 'int'>
<class 'bool'>
1
## Using XPath for data extraction
3
from lxml import objectify
4
import pandas as pd
5
from distutils import util
7
xml = objectify.parse(open('XMLData.xml'))
8
root = xml.getroot()
10
map_number = map(int, root.xpath('Record/Number'))
11
map_bool = map(str, root.xpath('Record/Boolean'))
12
map_bool = map(util.strtobool, map_bool)
13
map_bool = map(bool, map_bool)
14
map_string = map(str, root.xpath('Record/String'))
15
16
data = list(zip(map_number, map_bool))
17
18
df = pd.DataFrame(data,
19
columns=('Number', 'Boolean'),
20
index = list(map_string))
21
22
print(df)
23
print(type(df.loc['First']['Number']))
24
print(type(df.loc['First']['Boolean']))
Number Boolean
First 1 True
Second 2 False
Third 3 True
Fourth 4 False
<class 'numpy.int64'>
<class 'numpy.bool_'>
1
## Stemming & Removing stop words
3
from sklearn.feature_extraction.text import *
4
from nltk import word_tokenize
5
from nltk.stem.porter import PorterStemmer
6
# import nltk
8
# nltk.download('punkt')
10
stemmer = PorterStemmer()
11
12
def stem_tokens(tokens, stemmer):
13
stemmed = []
14
for item in tokens:
15
stemmed.append(stemmer.stem(item))
16
return stemmed
17
18
def tokenize(text):
19
tokens = word_tokenize(text)
20
stems = stem_tokens(tokens, stemmer)
21
return stems
22
23
vocab = ['Sam loves swimming so he swims all the time']
24
vect = CountVectorizer(tokenizer=tokenize,
25
stop_words='english')
26
vec = vect.fit(vocab)
27
28
sentence1 = vec.transform(['George loves swimming too!'])
29
30
31
print(vec.get_feature_names())
32
print(sentence1.toarray())
[[1 0 1 0]]
1
import re
3
data1 = 'My phone number is: 800-555-1212.'
4
data2 = '800-555-1234 is my phone number.'
6
pattern = re.compile(r'(\d{3})-(\d{3})-(\d{4})')
8
dmatch1 = pattern.search(data1).groups()
9
dmatch2 = pattern.search(data2).groups()
10
11
print(dmatch1)
12
print(dmatch2)
1
## Bags of words
3
from sklearn.datasets import fetch_20newsgroups
4
from sklearn.feature_extraction.text import *
6
categories = ['comp.graphics', 'misc.forsale',
7
'rec.autos', 'sci.space']
8
twenty_train = fetch_20newsgroups(subset='train',
9
categories=categories,
10
shuffle=True,
11
random_state=42)
12
13
count_vect = CountVectorizer()
14
X_train_counts = count_vect.fit_transform(
15
twenty_train.data)
16
17
print("BOW shape:", X_train_counts.shape)
18
caltech_idx = count_vect.vocabulary_['caltech']
19
print('"Caltech": %i' % X_train_counts[0, caltech_idx])
"Caltech": 3
1
## N-grams :
3
from sklearn.datasets import fetch_20newsgroups
4
from sklearn.feature_extraction.text import *
6
categories = ['sci.space']
7
8
twenty_train = fetch_20newsgroups(subset='train',
9
categories=categories,
10
remove=('headers',
11
'footers',
12
'quotes'),
13
shuffle=True,
14
random_state=42)
15
16
count_chars = CountVectorizer(analyzer='char_wb',
17
ngram_range=(3,3),
18
max_features=10)
19
20
count_chars.fit(twenty_train['data'])
21
22
count_words = CountVectorizer(analyzer='word',
23
ngram_range=(2,2),
24
max_features=10,
25
stop_words='english')
26
27
count_words.fit(twenty_train['data'])
28
29
X = count_chars.transform(twenty_train.data)
30
31
print(count_words.get_feature_names())
32
print(X[1].todense())
33
print(count_words.get_feature_names())
['anonymous ftp', 'commercial space', 'gamma ray', 'nasa gov', 'national space', 'remote sensing', 'sci space', 'space shuttl
[[0 0 2 5 1 4 2 2 0 5]]
['anonymous ftp', 'commercial space', 'gamma ray', 'nasa gov', 'national space', 'remote sensing', 'sci space', 'space shuttl
/usr/local/lib/python3.7/dist-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function get_feature_names is deprecat
warnings.warn(msg, category=FutureWarning)
1
## TF-IDF :
3
from sklearn.datasets import fetch_20newsgroups
4
from sklearn.feature_extraction.text import *
6
categories = ['comp.graphics', 'misc.forsale',
7
'rec.autos', 'sci.space']
8
twenty_train = fetch_20newsgroups(subset='train',
9
categories=categories,
10
shuffle=True,
11
random_state=42)
12
13
count_vect = CountVectorizer()
14
X_train_counts = count_vect.fit_transform(
15
twenty_train.data)
16
17
tfidf = TfidfTransformer().fit(X_train_counts)
18
X_train_tfidf = tfidf.transform(X_train_counts)
19
20
caltech_idx = count_vect.vocabulary_['caltech']
21
print('"Caltech" scored in a BOW:')
22
print('count: %0.3f' % X_train_counts[0, caltech_idx])
23
print('TF-IDF: %0.3f' % X_train_tfidf[0, caltech_idx])
24
25
26
print('\n')
27
28
import numpy as np
29
count = np.mean(X_train_counts[X_train_counts>0])
30
tfif = np.mean(X_train_tfidf[X_train_tfidf>0])
31
print('mean count: %0.3f' % np.mean(count))
32
print('mean TF-IDF: %0.3f' % np.mean(tfif))
count: 3.000
TF-IDF: 0.123
1
import networkx as nx
3
G = nx.cycle_graph(10)
4
A = nx.adjacency_matrix(G)
5
print(A.todense())
7
print('\n')
8
import matplotlib.pyplot as plt
9
%matplotlib inline
10
nx.draw_networkx(G)
11
plt.show()
12
13
14
print('\n')
15
G.add_edge(1,5)
16
nx.draw_networkx(G)
17
plt.show()
[[0 1 0 0 0 0 0 0 0 1]
[1 0 1 0 0 0 0 0 0 0]
[0 1 0 1 0 0 0 0 0 0]
[0 0 1 0 1 0 0 0 0 0]
[0 0 0 1 0 1 0 0 0 0]
[0 0 0 0 1 0 1 0 0 0]
[0 0 0 0 0 1 0 1 0 0]
[0 0 0 0 0 0 1 0 1 0]
[0 0 0 0 0 0 0 1 0 1]
[1 0 0 0 0 0 0 0 1 0]]
Defining plot
Drawing multiple lines & plots
Saving work to disk
Getting axes
Formatting axes
Adding grids
Adding labels
Annotating the chart
Creating legend
1
import matplotlib.pyplot as plt
2
%matplotlib inline
4
values = [1, 5, 8, 9, 2, 0, 3, 10, 4, 7]
5
values2 = [3, 8, 9, 2, 1, 2, 4, 7, 6, 6]
7
ax = plt.axes()
8
ax.set_xlim([0, 11])
9
ax.set_ylim([-1, 11])
10
ax.set_xticks([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
11
ax.set_yticks([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
12
ax.grid()
13
14
plt.xlabel('Entries')
15
plt.ylabel('Labels')
16
plt.annotate(xy=[1, 1], s='First Entry')
17
# plt.plot(range(1,11), values, linestyle='--', color='r', marker='o')
18
# plt.plot(range(1,11), values2, linestyle=':', color='b', marker='^')
19
plt.plot(range(1,11), values, '--or')
20
plt.plot(range(1,11), values2, ':b^')
21
plt.legend(['First', 'Second'], loc=4)
22
plt.show()
23
24
plt.ioff()
25
plt.savefig('MySamplePlot.png', format='png')
Depicting groups
Showing correlations
Time on axes
Plotting trends over time
Visualizing graphs :
1
values = [5, 8, 9, 10, 4, 7]
2
colors = ['b', 'g', 'r', 'c', 'm', 'y']
3
labels = ['A', 'B', 'C', 'D', 'E', 'F']
4
explode = (0, 0.2, 0, 0, 0, 0)
6
plt.pie(values, colors=colors, labels=labels,
7
explode=explode, autopct='%.2f',
8
counterclock=False, shadow=True)
10
plt.title('Values')
11
plt.show()
1
values = [5, 8, 9, 10, 4, 7]
2
widths = [0.7, 0.8, 0.7, 0.7, 0.7, 0.7]
3
colors = ['b', 'r', 'b', 'b', 'b', 'b']
5
plt.bar(range(0, 6), values, width=widths,
6
color=colors, align='center')
8
plt.show()
1
import numpy as np
3
x = 20 * np.random.randn(10000)
5
plt.hist(x, bins=25, range=(-50, 50), histtype='stepfilled',
6
align='mid', color='g', label='Test Data')
8
plt.legend()
9
plt.title('Step Filled Histogram')
10
plt.show()
1
spread = 100 * np.random.rand(100)
2
center = np.ones(50) * 50
3
flier_high = 100 * np.random.rand(10) + 100
4
flier_low = -100 * np.random.rand(10)
5
data = np.concatenate((spread, center,
6
flier_high, flier_low))
8
plt.boxplot(data, sym='gx', widths=.75, notch=True)
9
plt.show()
1 x1 = 5 * np.random.rand(50)
2 x2 = 5 * np.random.rand(50) + 25
3 x3 = 30 * np.random.rand(30)
4 x = np.concatenate((x1, x2, x3))
5
6 y1 = 5 * np.random.rand(50)
7 y2 = 5 * np.random.rand(50) + 25
8 y3 = 30 * np.random.rand(30)
9 y = np.concatenate((y1, y2, y3))
10
11 plt.scatter(x, y, s=[100], marker='^', c='m')
12 plt.show()
1
x1 = 5 * np.random.rand(50)
2
x2 = 5 * np.random.rand(50) + 25
3
x3 = 30 * np.random.rand(30)
4
x = np.concatenate((x1, x2, x3))
6
y1 = 5 * np.random.rand(50)
7
y2 = 5 * np.random.rand(50) + 25
8
y3 = 30 * np.random.rand(30)
9
y = np.concatenate((y1, y2, y3))
10
11
color_array = ['b'] * 50 + ['g'] * 50 + ['r'] * 30
12
plt.scatter(x, y, s=[130], marker='o', c=color_array)
13
14
z = np.polyfit(x, y, 1) ## Correlation
15
p = np.poly1d(z) ## Correlation
16
plt.plot(x, p(x), 'm-') ## Correlation
17
18
plt.show()
1 import datetime as dt
2 import pandas as pd
3
4 start_date = dt.datetime(2018, 7, 29)
5 end_date = dt.datetime(2018, 8, 7)
6 daterange = pd.date_range(start_date, end_date)
7 sales = (np.random.rand(len(daterange)) * 50).astype(int)
8 df = pd.DataFrame(sales, index=daterange,
9 columns=['Sales'])
10
11 ## Trends from here :
12
13 lr_coef = np.polyfit(range(0, len(df)), df['Sales'], 1)
14 lr_func = np.poly1d(lr_coef)
15 trend = lr_func(range(0, len(df)))
16 df['trend'] = trend
17
18 ## Trends till here ..
19
20 df.loc['Jul 30 2018':'Aug 05 2018'].plot()
21 plt.ylim(0, 50)
22 plt.xlabel('Sales Date')
23 plt.ylabel('Sale Value')
24 plt.title('Plotting Time')
25 plt.legend(['Sales', 'Trend'])
26 plt.show()
1
import networkx as nx
2
import matplotlib.pyplot as plt
3
%matplotlib inline
5
G = nx.Graph()
6
H = nx.Graph()
7
G.add_node(1)
8
G.add_nodes_from([2, 3])
9
G.add_nodes_from(range(4, 7))
10
H.add_node(7)
11
G.add_nodes_from(H)
12
13
G.add_edge(1, 2)
14
G.add_edge(1, 1)
15
G.add_edges_from([(2,3), (3,6), (4,6), (5,6)])
16
H.add_edges_from([(4,7), (5,7), (6,7)])
17
G.add_edges_from(H.edges())
18
19
nx.draw_networkx(G)
20
plt.show()
1
G = nx.DiGraph()
3
G.add_node(1)
4
G.add_nodes_from([2, 3])
5
G.add_nodes_from(range(4, 6))
6
nx.add_path(G, [6, 7, 8])
8
G.add_edge(1, 2)
9
G.add_edges_from([(1,4), (4,5), (2,3), (3,6), (5,6)])
10
11
colors = ['r', 'g', 'g', 'g', 'g', 'm', 'm', 'r']
12
labels = {1:'Start', 2:'2', 3:'3', 4:'4',
13
5:'5', 6:'6', 7:'7', 8:'End'}
14
sizes = [800, 300, 300, 300, 300, 600, 300, 800]
15
16
nx.draw_networkx(G, node_color=colors, node_shape='D',
17
with_labels=True, labels=labels,
18
node_size=sizes)
19
plt.show()
Hashing trick :
Running in Parallel :
Multicore parallelism
Multiprocessing demo
1
from sklearn.datasets import load_boston
2
from sklearn.linear_model import LinearRegression
3
from sklearn.preprocessing import MinMaxScaler
5
boston = load_boston()
6
X, y = boston.data,boston.target
7
print(f'Shapes : {X.shape}, {y.shape}')
9
hypothesis = LinearRegression(normalize=True)
10
hypothesis.fit(X, y)
11
print(f'Hypothesis coefficients : \n{hypothesis.coef_}')
12
13
new_observation = np.array([1, 0, 1, 0, 0.5, 7, 59,
14
6, 3, 200, 20, 350, 4],
15
dtype=float).reshape(1, -1)
16
17
print(f'Prediciton : {hypothesis.predict(new_observation)}')
18
print(f'Score : {hypothesis.score(X, y)}')
Hypothesis coefficients :
-5.24758378e-01]
Prediciton : [25.90156732]
Score : 0.7406426641094094
1
scaler = MinMaxScaler(feature_range=(0, 1))
2
scaler.fit(X)
3
print(scaler.transform(new_observation))
0.06263797]]
1
print(hash('Python'))
2
print(abs(hash('Python')) % 1000)
6
from sklearn.feature_extraction.text import *
7
oh_enconder = CountVectorizer()
8
oh_enconded = oh_enconder.fit_transform([
9
'Python for data science','Python for machine learning'])
10
11
print(f'Vocabulary : {oh_enconder.vocabulary_}')
12
13
string_1 = 'Python for data science'
14
string_2 = 'Python for machine learning'
15
16
17
18
def hashing_trick(input_string, vector_size=20):
19
feature_vector = [0] * vector_size
20
for word in input_string.split(' '):
21
index = abs(hash(word)) % vector_size
22
feature_vector[index] = 1
23
return feature_vector
24
25
print(hashing_trick(
26
input_string='Python for data science',
27
vector_size=20))
28
29
print(hashing_trick(
30
input_string='Python for machine learning',
31
vector_size=20))
-6421110064280408918
918
[0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]
[0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0]
1
from scipy.sparse import csc_matrix
2
print(csc_matrix([1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0]))
6
import sklearn.feature_extraction.text as txt
7
htrick = txt.HashingVectorizer(n_features=20,
8
binary=True, norm=None)
9
hashed_text = htrick.transform(['Python for data science',
10
'Python for machine learning'])
11
print('\n')
12
print(hashed_text)
13
14
15
16
print(oh_enconder.transform(['New text has arrived']).todense())
17
print(htrick.transform(['New text has arrived']).todense())
(0, 0) 1
(0, 5) 1
(0, 16) 1
(0, 18) 1
(0, 3) 1.0
(0, 5) 1.0
(1, 2) 1.0
(1, 3) 1.0
(1, 4) 1.0
(1, 5) 1.0
[[0 0 0 0 0 0]]
[[1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1.]]
2
%timeit -n 20 -r 5 l = [k for k in range(10**6)]
5
texts = ['Python for data science',
6
'Python for machine learning']
7
8
%timeit oh_enconded = oh_enconder.fit_transform(texts)
9
%timeit hashing = htrick.transform(texts)
10
11
12
import timeit
13
cumulative_time = timeit.timeit(
14
"hashing = htrick.transform(texts)",
15
"from __main__ import htrick, texts",
16
number=10000)
17
print(cumulative_time / 10000.0)
0.00021400607870000385
1
## Entire cell because of cell magic function
2
%%timeit
3
l = list()
4
for k in range(10**6):
5
l.append(k)
1
import sys
2
!{sys.executable} -m pip install memory_profiler
4
%load_ext memory_profiler
6
hashing = htrick.transform(texts)
7
%memit dense_hashing = hashing.toarray()
8
## last line is output of the above two lines
Collecting memory_profiler
1
%%writefile example_code.py
2
def comparison_test(text):
3
import sklearn.feature_extraction.text as txt
4
htrick = txt.HashingVectorizer(n_features=20,
5
binary=True,
6
norm=None)
7
oh_enconder = txt.CountVectorizer()
8
oh_enconded = oh_enconder.fit_transform(text)
9
hashing = htrick.transform(text)
10
return oh_enconded, hashing
11
12
13
14
from example_code import comparison_test
15
text = ['Python for data science',
16
'Python for machine learning']
17
%mprun -f comparison_test comparison_test(text)
18
Writing example_code.py
1 from sklearn.datasets import load_digits
2 from sklearn.svm import SVC
3 from sklearn.model_selection import cross_val_score
4
5 digits = load_digits()
6 X, y = digits.data,digits.target
7
8 %timeit single_core = cross_val_score(SVC(), X, y, \
9 cv=20, n_jobs=1)
10
11 %timeit multi_core = cross_val_score(SVC(), X, y, \
12 cv=20, n_jobs=-1)
EDA :
Frequencies
Contigency tables
Inspecting boxplots
t-tests after boxplots
Observing parallel coordinates
Graphing distributions
Plotting scatter plots
Understanding Correlation :
1
%matplotlib inline
3
import numpy as np
4
import pandas as pd
5
import matplotlib.pyplot as plt
6
from sklearn.datasets import load_iris
8
print('Your pandas version is: %s' % pd.__version__)
9
print('Your NumPy version is %s' % np.__version__)
10
11
iris = load_iris()
12
iris_nparray = iris.data
13
14
iris_dataframe = pd.DataFrame(iris.data, columns=iris.feature_names)
15
iris_dataframe['group'] = pd.Series([iris.target_names[k] for k in iris.target], dtype="category")
1
print(iris_dataframe.mean(numeric_only=True))
2
print('\n')
4
print(iris_dataframe.median(numeric_only=True))
5
print('\n')
7
print(iris_dataframe.std())
8
print('\n')
10
print(iris_dataframe.max(numeric_only=True)
11
- iris_dataframe.min(numeric_only=True))
12
print('\n')
13
14
print(iris_dataframe.quantile([0,.25,.50,.75,1]))
15
print('\n')
dtype: float64
dtype: float64
dtype: float64
dtype: float64
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm)
1
from scipy.stats import kurtosis, kurtosistest
2
variable = iris_dataframe['petal length (cm)']
3
k = kurtosis(variable)
4
zscore, pvalue = kurtosistest(variable)
5
print('Kurtosis %0.3f z-score %0.3f p-value %0.3f'
6
% (k, zscore, pvalue))
10
from scipy.stats import skew, skewtest
11
variable = iris_dataframe['petal length (cm)']
12
s = skew(variable)
13
zscore, pvalue = skewtest(variable)
14
print('Skewness %0.3f z-score %0.3f p-value %0.3f'
15
% (s, zscore, pvalue))
1
pcts = [0, .25, .5, .75, 1]
2
iris_binned = pd.concat(
3
[pd.qcut(iris_dataframe.iloc[:,0], pcts, precision=1),
4
pd.qcut(iris_dataframe.iloc[:,1], pcts, precision=1),
5
pd.qcut(iris_dataframe.iloc[:,2], pcts, precision=1),
6
pd.qcut(iris_dataframe.iloc[:,3], pcts, precision=1)],
7
join='outer', axis = 1)
9
print(iris_dataframe['group'].value_counts())
10
print('\n')
11
12
print(iris_binned['petal length (cm)'].value_counts())
13
print('\n')
14
15
print(pd.crosstab(iris_dataframe['group'],
16
iris_binned['petal length (cm)']))
17
print('\n')
virginica 50
versicolor 50
setosa 50
(0.9, 1.6] 44
(4.4, 5.1] 41
(5.1, 6.9] 34
(1.6, 4.4] 31
petal length (cm) (0.9, 1.6] (1.6, 4.4] (4.4, 5.1] (5.1, 6.9]
group
setosa 44 6 0 0
versicolor 0 25 25 0
virginica 0 0 16 34
1
boxplots = iris_dataframe.boxplot(column='petal length (cm)',
2
by='group', fontsize=10)
3
plt.suptitle("")
4
plt.show()
1
from scipy.stats import ttest_ind
3
group0 = iris_dataframe['group'] == 'setosa'
4
group1 = iris_dataframe['group'] == 'versicolor'
5
group2 = iris_dataframe['group'] == 'virginica'
6
variable = iris_dataframe['petal length (cm)']
8
print('var1 %0.3f var2 %03f' % (variable[group1].var(),
9
variable[group2].var()))
10
11
12
13
variable = iris_dataframe['sepal width (cm)']
14
t, pvalue = ttest_ind(variable[group1], variable[group2],
15
axis=0, equal_var=False)
16
print('t statistic %0.3f p-value %0.3f' % (t, pvalue))
17
18
19
20
from scipy.stats import f_oneway
21
variable = iris_dataframe['sepal width (cm)']
22
f, pvalue = f_oneway(variable[group0],
23
variable[group1],
24
variable[group2])
25
print('One-way ANOVA F-value %0.3f p-value %0.3f'
26
% (f,pvalue))
1
from pandas.plotting import parallel_coordinates
2
iris_dataframe['group'] = iris.target
3
iris_dataframe['labels'] = [iris.target_names[k]
4
for k in iris_dataframe['group']]
5
pll = parallel_coordinates(iris_dataframe, 'labels')
1
cols = iris_dataframe.columns[:4]
2
densityplot = iris_dataframe[cols].plot(kind='density')
1
variable = iris_dataframe['petal length (cm)']
2
single_distribution = variable.plot(kind='hist')
1
palette = {0: 'red', 1: 'yellow', 2:'blue'}
2
colors = [palette[c] for c in iris_dataframe['group']]
3
simple_scatterplot = iris_dataframe.plot(
4
kind='scatter', x='petal length (cm)',
5
y='petal width (cm)', c=colors)
1
from pandas.plotting import scatter_matrix
2
palette = {0: "red", 1: "yellow", 2: "blue"}
3
colors = [palette[c] for c in iris_dataframe['group']]
4
matrix_of_scatterplots = scatter_matrix(
5
iris_dataframe, figsize=(6, 6),
6
color=colors, diagonal='kde')
1
print(iris_dataframe.cov())
2
print('\n')
4
print(iris_dataframe.corr())
5
print('\n')
7
covariance_matrix = np.cov(iris_nparray, rowvar=0)
8
correlation_matrix = np.corrcoef(iris_nparray, rowvar=0)
[5 rows x 5 columns]
[5 rows x 5 columns]
1
from scipy.stats import spearmanr
2
from scipy.stats.stats import pearsonr
3
a = iris_dataframe['sepal length (cm)']
4
b = iris_dataframe['sepal width (cm)']
5
rho_coef, rho_p = spearmanr(a, b)
6
r_coef, r_p = pearsonr(a, b)
7
print('Pearson r %0.3f | Spearman rho %0.3f'
8
% (r_coef, rho_coef))
10
11
12
from scipy.stats import chi2_contingency
13
table = pd.crosstab(iris_dataframe['group'],
14
iris_binned['petal length (cm)'])
15
chi2, p, dof, expected = chi2_contingency(table.values)
16
print('Chi-square %0.2f p-value %0.3f' % (chi2, p))
17
1
from sklearn.preprocessing import scale
2
variable = iris_dataframe['sepal width (cm)']
3
stand_sepal_width = scale(variable)
5
from scipy.stats.stats import pearsonr
6
tranformations = {'x': lambda x: x,
7
'1/x': lambda x: 1/x,
8
'x**2': lambda x: x**2,
9
'x**3': lambda x: x**3,
10
'log(x)': lambda x: np.log(x)}
11
a = iris_dataframe['sepal length (cm)']
12
b = iris_dataframe['sepal width (cm)']
13
for transformation in tranformations:
14
b_transformed = tranformations[transformation](b)
15
pearsonr_coef, pearsonr_p = pearsonr(a, b_transformed)
16
print('Transformation: %s \t Pearson\'s r: %0.3f'
17
% (transformation, pearsonr_coef))