Overview Data
In [1]: import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
In [2]: import pandas as pd

data = pd.read_csv('C://Users/ditama/Downloads/Unicauca-dataset-April-June-2019-Networ
data head()
flow_key src_ip_numeric src_ip src_port dst_ip dst_port

0 3acee4f4ea001cd5e6d9584d4036b53d 3232266497 67

1 974ec5991b439c9a7176b88be0c90df0 3232266497 67

2 3acee4f4ea001cd5e6d9584d4036b53d 3232266497 67

3 974ec5991b439c9a7176b88be0c90df0 3232266497 67

4 cfa7c2740072befaa89c202499729e08 3232266497 0

5 rows × 50 columns

In [3]: data shape

Out[3]: (2704839, 50)

In [4]: data columns

Out[4]: Index(['flow_key', 'src_ip_numeric', 'src_ip', 'src_port', 'dst_ip',

'dst_port', 'proto', 'pktTotalCount', 'octetTotalCount', 'min_ps',
'max_ps', 'avg_ps', 'std_dev_ps', 'flowStart', 'flowEnd',
'flowDuration', 'min_piat', 'max_piat', 'avg_piat', 'std_dev_piat',
'f_pktTotalCount', 'f_octetTotalCount', 'f_min_ps', 'f_max_ps',
'f_avg_ps', 'f_std_dev_ps', 'f_flowStart', 'f_flowEnd',
'f_flowDuration', 'f_min_piat', 'f_max_piat', 'f_avg_piat',
'f_std_dev_piat', 'b_pktTotalCount', 'b_octetTotalCount', 'b_min_ps',
'b_max_ps', 'b_avg_ps', 'b_std_dev_ps', 'b_flowStart', 'b_flowEnd',
'b_flowDuration', 'b_min_piat', 'b_max_piat', 'b_avg_piat',
'b_std_dev_piat', 'flowEndReason', 'category', 'application_protocol',

In [5]: data info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2704839 entries, 0 to 2704838
Data columns (total 50 columns):
# Column Dtype
--- ------ -----
0 flow_key object
1 src_ip_numeric int64
2 src_ip object
3 src_port int64
4 dst_ip object
5 dst_port int64
6 proto int64
7 pktTotalCount int64
8 octetTotalCount int64
9 min_ps int64
10 max_ps int64
11 avg_ps float64
12 std_dev_ps float64
13 flowStart float64
14 flowEnd float64
15 flowDuration float64
16 min_piat float64
17 max_piat float64
18 avg_piat float64
19 std_dev_piat float64
20 f_pktTotalCount int64
21 f_octetTotalCount int64
22 f_min_ps int64
23 f_max_ps int64
24 f_avg_ps float64
25 f_std_dev_ps float64
26 f_flowStart float64
27 f_flowEnd float64
28 f_flowDuration float64
29 f_min_piat float64
30 f_max_piat float64
31 f_avg_piat float64
32 f_std_dev_piat float64
33 b_pktTotalCount int64
34 b_octetTotalCount int64
35 b_min_ps int64
36 b_max_ps int64
37 b_avg_ps float64
38 b_std_dev_ps float64
39 b_flowStart float64
40 b_flowEnd float64
41 b_flowDuration float64
42 b_min_piat float64
43 b_max_piat float64
44 b_avg_piat float64
45 b_std_dev_piat float64
46 flowEndReason int64
47 category object
48 application_protocol object

49 web_service object
dtypes: float64(27), int64(17), object(6)
memory usage: 1.0+ GB

Lets take a look at all the non-numeric columns

In [6]: non_num_cols = [col for col in data.columns if data[col].dtype == 'O']

non_num_data = data[non_num_cols]
flow_key src_ip dst_ip category application_protoc

0 3acee4f4ea001cd5e6d9584d4036b53d Network

1 974ec5991b439c9a7176b88be0c90df0 Network

2 3acee4f4ea001cd5e6d9584d4036b53d Network

3 974ec5991b439c9a7176b88be0c90df0 Network

4 cfa7c2740072befaa89c202499729e08 Network

... ... ... ... ...

2704834 695ea899a18c6d2f90c8b2f6c9b70bdf System

2704835 f8188e4364129e635fe032a3bda206ea System

2704836 4deda0130e2054781655cb4bd4cb580d System

2704837 8c07a45c0c48648ff56341d7a065b855 Web

2704838 a61c7ab8213996e502ac7f54fc97fb34 Web

2704839 rows × 6 columns

No. of unique values and their counts in non_numeric columns

In [7]: [(col non_num_data[col].nunique()) for col in non_num_cols]

Out[7]: [('flow_key', 2344534),

('src_ip', 716),
('dst_ip', 104463),
('category', 24),
('application_protocol', 23),
('web_service', 141)]

In [8]: def summarize_cat(col_name):

sorted_values = sorted(non_num_data[col_name].value_counts().iteritems(), key
remaining_per = 100
for (value, count) in sorted_values:
per = count / len(non_num_data) * 100
if per >= 1:
print(f'{value} : {per:.2f}%')
else :
print(f'Others : {remaining_per:.2f}%')
remaining_per = remaining_per - per

In [9]: for col in non_num_cols:

print(f"Summary of {col} column : ")
Summary of flow_key column :
Others : 100.00%

Summary of src_ip column : : 5.59% : 1.75% : 1.58% : 1.30% : 1.26% : 1.14%
Others : 87.38%

Summary of dst_ip column : : 27.43% : 5.46% : 5.01%
Others : 62.10%

Summary of category column :

Web : 52.36%
Network : 16.39%
Unspecified : 9.21%
SocialNetwork : 5.58%
Chat : 2.79%
Download-FileTransfer-FileSharing : 2.62%
Media : 2.36%
Cloud : 1.87%
VoIP : 1.74%
Collaborative : 1.44%
System : 1.37%
Others : 2.27%

Summary of application_protocol column :

Unknown : 48.37%
TLS : 25.58%
DNS : 18.10%
HTTP : 4.75%
QUIC : 2.62%
Others : 0.59%

Summary of web_service column :

Google : 21.07%
DNS : 15.52%
TLS : 9.60%
Unknown : 9.21%
Microsoft : 6.37%

HTTP : 5.65%
Facebook : 4.47%
Amazon : 3.24%
GoogleServices : 3.23%
BitTorrent : 2.62%
YouTube : 2.06%
Messenger : 1.67%
HTTP_Proxy : 1.25%
Others : 14.04%

Exploratory Analysis for numeric columns

In [10]: num_cols = list(set(data.columns) - set(non_num_cols))

Out[10]: ['b_avg_ps',

In [11]: data[num_cols].describe()

b_avg_ps f_pktTotalCount f_max_piat f_std_dev_piat b_avg_piat flowEnd

count 2.704839e+06 2.704839e+06 2.704839e+06 2.704839e+06 2.704839e+06 2.704839e+06

mean 2.880519e+02 3.729565e+01 2.210437e+01 5.803303e+00 4.132842e+00 1.557242e+09

std 5.933458e+02 1.960941e+03 8.534161e+01 2.440810e+01 3.715773e+01 1.610686e+06

min 0.000000e+00 1.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 1.555954e+09

25% 4.600000e+01 1.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 1.556055e+09

50% 1.140000e+02 2.000000e+00 5.225205e-02 0.000000e+00 0.000000e+00 1.556291e+09

75% 2.800000e+02 9.000000e+00 6.875163e+00 1.501036e+00 3.276894e-01 1.559659e+09

max 1.583600e+04 2.156205e+06 1.780822e+03 8.624964e+02 1.780821e+03 1.559771e+09

8 rows × 44 columns

In [12]: [col for col in num_cols if data[col].isnull().any()]

Out[12]: []

In [13]: print("range and no. of unique values in numeric columns")

for col in num_cols:
print(f'{col}\tRange : {max(data[col]) - min(data[col])}, No. of unique values :

range and no. of unique values in numeric columns

b_avg_ps Range : 15836.0, No. of unique values : 309031
f_pktTotalCount Range : 2156204, No. of unique values : 5623
f_max_piat Range : 1780.82152986526, No. of unique values : 1174066
f_std_dev_piat Range : 862.496393918991, No. of unique values : 1301011
b_avg_piat Range : 1780.82148790359, No. of unique values : 1179890
flowEnd Range : 3817819.12740016, No. of unique values : 2621434
b_std_dev_ps Range : 11680.0, No. of unique values : 655659
b_std_dev_piat Range : 839.900081515312, No. of unique values : 1148891
proto Range : 16, No. of unique values : 3
f_avg_ps Range : 11596.0, No. of unique values : 210778
min_ps Range : 11596, No. of unique values : 711
dst_port Range : 65535, No. of unique values : 33753
flowDuration Range : 1800.20165610313, No. of unique values : 1430642
b_max_piat Range : 1780.82148790359, No. of unique values : 967839
f_std_dev_ps Range : 7908.10729941073, No. of unique values : 634260
b_max_ps Range : 26320, No. of unique values : 14233
b_flowDuration Range : 1558211564721.97, No. of unique values : 1189909
avg_ps Range : 11596.0, No. of unique values : 410222
max_ps Range : 26292, No. of unique values : 14548
f_min_piat Range : 1780.82152986526, No. of unique values : 243202
min_piat Range : 1763.94893193245, No. of unique values : 310582
f_flowStart Range : 3817819.2345199585, No. of unique values : 2645081
f_avg_piat Range : 1780.82152986526, No. of unique values : 1344234
b_octetTotalCount Range : 2971893160, No. of unique values : 141155
src_ip_numeric Range : 2044, No. of unique values : 716
std_dev_ps Range : 9370.13341149918, No. of unique values : 833730
f_flowEnd Range : 1558215380814.8389, No. of unique values : 2397265
f_flowDuration Range : 1800.20165610313, No. of unique values : 1288078
flowEndReason Range : 3, No. of unique values : 4
b_pktTotalCount Range : 1017780, No. of unique values : 7015
b_min_ps Range : 15836, No. of unique values : 841
b_flowStart Range : 1559771334.33015, No. of unique values : 2288361
flowStart Range : 3817819.2345199585, No. of unique values : 2645081
octetTotalCount Range : 2981111667, No. of unique values : 154581
avg_piat Range : 1763.94893193245, No. of unique values : 1554611
pktTotalCount Range : 2292424, No. of unique values : 8984
f_octetTotalCount Range : 2955382240, No. of unique values : 56947
src_port Range : 65535, No. of unique values : 61314
std_dev_piat Range : 865.191153526306, No. of unique values : 1394435
f_min_ps Range : 11596, No. of unique values : 1034
f_max_ps Range : 26292, No. of unique values : 6980
b_min_piat Range : 1780.82148790359, No. of unique values : 155794
max_piat Range : 1780.82109594345, No. of unique values : 1268402
b_flowEnd Range : 1559771334.33116, No. of unique values : 2282692

For the columns having <=50 unique values, we plot histograms, for
others we just list distribution of most frequent values as in case of
category columns

In [14]: cols_for_hist = [col for col in num_cols if data[col].nunique() <= 50]

cols_for_hist len(cols_for_hist)
Out[14]: (['proto', 'flowEndReason'], 2)

In [15]: cols_for_desc = [col for col in num_cols if data[col].nunique() > 50]

Out[15]: ['b_avg_ps',

In [16]: data[cols_for_hist].hist(layout = (7,3), figsize = (12, 20))

plt tight_layout()

Correlation Matrix

In [17]: corr data[num_cols].corr()

In [18]: f = plt.figure(figsize = (25,25))

plt.matshow(corr, fignum=f.number)
plt.title('Correlation Matrix of Numeric columns in the dataset', fontsize = 20
plt.xticks(range(len(num_cols)), num_cols, fontsize = 14, rotation = 90)
plt.yticks(range(len(num_cols)), num_cols, fontsize = 14)
cb = plt.colorbar(fraction = 0.0466, pad = 0.02)
plt show()


In [19]: #check null

data isnull().sum()
Out[19]: flow_key 0
src_ip_numeric 0
src_ip 0
src_port 0
dst_ip 0
dst_port 0
proto 0
pktTotalCount 0
octetTotalCount 0
min_ps 0
max_ps 0
avg_ps 0
std_dev_ps 0
flowStart 0
flowEnd 0
flowDuration 0
min_piat 0
max_piat 0
avg_piat 0
std_dev_piat 0
f_pktTotalCount 0
f_octetTotalCount 0
f_min_ps 0
f_max_ps 0
f_avg_ps 0
f_std_dev_ps 0
f_flowStart 0
f_flowEnd 0
f_flowDuration 0
f_min_piat 0
f_max_piat 0
f_avg_piat 0
f_std_dev_piat 0
b_pktTotalCount 0
b_octetTotalCount 0
b_min_ps 0
b_max_ps 0
b_avg_ps 0
b_std_dev_ps 0
b_flowStart 0
b_flowEnd 0
b_flowDuration 0
b_min_piat 0
b_max_piat 0
b_avg_piat 0
b_std_dev_piat 0
flowEndReason 0
category 0
application_protocol 0
web_service 0
dtype: int64

In [20]: #check duplicate

dups = data.duplicated()
print('Number of duplicate rows = %d' % (dups sum()))
Number of duplicate rows = 10

In [21]: #remove duplicate

print('Number of rows before discarding duplicates = %d' % (data.shape[0]))
data = data.drop_duplicates()
print('Number of rows after discarding duplicates = %d' % (data shape[0]))
Number of rows before discarding duplicates = 2704839
Number of rows after discarding duplicates = 2704829

Feature Selection

Based on the unique column removed

In [22]: ipdata data copy()

In [23]: ipdata drop(['flow_key' 'src_ip_numeric' 'src_ip' 'dst_ip'], axis = 1 inplace

In [24]: single_unique_cols = [col for col in ipdata.columns if ipdata[col].nunique() ==

Out[24]: []

Based on Corr colum numeric removed

In [25]: ipdata_num = data[num_cols].copy()

In [26]: ipdata_num.drop(['f_flowStart','flowEnd','octetTotalCount','b_octetTotalCount',

In [27]: corr = ipdata_num corr()

In [28]: num_cols_after_drop = list(set(ipdata_num.columns))

Out[28]: ['f_std_dev_ps',

In [29]: f = plt.figure(figsize = (25,25))

plt.matshow(corr, fignum=f.number)
plt.title('Correlation Matrix of Numeric columns in the dataset', fontsize = 20
plt.xticks(range(len(num_cols_after_drop)), num_cols_after_drop, fontsize = 14,
plt.yticks(range(len(num_cols_after_drop)), num_cols_after_drop, fontsize = 14)
cb = plt.colorbar(fraction = 0.0466, pad = 0.02)
plt show()

Final Feature

In [30]: df ipdata_num copy()

In [31]: df head()

b_std_dev_piat proto min_ps dst_port flowDuration b_max_piat f_std_dev_ps min_piat src_ip_num

0 84.916348 17 328 67 1701.385427 198.657965 9.140200 0.000313

1 0.000000 17 328 67 1701.385515 0.000000 9.140200 0.010356

2 124.270745 17 328 67 1450.967340 340.268454 9.718024 0.000239

3 0.000000 17 328 67 1450.967130 0.000000 10.057833 0.015330

4 0.000000 1 56 0 0.000000 0.000000 0.000000 0.000000

In [32]: df shape

Out[32]: (2704829, 17)

Classification Label Web Service DT, Naive

In [33]: #data train, dan test
X = ipdata_num
Y data['web_service']
In [34]: #splitting
from sklearn.model_selection import train_test_split
X_train X_test y_train y_test train_test_split(X Y test_size 0.5 random_state
In [35]: X_train shape

Out[35]: (1352414, 17)

In [36]: X_test shape

Out[36]: (1352415, 17)

In [37]: #normalisasi
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler fit_transform(X_test)

In [38]: X_train_scaled

Out[38]: array([[0.00000000e+00, 3.12500000e-01, 4.13507926e-03, ...,

0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
[7.98713134e-04, 3.12500000e-01, 2.06753963e-03, ...,
1.63910959e-09, 1.13994231e-03, 9.98002621e-01],
[1.05979673e-04, 3.12500000e-01, 4.13507926e-03, ...,
6.20653215e-04, 7.13559201e-04, 9.99942314e-01],
[0.00000000e+00, 1.00000000e+00, 5.34114404e-03, ...,
0.00000000e+00, 3.62225073e-07, 9.99996212e-01],
[0.00000000e+00, 1.00000000e+00, 1.65403170e-02, ...,
0.00000000e+00, 5.70184165e-05, 0.00000000e+00],
[1.03205274e-05, 3.12500000e-01, 4.13507926e-03, ...,
1.77570205e-09, 2.03033912e-05, 9.97559794e-01]])

In [39]: X_test_scaled

Out[39]: array([[3.88515652e-03, 3.12500000e-01, 1.03483960e-03, ...,

1.35220047e-08, 6.01140397e-03, 9.97782795e-01],
[1.38839226e-04, 3.12500000e-01, 2.06967920e-03, ...,
2.67762469e-09, 1.57405541e-04, 9.99942577e-01],
[1.62385749e-03, 3.12500000e-01, 1.03483960e-03, ...,
1.74045605e-09, 2.78186510e-03, 9.99934633e-01],
[0.00000000e+00, 1.00000000e+00, 5.77785443e-03, ...,
0.00000000e+00, 2.29740249e-07, 9.99996128e-01],
[2.44122729e-03, 3.12500000e-01, 1.03483960e-03, ...,
3.83126590e-05, 1.23374111e-02, 9.97766900e-01],
[0.00000000e+00, 1.00000000e+00, 8.62366333e-05, ...,
0.00000000e+00, 5.39459826e-05, 9.97564948e-01]])

In [40]: from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
Y_train_encode = label_encoder.fit_transform(y_train)

In [41]: label_encoder2 = LabelEncoder()

Y_test_encode label_encoder2 fit_transform(y_test)
In [42]: Y_train_encode

Out[42]: array([ 25, 1, 3, ..., 130, 90, 122])

In [43]: Y_test_encode

Out[43]: array([105, 31, 105, ..., 33, 39, 45])

In [44]: label_encoder classes_

Out[44]: array(['AJP', 'Amazon', 'AmazonVideo', 'Apple', 'ApplePush', 'AppleStore',

'AppleiCloud', 'AppleiTunes', 'BJNP', 'BitTorrent', 'CNN',
'CiscoSkinny', 'CiscoVPN', 'Citrix', 'Cloudflare', 'DHCP', 'DNP3',
'DNS', 'DNSoverHTTPS', 'DataSaver', 'Deezer',
'Direct_Download_Link', 'Dropbox', 'FTP_CONTROL', 'FTP_DATA',
'Facebook', 'GMail', 'Github', 'Google', 'GoogleDocs',
'GoogleDrive', 'GoogleHangoutDuo', 'GoogleMaps', 'GooglePlus',
'GoogleServices', 'H323', 'HTTP', 'HTTP_Proxy', 'HotspotShield',
'IAX', 'ICMP', 'IMAPS', 'IMO', 'IPsec', 'IRC', 'Instagram', 'LDAP',
'LinkedIn', 'LotusNotes', 'MQTT', 'MSN', 'MS_OneDrive',
'Messenger', 'Microsoft', 'Mining', 'MsSQL-TDS', 'NFS', 'NTP',
'NestLogSink', 'NetBIOS', 'NetFlix', 'Office365', 'Ookla',
'OpenDNS', 'OpenVPN', 'Oracle', 'POP3', 'PS_VUE',
'Pando_Media_Booster', 'PlayStore', 'Playstation', 'PostgreSQL',
'QQ', 'QUIC', 'RDP', 'RTMP', 'RTP', 'RTSP', 'RX', 'Radius', 'SAP',
'SIP', 'SMBv1', 'SMBv23', 'SMTP', 'SNMP', 'SOCKS', 'SOMEIP',
'SSDP', 'SSH', 'STUN', 'Signal', 'Sina(Weibo)', 'Skype',
'SkypeCall', 'Slack', 'Snapchat', 'SoundCloud', 'Spotify',
'Starcraft', 'Steam', 'Syslog', 'TLS', 'Targus Dataspeed',
'TeamViewer', 'Telegram', 'Teredo', 'TikTok', 'Tor', 'Tuenti',
'Twitch', 'Twitter', 'UBNTAC2', 'UPnP', 'UbuntuONE',
'Unencrypted_Jabber', 'Unknown', 'VNC', 'Viber', 'Waze', 'WeChat',
'Webex', 'WhatsApp', 'WhatsAppCall', 'WhatsAppFiles', 'Whois-DAS',
'Wikipedia', 'WindowsUpdate', 'Xbox', 'Yahoo', 'YouTube', 'Zoom',
'eBay', 'eDonkey', 'sFlow'], dtype=object)

label_encoder2 classes_

Out[45]: array(['104', 'AJP', 'Amazon', 'AmazonVideo', 'Apple', 'ApplePush',

'AppleStore', 'AppleiCloud', 'AppleiTunes', 'BGP', 'BJNP',
'BitTorrent', 'CNN', 'CiscoSkinny', 'CiscoVPN', 'Citrix',
'Cloudflare', 'DHCP', 'DNP3', 'DNS', 'DNSoverHTTPS', 'DataSaver',
'Deezer', 'Direct_Download_Link', 'Dropbox', 'FTP_CONTROL',
'FTP_DATA', 'Facebook', 'GMail', 'GTP', 'Github', 'Google',
'GoogleDocs', 'GoogleDrive', 'GoogleHangoutDuo', 'GoogleMaps',
'GooglePlus', 'GoogleServices', 'H323', 'HTTP', 'HTTP_Proxy',
'HotspotShield', 'IAX', 'ICMP', 'IMAPS', 'IMO', 'IPsec', 'IRC',
'Instagram', 'LDAP', 'LinkedIn', 'LotusNotes', 'MDNS', 'MQTT',
'MSN', 'MS_OneDrive', 'Messenger', 'Microsoft', 'Mining',
'MsSQL-TDS', 'MySQL', 'NFS', 'NTP', 'NestLogSink', 'NetBIOS',
'NetFlix', 'Office365', 'Ookla', 'OpenDNS', 'OpenVPN', 'Oracle',
'PS_VUE', 'Pando_Media_Booster', 'PlayStore', 'Playstation',
'PostgreSQL', 'QQ', 'QUIC', 'RDP', 'RTMP', 'RTP', 'RTSP', 'RX',
'Radius', 'SIP', 'SMBv1', 'SMBv23', 'SMTP', 'SMTPS', 'SNMP',
'SOCKS', 'SSDP', 'SSH', 'STUN', 'Signal', 'Sina(Weibo)', 'Skype',
'SkypeCall', 'Slack', 'Snapchat', 'SoundCloud', 'Spotify',
'Starcraft', 'Steam', 'Syslog', 'TLS', 'Targus Dataspeed',
'TeamViewer', 'Telegram', 'Teredo', 'TikTok', 'Tor', 'Tuenti',
'Twitch', 'Twitter', 'UBNTAC2', 'UbuntuONE', 'Unencrypted_Jabber',
'Unknown', 'VNC', 'Viber', 'Waze', 'WeChat', 'Webex', 'WhatsApp',
'WhatsAppCall', 'WhatsAppFiles', 'Whois-DAS', 'Wikipedia',
'WindowsUpdate', 'Xbox', 'Yahoo', 'YouTube', 'eBay', 'sFlow'],

Desicion Tree Model

In [46]: from sklearn import tree

clf_gini = tree.DecisionTreeClassifier(criterion='entropy', max_depth=20, random_state

# fit the model, Y_train_encode)
clf_gini score(X_train_scaled Y_train_encode)
Out[46]: 0.8926711790916095

In [47]: y_pred_gini clf_gini predict(X_test_scaled )

In [48]: from sklearn.metrics import accuracy_score

tree_train_accuracy = clf_gini.score(X_train_scaled,Y_train_encode)
tree_accuracy = clf_gini.score(X_test_scaled,Y_test_encode)

print("Training score: {:.3f}".format(clf_gini.score(X_train_scaled, Y_train_encode

print("Test score: {:.3f}" format(clf_gini score(X_test_scaled Y_test_encode)))
Training score: 0.893
Test score: 0.003

In [49]: # Lets split the data into 5 folds.

# We will use this 'kf'(StratiFiedKFold splitting stratergy) object as input to cross_
# The folds are made by preserving the percentage of samples for each class.
from sklearn.model_selection import StratifiedKFold
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cnt = 1
# split() method generate indices to split data into training and test set.
for train_index, test_index in kf.split(X_train_scaled, Y_train_encode):
print(f'Fold:{cnt}, Train set: {len(train_index)}, Test set:{len(test_index

# Note that:
# cross_val_score() parameter 'cv' will by default use StratifiedKFold spliting starte
# So you can bypass above step and just specify cv= 5 in cross_val_score() function

y:684: UserWarning: The least populated class in y has only 1 members, which
is less than n_splits=5.

Fold:1, Train set: 1081931, Test set:270483

Fold:2, Train set: 1081931, Test set:270483
Fold:3, Train set: 1081931, Test set:270483
Fold:4, Train set: 1081931, Test set:270483
Fold:5, Train set: 1081932, Test set:270482

In [50]: from sklearn.model_selection import cross_val_score

score = cross_val_score(tree.DecisionTreeClassifier(criterion='entropy', max_depth
print(f'Scores for each fold are: {score}')
print(f'Average score: {"{:.2f}".format(score.mean())}')

y:684: UserWarning: The least populated class in y has only 1 members, which
is less than n_splits=5.

Scores for each fold are: [0.80910815 0.80830958 0.80759604 0.80804709 0.8090
Average score: 0.81

In [51]: # Lets split the data into 5 folds.

# We will use this 'kf'(StratiFiedKFold splitting stratergy) object as input to cross_
# The folds are made by preserving the percentage of samples for each class.
from sklearn.model_selection import StratifiedKFold
kf2 = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cnt = 1
# split() method generate indices to split data into training and test set.
for train_index, test_index in kf2.split(X_test_scaled, Y_test_encode):
print(f'Fold:{cnt}, Train set: {len(train_index)}, Test set:{len(test_index

# Note that:
# cross_val_score() parameter 'cv' will by default use StratifiedKFold spliting starte
# So you can bypass above step and just specify cv= 5 in cross_val_score() function

y:684: UserWarning: The least populated class in y has only 1 members, which
is less than n_splits=5.

Fold:1, Train set: 1081932, Test set:270483

Fold:2, Train set: 1081932, Test set:270483
Fold:3, Train set: 1081932, Test set:270483
Fold:4, Train set: 1081932, Test set:270483
Fold:5, Train set: 1081932, Test set:270483

In [52]: from sklearn.model_selection import cross_val_score

score = cross_val_score(tree.DecisionTreeClassifier(criterion='entropy', max_depth
print(f'Scores for each fold are: {score}')
print(f'Average score: {"{:.2f}".format(score.mean())}')

y:684: UserWarning: The least populated class in y has only 1 members, which
is less than n_splits=5.

Scores for each fold are: [0.80932628 0.80940392 0.80885305 0.80842789 0.8093
Average score: 0.81

Naive Bayes

In [53]: # train a Gaussian Naive Bayes classifier on the training set

from sklearn.naive_bayes import GaussianNB
# instantiate the model
gnb = GaussianNB()
# fit the model
gnb fit(X_train_scaled Y_train_encode)
Out[53]: ▾ GaussianNB


In [54]: print("Training accuracy = ",gnb.score(X_train_scaled,Y_train_encode))

#Print Test Accuracy
gnb_accuracy = gnb.score(X_test_scaled,Y_test_encode)
print("Testing accuracy = " gnb score(X_test_scaled Y_test_encode))
Training accuracy = 0.09942961252989099
Testing accuracy = 0.0038819445214671533

KNN Model

In [55]: #Model Classification KNN using n_neighbors = 3

from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
neigh = KNeighborsClassifier(n_neighbors=3), Y_train_encode)
# store the predicted response values
Out[55]: ▾ KNeighborsClassifier

In [56]: print("Training score: {:.3f}".format(neigh.score(X_train_scaled, Y_train_encode

print("Test score: {:.3f}" format(neigh score(X_test_scaled Y_test_encode)))
Training score: 0.840
Test score: 0.002

Multi Layer Perceptron

In [57]: from sklearn neural_network import MLPClassifier

In [58]: mlp = MLPClassifier(hidden_layer_sizes=(3,2),activation='relu')

Out[58]: ▾ MLPClassifier
MLPClassifier(hidden_layer_sizes=(3, 2))

In [59]: mlp fit(X_train_scaled Y_train_encode)

Out[59]: ▾ MLPClassifier
MLPClassifier(hidden_layer_sizes=(3, 2))

In [60]: print("Training accuracy = ",mlp.score(X_train_scaled,Y_train_encode))

#Print Test Accuracy
print("Testing accuracy = " mlp score(X_test_scaled Y_test_encode))
Training accuracy = 0.4314418513857443
Testing accuracy = 0.004001730238129568

Random Forest

In [61]: from sklearn.ensemble import RandomForestClassifier

#Menggunakan ensamble algorithm Random Forest Classifier dengan libSklearn
modelRF = RandomForestClassifier(n_estimators=1)
In [62]: modelRF fit(X_train_scaled Y_train_encode)

Out[62]: ▾ RandomForestClassifier

In [63]: print("Training accuracy = ",modelRF.score(X_train_scaled,Y_train_encode))

#Print Test Accuracy
print("Testing accuracy = " modelRF score(X_test_scaled Y_test_encode))
Training accuracy = 0.8809173818076418
Testing accuracy = 0.004890510679044524

Evaluation With DT

In [64]: y_pred = clf_gini.predict(X_train_scaled )

from sklearn import metrics

tree_cm = metrics.confusion_matrix(Y_train_encode, y_pred)

sns.heatmap(tree_cm, annot=True, fmt=".0f", linewidths=.5, square = True, cmap
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Confusion Matrix - score:'+str(metrics.accuracy_score(Y_train_enco
plt.title(all_sample_title, size = 15);
print(metrics classification_report(Y_train_encode y_pred))

In [65]: y = label_encoder.inverse_transform([28,17,102,116])
Out[65]: array(['Google', 'DNS', 'TLS', 'Unknown'], dtype=object)

24 dari 24 29/11/2022 12.17

