You are on page 1of 24

KBJ_label_encoder - Jupyter Notebook http://localhost:8888/notebooks/KBJ_label_encoder.

ipynb

Overview Data
In [1]: import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
In [2]: import pandas as pd

data = pd.read_csv('C://Users/ditama/Downloads/Unicauca-dataset-April-June-2019-Networ
data head()
Out[2]:
flow_key src_ip_numeric src_ip src_port dst_ip dst_port

0 3acee4f4ea001cd5e6d9584d4036b53d 3232266497 192.168.121.1 67 172.16.255.185

1 974ec5991b439c9a7176b88be0c90df0 3232266497 192.168.121.1 67 172.16.255.186

2 3acee4f4ea001cd5e6d9584d4036b53d 3232266497 192.168.121.1 67 172.16.255.185

3 974ec5991b439c9a7176b88be0c90df0 3232266497 192.168.121.1 67 172.16.255.186

4 cfa7c2740072befaa89c202499729e08 3232266497 192.168.121.1 0 10.130.1.166

5 rows × 50 columns

In [3]: data shape

Out[3]: (2704839, 50)

In [4]: data columns

Out[4]: Index(['flow_key', 'src_ip_numeric', 'src_ip', 'src_port', 'dst_ip',


'dst_port', 'proto', 'pktTotalCount', 'octetTotalCount', 'min_ps',
'max_ps', 'avg_ps', 'std_dev_ps', 'flowStart', 'flowEnd',
'flowDuration', 'min_piat', 'max_piat', 'avg_piat', 'std_dev_piat',
'f_pktTotalCount', 'f_octetTotalCount', 'f_min_ps', 'f_max_ps',
'f_avg_ps', 'f_std_dev_ps', 'f_flowStart', 'f_flowEnd',
'f_flowDuration', 'f_min_piat', 'f_max_piat', 'f_avg_piat',
'f_std_dev_piat', 'b_pktTotalCount', 'b_octetTotalCount', 'b_min_ps',
'b_max_ps', 'b_avg_ps', 'b_std_dev_ps', 'b_flowStart', 'b_flowEnd',
'b_flowDuration', 'b_min_piat', 'b_max_piat', 'b_avg_piat',
'b_std_dev_piat', 'flowEndReason', 'category', 'application_protocol',
'web_service'],
dtype='object')

1 dari 24 29/11/2022 12.17


KBJ_label_encoder - Jupyter Notebook http://localhost:8888/notebooks/KBJ_label_encoder.ipynb

In [5]: data info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2704839 entries, 0 to 2704838
Data columns (total 50 columns):
# Column Dtype
--- ------ -----
0 flow_key object
1 src_ip_numeric int64
2 src_ip object
3 src_port int64
4 dst_ip object
5 dst_port int64
6 proto int64
7 pktTotalCount int64
8 octetTotalCount int64
9 min_ps int64
10 max_ps int64
11 avg_ps float64
12 std_dev_ps float64
13 flowStart float64
14 flowEnd float64
15 flowDuration float64
16 min_piat float64
17 max_piat float64
18 avg_piat float64
19 std_dev_piat float64
20 f_pktTotalCount int64
21 f_octetTotalCount int64
22 f_min_ps int64
23 f_max_ps int64
24 f_avg_ps float64
25 f_std_dev_ps float64
26 f_flowStart float64
27 f_flowEnd float64
28 f_flowDuration float64
29 f_min_piat float64
30 f_max_piat float64
31 f_avg_piat float64
32 f_std_dev_piat float64
33 b_pktTotalCount int64
34 b_octetTotalCount int64
35 b_min_ps int64
36 b_max_ps int64
37 b_avg_ps float64
38 b_std_dev_ps float64
39 b_flowStart float64
40 b_flowEnd float64
41 b_flowDuration float64
42 b_min_piat float64
43 b_max_piat float64
44 b_avg_piat float64
45 b_std_dev_piat float64
46 flowEndReason int64
47 category object
48 application_protocol object

2 dari 24 29/11/2022 12.17


KBJ_label_encoder - Jupyter Notebook http://localhost:8888/notebooks/KBJ_label_encoder.ipynb

49 web_service object
dtypes: float64(27), int64(17), object(6)
memory usage: 1.0+ GB

Lets take a look at all the non-numeric columns

In [6]: non_num_cols = [col for col in data.columns if data[col].dtype == 'O']


non_num_data = data[non_num_cols]
non_num_data
Out[6]:
flow_key src_ip dst_ip category application_protoc

0 3acee4f4ea001cd5e6d9584d4036b53d 192.168.121.1 172.16.255.185 Network

1 974ec5991b439c9a7176b88be0c90df0 192.168.121.1 172.16.255.186 Network

2 3acee4f4ea001cd5e6d9584d4036b53d 192.168.121.1 172.16.255.185 Network

3 974ec5991b439c9a7176b88be0c90df0 192.168.121.1 172.16.255.186 Network

4 cfa7c2740072befaa89c202499729e08 192.168.121.1 10.130.1.166 Network

... ... ... ... ...

2704834 695ea899a18c6d2f90c8b2f6c9b70bdf 192.168.128.252 172.16.255.186 System

2704835 f8188e4364129e635fe032a3bda206ea 192.168.128.252 172.16.255.185 System

2704836 4deda0130e2054781655cb4bd4cb580d 192.168.128.252 172.16.255.186 System

2704837 8c07a45c0c48648ff56341d7a065b855 192.168.128.252 108.177.11.188 Web

2704838 a61c7ab8213996e502ac7f54fc97fb34 192.168.128.252 172.217.15.196 Web

2704839 rows × 6 columns

No. of unique values and their counts in non_numeric columns

In [7]: [(col non_num_data[col].nunique()) for col in non_num_cols]

Out[7]: [('flow_key', 2344534),


('src_ip', 716),
('dst_ip', 104463),
('category', 24),
('application_protocol', 23),
('web_service', 141)]

3 dari 24 29/11/2022 12.17


KBJ_label_encoder - Jupyter Notebook http://localhost:8888/notebooks/KBJ_label_encoder.ipynb

In [8]: def summarize_cat(col_name):


sorted_values = sorted(non_num_data[col_name].value_counts().iteritems(), key
remaining_per = 100
for (value, count) in sorted_values:
per = count / len(non_num_data) * 100
if per >= 1:
print(f'{value} : {per:.2f}%')
else :
print(f'Others : {remaining_per:.2f}%')
break
remaining_per = remaining_per - per

4 dari 24 29/11/2022 12.17


KBJ_label_encoder - Jupyter Notebook http://localhost:8888/notebooks/KBJ_label_encoder.ipynb

In [9]: for col in non_num_cols:


print(f"Summary of {col} column : ")
summarize_cat(col)
print('\n')
Summary of flow_key column :
Others : 100.00%

Summary of src_ip column :


192.168.128.3 : 5.59%
192.168.122.52 : 1.75%
192.168.125.17 : 1.58%
192.168.121.62 : 1.30%
192.168.127.13 : 1.26%
192.168.128.87 : 1.14%
Others : 87.38%

Summary of dst_ip column :


172.16.255.200 : 27.43%
172.16.255.183 : 5.46%
172.16.141.250 : 5.01%
Others : 62.10%

Summary of category column :


Web : 52.36%
Network : 16.39%
Unspecified : 9.21%
SocialNetwork : 5.58%
Chat : 2.79%
Download-FileTransfer-FileSharing : 2.62%
Media : 2.36%
Cloud : 1.87%
VoIP : 1.74%
Collaborative : 1.44%
System : 1.37%
Others : 2.27%

Summary of application_protocol column :


Unknown : 48.37%
TLS : 25.58%
DNS : 18.10%
HTTP : 4.75%
QUIC : 2.62%
Others : 0.59%

Summary of web_service column :


Google : 21.07%
DNS : 15.52%
TLS : 9.60%
Unknown : 9.21%
Microsoft : 6.37%

5 dari 24 29/11/2022 12.17


KBJ_label_encoder - Jupyter Notebook http://localhost:8888/notebooks/KBJ_label_encoder.ipynb

HTTP : 5.65%
Facebook : 4.47%
Amazon : 3.24%
GoogleServices : 3.23%
BitTorrent : 2.62%
YouTube : 2.06%
Messenger : 1.67%
HTTP_Proxy : 1.25%
Others : 14.04%

Exploratory Analysis for numeric columns

6 dari 24 29/11/2022 12.17


KBJ_label_encoder - Jupyter Notebook http://localhost:8888/notebooks/KBJ_label_encoder.ipynb

In [10]: num_cols = list(set(data.columns) - set(non_num_cols))


num_cols
Out[10]: ['b_avg_ps',
'f_pktTotalCount',
'f_max_piat',
'f_std_dev_piat',
'b_avg_piat',
'flowEnd',
'b_std_dev_ps',
'b_std_dev_piat',
'proto',
'f_avg_ps',
'min_ps',
'dst_port',
'flowDuration',
'b_max_piat',
'f_std_dev_ps',
'b_max_ps',
'b_flowDuration',
'avg_ps',
'max_ps',
'f_min_piat',
'min_piat',
'f_flowStart',
'f_avg_piat',
'b_octetTotalCount',
'src_ip_numeric',
'std_dev_ps',
'f_flowEnd',
'f_flowDuration',
'flowEndReason',
'b_pktTotalCount',
'b_min_ps',
'b_flowStart',
'flowStart',
'octetTotalCount',
'avg_piat',
'pktTotalCount',
'f_octetTotalCount',
'src_port',
'std_dev_piat',
'f_min_ps',
'f_max_ps',
'b_min_piat',
'max_piat',
'b_flowEnd']

7 dari 24 29/11/2022 12.17


KBJ_label_encoder - Jupyter Notebook http://localhost:8888/notebooks/KBJ_label_encoder.ipynb

In [11]: data[num_cols].describe()

Out[11]:
b_avg_ps f_pktTotalCount f_max_piat f_std_dev_piat b_avg_piat flowEnd

count 2.704839e+06 2.704839e+06 2.704839e+06 2.704839e+06 2.704839e+06 2.704839e+06

mean 2.880519e+02 3.729565e+01 2.210437e+01 5.803303e+00 4.132842e+00 1.557242e+09

std 5.933458e+02 1.960941e+03 8.534161e+01 2.440810e+01 3.715773e+01 1.610686e+06

min 0.000000e+00 1.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 1.555954e+09

25% 4.600000e+01 1.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 1.556055e+09

50% 1.140000e+02 2.000000e+00 5.225205e-02 0.000000e+00 0.000000e+00 1.556291e+09

75% 2.800000e+02 9.000000e+00 6.875163e+00 1.501036e+00 3.276894e-01 1.559659e+09

max 1.583600e+04 2.156205e+06 1.780822e+03 8.624964e+02 1.780821e+03 1.559771e+09

8 rows × 44 columns

In [12]: [col for col in num_cols if data[col].isnull().any()]

Out[12]: []

8 dari 24 29/11/2022 12.17


KBJ_label_encoder - Jupyter Notebook http://localhost:8888/notebooks/KBJ_label_encoder.ipynb

In [13]: print("range and no. of unique values in numeric columns")


for col in num_cols:
print(f'{col}\tRange : {max(data[col]) - min(data[col])}, No. of unique values :

range and no. of unique values in numeric columns


b_avg_ps Range : 15836.0, No. of unique values : 309031
f_pktTotalCount Range : 2156204, No. of unique values : 5623
f_max_piat Range : 1780.82152986526, No. of unique values : 1174066
f_std_dev_piat Range : 862.496393918991, No. of unique values : 1301011
b_avg_piat Range : 1780.82148790359, No. of unique values : 1179890
flowEnd Range : 3817819.12740016, No. of unique values : 2621434
b_std_dev_ps Range : 11680.0, No. of unique values : 655659
b_std_dev_piat Range : 839.900081515312, No. of unique values : 1148891
proto Range : 16, No. of unique values : 3
f_avg_ps Range : 11596.0, No. of unique values : 210778
min_ps Range : 11596, No. of unique values : 711
dst_port Range : 65535, No. of unique values : 33753
flowDuration Range : 1800.20165610313, No. of unique values : 1430642
b_max_piat Range : 1780.82148790359, No. of unique values : 967839
f_std_dev_ps Range : 7908.10729941073, No. of unique values : 634260
b_max_ps Range : 26320, No. of unique values : 14233
b_flowDuration Range : 1558211564721.97, No. of unique values : 1189909
avg_ps Range : 11596.0, No. of unique values : 410222
max_ps Range : 26292, No. of unique values : 14548
f_min_piat Range : 1780.82152986526, No. of unique values : 243202
min_piat Range : 1763.94893193245, No. of unique values : 310582
f_flowStart Range : 3817819.2345199585, No. of unique values : 2645081
f_avg_piat Range : 1780.82152986526, No. of unique values : 1344234
b_octetTotalCount Range : 2971893160, No. of unique values : 141155
src_ip_numeric Range : 2044, No. of unique values : 716
std_dev_ps Range : 9370.13341149918, No. of unique values : 833730
f_flowEnd Range : 1558215380814.8389, No. of unique values : 2397265
f_flowDuration Range : 1800.20165610313, No. of unique values : 1288078
flowEndReason Range : 3, No. of unique values : 4
b_pktTotalCount Range : 1017780, No. of unique values : 7015
b_min_ps Range : 15836, No. of unique values : 841
b_flowStart Range : 1559771334.33015, No. of unique values : 2288361
flowStart Range : 3817819.2345199585, No. of unique values : 2645081
octetTotalCount Range : 2981111667, No. of unique values : 154581
avg_piat Range : 1763.94893193245, No. of unique values : 1554611
pktTotalCount Range : 2292424, No. of unique values : 8984
f_octetTotalCount Range : 2955382240, No. of unique values : 56947
src_port Range : 65535, No. of unique values : 61314
std_dev_piat Range : 865.191153526306, No. of unique values : 1394435
f_min_ps Range : 11596, No. of unique values : 1034
f_max_ps Range : 26292, No. of unique values : 6980
b_min_piat Range : 1780.82148790359, No. of unique values : 155794
max_piat Range : 1780.82109594345, No. of unique values : 1268402
b_flowEnd Range : 1559771334.33116, No. of unique values : 2282692

For the columns having <=50 unique values, we plot histograms, for
others we just list distribution of most frequent values as in case of
category columns

9 dari 24 29/11/2022 12.17


KBJ_label_encoder - Jupyter Notebook http://localhost:8888/notebooks/KBJ_label_encoder.ipynb

In [14]: cols_for_hist = [col for col in num_cols if data[col].nunique() <= 50]


cols_for_hist len(cols_for_hist)
Out[14]: (['proto', 'flowEndReason'], 2)

In [15]: cols_for_desc = [col for col in num_cols if data[col].nunique() > 50]


cols_for_desc
Out[15]: ['b_avg_ps',
'f_pktTotalCount',
'f_max_piat',
'f_std_dev_piat',
'b_avg_piat',
'flowEnd',
'b_std_dev_ps',
'b_std_dev_piat',
'f_avg_ps',
'min_ps',
'dst_port',
'flowDuration',
'b_max_piat',
'f_std_dev_ps',
'b_max_ps',
'b_flowDuration',
'avg_ps',
'max_ps',
'f_min_piat',
'min_piat',
'f_flowStart',
'f_avg_piat',
'b_octetTotalCount',
'src_ip_numeric',
'std_dev_ps',
'f_flowEnd',
'f_flowDuration',
'b_pktTotalCount',
'b_min_ps',
'b_flowStart',
'flowStart',
'octetTotalCount',
'avg_piat',
'pktTotalCount',
'f_octetTotalCount',
'src_port',
'std_dev_piat',
'f_min_ps',
'f_max_ps',
'b_min_piat',
'max_piat',
'b_flowEnd']

10 dari 24 29/11/2022 12.17


KBJ_label_encoder - Jupyter Notebook http://localhost:8888/notebooks/KBJ_label_encoder.ipynb

In [16]: data[cols_for_hist].hist(layout = (7,3), figsize = (12, 20))


plt tight_layout()

Correlation Matrix

In [17]: corr data[num_cols].corr()

11 dari 24 29/11/2022 12.17


KBJ_label_encoder - Jupyter Notebook http://localhost:8888/notebooks/KBJ_label_encoder.ipynb

In [18]: f = plt.figure(figsize = (25,25))


plt.matshow(corr, fignum=f.number)
plt.title('Correlation Matrix of Numeric columns in the dataset', fontsize = 20
plt.xticks(range(len(num_cols)), num_cols, fontsize = 14, rotation = 90)
plt.yticks(range(len(num_cols)), num_cols, fontsize = 14)
plt.gca().xaxis.set_ticks_position('bottom')
cb = plt.colorbar(fraction = 0.0466, pad = 0.02)
cb.ax.tick_params(labelsize=10)
plt show()

Prepocessing

12 dari 24 29/11/2022 12.17


KBJ_label_encoder - Jupyter Notebook http://localhost:8888/notebooks/KBJ_label_encoder.ipynb

In [19]: #check null


data isnull().sum()
Out[19]: flow_key 0
src_ip_numeric 0
src_ip 0
src_port 0
dst_ip 0
dst_port 0
proto 0
pktTotalCount 0
octetTotalCount 0
min_ps 0
max_ps 0
avg_ps 0
std_dev_ps 0
flowStart 0
flowEnd 0
flowDuration 0
min_piat 0
max_piat 0
avg_piat 0
std_dev_piat 0
f_pktTotalCount 0
f_octetTotalCount 0
f_min_ps 0
f_max_ps 0
f_avg_ps 0
f_std_dev_ps 0
f_flowStart 0
f_flowEnd 0
f_flowDuration 0
f_min_piat 0
f_max_piat 0
f_avg_piat 0
f_std_dev_piat 0
b_pktTotalCount 0
b_octetTotalCount 0
b_min_ps 0
b_max_ps 0
b_avg_ps 0
b_std_dev_ps 0
b_flowStart 0
b_flowEnd 0
b_flowDuration 0
b_min_piat 0
b_max_piat 0
b_avg_piat 0
b_std_dev_piat 0
flowEndReason 0
category 0
application_protocol 0
web_service 0
dtype: int64

13 dari 24 29/11/2022 12.17


KBJ_label_encoder - Jupyter Notebook http://localhost:8888/notebooks/KBJ_label_encoder.ipynb

In [20]: #check duplicate


dups = data.duplicated()
print('Number of duplicate rows = %d' % (dups sum()))
Number of duplicate rows = 10

In [21]: #remove duplicate


print('Number of rows before discarding duplicates = %d' % (data.shape[0]))
data = data.drop_duplicates()
print('Number of rows after discarding duplicates = %d' % (data shape[0]))
Number of rows before discarding duplicates = 2704839
Number of rows after discarding duplicates = 2704829

Feature Selection

Based on the unique column removed

In [22]: ipdata data copy()

In [23]: ipdata drop(['flow_key' 'src_ip_numeric' 'src_ip' 'dst_ip'], axis = 1 inplace

In [24]: single_unique_cols = [col for col in ipdata.columns if ipdata[col].nunique() ==


single_unique_cols
Out[24]: []

Based on Corr colum numeric removed

In [25]: ipdata_num = data[num_cols].copy()

In [26]: ipdata_num.drop(['f_flowStart','flowEnd','octetTotalCount','b_octetTotalCount',

In [27]: corr = ipdata_num corr()

14 dari 24 29/11/2022 12.17


KBJ_label_encoder - Jupyter Notebook http://localhost:8888/notebooks/KBJ_label_encoder.ipynb

In [28]: num_cols_after_drop = list(set(ipdata_num.columns))


num_cols_after_drop
Out[28]: ['f_std_dev_ps',
'pktTotalCount',
'src_ip_numeric',
'b_std_dev_piat',
'std_dev_ps',
'f_flowEnd',
'flowEndReason',
'proto',
'b_min_piat',
'min_piat',
'min_ps',
'max_piat',
'dst_port',
'flowDuration',
'b_flowEnd',
'b_max_piat',
'flowStart']

15 dari 24 29/11/2022 12.17


KBJ_label_encoder - Jupyter Notebook http://localhost:8888/notebooks/KBJ_label_encoder.ipynb

In [29]: f = plt.figure(figsize = (25,25))


plt.matshow(corr, fignum=f.number)
plt.title('Correlation Matrix of Numeric columns in the dataset', fontsize = 20
plt.xticks(range(len(num_cols_after_drop)), num_cols_after_drop, fontsize = 14,
plt.yticks(range(len(num_cols_after_drop)), num_cols_after_drop, fontsize = 14)
plt.gca().xaxis.set_ticks_position('bottom')
cb = plt.colorbar(fraction = 0.0466, pad = 0.02)
cb.ax.tick_params(labelsize=10)
plt show()

Final Feature

In [30]: df ipdata_num copy()

16 dari 24 29/11/2022 12.17


KBJ_label_encoder - Jupyter Notebook http://localhost:8888/notebooks/KBJ_label_encoder.ipynb

In [31]: df head()

Out[31]:
b_std_dev_piat proto min_ps dst_port flowDuration b_max_piat f_std_dev_ps min_piat src_ip_num

0 84.916348 17 328 67 1701.385427 198.657965 9.140200 0.000313

1 0.000000 17 328 67 1701.385515 0.000000 9.140200 0.010356

2 124.270745 17 328 67 1450.967340 340.268454 9.718024 0.000239

3 0.000000 17 328 67 1450.967130 0.000000 10.057833 0.015330

4 0.000000 1 56 0 0.000000 0.000000 0.000000 0.000000

In [32]: df shape

Out[32]: (2704829, 17)

Classification Label Web Service DT, Naive


Bayes, KNN,MLP,RF
In [33]: #data train, dan test
X = ipdata_num
Y data['web_service']
In [34]: #splitting
from sklearn.model_selection import train_test_split
X_train X_test y_train y_test train_test_split(X Y test_size 0.5 random_state
In [35]: X_train shape

Out[35]: (1352414, 17)

In [36]: X_test shape

Out[36]: (1352415, 17)

In [37]: #normalisasi
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler fit_transform(X_test)

17 dari 24 29/11/2022 12.17


KBJ_label_encoder - Jupyter Notebook http://localhost:8888/notebooks/KBJ_label_encoder.ipynb

In [38]: X_train_scaled

Out[38]: array([[0.00000000e+00, 3.12500000e-01, 4.13507926e-03, ...,


0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
[7.98713134e-04, 3.12500000e-01, 2.06753963e-03, ...,
1.63910959e-09, 1.13994231e-03, 9.98002621e-01],
[1.05979673e-04, 3.12500000e-01, 4.13507926e-03, ...,
6.20653215e-04, 7.13559201e-04, 9.99942314e-01],
...,
[0.00000000e+00, 1.00000000e+00, 5.34114404e-03, ...,
0.00000000e+00, 3.62225073e-07, 9.99996212e-01],
[0.00000000e+00, 1.00000000e+00, 1.65403170e-02, ...,
0.00000000e+00, 5.70184165e-05, 0.00000000e+00],
[1.03205274e-05, 3.12500000e-01, 4.13507926e-03, ...,
1.77570205e-09, 2.03033912e-05, 9.97559794e-01]])

In [39]: X_test_scaled

Out[39]: array([[3.88515652e-03, 3.12500000e-01, 1.03483960e-03, ...,


1.35220047e-08, 6.01140397e-03, 9.97782795e-01],
[1.38839226e-04, 3.12500000e-01, 2.06967920e-03, ...,
2.67762469e-09, 1.57405541e-04, 9.99942577e-01],
[1.62385749e-03, 3.12500000e-01, 1.03483960e-03, ...,
1.74045605e-09, 2.78186510e-03, 9.99934633e-01],
...,
[0.00000000e+00, 1.00000000e+00, 5.77785443e-03, ...,
0.00000000e+00, 2.29740249e-07, 9.99996128e-01],
[2.44122729e-03, 3.12500000e-01, 1.03483960e-03, ...,
3.83126590e-05, 1.23374111e-02, 9.97766900e-01],
[0.00000000e+00, 1.00000000e+00, 8.62366333e-05, ...,
0.00000000e+00, 5.39459826e-05, 9.97564948e-01]])

In [40]: from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
Y_train_encode = label_encoder.fit_transform(y_train)

In [41]: label_encoder2 = LabelEncoder()


Y_test_encode label_encoder2 fit_transform(y_test)
In [42]: Y_train_encode

Out[42]: array([ 25, 1, 3, ..., 130, 90, 122])

In [43]: Y_test_encode

Out[43]: array([105, 31, 105, ..., 33, 39, 45])

18 dari 24 29/11/2022 12.17


KBJ_label_encoder - Jupyter Notebook http://localhost:8888/notebooks/KBJ_label_encoder.ipynb

In [44]: label_encoder classes_

Out[44]: array(['AJP', 'Amazon', 'AmazonVideo', 'Apple', 'ApplePush', 'AppleStore',


'AppleiCloud', 'AppleiTunes', 'BJNP', 'BitTorrent', 'CNN',
'CiscoSkinny', 'CiscoVPN', 'Citrix', 'Cloudflare', 'DHCP', 'DNP3',
'DNS', 'DNSoverHTTPS', 'DataSaver', 'Deezer',
'Direct_Download_Link', 'Dropbox', 'FTP_CONTROL', 'FTP_DATA',
'Facebook', 'GMail', 'Github', 'Google', 'GoogleDocs',
'GoogleDrive', 'GoogleHangoutDuo', 'GoogleMaps', 'GooglePlus',
'GoogleServices', 'H323', 'HTTP', 'HTTP_Proxy', 'HotspotShield',
'IAX', 'ICMP', 'IMAPS', 'IMO', 'IPsec', 'IRC', 'Instagram', 'LDAP',
'LinkedIn', 'LotusNotes', 'MQTT', 'MSN', 'MS_OneDrive',
'Messenger', 'Microsoft', 'Mining', 'MsSQL-TDS', 'NFS', 'NTP',
'NestLogSink', 'NetBIOS', 'NetFlix', 'Office365', 'Ookla',
'OpenDNS', 'OpenVPN', 'Oracle', 'POP3', 'PS_VUE',
'Pando_Media_Booster', 'PlayStore', 'Playstation', 'PostgreSQL',
'QQ', 'QUIC', 'RDP', 'RTMP', 'RTP', 'RTSP', 'RX', 'Radius', 'SAP',
'SIP', 'SMBv1', 'SMBv23', 'SMTP', 'SNMP', 'SOCKS', 'SOMEIP',
'SSDP', 'SSH', 'STUN', 'Signal', 'Sina(Weibo)', 'Skype',
'SkypeCall', 'Slack', 'Snapchat', 'SoundCloud', 'Spotify',
'Starcraft', 'Steam', 'Syslog', 'TLS', 'Targus Dataspeed',
'TeamViewer', 'Telegram', 'Teredo', 'TikTok', 'Tor', 'Tuenti',
'Twitch', 'Twitter', 'UBNTAC2', 'UPnP', 'UbuntuONE',
'Unencrypted_Jabber', 'Unknown', 'VNC', 'Viber', 'Waze', 'WeChat',
'Webex', 'WhatsApp', 'WhatsAppCall', 'WhatsAppFiles', 'Whois-DAS',
'Wikipedia', 'WindowsUpdate', 'Xbox', 'Yahoo', 'YouTube', 'Zoom',
'eBay', 'eDonkey', 'sFlow'], dtype=object)

In [45]: label_encoder2 classes_

Out[45]: array(['104', 'AJP', 'Amazon', 'AmazonVideo', 'Apple', 'ApplePush',


'AppleStore', 'AppleiCloud', 'AppleiTunes', 'BGP', 'BJNP',
'BitTorrent', 'CNN', 'CiscoSkinny', 'CiscoVPN', 'Citrix',
'Cloudflare', 'DHCP', 'DNP3', 'DNS', 'DNSoverHTTPS', 'DataSaver',
'Deezer', 'Direct_Download_Link', 'Dropbox', 'FTP_CONTROL',
'FTP_DATA', 'Facebook', 'GMail', 'GTP', 'Github', 'Google',
'GoogleDocs', 'GoogleDrive', 'GoogleHangoutDuo', 'GoogleMaps',
'GooglePlus', 'GoogleServices', 'H323', 'HTTP', 'HTTP_Proxy',
'HotspotShield', 'IAX', 'ICMP', 'IMAPS', 'IMO', 'IPsec', 'IRC',
'Instagram', 'LDAP', 'LinkedIn', 'LotusNotes', 'MDNS', 'MQTT',
'MSN', 'MS_OneDrive', 'Messenger', 'Microsoft', 'Mining',
'MsSQL-TDS', 'MySQL', 'NFS', 'NTP', 'NestLogSink', 'NetBIOS',
'NetFlix', 'Office365', 'Ookla', 'OpenDNS', 'OpenVPN', 'Oracle',
'PS_VUE', 'Pando_Media_Booster', 'PlayStore', 'Playstation',
'PostgreSQL', 'QQ', 'QUIC', 'RDP', 'RTMP', 'RTP', 'RTSP', 'RX',
'Radius', 'SIP', 'SMBv1', 'SMBv23', 'SMTP', 'SMTPS', 'SNMP',
'SOCKS', 'SSDP', 'SSH', 'STUN', 'Signal', 'Sina(Weibo)', 'Skype',
'SkypeCall', 'Slack', 'Snapchat', 'SoundCloud', 'Spotify',
'Starcraft', 'Steam', 'Syslog', 'TLS', 'Targus Dataspeed',
'TeamViewer', 'Telegram', 'Teredo', 'TikTok', 'Tor', 'Tuenti',
'Twitch', 'Twitter', 'UBNTAC2', 'UbuntuONE', 'Unencrypted_Jabber',
'Unknown', 'VNC', 'Viber', 'Waze', 'WeChat', 'Webex', 'WhatsApp',
'WhatsAppCall', 'WhatsAppFiles', 'Whois-DAS', 'Wikipedia',
'WindowsUpdate', 'Xbox', 'Yahoo', 'YouTube', 'eBay', 'sFlow'],
dtype=object)

19 dari 24 29/11/2022 12.17


KBJ_label_encoder - Jupyter Notebook http://localhost:8888/notebooks/KBJ_label_encoder.ipynb

Desicion Tree Model

In [46]: from sklearn import tree


clf_gini = tree.DecisionTreeClassifier(criterion='entropy', max_depth=20, random_state

# fit the model


clf_gini.fit(X_train_scaled, Y_train_encode)
clf_gini score(X_train_scaled Y_train_encode)
Out[46]: 0.8926711790916095

In [47]: y_pred_gini clf_gini predict(X_test_scaled )

In [48]: from sklearn.metrics import accuracy_score

tree_train_accuracy = clf_gini.score(X_train_scaled,Y_train_encode)
tree_accuracy = clf_gini.score(X_test_scaled,Y_test_encode)

print("Training score: {:.3f}".format(clf_gini.score(X_train_scaled, Y_train_encode


print("Test score: {:.3f}" format(clf_gini score(X_test_scaled Y_test_encode)))
Training score: 0.893
Test score: 0.003

In [49]: # Lets split the data into 5 folds.


# We will use this 'kf'(StratiFiedKFold splitting stratergy) object as input to cross_
# The folds are made by preserving the percentage of samples for each class.
from sklearn.model_selection import StratifiedKFold
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cnt = 1
# split() method generate indices to split data into training and test set.
for train_index, test_index in kf.split(X_train_scaled, Y_train_encode):
print(f'Fold:{cnt}, Train set: {len(train_index)}, Test set:{len(test_index
cnt+=1

# Note that:
# cross_val_score() parameter 'cv' will by default use StratifiedKFold spliting starte
# So you can bypass above step and just specify cv= 5 in cross_val_score() function

C:\Users\ditama\anaconda3\lib\site-packages\sklearn\model_selection\_split.p
y:684: UserWarning: The least populated class in y has only 1 members, which
is less than n_splits=5.
warnings.warn(

Fold:1, Train set: 1081931, Test set:270483


Fold:2, Train set: 1081931, Test set:270483
Fold:3, Train set: 1081931, Test set:270483
Fold:4, Train set: 1081931, Test set:270483
Fold:5, Train set: 1081932, Test set:270482

20 dari 24 29/11/2022 12.17


KBJ_label_encoder - Jupyter Notebook http://localhost:8888/notebooks/KBJ_label_encoder.ipynb

In [50]: from sklearn.model_selection import cross_val_score


score = cross_val_score(tree.DecisionTreeClassifier(criterion='entropy', max_depth
print(f'Scores for each fold are: {score}')
print(f'Average score: {"{:.2f}".format(score.mean())}')

C:\Users\ditama\anaconda3\lib\site-packages\sklearn\model_selection\_split.p
y:684: UserWarning: The least populated class in y has only 1 members, which
is less than n_splits=5.
warnings.warn(

Scores for each fold are: [0.80910815 0.80830958 0.80759604 0.80804709 0.8090
2241]
Average score: 0.81

In [51]: # Lets split the data into 5 folds.


# We will use this 'kf'(StratiFiedKFold splitting stratergy) object as input to cross_
# The folds are made by preserving the percentage of samples for each class.
from sklearn.model_selection import StratifiedKFold
kf2 = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cnt = 1
# split() method generate indices to split data into training and test set.
for train_index, test_index in kf2.split(X_test_scaled, Y_test_encode):
print(f'Fold:{cnt}, Train set: {len(train_index)}, Test set:{len(test_index
cnt+=1

# Note that:
# cross_val_score() parameter 'cv' will by default use StratifiedKFold spliting starte
# So you can bypass above step and just specify cv= 5 in cross_val_score() function

C:\Users\ditama\anaconda3\lib\site-packages\sklearn\model_selection\_split.p
y:684: UserWarning: The least populated class in y has only 1 members, which
is less than n_splits=5.
warnings.warn(

Fold:1, Train set: 1081932, Test set:270483


Fold:2, Train set: 1081932, Test set:270483
Fold:3, Train set: 1081932, Test set:270483
Fold:4, Train set: 1081932, Test set:270483
Fold:5, Train set: 1081932, Test set:270483

In [52]: from sklearn.model_selection import cross_val_score


score = cross_val_score(tree.DecisionTreeClassifier(criterion='entropy', max_depth
print(f'Scores for each fold are: {score}')
print(f'Average score: {"{:.2f}".format(score.mean())}')

C:\Users\ditama\anaconda3\lib\site-packages\sklearn\model_selection\_split.p
y:684: UserWarning: The least populated class in y has only 1 members, which
is less than n_splits=5.
warnings.warn(

Scores for each fold are: [0.80932628 0.80940392 0.80885305 0.80842789 0.8093
6695]
Average score: 0.81

21 dari 24 29/11/2022 12.17


KBJ_label_encoder - Jupyter Notebook http://localhost:8888/notebooks/KBJ_label_encoder.ipynb

Naive Bayes

In [53]: # train a Gaussian Naive Bayes classifier on the training set


from sklearn.naive_bayes import GaussianNB
# instantiate the model
gnb = GaussianNB()
# fit the model
gnb fit(X_train_scaled Y_train_encode)
Out[53]: ▾ GaussianNB

GaussianNB()

In [54]: print("Training accuracy = ",gnb.score(X_train_scaled,Y_train_encode))


#Print Test Accuracy
gnb_accuracy = gnb.score(X_test_scaled,Y_test_encode)
print("Testing accuracy = " gnb score(X_test_scaled Y_test_encode))
Training accuracy = 0.09942961252989099
Testing accuracy = 0.0038819445214671533

KNN Model

In [55]: #Model Classification KNN using n_neighbors = 3


from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X_train_scaled, Y_train_encode)
# store the predicted response values
Out[55]: ▾ KNeighborsClassifier
KNeighborsClassifier(n_neighbors=3)

In [56]: print("Training score: {:.3f}".format(neigh.score(X_train_scaled, Y_train_encode


print("Test score: {:.3f}" format(neigh score(X_test_scaled Y_test_encode)))
Training score: 0.840
Test score: 0.002

Multi Layer Perceptron

In [57]: from sklearn neural_network import MLPClassifier

In [58]: mlp = MLPClassifier(hidden_layer_sizes=(3,2),activation='relu')


mlp
Out[58]: ▾ MLPClassifier
MLPClassifier(hidden_layer_sizes=(3, 2))

22 dari 24 29/11/2022 12.17


KBJ_label_encoder - Jupyter Notebook http://localhost:8888/notebooks/KBJ_label_encoder.ipynb

In [59]: mlp fit(X_train_scaled Y_train_encode)

Out[59]: ▾ MLPClassifier
MLPClassifier(hidden_layer_sizes=(3, 2))

In [60]: print("Training accuracy = ",mlp.score(X_train_scaled,Y_train_encode))


#Print Test Accuracy
print("Testing accuracy = " mlp score(X_test_scaled Y_test_encode))
Training accuracy = 0.4314418513857443
Testing accuracy = 0.004001730238129568

Random Forest

In [61]: from sklearn.ensemble import RandomForestClassifier


#Menggunakan ensamble algorithm Random Forest Classifier dengan libSklearn
modelRF = RandomForestClassifier(n_estimators=1)
In [62]: modelRF fit(X_train_scaled Y_train_encode)

Out[62]: ▾ RandomForestClassifier
RandomForestClassifier(n_estimators=1)

In [63]: print("Training accuracy = ",modelRF.score(X_train_scaled,Y_train_encode))


#Print Test Accuracy
print("Testing accuracy = " modelRF score(X_test_scaled Y_test_encode))
Training accuracy = 0.8809173818076418
Testing accuracy = 0.004890510679044524

Evaluation With DT

23 dari 24 29/11/2022 12.17


KBJ_label_encoder - Jupyter Notebook http://localhost:8888/notebooks/KBJ_label_encoder.ipynb

In [64]: y_pred = clf_gini.predict(X_train_scaled )


from sklearn import metrics

tree_cm = metrics.confusion_matrix(Y_train_encode, y_pred)


plt.figure(figsize=(10,10))
sns.heatmap(tree_cm, annot=True, fmt=".0f", linewidths=.5, square = True, cmap
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Confusion Matrix - score:'+str(metrics.accuracy_score(Y_train_enco
plt.title(all_sample_title, size = 15);
plt.show()
print(metrics classification_report(Y_train_encode y_pred))

In [65]: y = label_encoder.inverse_transform([28,17,102,116])
y
Out[65]: array(['Google', 'DNS', 'TLS', 'Unknown'], dtype=object)

24 dari 24 29/11/2022 12.17

You might also like