You are on page 1of 10

# Load Libraries - Make sure to run this cell!

import pandas as pd

import numpy as np

import re, os

from string import printable

from sklearn import model_selection

#import gensim

import tensorflow as tf

from keras.models import Sequential, Model, model_from_json, load_model

from keras import regularizers

from keras.layers.core import Dense, Dropout, Activation, Lambda, Flatten

from keras.layers import Input, ELU, LSTM, Embedding, Convolution2D, MaxPooling2D, \

BatchNormalization, Convolution1D, MaxPooling1D, concatenate

from keras.preprocessing import sequence

from keras.optimizers import SGD, Adam, RMSprop

from keras.utils import np_utils

from keras import backend as K

from pathlib import Path

import json

import warnings

warnings.filterwarnings("ignore")
2.

DATA_HOME = 'data/'

df = pd.read_csv(DATA_HOME + 'url_data_mega_deep_learning.csv')

df.sample(n=25).head(25)
out 2:

url_int_tokens = [[printable.index(x) + 1 for x in url if x in printable] for url in df.url]

max_len=75

X = sequence.pad_sequences(url_int_tokens, maxlen=max_len)

target = np.array(df.isMalicious)

print('Matrix dimensions of X: ', X.shape, 'Vector dimension of target: ', target.shape)

X_train, X_test, target_train, target_test = model_selection.train_test_split(X, target, test_size=0.25,


random_state=33)

def print_layers_dims(model):

l_layers = model.layers

# Note None is ALWAYS batch_size

for i in range(len(l_layers)):

print(l_layers[i])

print('Input Shape: ', l_layers[i].input_shape, 'Output Shape: ', l_layers[i].output_shape)

def save_model(fileModelJSON,fileWeights):

#print("Saving model to disk: ",fileModelJSON,"and",fileWeights)

#have h5py installed

if Path(fileModelJSON).is_file():

os.remove(fileModelJSON)

json_string = model.to_json()

with open(fileModelJSON,'w' ) as f:

json.dump(json_string, f)
if Path(fileWeights).is_file():

os.remove(fileWeights)

model.save_weights(fileWeights)

def load_model(fileModelJSON,fileWeights):

#print("Saving model to disk: ",fileModelJSON,"and",fileWeights)

with open(fileModelJSON, 'r') as f:

model_json = json.load(f)

model = model_from_json(model_json)

model.load_weights(fileWeights)

return model

def conv_fully(max_len=75, emb_dim=32, max_vocab_len=100, W_reg=regularizers.l2(1e-4)):

# Input

main_input = Input(shape=(max_len,), dtype='int32', name='main_input')

# Embedding layer

emb = Embedding(input_dim=max_vocab_len, output_dim=emb_dim, input_length=max_len,

W_regularizer=W_reg)(main_input)

emb = Dropout(0.25)(emb)

def sum_1d(X):

return K.sum(X, axis=1)

def get_conv_layer(emb, kernel_size=5, filters=256):

conv = Convolution1D(kernel_size=kernel_size, filters=filters, \

border_mode='same')(emb)
conv = ELU()(conv)

conv = Lambda(sum_1d, output_shape=(filters,))(conv)

#conv = BatchNormalization(mode=0)(conv)

conv = Dropout(0.5)(conv)

return conv

conv1 = get_conv_layer(emb, kernel_size=2, filters=256)

conv2 = get_conv_layer(emb, kernel_size=3, filters=256)

conv3 = get_conv_layer(emb, kernel_size=4, filters=256)

conv4 = get_conv_layer(emb, kernel_size=5, filters=256)

merged = concatenate([conv1,conv2,conv3,conv4], axis=1)

hidden1 = Dense(1024)(merged)

hidden1 = ELU()(hidden1)

hidden1 = BatchNormalization(mode=0)(hidden1)

hidden1 = Dropout(0.5)(hidden1)

hidden2 = Dense(1024)(hidden1)

hidden2 = ELU()(hidden2)

hidden2 = BatchNormalization(mode=0)(hidden2)

hidden2 = Dropout(0.5)(hidden2)

output = Dense(1, activation='sigmoid', name='output')(hidden2)

model = Model(input=[main_input], output=[output])

adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)

model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])

return model
epochs = 5

batch_size = 32

model = conv_fully()

model.fit(X_train, target_train, epochs=epochs, batch_size=batch_size)

loss, accuracy = model.evaluate(X_test, target_test, verbose=1)

print('\nFinal Cross-Validation Accuracy', accuracy, '\n')

print_layers_dims(model)

target_proba = model.predict(X_test, batch_size=1)

target_proba[0:10]

model_name = "deeplearning_1DConv"

save_model(DATA_HOME + model_name + ".json", DATA_HOME + model_name + ".h5")

model = load_model(DATA_HOME + model_name + ".json", DATA_HOME + model_name + ".h5")

l_layers = model.layers

weights = l_layers[1].get_weights()

weights[0].shape
test_url_mal = "naureen.net/etisalat.ae/index2.php"

test_url_benign = "sixt.com/php/reservation?language=en_US"

url = test_url_benign

url_int_tokens = [[printable.index(x) + 1 for x in url if x in printable]]

max_len=75

X = sequence.pad_sequences(url_int_tokens, maxlen=max_len)

You might also like