You are on page 1of 2

df.columns = df.columns.str.

strip()

columns = {'unnamed: 0': 'Date', 'date': 'Date', 'srcip': 'Source IP(s)',


'source ip': 'Source IP(s)',
'source_ip': 'Source IP(s)', 'dstip': 'Destination IP(s)',
'destination ip': 'Destination IP(s)',
'destination_ip': 'Destination IP(s)', 'alertname': 'Alert Name',
'origin': 'Origin Country',
'origincountry': 'Origin Country', 'attacktype': 'Attack Type',
'description': 'Description', 'descrption': 'Description'}
df.columns = [columns[column.lower()] if column.lower() in columns.keys() else
column for column in df.columns]
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

columns_to_check = list(df.columns)
columns_to_check.remove('Date')

df = df.dropna(subset=columns_to_check, how='all')
df.loc[:, 'Date'] =
df['Date'].apply(convert_serial_to_date).fillna(method='ffill').apply(parse_date_on
ly)
df.loc[:, 'Date'] = df['Date'].apply(ethiopian_to_gregorian) # Convert
Ethiopian dates to Gregorian dates using ethiopian_date package

df = df.where(pd.notnull(df), None)

df['Source IP(s)'] = df['Source IP(s)'].apply(


lambda x: check_for_ip_domain(','.join(x)) if isinstance(x, list) else
(check_for_ip_domain(x) if isinstance(x, str) else ''))
df['Destination IP(s)'] = df['Destination IP(s)'].apply(
lambda x: check_for_ip_domain(','.join(x)) if isinstance(x, list) else
(check_for_ip_domain(x) if isinstance(x, str) else ''))
df['Alert Name'] = df['Alert Name'].apply(lambda x: ('
'.join(x.split())).strip() if isinstance(x, str) else '')
df['Origin Country'] = df['Origin Country'].apply(lambda x: ('
'.join(x.split())).strip() if isinstance(x, str) else '')
df['Attack Type'] = df['Attack Type'].apply(lambda x: ('
'.join(x.split())).strip() if isinstance(x, str) else '')
df['Description'] = df['Description'].apply(lambda x: ('
'.join(x.split())).strip() if isinstance(x, str) else '')

df_json = df.copy()
df_json.columns = df_json.columns.str.lower().str.replace(' ',
'_').str.replace(r'[()]+', '', regex=True)
df_json.rename(columns={"origin_country":"attack_origin"}, inplace=True)

###############################
df_json.rename(columns={"alert_name":"alert_message"}, inplace=True)
df_json['source_ips'] = df_json['source_ips'].apply(lambda x: x.split(','))
df_json['destination_ips'] = df_json['destination_ips'].apply(lambda x:
x.split(','))

df_json['attack_origin'] = df_json['attack_origin'].str.title()
df_json['attack_origin'] = df_json['attack_origin'].apply(replace_with_match)
df_json['attack_type'] = df_json['attack_type'].str.title()
###########################

json_data = df_json.to_json(orient='records')
if filename:

with open(filename + "_cleaned.json", 'w') as json_file:


json.dump(json.loads(json_data), json_file)

with open(filename + "_cleaned.csv", 'w', encoding='utf-8', newline='') as


csv_file:
df.to_csv(csv_file, encoding='utf-8', index=False)
df = pd.read_csv(filename + "_cleaned.csv", encoding='utf-8')

You might also like