Professional Documents
Culture Documents
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Unit-2\n"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data Loading"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Loading the dataset\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import pandas as pd"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"#### importing the data the mail data\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"X-GM-THRID\n",
"X-Gmail-Labels\n",
"MIME-Version\n",
"Date\n",
"Message-ID\n",
"Subject\n",
"From\n",
"To\n",
"Content-Type\n"
]
},
{
"data": {
"text/plain": [
"<mailbox.mbox at 0x12a9cacd490>"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import mailbox\n",
"\n",
"mboxfile = \"./All mail Including Spam and Trash.mbox\"\n",
"mbox = mailbox.mbox(mboxfile)\n",
"\n",
"for key in mbox[0].keys():\n",
" print(key)\n",
"mbox"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Data transformation"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Data cleansing"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"import csv\n",
"\n",
"with open('testdata.csv','w') as outputfile:\n",
" writing = csv.writer(outputfile)\n",
" writing.writerow(['subject', 'from', 'date', 'to', 'label','Thread'])\n",
"\n",
" for message in mbox:\n",
" writing.writerow([\n",
" message['subject'],\n",
" message['from'],\n",
" message['date'],\n",
" message['to'],\n",
" message['X-Gmail-Labels'],\n",
" message['X-GM-THRID']\n",
" ])\n"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Loading the csv file\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>subject</th>\n",
" <th>from</th>\n",
" <th>date</th>\n",
" <th>to</th>\n",
" <th>label</th>\n",
" <th>Thread</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Conceptual Physics book file</td>\n",
" <td>Kadarla Sai Nithin <sainithink.be25@uceou.edu></td>\n",
" <td>Wed, 11 Jan 2023 10:00:04 +0530</td>\n",
" <td>Suresh Kumar Lokhande <suresh.l@uceou.edu></td>\n",
" <td>Archived,Sent</td>\n",
" <td>1754524927945493361</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Thank you for signing up for ThingSpeak!</td>\n",
" <td>ThingSpeak Support <support@mail.thingspeak.com></td>\n",
" <td>Sat, 22 Apr 2023 09:56:31 +0000</td>\n",
" <td>sainithink.be25@uceou.edu</td>\n",
" <td>Inbox,Category Updates,Unread</td>\n",
" <td>1763869869345248847</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Get the official Gmail app</td>\n",
" <td>Gmail Team <mail-noreply@google.com></td>\n",
" <td>Wed, 30 Nov 2022 00:05:59 -0800</td>\n",
" <td>Kadarla Sai Nithin <sainithink.be25@uceou.edu></td>\n",
" <td>Inbox,Unread</td>\n",
" <td>1750907548775931784</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Reset Your Password</td>\n",
" <td>Xmind Notifications <notifications@mail.xmind....</td>\n",
" <td>Tue, 16 May 2023 15:37:20 +0000</td>\n",
" <td>sainithink.be25@uceou.edu</td>\n",
" <td>Inbox,Important,Opened,Category Updates</td>\n",
" <td>1766065641328425275</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Security alert</td>\n",
" <td>Google <no-reply@accounts.google.com></td>\n",
" <td>Sat, 25 Feb 2023 04:51:44 GMT</td>\n",
" <td>sainithink.be25@uceou.edu</td>\n",
" <td>Inbox,Category Updates,Unread</td>\n",
" <td>1758777265723926184</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>61</th>\n",
" <td>Verify Email Address</td>\n",
" <td>service@account.mathworks.com</td>\n",
" <td>Sat, 22 Apr 2023 09:55:12 +0000</td>\n",
" <td>sainithink.be25@uceou.edu</td>\n",
" <td>Inbox,Important,Opened,Category Updates</td>\n",
" <td>1763869787228555223</td>\n",
" </tr>\n",
" <tr>\n",
" <th>62</th>\n",
" <td>uceou.edu account email verification code</td>\n",
" <td>msonlineservicesteam@microsoftonline.com</td>\n",
" <td>Sun, 19 Mar 2023 23:55:52 -0700</td>\n",
" <td>Sainithink.be25@uceou.edu</td>\n",
" <td>Inbox,Important,Opened,Category Updates</td>\n",
" <td>1760868809297254026</td>\n",
" </tr>\n",
" <tr>\n",
" <th>63</th>\n",
" <td>uceou.edu account email verification code</td>\n",
" <td>msonlineservicesteam@microsoftonline.com</td>\n",
" <td>Mon, 13 Mar 2023 23:40:25 -0700</td>\n",
" <td>Sainithink.be25@uceou.edu</td>\n",
" <td>Inbox,Important,Opened,Category Updates</td>\n",
" <td>1760324258667829753</td>\n",
" </tr>\n",
" <tr>\n",
" <th>64</th>\n",
" <td>Security alert</td>\n",
" <td>Google <no-reply@accounts.google.com></td>\n",
" <td>Fri, 19 May 2023 10:03:42 GMT</td>\n",
" <td>sainithink.be25@uceou.edu</td>\n",
" <td>Inbox,Opened,Category Updates</td>\n",
" <td>1766316440594475561</td>\n",
" </tr>\n",
" <tr>\n",
" <th>65</th>\n",
" <td>[Request received]</td>\n",
" <td>\"Tech (Support)\" <agents@xmind.zendesk.com></td>\n",
" <td>Wed, 08 Feb 2023 14:48:12 +0000</td>\n",
" <td>\"sainithink.be25\" <sainithink.be25@uceou.edu></td>\n",
" <td>Inbox,Category Updates,Unread</td>\n",
" <td>1757274643381080149</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>66 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" subject \\\n",
"0 Conceptual Physics book file \n",
"1 Thank you for signing up for ThingSpeak! \n",
"2 Get the official Gmail app \n",
"3 Reset Your Password \n",
"4 Security alert \n",
".. ... \n",
"61 Verify Email Address \n",
"62 uceou.edu account email verification code \n",
"63 uceou.edu account email verification code \n",
"64 Security alert \n",
"65 [Request received] \n",
"\n",
" from \\\n",
"0 Kadarla Sai Nithin <sainithink.be25@uceou.edu> \n",
"1 ThingSpeak Support <support@mail.thingspeak.com> \n",
"2 Gmail Team <mail-noreply@google.com> \n",
"3 Xmind Notifications <notifications@mail.xmind.... \n",
"4 Google <no-reply@accounts.google.com> \n",
".. ... \n",
"61 service@account.mathworks.com \n",
"62 msonlineservicesteam@microsoftonline.com \n",
"63 msonlineservicesteam@microsoftonline.com \n",
"64 Google <no-reply@accounts.google.com> \n",
"65 \"Tech (Support)\" <agents@xmind.zendesk.com> \n",
"\n",
" date \\\n",
"0 Wed, 11 Jan 2023 10:00:04 +0530 \n",
"1 Sat, 22 Apr 2023 09:56:31 +0000 \n",
"2 Wed, 30 Nov 2022 00:05:59 -0800 \n",
"3 Tue, 16 May 2023 15:37:20 +0000 \n",
"4 Sat, 25 Feb 2023 04:51:44 GMT \n",
".. ... \n",
"61 Sat, 22 Apr 2023 09:55:12 +0000 \n",
"62 Sun, 19 Mar 2023 23:55:52 -0700 \n",
"63 Mon, 13 Mar 2023 23:40:25 -0700 \n",
"64 Fri, 19 May 2023 10:03:42 GMT \n",
"65 Wed, 08 Feb 2023 14:48:12 +0000 \n",
"\n",
" to \\\n",
"0 Suresh Kumar Lokhande <suresh.l@uceou.edu> \n",
"1 sainithink.be25@uceou.edu \n",
"2 Kadarla Sai Nithin <sainithink.be25@uceou.edu> \n",
"3 sainithink.be25@uceou.edu \n",
"4 sainithink.be25@uceou.edu \n",
".. ... \n",
"61 sainithink.be25@uceou.edu \n",
"62 Sainithink.be25@uceou.edu \n",
"63 Sainithink.be25@uceou.edu \n",
"64 sainithink.be25@uceou.edu \n",
"65 \"sainithink.be25\" <sainithink.be25@uceou.edu> \n",
"\n",
" label Thread \n",
"0 Archived,Sent 1754524927945493361 \n",
"1 Inbox,Category Updates,Unread 1763869869345248847 \n",
"2 Inbox,Unread 1750907548775931784 \n",
"3 Inbox,Important,Opened,Category Updates 1766065641328425275 \n",
"4 Inbox,Category Updates,Unread 1758777265723926184 \n",
".. ... ... \n",
"61 Inbox,Important,Opened,Category Updates 1763869787228555223 \n",
"62 Inbox,Important,Opened,Category Updates 1760868809297254026 \n",
"63 Inbox,Important,Opened,Category Updates 1760324258667829753 \n",
"64 Inbox,Opened,Category Updates 1766316440594475561 \n",
"65 Inbox,Category Updates,Unread 1757274643381080149 \n",
"\n",
"[66 rows x 6 columns]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dfs = pd.read_csv('testdata.csv')\n",
"dfs"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Converting the data"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"subject object\n",
"from object\n",
"date object\n",
"to object\n",
"label object\n",
"Thread int64\n",
"dtype: object\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>subject</th>\n",
" <th>from</th>\n",
" <th>date</th>\n",
" <th>to</th>\n",
" <th>label</th>\n",
" <th>Thread</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Conceptual Physics book file</td>\n",
" <td>Kadarla Sai Nithin <sainithink.be25@uceou.edu></td>\n",
" <td>2023-01-11 04:30:04+00:00</td>\n",
" <td>Suresh Kumar Lokhande <suresh.l@uceou.edu></td>\n",
" <td>Archived,Sent</td>\n",
" <td>1754524927945493361</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Thank you for signing up for ThingSpeak!</td>\n",
" <td>ThingSpeak Support <support@mail.thingspeak.com></td>\n",
" <td>2023-04-22 09:56:31+00:00</td>\n",
" <td>sainithink.be25@uceou.edu</td>\n",
" <td>Inbox,Category Updates,Unread</td>\n",
" <td>1763869869345248847</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Get the official Gmail app</td>\n",
" <td>Gmail Team <mail-noreply@google.com></td>\n",
" <td>2022-11-30 08:05:59+00:00</td>\n",
" <td>Kadarla Sai Nithin <sainithink.be25@uceou.edu></td>\n",
" <td>Inbox,Unread</td>\n",
" <td>1750907548775931784</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Reset Your Password</td>\n",
" <td>Xmind Notifications <notifications@mail.xmind....</td>\n",
" <td>2023-05-16 15:37:20+00:00</td>\n",
" <td>sainithink.be25@uceou.edu</td>\n",
" <td>Inbox,Important,Opened,Category Updates</td>\n",
" <td>1766065641328425275</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Security alert</td>\n",
" <td>Google <no-reply@accounts.google.com></td>\n",
" <td>2023-02-25 04:51:44+00:00</td>\n",
" <td>sainithink.be25@uceou.edu</td>\n",
" <td>Inbox,Category Updates,Unread</td>\n",
" <td>1758777265723926184</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>61</th>\n",
" <td>Verify Email Address</td>\n",
" <td>service@account.mathworks.com</td>\n",
" <td>2023-04-22 09:55:12+00:00</td>\n",
" <td>sainithink.be25@uceou.edu</td>\n",
" <td>Inbox,Important,Opened,Category Updates</td>\n",
" <td>1763869787228555223</td>\n",
" </tr>\n",
" <tr>\n",
" <th>62</th>\n",
" <td>uceou.edu account email verification code</td>\n",
" <td>msonlineservicesteam@microsoftonline.com</td>\n",
" <td>2023-03-20 06:55:52+00:00</td>\n",
" <td>Sainithink.be25@uceou.edu</td>\n",
" <td>Inbox,Important,Opened,Category Updates</td>\n",
" <td>1760868809297254026</td>\n",
" </tr>\n",
" <tr>\n",
" <th>63</th>\n",
" <td>uceou.edu account email verification code</td>\n",
" <td>msonlineservicesteam@microsoftonline.com</td>\n",
" <td>2023-03-14 06:40:25+00:00</td>\n",
" <td>Sainithink.be25@uceou.edu</td>\n",
" <td>Inbox,Important,Opened,Category Updates</td>\n",
" <td>1760324258667829753</td>\n",
" </tr>\n",
" <tr>\n",
" <th>64</th>\n",
" <td>Security alert</td>\n",
" <td>Google <no-reply@accounts.google.com></td>\n",
" <td>2023-05-19 10:03:42+00:00</td>\n",
" <td>sainithink.be25@uceou.edu</td>\n",
" <td>Inbox,Opened,Category Updates</td>\n",
" <td>1766316440594475561</td>\n",
" </tr>\n",
" <tr>\n",
" <th>65</th>\n",
" <td>[Request received]</td>\n",
" <td>\"Tech (Support)\" <agents@xmind.zendesk.com></td>\n",
" <td>2023-02-08 14:48:12+00:00</td>\n",
" <td>\"sainithink.be25\" <sainithink.be25@uceou.edu></td>\n",
" <td>Inbox,Category Updates,Unread</td>\n",
" <td>1757274643381080149</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>66 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" subject \\\n",
"0 Conceptual Physics book file \n",
"1 Thank you for signing up for ThingSpeak! \n",
"2 Get the official Gmail app \n",
"3 Reset Your Password \n",
"4 Security alert \n",
".. ... \n",
"61 Verify Email Address \n",
"62 uceou.edu account email verification code \n",
"63 uceou.edu account email verification code \n",
"64 Security alert \n",
"65 [Request received] \n",
"\n",
" from \\\n",
"0 Kadarla Sai Nithin <sainithink.be25@uceou.edu> \n",
"1 ThingSpeak Support <support@mail.thingspeak.com> \n",
"2 Gmail Team <mail-noreply@google.com> \n",
"3 Xmind Notifications <notifications@mail.xmind.... \n",
"4 Google <no-reply@accounts.google.com> \n",
".. ... \n",
"61 service@account.mathworks.com \n",
"62 msonlineservicesteam@microsoftonline.com \n",
"63 msonlineservicesteam@microsoftonline.com \n",
"64 Google <no-reply@accounts.google.com> \n",
"65 \"Tech (Support)\" <agents@xmind.zendesk.com> \n",
"\n",
" date
to \\\n",
"0 2023-01-11 04:30:04+00:00 Suresh Kumar Lokhande
<suresh.l@uceou.edu> \n",
"1 2023-04-22 09:56:31+00:00
sainithink.be25@uceou.edu \n",
"2 2022-11-30 08:05:59+00:00 Kadarla Sai Nithin
<sainithink.be25@uceou.edu> \n",
"3 2023-05-16 15:37:20+00:00
sainithink.be25@uceou.edu \n",
"4 2023-02-25 04:51:44+00:00
sainithink.be25@uceou.edu \n",