You are on page 1of 8

Name :- Patel Vedant Vikrambhai

R.g.no :- 20BCE1779
Sub :- Web Mining
Code :- CSE3024
Lab :- 5

Problem-1
Steps:
• Calculate the prior probability
• Calculate posterior probabilities
• The class with highest posterior probability will be predicted

Code :-

import pandas as pd

df = pd.DataFrame({
'Document': ['D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7'],
'TDP': [4, 0, 0, 4, 0, 0, 5],
'Nifty': [0, 5, 0, 1, 0, 4, 0],
'Sidhu': [3, 0, 6, 0, 0, 0, 0],
'BJP': [5, 2, 1, 1, 0, 2, 3],
'Sensex': [1, 6, 0, 1, 0, 6, 0],
'Sixer': [0, 0, 4, 0, 5, 0, 0],
'Congress': [6, 1, 1, 6, 0, 0, 5],
'Century': [0, 0, 2, 0, 6, 1, 0],
'Category': ['Politics', 'Business', 'Sports', 'Politics',
'Sports', 'Business', 'Politics']
})

prior_probs = {}
for category in df['Category'].unique():
prior_probs[category] = len(df[df['Category'] == category]) /
len(df)
likelihood_probs = {}
for feature in df.columns[1:-1]:
likelihood_probs[feature] = {}
for value in df[feature].unique():
feature_probs = {}
for category in df['Category'].unique():
feature_probs[category] = len(df[(df[feature] == value) &
(df['Category'] == category)]) / len(df[df['Category'] == category])
likelihood_probs[feature][value] = feature_probsnew_doc = [0,
3, 0, 2, 6, 0, 2] # test document

posterior_probs = {}
for category in df['Category'].unique():
posterior_prob = prior_probs[category]
for i, feature in enumerate(df.columns[1:-1]):
if new_doc[i] in likelihood_probs[feature]:
posterior_prob *=
likelihood_probs[feature][new_doc[i]][category]
else:
posterior_prob = 0
break
posterior_probs[category] = posterior_prob

predicted_category = max(posterior_probs, key=posterior_probs.get)

print(f"Predicted category for the new document {new_doc}:


{predicted_category}")

Output :-
• Write a Naïve Bayes Classifier in python without using
any package forthe following dataset

Challenging Exercise-1 :-
Take any set of web pages, do the necessary preprocessing and build a Naïve
Bayes Classifier and draw the inferences.

Code :-

# Step 1: Install necessary packages


!pip install pandas numpy scikit-learn nltk

# Step 2: Load the dataset


from sklearn.datasets import fetch_20newsgroups

categories = ['alt.atheism', 'comp.graphics', 'sci.med',


'soc.religion.christian']
train_dataset = fetch_20newsgroups(subset='train',
categories=categories, shuffle=True, random_state=42)
test_dataset = fetch_20newsgroups(subset='test', categories=categories,
shuffle=True, random_state=42)
# Step 3: Preprocessing the data
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

nltk.download('stopwords')

stemmer = SnowballStemmer('english')
stop_words = stopwords.words('english')

def preprocess_text(text):
words = text.lower().split()
words = [stemmer.stem(word) for word in words if word not in
stop_words]
return ' '.join(words)

X_train = [preprocess_text(text) for text in train_dataset.data]


y_train = train_dataset.target

X_test = [preprocess_text(text) for text in test_dataset.data]


y_test = test_dataset.target

# Step 4: Building the Naive Bayes Classifier


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

text_clf = Pipeline([('vect', CountVectorizer()), ('clf',


MultinomialNB())])
text_clf.fit(X_train, y_train)

# Step 5: Evaluating the Classifier


from sklearn.metrics import accuracy_score

predicted = text_clf.predict(X_test)
accuracy = accuracy_score(y_test, predicted)
print(f"Accuracy: {accuracy}")
Output :-
CHALLENGE EXERCISE-2

Code :-
# Step 1: Install necessary packages
!pip install pandas numpy scikit-learn nltk beautifulsoup4 requests

# Step 2: Crawl and scrape the web pages


import requests
from bs4 import BeautifulSoup

urls = ['https://www.bbc.com/news/world',
'https://edition.cnn.com/world', 'https://www.aljazeera.com/news']

articles = []
for url in urls:
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
headlines = soup.find_all('h3')
for headline in headlines:
articles.append(headline.text.strip())
# Step 3: Preprocessing the data
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

nltk.download('stopwords')
stemmer = SnowballStemmer('english')
stop_words = stopwords.words('english')

def preprocess_text(text):
words = text.lower().split()
words = [stemmer.stem(word) for word in words if word not in
stop_words]
return ' '.join(words)

X = [preprocess_text(text) for text in articles]


y = [0]*len(articles) # setting all articles to class 0

# Step 4: Building the Naive Bayes Classifier


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

text_clf = Pipeline([('vect', CountVectorizer()), ('clf',


MultinomialNB())])
text_clf.fit(X, y)
# Step 5: Evaluating the Classifier
test_article = "New Zealand records zero community cases of Covid-19"
preprocessed_test_article = preprocess_text(test_article)
predicted_class = text_clf.predict([preprocessed_test_article])[0]
print(f"Predicted class: {predicted_class}")

Output :-

You might also like