Professional Documents
Culture Documents
Code :-
import pandas as pd
import numpy as np
import collections
import re
#Remove Punctuation
wordset12 = np.union1d(l_doc1,l_doc2)
wordset = np.union1d(wordset12,l_doc3)
print(wordset)
tf_diz = dict.fromkeys(wordset,0)
tf_diz[word] = l_doc.count(word)
return tf_diz
#We can finally obtain the bag of words representatives for the reviews. In the
end, we obtain a dataframe, where each row corresponds to the extracted
features of each document
bow1 = calculateBOW(wordset,l_doc1)
bow2 = calculateBOW(wordset,l_doc2)
bow3 = calculateBOW(wordset,l_doc3)
df_bow = pd.DataFrame([bow1,bow2,bow3])
df_bow.head()
df_bow_sklearn.head()
x = vectorizer.fit_transform([doc1,doc2,doc3])
print(vectorizer.get_feature_names())
df_bow_sklearn=pd.DataFrame(x.toarray(),columns=vectorizer.get_feature_names())
df_bow_sklearn.head()
x=vectorizer.fit_transform([doc1,doc2,doc3])
print(vectorizer.get_feature_names())
df_bow_sklearn=pd.DataFrame(x.toarray(),columns=vectorizer.get_feature_names())
df_bow_sklearn.head()
import pandas as pd
dataset = pd.read_csv(r”c:\Users\HP\data.csv”,encoding=”ISO-8859-1”)
dataset.head()
#spell correction
from nltk.corpus import stopwords
Data = []
for i in range(dataset.shape[0]) :
sms = dataset.iloc[I,1]
#remove non alphabetic characters
sms = re.sub(^[A-Za-z],’ ‘ ,sms)
#tokenising
tokenized_sms = wt(sms)
sms_processed = []
for word in tokenized_sms :
if word not in set(stopwords.words(‘english’)) :
sms_processed.append(stemmer.stem(word))
sms_text = “ “.join(sms_processed)
data.append(sms_text)
# Naive Bayes
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(x_train, y_train)
# Confusion matrix
from sklearn.metrics import confusion_matrix, classification_report,
accuracy_score
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)
accuracy
string = [d0,d1,d2]
tfidf = TfidfVectorizer()
result = tfidf.fit_transform(string)
result
vectorizer = CountVectorizer()
# For our text, we are going to take some text form our previous blog post
# about count vectorization
# To actually create the vectorizer, we simply need to call fit on the text
# data that we wish to fix
vectorizer.fit(sample_text)
# We could also do the whole thing at once with the fit_transform method :
print('One swoop : ')
new_text = ["Today is the day that I do the thing today, today"]
new_vectorizer = CountVectorizer()
df = pd.read_csv("C:\\Users\\Admin\\Downloads\\BBC_News_Train.csv")
df.head()
df.shape
df['Category'].value_counts()
import nltk
from nltk.corpus import stopwords
import string
print(df.iloc[:,1].apply(text_cleaning))
title_tfidf = tfidf_transformer.transform(title_bow)
print(title_tfidf)
print(title_tfidf.shape)
all_predictions = model.predict(title_tfidf)
print(all_predictions)
random.shuffle(documents)
all_words = []
for w in movie_reviews.words() :
all_words.append(w.lower())
all_words = nltk.FreqDist(all_words)
word_features = list(all_words.keys())[:3000]
def find_features(documnet) :
words = set(documents)
features = {}
for w in word_features :
return features
print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))