You are on page 1of 8

Name – Harsh

Roll Number – 36
Registration Number – 201800469
Semester – 7th
Section – C

Question: Implement LSTM network for sentiment analysis for reviews of the movie on
IMDB data set.

Solution:

1. Upload the dataset in Google Drive.

Code:

from google.colab import drive

drive.mount('/content/drive')
OUTPUT:
Mounted at /content/drive

#Import all the libraries needed.

import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model
import re

#Preview dataset.

data = pd.read_csv('/content/drive/MyDrive/IMDB/IMDB Dataset.csv')

print(data)
OUTPUT:
review sentiment
0 One of the other reviewers has mentioned that ... positive
1 A wonderful little production. <br /><br />The... positive
2 I thought this was a wonderful way to spend ti... positive
3 Basically there's a family where a little boy ... negative
4 Petter Mattei's "Love in the Time of Money" is... positive
... ... ...
49995 I thought this movie did a down right good job... positive
49996 Bad plot, bad dialogue, bad acting, idiotic di... negative
49997 I am a Catholic taught in parochial elementary... negative
49998 I'm going to have to disagree with the previou... negative
49999 No one expects the Star Trek movies to be high... negative

[50000 rows x 2 columns]

#Declaring the english stop words.

import nltk
nltk.download('stopwords')
OUTPUT:
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data] Unzipping corpora/stopwords.zip.
True

english_stops = set(stopwords.words('english'))

#Encode Sentiments.

def load_dataset():
df = pd.read_csv('/content/drive/MyDrive/IMDB/IMDB Dataset.csv')
x_data = df['review']
y_data = df['sentiment']

x_data = x_data.replace({'<.*?>': ''}, regex = True)


x_data = x_data.replace({'[^A-Za-z]': ' '}, regex = True)
x_data = x_data.apply(lambda review: [w for w in review.split() if
w not in english_stops])
x_data = x_data.apply(lambda review: [w.lower() for w in review])

y_data = y_data.replace('positive', 1)
y_data = y_data.replace('negative', 0)
return x_data, y_data

x_data, y_data = load_dataset()

print('Reviews')
print(x_data, '\n')
print('Sentiment')
print(y_data)
OUTPUT:
Reviews
0 [one, reviewers, mentioned, watching, oz, epis...
1 [a, wonderful, little, production, the, filmin...
2 [i, thought, wonderful, way, spend, time, hot,...
3 [basically, family, little, boy, jake, thinks,...
4 [petter, mattei, love, time, money, visually, ...
...
49995 [i, thought, movie, right, good, job, it, crea...
49996 [bad, plot, bad, dialogue, bad, acting, idioti...
49997 [i, catholic, taught, parochial, elementary, s...
49998 [i, going, disagree, previous, comment, side, ...
49999 [no, one, expects, star, trek, movies, high, a...
Name: review, Length: 50000, dtype: object

Sentiment
0 1
1 1
2 1
3 0
4 1
..
49995 1
49996 0
49997 0
49998 0
49999 0
Name: sentiment, Length: 50000, dtype: int64

#Split dataset.

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, tes


t_size = 0.2)

print('Train Set')
print(x_train, '\n')
print(x_test, '\n')
print('Test Set')
print(y_train, '\n')
print(y_test)
OUTPUT:
Train Set
45547 [this, best, films, far, christopher, guest, c...
46963 [bonanza, great, cast, wonderful, actors, lorn...
887 [a, number, contributors, mentioned, age, diff...
44061 [i, usually, spend, almost, much, time, writin...
26092 [bette, midler, best, thing, movie, it, poor, ...
...
7223 [you, know, film, serious, trouble, best, acti...
7226 [john, leguizemo, wonderful, comic, actor, new...
24396 [did, movie, makers, even, preview, released, ...
21914 [i, read, online, reviews, praising, obscure, ...
43988 [i, watched, movie, last, week, sometime, bigg...
Name: review, Length: 40000, dtype: object

19959 [as, environmentally, aware, films, soylent, g...


19643 [meltdown, opens, scene, scientists, preparing...
7476 [this, documentary, attempts, comedy, never, q...
17554 [once, get, nic, cage, playing, italian, soldi...
49620 [this, one, highlights, st, tng, semi, forgett...
...
31596 [i, like, good, novelty, song, no, i, take, ba...
4482 [overrated, show, television, believe, people,...
46476 [i, live, film, i, it, international, awards, ...
12621 [wow, one, greatest, movies, i, ever, ever, se...
22543 [i, saw, movie, i, young, it, time, favorite, ...
Name: review, Length: 10000, dtype: object

Test Set
45547 1
46963 1
887 1
44061 0
26092 0
..
7223 0
7226 0
24396 0
21914 0
43988 0
Name: sentiment, Length: 40000, dtype: int64

19959 1
19643 0
7476 0
17554 1
49620 1
..
31596 0
4482 0
46476 0
12621 1
22543 1
Name: sentiment, Length: 10000, dtype: int64

#Function for getting the maximum review length, by calculating the mean of all the
reviews length.

def get_max_length():
review_length = []
for review in x_train:
review_length.append(len(review))
return int(np.ceil(np.mean(review_length)))

#Encode review.

token = Tokenizer(lower=False)
token.fit_on_texts(x_train)
x_train = token.texts_to_sequences(x_train)
x_test = token.texts_to_sequences(x_test)

max_length = get_max_length()

x_train = pad_sequences(x_train, maxlen=max_length, padding='post', tru


ncating='post')
x_test = pad_sequences(x_test, maxlen=max_length, padding='post', trunc
ating='post')

total_words = len(token.word_index) + 1

print('Encoded X Train\n', x_train, '\n')


print('Encoded X Test\n', x_test, '\n')
print('Maximum review length: ', max_length)
OUTPUT:
Encoded X Train
[[ 8 46 35 ... 2081 6774 160]
[10947 20 86 ... 115 1602 749]
[ 39 534 20136 ... 0 0 0]
...
[ 1443 3 1111 ... 0 0 0]
[ 1 241 4819 ... 167 95 52]
[ 1 194 3 ... 0 0 0]]

Encoded X Test
[[ 108 37158 1807 ... 5550 1713 1360]
[17635 1989 57 ... 670 26087 1573]
[ 8 532 939 ... 0 0 0]
...
[ 1 323 4 ... 523 18 8]
[ 1225 5 709 ... 0 0 0]
[ 1 119 3 ... 0 0 0]]

Maximum review length: 130

#Architecture.

EMBED_DIM = 32
LSTM_OUT = 64

model = Sequential()
model.add(Embedding(total_words, EMBED_DIM, input_length = max_length))
model.add(LSTM(LSTM_OUT))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics
= ['accuracy'])

print(model.summary())
OUTPUT:
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
embedding (Embedding) (None, 130, 32) 2951200

lstm (LSTM) (None, 64) 24832

dense (Dense) (None, 1) 65

=================================================================
Total params: 2,976,097
Trainable params: 2,976,097
Non-trainable params: 0
_________________________________________________________________
None

#Training.

checkpoint = ModelCheckpoint(
'models/LSTM.h5',
monitor='accuracy',
save_best_only=True,
verbose=1
)

model.fit(x_train, y_train, batch_size = 128, epochs = 5, callbacks=[ch


eckpoint])
OUTPUT:
Epoch 1/5
313/313 [==============================] - ETA: 0s - loss: 0.4913 - accuracy: 0.7289
Epoch 00001: accuracy improved from -inf to 0.72887, saving model to models/LSTM.h5
313/313 [==============================] - 59s 182ms/step - loss: 0.4913 - accuracy: 0.7289
Epoch 2/5
313/313 [==============================] - ETA: 0s - loss: 0.2207 - accuracy: 0.9206
Epoch 00002: accuracy improved from 0.72887 to 0.92060, saving model to models/LSTM.h5
313/313 [==============================] - 57s 182ms/step - loss: 0.2207 - accuracy: 0.9206
Epoch 3/5
313/313 [==============================] - ETA: 0s - loss: 0.1245 - accuracy: 0.9599
Epoch 00003: accuracy improved from 0.92060 to 0.95985, saving model to models/LSTM.h5
313/313 [==============================] - 57s 181ms/step - loss: 0.1245 - accuracy: 0.9599
Epoch 4/5
313/313 [==============================] - ETA: 0s - loss: 0.0753 - accuracy: 0.9782
Epoch 00004: accuracy improved from 0.95985 to 0.97817, saving model to models/LSTM.h5
313/313 [==============================] - 56s 180ms/step - loss: 0.0753 - accuracy: 0.9782
Epoch 5/5
313/313 [==============================] - ETA: 0s - loss: 0.0604 - accuracy: 0.9840
Epoch 00005: accuracy improved from 0.97817 to 0.98397, saving model to models/LSTM.h5
313/313 [==============================] - 56s 180ms/step - loss: 0.0604 - accuracy: 0.9840
<keras.callbacks.History at 0x7f84533a4890>
#Testing.

y_pred = model.predict(x_test, batch_size = 128)


y_pred = np.round(y_pred).astype(int)

true = 0
for i, y in enumerate(y_test):
if y == y_pred1[i]:
true += 1

print('Correct Prediction: {}'.format(true))


print('Wrong Prediction: {}'.format(len(y_pred) - true))
print('Accuracy: {}'.format(true/len(y_pred)*100))
OUTPUT:
Correct Prediction: 8609
Wrong Prediction: 1391
Accuracy: 86.09

#Load saved model.

loaded_model = load_model('models/LSTM.h5')

review = str(input('Movie Review: '))


OUTPUT:
Movie Review: Nothing was typical about this. Everything was beautifully done in this movie, the story,
the flow, the scenario, everything. I highly recommend it for mystery lovers, for anyone who wants to
watch a good movie!

regex = re.compile(r'[^a-zA-Z\s]')
review = regex.sub('', review)
print('Cleaned: ', review)

words = review.split(' ')


filtered = [w for w in words if w not in english_stops]
filtered = ' '.join(filtered)
filtered = [filtered.lower()]

print('Filtered: ', filtered)


OUTPUT:
Cleaned: Nothing was typical about this Everything was beautifully
done in this movie the story the flow the scenario everything I highly
recommend it for mystery lovers for anyone who wants to watch a good
movie
Filtered: ['nothing typical everything beautifully done movie story
flow scenario everything i highly recommend mystery lovers anyone wants
watch good movie']

tokenize_words = token.texts_to_sequences(filtered)
tokenize_words = pad_sequences(tokenize_words, maxlen=max_length, paddi
ng='post', truncating='post')
print(tokenize_words)
OUTPUT:
[[ 76 671 172 1236 128 3 14 2668 2606 172 1 447 279 685
1764 152 395 34 9 3 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0]]

result = loaded_model.predict(tokenize_words)
print(result)
OUTPUT:
[[0.9950993]]

if result >= 0.7:


print('positive')
else:
print('negative')
OUTPUT:
positive

You might also like