0% found this document useful (0 votes)
18 views3 pages

Movie Rating Prediction with NLP

Uploaded by

Rishubh Gandhi
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
18 views3 pages

Movie Rating Prediction with NLP

Uploaded by

Rishubh Gandhi
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd

Experiment 7

Code:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
data = pd.read_csv('ratings_sample.csv')
data.head()
data['movie_id'] = data['movie_id'].str.replace('+', ' ')
data.describe()
data.info()
data.isnull().sum()
data = data.dropna()
data.isnull().sum()
data.info()
# Assign unique integer IDs to each distinct movie
data['movie_id'] = pd.factorize(data['movie_id'])[0]
data['production_companies'] = pd.factorize(data['production_companies'])[0]
data['production_countries'] = pd.factorize(data['production_countries'])[0]
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# Initialize WordNet lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
# Function to preprocess text
def preprocess_text(text):
text = re.sub(r'<[^>]+>', '', text)
text = re.sub(r'[^a-zA-Z]', ' ', text)
text = text.lower()
words = word_tokenize(text)
words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
processed_text = ' '.join(words)
return processed_text
# Apply preprocessing to the 'overview' column
data['overview'] = data['overview'].apply(preprocess_text)
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
overview_features = tfidf_vectorizer.fit_transform(data['overview'])
overview_features_array = overview_features.toarray()
# Split the genres into individual genres
genres_list = data['genres'].str.split(' ')
# Get unique genres
unique_genres = set(genre for sublist in genres_list for genre in sublist)
for genre in unique_genres:
data[genre] = data['genres'].str.contains(genre).astype(int)
data.drop('genres', axis=1, inplace=True)
data.info()

from sklearn.model_selection import train_test_split


from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
overview_features = tfidf_vectorizer.fit_transform(data['overview'])
combined_features = overview_features
X_train, X_test, y_train, y_test = train_test_split(combined_features, data['rating'], test_size=0.2,
random_state=42)
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred = lr_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)
Output:

You might also like