Lab - 3

In [1]:
import pandas as pd
import numpy as np
In [3]:
movies_df = pd.read_csv("movies.csv", dtype={'userId': 'int32', 'movieId': 'int32', 'rat
ing': 'float32'})
rating_df=pd.read_csv("ratings.csv",
dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})
In [4]:
movies_df.head()
Out[4]:
movieId title genres
0 1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy
1 2 Jumanji (1995) Adventure|Children|Fantasy
2 3 Grumpier Old Men (1995) Comedy|Romance
3 4 Waiting to Exhale (1995) Comedy|Drama|Romance
Father of the Bride Part II

4 5 Comedy
(1995)
In [5]:
rating_df.head()
Out[5]:
userId movieId rating timestamp
0 1 31 2.5 1260759144
1 1 1029 3.0 1260759179
2 1 1061 3.0 1260759182
3 1 1129 2.0 1260759185
4 1 1172 4.0 1260759205
In [6]:
df = pd.merge(rating_df,movies_df,on='movieId')
df.head()
Out[6]:
userId movieId rating timestamp title genres
0 1 31 2.5 1260759144 Dangerous Minds (1995) Drama
In [7]:
combine_movie_rating = df.dropna(axis = 0, subset = ['title'])
movie_ratingCount = (combine_movie_rating.
groupby(by = ['title'])['rating'].
count().
reset_index().
rename(columns = {'rating': 'totalRatingCount'})
[['title', 'totalRatingCount']]
)
movie_ratingCount.head()
Out[7]:
title totalRatingCount
0 "Great Performances" Cats (1998) 2
1 $9.99 (2008) 3
2 'Hellboy': The Seeds of Creation (2004) 1
3 'Neath the Arizona Skies (1934) 1
4 'Round Midnight (1986) 2
In [8]:
rating_with_totalRatingCount = combine_movie_rating.merge(movie_ratingCount, left_on = '
title', right_on = 'title', how = 'left')
rating_with_totalRatingCount.head()
Out[8]:
userId movieId rating timestamp title genres totalRatingCount
0 1 31 2.5 1260759144 Dangerous Minds (1995) Drama 42
In [9]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
print(movie_ratingCount['totalRatingCount'].describe())
count 9064.000
mean 11.033
std 24.053
min 1.000
25% 1.000
50% 3.000
75% 9.000
max 341.000
Name: totalRatingCount, dtype: float64
In [10]:
popularity_threshold = 50
rating_popular_movie= rating_with_totalRatingCount.query('totalRatingCount >= @popularity
_threshold')
rating_popular_movie.head()
Out[10]:
Dracula (Bram Stoker's Dracula)

351 1 1339 3.500 1260759125 Fantasy|Horror|Romance|Thriller 52
(1992)

(1992)

(1992)
(1992)

(1992)
In [11]:
rating_popular_movie.shape
Out[11]:
(43083, 7)
In [12]:
## First lets create a Pivot matrix
movie_features_df=rating_popular_movie.pivot_table(index='title',columns='userId',values
='rating').fillna(0)
movie_features_df.head()
Out[12]:
userId 1 2 3 4 5 6 7 8 9 10 ... 662 663 664 665 666 667 668 669
title
10
Things I
Hate
0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 ... 0.000 0.000 0.000 3.000 0.000 0.000 0.000 0.000
About
You
(1999)
12
Angry
0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 ... 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000
Men
(1957)
2001: A
Space
0.000 0.000 0.000 0.000 0.000 0.000 4.000 0.000 0.000 0.000 ... 0.000 0.000 4.000 0.000 0.000 0.000 0.000 0.000
Odyssey
(1968)
28 Days
Later 0.000 0.000 0.000 0.000 4.000 0.000 0.000 0.000 0.000 0.000 ... 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000
(2002)
300
0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 ... 0.000 0.000 4.000 0.000 0.000 0.000 0.000 0.000
(2007)
5 rows × 670 columns
In [13]:
from scipy.sparse import csr_matrix
movie_features_df_matrix = csr_matrix(movie_features_df.values)
from sklearn.neighbors import NearestNeighbors
model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')

model_knn.fit(movie_features_df_matrix)
Out[13]:
▾ NearestNeighbors
NearestNeighbors(algorithm='brute', metric='cosine')
In [14]:
movie_features_df.shape
Out[14]:
(453, 670)
In [15]:
query_index = np.random.choice(movie_features_df.shape[0])
print(query_index)
query_index =2
In [16]:
distances, indices = model_knn.kneighbors(movie_features_df.iloc[query_index,:].values.r
eshape(1, -1), n_neighbors = 6)
In [17]:
movie_features_df.head()
Out[17]:
userId 1 2 3 4 5 6 7 8 9 10 ... 662 663 664 665 666 667 668 669
title
10
Things I
Hate
0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 ... 0.000 0.000 0.000 3.000 0.000 0.000 0.000 0.000
About
You
(1999)
12
Angry
0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 ... 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000
Men
(1957)
2001: A
Space
0.000 0.000 0.000 0.000 0.000 0.000 4.000 0.000 0.000 0.000 ... 0.000 0.000 4.000 0.000 0.000 0.000 0.000 0.000
Odyssey
(1968)
28 Days
Later 0.000 0.000 0.000 0.000 4.000 0.000 0.000 0.000 0.000 0.000 ... 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000
(2002)
300
0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 ... 0.000 0.000 4.000 0.000 0.000 0.000 0.000 0.000
(2007)
5 rows × 670 columns
In [18]:
for i in range(0, len(distances.flatten())):
if i == 0:
print('Recommendations for {0}:\n '.format(movie_features_df.index[query_index]))
else:
print('{0}: {1}, with distance of {2}:'.format(i, movie_features_df.index[indice
s.flatten()[i]], distances.flatten()[i]))
Recommendations for 2001: A Space Odyssey (1968):
1: Clockwork Orange, A (1971), with distance of 0.4421120882034302:

2: Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964), with dist
ance of 0.44667208194732666:
3: Blade Runner (1982), with distance of 0.45668184757232666:
4: Alien (1979), with distance of 0.4607759714126587:
5: One Flew Over the Cuckoo's Nest (1975), with distance of 0.49276071786880493:
In [ ]:
In [ ]:

Lab - 3

Uploaded by

Document Information

Original Description:

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Lab - 3

Uploaded by

Copyright:

Available Formats

In [1]:

movieId title genres

0 1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy

1 2 Jumanji (1995) Adventure|Children|Fantasy

2 3 Grumpier Old Men (1995) Comedy|Romance

3 4 Waiting to Exhale (1995) Comedy|Drama|Romance

Father of the Bride Part II

userId movieId rating timestamp

1 1 1029 3.0 1260759179

2 1 1061 3.0 1260759182

3 1 1129 2.0 1260759185

4 1 1172 4.0 1260759205

userId movieId rating timestamp title genres

0 1 31 2.5 1260759144 Dangerous Minds (1995) Drama

1 7 31 3.0 851868750 Dangerous Minds (1995) Drama

2 31 31 4.0 1273541953 Dangerous Minds (1995) Drama

3 32 31 4.0 834828440 Dangerous Minds (1995) Drama

4 36 31 3.0 847057202 Dangerous Minds (1995) Drama

0 "Great Performances" Cats (1998) 2

2 'Hellboy': The Seeds of Creation (2004) 1

3 'Neath the Arizona Skies (1934) 1

4 'Round Midnight (1986) 2

userId movieId rating timestamp title genres totalRatingCount

0 1 31 2.5 1260759144 Dangerous Minds (1995) Drama 42

1 7 31 3.0 851868750 Dangerous Minds (1995) Drama 42

2 31 31 4.0 1273541953 Dangerous Minds (1995) Drama 42

3 32 31 4.0 834828440 Dangerous Minds (1995) Drama 42

4 36 31 3.0 847057202 Dangerous Minds (1995) Drama 42

userId movieId rating timestamp title genres totalRatingCount

Dracula (Bram Stoker's Dracula)

Dracula (Bram Stoker's Dracula)

Dracula (Bram Stoker's Dracula)

Dracula (Bram Stoker's Dracula)

5 rows × 670 columns

from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')

5 rows × 670 columns

Recommendations for 2001: A Space Odyssey (1968):

1: Clockwork Orange, A (1971), with distance of 0.4421120882034302:

You might also like