You are on page 1of 6

SIMREN DUBEY 2019450014

Submission content should include

1. Perspective of the problem statement by the student. Unique perspective

Recommender System of shopping

‘Recommended for you’ how does it work

2. Input

Csv data : Online Retail.xlsx

3. Output

Recommendation of products

4. Logic to get output from the input

def similar_products(v, n = 6):

# extract most similar products for the input vector

ms = model.similar_by_vector(v, topn= n+1)[1:]

# extract name and similarity score of the similar products

new_ms = []

for j in ms:

pair = (products_dict[j[0]][0], j[1])

new_ms.append(pair)

return new_ms

5. Code

import pandas as pd

import numpy as np

import random

from tqdm import tqdm

from gensim.models import Word2Vec

import matplotlib.pyplot as plt


SIMREN DUBEY 2019450014

import warnings;

warnings.filterwarnings('ignore')

df = pd.read_excel('C://Users//simra//Desktop//Practicals//DL//Online Retail.xlsx')

df.head()

df.shape

# check for missing values

df.isnull().sum()

# remove missing values

df.dropna(inplace=True)

df['StockCode']= df['StockCode'].astype(str)

customers = df["CustomerID"].unique().tolist()

len(customers)

# shuffle customer ID's

random.shuffle(customers)

# extract 90% of customer ID's

customers_train = [customers[i] for i in range(round(0.9*len(customers)))]


SIMREN DUBEY 2019450014

# split data into train and validation set

train_df = df[df['CustomerID'].isin(customers_train)]

validation_df = df[~df['CustomerID'].isin(customers_train)]

# list to capture purchase history of the customers

purchases_train = []

# populate the list with the product codes

for i in tqdm(customers_train):

temp = train_df[train_df["CustomerID"] == i]["StockCode"].tolist()

purchases_train.append(temp)

# list to capture purchase history of the customers

purchases_val = []

# populate the list with the product codes

for i in tqdm(validation_df['CustomerID'].unique()):

temp = validation_df[validation_df["CustomerID"] == i]["StockCode"].tolist()

purchases_val.append(temp)

# train word2vec model

model = Word2Vec(window = 10, sg = 1, hs = 0,

negative = 10, # for negative sampling

alpha=0.03, min_alpha=0.0007,

seed = 14)

model.build_vocab(purchases_train, progress_per=200)

model.train(purchases_train, total_examples = model.corpus_count,


SIMREN DUBEY 2019450014

epochs=10, report_delay=1)

model.init_sims(replace=True)

print(model)

# extract all vectors

X = model[model.wv.vocab]

X.shape

import umap

cluster_embedding = umap.UMAP(n_neighbors=30, min_dist=0.0,

n_components=2, random_state=42).fit_transform(X)

plt.figure(figsize=(10,9))

plt.scatter(cluster_embedding[:, 0], cluster_embedding[:, 1], s=3, cmap='Spectral')

products = train_df[["StockCode", "Description"]]

# remove duplicates

products.drop_duplicates(inplace=True, subset='StockCode', keep="last")

# create product-ID and product-description dictionary

products_dict = products.groupby('StockCode')['Description'].apply(list).to_dict()

def similar_products(v, n = 6):


SIMREN DUBEY 2019450014

# extract most similar products for the input vector

ms = model.similar_by_vector(v, topn= n+1)[1:]

# extract name and similarity score of the similar products

new_ms = []

for j in ms:

pair = (products_dict[j[0]][0], j[1])

new_ms.append(pair)

return new_ms

similar_products(model['90019A'])

def aggregate_vectors(products):

product_vec = []

for i in products:

try:

product_vec.append(model[i])

except KeyError:

continue

return np.mean(product_vec, axis=0)

len(purchases_val[0])

aggregate_vectors(purchases_val[0]).shape

similar_products(aggregate_vectors(purchases_val[0]))

similar_products(aggregate_vectors(purchases_val[0][-10:]))

print("end")
SIMREN DUBEY 2019450014

6. Observation

The softmax function transforms each element of a collection by computing the


exponential of each element divided by the sum of the exponentials of all the elements. That is,
if x is a one-dimensional numpy array:

softmax(x) = np.exp(x)/sum(np.exp(x))

You might also like