You are on page 1of 7

In [1]: import pandas as pd

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
import warnings
warnings.filterwarnings('ignore')

Bag of words
In [2]: text = ['Hi, how are you you?',
'I am Fine. You?']
print("Suppose. This is our text data : ",text)

Suppose. This is our text data : ['Hi, how are you you?', 'I am Fine. You?']

In [3]: bow_converter = CountVectorizer()


bow_converter.fit(text)
words = bow_converter.get_feature_names()
print("Words of Bag-of-words : ",words)

Words of Bag-of-words : ['am', 'are', 'fine', 'hi', 'how', 'you']

In [5]: features = bow_converter.transform(text).toarray()


print(features)

[[0 1 0 1 1 2]
[1 0 1 0 0 1]]

In [7]: text

Out[7]: ['Hi, how are you you?', 'I am Fine. You?']

In [6]: frequency_matrix = pd.DataFrame(features, index=text,


columns=bow_converter.get_feature_names())
frequency_matrix

Out[6]: am are fine hi how you

Hi, how are you you? 0 1 0 1 1 2

I am Fine. You? 1 0 1 0 0 1

Bag of N-grams
In [6]: text = ['Hi, how are you?',
'Fine. You?']
print("Suppose. This is our text data : ",text)

Suppose. This is our text data : ['Hi, how are you?', 'Fine. You?']

In [9]: # help(CountVectorizer)

In [10]: bigram_converter = CountVectorizer(ngram_range=(1, 2))


bigram_converter.fit(text)
bigrams = bigram_converter.get_feature_names()
print("Words of Bag-of-words : ",bigrams)

Words of Bag-of-words : ['am', 'am fine', 'are', 'are you', 'fine', 'fine you', 'hi', 'hi how', 'how', 'how are', 'you', 'you
you']

In [11]: features = bigram_converter.transform(text).toarray()



# present in dataframe
frequency_matrix = pd.DataFrame(features, index=text, columns=bigrams)
frequency_matrix

Out[11]: am am fine are are you fine fine you hi hi how how how are you you you

Hi, how are you you? 0 0 1 1 0 0 1 1 1 1 2 1

I am Fine. You? 1 1 0 0 1 1 0 0 0 0 1 0
In [12]: print(f'{words}\nunigram count: {len(words)} \n{bigrams} \nbigram count: {len(bigrams)}')

['am', 'are', 'fine', 'hi', 'how', 'you']


unigram count: 6
['am', 'am fine', 'are', 'are you', 'fine', 'fine you', 'hi', 'hi how', 'how', 'how are', 'you', 'you you']
bigram count: 12

Bag of words Vs. Bag of N-grams


In [13]: import json

file = open('data/yelp_academic_dataset_review.json')
json_file = []

for i in range(10000):
json_file.append(json.loads(file.readline()))
file.close()

review_df = pd.DataFrame(json_file)
review_df.head()

Out[13]: votes user_id review_id stars date text type business_id

{'funny': 0, 'useful': 2011-01- My wife took me here on my


0 rLtl8ZkDX5vH5nAx9C3q5Q fWKvX83p0-ka4JS3dc6E5A 5 review 9yKzy9PApeiPPOUJEtnvkg
5, 'cool': 2} 26 birthday for breakf...

{'funny': 0, 'useful': 2011-07- I have no idea why some people


1 0a2KyEL0d3Yb1V6aivbIuQ IjZ33sJrzXqU-0X6U8NwyA 5 review ZRJwVLyzEJq1VAihDhYiow
0, 'cool': 0} 27 give bad review...

{'funny': 0, 'useful': 2012-06- love the gyro plate. Rice is so


2 0hT2KtfLiobPvh6cDC8JQg IESLBzqUCLdSzSqm0eCSxQ 4 review 6oRAC4uyJCsJl1X0WZpVSA
1, 'cool': 0} 14 good and I als...

{'funny': 0, 'useful': 2010-05- Rosie, Dakota, and I LOVE


3 uZetl9T0NcROGOyFfughhg G-WvGaISbqqaMHlNnByodA 5 review _1QQZuf4zZOyFCvXc0o6Vg
2, 'cool': 1} 27 Chaparral Dog Park!!...

{'funny': 0, 'useful': vYmM4KTsC8ZfQBg- 2012-01- General Manager Scott Petello


4 1uJFq2r5QfJG_6ExMRCaGw 5 review 6ozycU1RpktNG2-1BroVtw
0, 'cool': 0} j5MWkw 05 is a good egg!!!...

In [14]: bow_converter = CountVectorizer()


bigram_converter = CountVectorizer(ngram_range=(2,2))

bow_converter.fit(review_df['text'])
words = bow_converter.get_feature_names()

bigram_converter.fit(review_df['text'])
bigrams = bigram_converter.get_feature_names()

In [15]: #vocabulary size


print (f'Unigram size : {len(words)} \nBigram size : {len(bigrams)} \
\n{int(len(bigrams)/len(words))} times bigger vocabulary')

Unigram size : 29185


Bigram size : 385638
13 times bigger vocabulary

Tf-Idf (Term frequency–Inverse document frequency)

https://stackoverflow.com/questions/42440621/how-term-frequency-is-calculated-in-tfidfvectorizer (https://stackoverflow.com/questions/42440621/how-term-
frequency-is-calculated-in-tfidfvectorizer)
In [16]: from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
text = ['it is a puppy',
'it is a cat',
'it is a kitten',
'that is a dog and this is a pen']
tfidf = TfidfVectorizer()
tfidf.fit(text)

tf = tfidf.get_feature_names()
tf

Out[16]: ['and', 'cat', 'dog', 'is', 'it', 'kitten', 'pen', 'puppy', 'that', 'this']

In [17]: features = tfidf.transform(text).toarray()


print(features)

frequency_matrix = pd.DataFrame(features, index=text, columns=tf)
frequency_matrix

[[0. 0. 0. 0.40264194 0.49248889 0.


0. 0.77157901 0. 0. ]
[0. 0.77157901 0. 0.40264194 0.49248889 0.
0. 0. 0. 0. ]
[0. 0. 0. 0.40264194 0.49248889 0.77157901
0. 0. 0. 0. ]
[0.4052446 0. 0.4052446 0.42294689 0. 0.
0.4052446 0. 0.4052446 0.4052446 ]]

Out[17]: and cat dog is it kitten pen puppy that this

it is a puppy 0.000000 0.000000 0.000000 0.402642 0.492489 0.000000 0.000000 0.771579 0.000000 0.000000

it is a cat 0.000000 0.771579 0.000000 0.402642 0.492489 0.000000 0.000000 0.000000 0.000000 0.000000

it is a kitten 0.000000 0.000000 0.000000 0.402642 0.492489 0.771579 0.000000 0.000000 0.000000 0.000000

that is a dog and this is a pen 0.405245 0.000000 0.405245 0.422947 0.000000 0.000000 0.405245 0.000000 0.405245 0.405245

In [18]: import math



# puppy
tf = 1/4
idf = math.log(4/1)

print(f' Tf-Ids value for word/term puppy: {tf*idf} But in real-life implementaion it is different.')

Tf-Ids value for word/term puppy: 0.34657359027997264 But in real-life implementaion it is different.

Simple comparison (for self study)


In [32]: import json
import pandas as pd
Prepare Data

In [21]: # data.rar
# Load Yelp business data
biz_f = open('data/yelp_academic_dataset_business.json')
biz_df = pd.DataFrame([json.loads(x) for x in biz_f.readlines()])
biz_f.close()
biz_df.head()

Out[21]: business_id full_address open categories city review_count name neighborhoods longitude state stars latitude

8466 W
[Accountants,
Peoria Peoria
Professional
0 rncjoVoEFUJGCUoC1JgnUA Ave\nSte True Peoria 3 Income Tax [] -112.241596 AZ 5.0 33.581867 bu
Services, Tax
6\nPeoria, AZ Service
Servi...
85345

[Sporting
2149 W Wood
Goods,
1 0FNFSzCFP_rGUoJx8W7tJg Dr\nPhoenix, True Phoenix 5 Bike Doctor [] -112.105933 AZ 5.0 33.604054 bu
Bikes,
AZ 85029
Shopping]

1134 N
Valley
Central
2 3f_lyB6vFK48ukH6ScvLHg True [] Phoenix 4 Permaculture [] -112.073933 AZ 5.0 33.460526 bu
Ave\nPhoenix,
Alliance
AZ 85004

845 W
Southern [Food,
3 usAsSV36QmUej8--yvN-dg True Phoenix 5 Food City [] -112.085377 AZ 3.5 33.392210 bu
Ave\nPhoenix, Grocery]
AZ 85041

6520 W
[Food,
Happy Valley
Bagels, Glendale Hot Bagels &
4 PzOqRohWw7F7YEPBz6AubA Rd\nSte True 14 [] -112.200264 AZ 3.5 33.712797 bu
Delis, Az Deli
101\nGlendale
Restaurants]
Az, ...

In [34]: # Load Yelp reviews data


review_file = open('data/yelp_academic_dataset_review.json')
review_df = pd.DataFrame([json.loads(x) for x in review_file.readlines()])
review_file.close()
review_df.head()

Out[34]: votes user_id review_id stars date text type business_id

{'funny': 0, 'useful': 2011-01- My wife took me here on my


0 rLtl8ZkDX5vH5nAx9C3q5Q fWKvX83p0-ka4JS3dc6E5A 5 review 9yKzy9PApeiPPOUJEtnvkg
5, 'cool': 2} 26 birthday for breakf...

{'funny': 0, 'useful': 2011-07- I have no idea why some people


1 0a2KyEL0d3Yb1V6aivbIuQ IjZ33sJrzXqU-0X6U8NwyA 5 review ZRJwVLyzEJq1VAihDhYiow
0, 'cool': 0} 27 give bad review...

{'funny': 0, 'useful': 2012-06- love the gyro plate. Rice is so


2 0hT2KtfLiobPvh6cDC8JQg IESLBzqUCLdSzSqm0eCSxQ 4 review 6oRAC4uyJCsJl1X0WZpVSA
1, 'cool': 0} 14 good and I als...

{'funny': 0, 'useful': 2010-05- Rosie, Dakota, and I LOVE


3 uZetl9T0NcROGOyFfughhg G-WvGaISbqqaMHlNnByodA 5 review _1QQZuf4zZOyFCvXc0o6Vg
2, 'cool': 1} 27 Chaparral Dog Park!!...

{'funny': 0, 'useful': vYmM4KTsC8ZfQBg- 2012-01- General Manager Scott Petello


4 1uJFq2r5QfJG_6ExMRCaGw 5 review 6ozycU1RpktNG2-1BroVtw
0, 'cool': 0} j5MWkw 05 is a good egg!!!...

In [20]: review_df['text']

Out[20]: 0 My wife took me here on my birthday for breakf...


1 I have no idea why some people give bad review...
2 love the gyro plate. Rice is so good and I als...
3 Rosie, Dakota, and I LOVE Chaparral Dog Park!!...
4 General Manager Scott Petello is a good egg!!!...
...
9995 First visit...Had lunch here today - used my G...
9996 Should be called house of deliciousness!\n\nI ...
9997 I recently visited Olive and Ivy for business ...
9998 My nephew just moved to Scottsdale recently so...
9999 4-5 locations.. all 4.5 star average.. I think...
Name: text, Length: 10000, dtype: object

In [22]: biz_df.columns, review_df.columns

Out[22]: (Index(['business_id', 'full_address', 'open', 'categories', 'city',


'review_count', 'name', 'neighborhoods', 'longitude', 'state', 'stars',
'latitude', 'type'],
dtype='object'),
Index(['votes', 'user_id', 'review_id', 'stars', 'date', 'text', 'type',
'business_id'],
dtype='object'))
In [23]: biz_df['categories']

Out[23]: 0 [Accountants, Professional Services, Tax Servi...


1 [Sporting Goods, Bikes, Shopping]
2 []
3 [Food, Grocery]
4 [Food, Bagels, Delis, Restaurants]
...
11532 [Mexican, Restaurants]
11533 [Mexican, Restaurants]
11534 [Food, Grocery]
11535 [Greek, Mediterranean, Restaurants]
11536 [Print Media, Mass Media]
Name: categories, Length: 11537, dtype: object

In [24]: # Pull out only Nightlife and Restaurants businesses


two_biz = biz_df[biz_df.apply(lambda x: 'Nightlife' in x['categories'] or
'Restaurants' in x['categories'], axis=1)]

# Join with the reviews to get all reviews on the two types of business
two_biz_reviews = two_biz.merge(review_df, on='business_id', how='inner')

# Trim away the features we won't use
two_biz_reviews = two_biz_reviews[['business_id',
'text',
'categories']]

# Create the target column--True for Nightlife businesses, and False otherwise
two_biz_reviews['target'] = two_biz_reviews.apply(lambda x: 'Nightlife' in x['categories'], axis=1)
two_biz_reviews

Out[24]: business_id text categories target

0 JxVGJ9Nly2FFIs_WpJvkug They built a Sauce in Minneapolis & it only la... [Pizza, Restaurants] False

1 JxVGJ9Nly2FFIs_WpJvkug I was pleasantly surprised by Sauce. We went h... [Pizza, Restaurants] False

2 JxVGJ9Nly2FFIs_WpJvkug I was very disappointed my last experience at ... [Pizza, Restaurants] False

3 JxVGJ9Nly2FFIs_WpJvkug Fun, Fast, Easy, Yummy. Nice Flatbread style ... [Pizza, Restaurants] False

4 Jj7bcQ6NDfKoz4TXwvYfMg Pros... Quick, good, cooked right, self serve ... [Burgers, Restaurants] False

... ... ... ... ...

7207 QzXFdjIbFRGhzL83goPPLA ordered the steak sandwich (medium rare). Cam... [Asian Fusion, Restaurants] False

7208 QzXFdjIbFRGhzL83goPPLA Good food when it's slow. Not so good when sup... [Asian Fusion, Restaurants] False

7209 QzXFdjIbFRGhzL83goPPLA Good service. The waitress was friendly. but t... [Asian Fusion, Restaurants] False

7210 GZ8KctCJxGzYZ7aAdapprg Not recommended if you're not white. For me, i... [Active Life, Amusement Parks, Nightlife, Bowl... True

7211 F3tqTcfKnljJcSyyqN0bbw Great food, clean a bit old but nice [Mexican, Restaurants] False

7212 rows × 4 columns

In [25]: print(two_biz_reviews.target.value_counts())
# two_biz_reviews.head()

False 5899
True 1313
Name: target, dtype: int64

In [26]: nightlife = two_biz_reviews[two_biz_reviews.apply(lambda x: 'Nightlife' in x['categories'], axis=1)]


restaurants = two_biz_reviews[two_biz_reviews.apply(lambda x: 'Restaurants' in x['categories'], axis=1)]
print(f'nightlife : {nightlife.shape} \nrestaurants : {restaurants.shape}')

nightlife : (1313, 4)
restaurants : (6911, 4)

In [28]: (1313*.9)/6911

Out[28]: 0.17098827955433368

In [29]: nightlife_subset = nightlife.sample(frac=.9, random_state=123)


restaurant_subset = restaurants.sample(frac=0.17, random_state=123)

print(nightlife_subset.shape, restaurant_subset.shape)
combined = pd.concat([nightlife_subset, restaurant_subset])
combined.shape

(1182, 4) (1175, 4)

Out[29]: (2357, 4)
Data Split

In [30]: from sklearn.model_selection import train_test_split



# Split into training and test datasets
training_data, test_data = train_test_split(combined, train_size=0.7, random_state=123)
training_data.shape, test_data.shape

Out[30]: ((1649, 4), (708, 4))

Data Representation

In [31]: # Represent the review text as a bag-of-words


bow_transform = CountVectorizer()
X_tr_bow = bow_transform.fit_transform(training_data['text'])
X_te_bow = bow_transform.transform(test_data['text'])

print(len(bow_transform.vocabulary_))

# target data
y_tr = training_data['target']
y_te = test_data['target']

# Create the tf-idf representation using the bag-of-words matrix
tfidf_trfm = TfidfTransformer(norm=None)
X_tr_tfidf = tfidf_trfm.fit_transform(X_tr_bow)
X_te_tfidf = tfidf_trfm.transform(X_te_bow)

11478

Applying Machine Mearning

In [32]: from sklearn.linear_model import LogisticRegression



def simple_logistic_classify(X_tr, y_tr, X_test, y_test, description):
### Helper function to train a logistic classifier and score on testdata
m = LogisticRegression().fit(X_tr, y_tr)
s = m.score(X_test, y_test)
print ('Test score with', description, 'features:', s)
return m

m1 = simple_logistic_classify(X_tr_bow, y_tr, X_te_bow, y_te, 'bow')
m3 = simple_logistic_classify(X_tr_tfidf, y_tr, X_te_tfidf, y_te, 'tfidf')

Test score with bow features: 0.7358757062146892


Test score with tfidf features: 0.730225988700565

Fine-Tuning

In [33]: import sklearn.model_selection as model



# Specify a search grid, then do a 5-fold grid search for each of the feature sets
# 1e5 = 100000
prams = [1e-5, 1e-4, 1e-3, 1e-1, 1e2]
param_grid_ = {'C': prams}

# Tune classifier for bag-of-words representation
print('using BoW:')
bow_search = model.GridSearchCV(LogisticRegression(), cv=5, param_grid=param_grid_, verbose=2, n_jobs=3)
bow_search.fit(X_tr_bow, y_tr)


# Tune classifier for tf-idf
print('using tf-idf:')
tfidf_search = model.GridSearchCV(LogisticRegression(), cv=5, param_grid=param_grid_, verbose=2, n_jobs=3)
tfidf_search.fit(X_tr_tfidf, y_tr)

using BoW:
Fitting 5 folds for each of 5 candidates, totalling 25 fits
using tf-idf:
Fitting 5 folds for each of 5 candidates, totalling 25 fits

Out[33]: GridSearchCV(cv=5, estimator=LogisticRegression(), n_jobs=3,


param_grid={'C': [1e-05, 0.0001, 0.001, 0.1, 100.0]}, verbose=2)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
In [34]: search_results = pd.DataFrame.from_dict({'bow': bow_search.cv_results_['mean_test_score'],
'tfidf': tfidf_search.cv_results_['mean_test_score'],
'Inverse of regularization strength (C)': prams})
search_results

Out[34]: bow tfidf Inverse of regularization strength (C)

0 0.579140 0.579140 0.00001

1 0.579140 0.605828 0.00010

2 0.600372 0.763500 0.00100

3 0.752578 0.767754 0.10000

4 0.728319 0.738637 100.00000

In [ ]: ​

You might also like