9 Feature Engineering Text Data

In [1]: import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
import warnings
warnings.filterwarnings('ignore')
Bag of words
In [2]: text = ['Hi, how are you you?',
'I am Fine. You?']
print("Suppose. This is our text data : ",text)
Suppose. This is our text data : ['Hi, how are you you?', 'I am Fine. You?']
In [3]: bow_converter = CountVectorizer()

bow_converter.fit(text)
words = bow_converter.get_feature_names()
print("Words of Bag-of-words : ",words)
Words of Bag-of-words : ['am', 'are', 'fine', 'hi', 'how', 'you']
In [5]: features = bow_converter.transform(text).toarray()

print(features)
[[0 1 0 1 1 2]
[1 0 1 0 0 1]]
In [7]: text
Out[7]: ['Hi, how are you you?', 'I am Fine. You?']
In [6]: frequency_matrix = pd.DataFrame(features, index=text,

columns=bow_converter.get_feature_names())
frequency_matrix
Out[6]: am are fine hi how you
Hi, how are you you? 0 1 0 1 1 2
I am Fine. You? 1 0 1 0 0 1
Bag of N-grams
In [6]: text = ['Hi, how are you?',
'Fine. You?']
print("Suppose. This is our text data : ",text)
Suppose. This is our text data : ['Hi, how are you?', 'Fine. You?']
In [9]: # help(CountVectorizer)
In [10]: bigram_converter = CountVectorizer(ngram_range=(1, 2))

bigram_converter.fit(text)
bigrams = bigram_converter.get_feature_names()
print("Words of Bag-of-words : ",bigrams)
Words of Bag-of-words : ['am', 'am fine', 'are', 'are you', 'fine', 'fine you', 'hi', 'hi how', 'how', 'how are', 'you', 'you
you']
In [11]: features = bigram_converter.transform(text).toarray()

# present in dataframe
frequency_matrix = pd.DataFrame(features, index=text, columns=bigrams)
frequency_matrix
Out[11]: am am fine are are you fine fine you hi hi how how how are you you you
Hi, how are you you? 0 0 1 1 0 0 1 1 1 1 2 1
I am Fine. You? 1 1 0 0 1 1 0 0 0 0 1 0
In [12]: print(f'{words}\nunigram count: {len(words)} \n{bigrams} \nbigram count: {len(bigrams)}')
['am', 'are', 'fine', 'hi', 'how', 'you']

unigram count: 6
['am', 'am fine', 'are', 'are you', 'fine', 'fine you', 'hi', 'hi how', 'how', 'how are', 'you', 'you you']
bigram count: 12
Bag of words Vs. Bag of N-grams

In [13]: import json

file = open('data/yelp_academic_dataset_review.json')
json_file = []

for i in range(10000):
json_file.append(json.loads(file.readline()))
file.close()

review_df = pd.DataFrame(json_file)
review_df.head()
Out[13]: votes user_id review_id stars date text type business_id
{'funny': 0, 'useful': 2011-01- My wife took me here on my

0 rLtl8ZkDX5vH5nAx9C3q5Q fWKvX83p0-ka4JS3dc6E5A 5 review 9yKzy9PApeiPPOUJEtnvkg
5, 'cool': 2} 26 birthday for breakf...
{'funny': 0, 'useful': 2011-07- I have no idea why some people

1 0a2KyEL0d3Yb1V6aivbIuQ IjZ33sJrzXqU-0X6U8NwyA 5 review ZRJwVLyzEJq1VAihDhYiow
0, 'cool': 0} 27 give bad review...
{'funny': 0, 'useful': 2012-06- love the gyro plate. Rice is so

2 0hT2KtfLiobPvh6cDC8JQg IESLBzqUCLdSzSqm0eCSxQ 4 review 6oRAC4uyJCsJl1X0WZpVSA
1, 'cool': 0} 14 good and I als...
{'funny': 0, 'useful': 2010-05- Rosie, Dakota, and I LOVE

3 uZetl9T0NcROGOyFfughhg G-WvGaISbqqaMHlNnByodA 5 review _1QQZuf4zZOyFCvXc0o6Vg
2, 'cool': 1} 27 Chaparral Dog Park!!...
{'funny': 0, 'useful': vYmM4KTsC8ZfQBg- 2012-01- General Manager Scott Petello

4 1uJFq2r5QfJG_6ExMRCaGw 5 review 6ozycU1RpktNG2-1BroVtw
0, 'cool': 0} j5MWkw 05 is a good egg!!!...
In [14]: bow_converter = CountVectorizer()

bigram_converter = CountVectorizer(ngram_range=(2,2))

bow_converter.fit(review_df['text'])
words = bow_converter.get_feature_names()

bigram_converter.fit(review_df['text'])
bigrams = bigram_converter.get_feature_names()
In [15]: #vocabulary size

print (f'Unigram size : {len(words)} \nBigram size : {len(bigrams)} \
\n{int(len(bigrams)/len(words))} times bigger vocabulary')
Unigram size : 29185

Bigram size : 385638
13 times bigger vocabulary
Tf-Idf (Term frequency–Inverse document frequency)
https://stackoverflow.com/questions/42440621/how-term-frequency-is-calculated-in-tfidfvectorizer (https://stackoverflow.com/questions/42440621/how-term-
frequency-is-calculated-in-tfidfvectorizer)
In [16]: from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
text = ['it is a puppy',
'it is a cat',
'it is a kitten',
'that is a dog and this is a pen']
tfidf = TfidfVectorizer()
tfidf.fit(text)

tf = tfidf.get_feature_names()
tf
Out[16]: ['and', 'cat', 'dog', 'is', 'it', 'kitten', 'pen', 'puppy', 'that', 'this']
In [17]: features = tfidf.transform(text).toarray()

print(features)

frequency_matrix = pd.DataFrame(features, index=text, columns=tf)
frequency_matrix
[[0. 0. 0. 0.40264194 0.49248889 0.

0. 0.77157901 0. 0. ]
[0. 0.77157901 0. 0.40264194 0.49248889 0.
0. 0. 0. 0. ]
[0. 0. 0. 0.40264194 0.49248889 0.77157901
0. 0. 0. 0. ]
[0.4052446 0. 0.4052446 0.42294689 0. 0.
0.4052446 0. 0.4052446 0.4052446 ]]
Out[17]: and cat dog is it kitten pen puppy that this
it is a puppy 0.000000 0.000000 0.000000 0.402642 0.492489 0.000000 0.000000 0.771579 0.000000 0.000000
it is a cat 0.000000 0.771579 0.000000 0.402642 0.492489 0.000000 0.000000 0.000000 0.000000 0.000000
it is a kitten 0.000000 0.000000 0.000000 0.402642 0.492489 0.771579 0.000000 0.000000 0.000000 0.000000
that is a dog and this is a pen 0.405245 0.000000 0.405245 0.422947 0.000000 0.000000 0.405245 0.000000 0.405245 0.405245
In [18]: import math

# puppy
tf = 1/4
idf = math.log(4/1)

print(f' Tf-Ids value for word/term puppy: {tf*idf} But in real-life implementaion it is different.')
Tf-Ids value for word/term puppy: 0.34657359027997264 But in real-life implementaion it is different.
Simple comparison (for self study)

In [32]: import json
import pandas as pd
Prepare Data
In [21]: # data.rar
# Load Yelp business data
biz_f = open('data/yelp_academic_dataset_business.json')
biz_df = pd.DataFrame([json.loads(x) for x in biz_f.readlines()])
biz_f.close()
biz_df.head()
Out[21]: business_id full_address open categories city review_count name neighborhoods longitude state stars latitude
8466 W
[Accountants,
Peoria Peoria
Professional
0 rncjoVoEFUJGCUoC1JgnUA Ave\nSte True Peoria 3 Income Tax [] -112.241596 AZ 5.0 33.581867 bu
Services, Tax
6\nPeoria, AZ Service
Servi...
85345
[Sporting
2149 W Wood
Goods,
1 0FNFSzCFP_rGUoJx8W7tJg Dr\nPhoenix, True Phoenix 5 Bike Doctor [] -112.105933 AZ 5.0 33.604054 bu
Bikes,
AZ 85029
Shopping]
1134 N
Valley
Central
2 3f_lyB6vFK48ukH6ScvLHg True [] Phoenix 4 Permaculture [] -112.073933 AZ 5.0 33.460526 bu
Ave\nPhoenix,
Alliance
AZ 85004
845 W
Southern [Food,
3 usAsSV36QmUej8--yvN-dg True Phoenix 5 Food City [] -112.085377 AZ 3.5 33.392210 bu
Ave\nPhoenix, Grocery]
AZ 85041
6520 W
[Food,
Happy Valley
Bagels, Glendale Hot Bagels &
4 PzOqRohWw7F7YEPBz6AubA Rd\nSte True 14 [] -112.200264 AZ 3.5 33.712797 bu
Delis, Az Deli
101\nGlendale
Restaurants]
Az, ...
In [34]: # Load Yelp reviews data

review_file = open('data/yelp_academic_dataset_review.json')
review_df = pd.DataFrame([json.loads(x) for x in review_file.readlines()])
review_file.close()
review_df.head()
Out[34]: votes user_id review_id stars date text type business_id
{'funny': 0, 'useful': 2011-01- My wife took me here on my

0 rLtl8ZkDX5vH5nAx9C3q5Q fWKvX83p0-ka4JS3dc6E5A 5 review 9yKzy9PApeiPPOUJEtnvkg
5, 'cool': 2} 26 birthday for breakf...
{'funny': 0, 'useful': 2011-07- I have no idea why some people

1 0a2KyEL0d3Yb1V6aivbIuQ IjZ33sJrzXqU-0X6U8NwyA 5 review ZRJwVLyzEJq1VAihDhYiow
0, 'cool': 0} 27 give bad review...
{'funny': 0, 'useful': 2012-06- love the gyro plate. Rice is so

2 0hT2KtfLiobPvh6cDC8JQg IESLBzqUCLdSzSqm0eCSxQ 4 review 6oRAC4uyJCsJl1X0WZpVSA
1, 'cool': 0} 14 good and I als...
{'funny': 0, 'useful': 2010-05- Rosie, Dakota, and I LOVE

3 uZetl9T0NcROGOyFfughhg G-WvGaISbqqaMHlNnByodA 5 review _1QQZuf4zZOyFCvXc0o6Vg
2, 'cool': 1} 27 Chaparral Dog Park!!...
{'funny': 0, 'useful': vYmM4KTsC8ZfQBg- 2012-01- General Manager Scott Petello

4 1uJFq2r5QfJG_6ExMRCaGw 5 review 6ozycU1RpktNG2-1BroVtw
0, 'cool': 0} j5MWkw 05 is a good egg!!!...
In [20]: review_df['text']
Out[20]: 0 My wife took me here on my birthday for breakf...

1 I have no idea why some people give bad review...
2 love the gyro plate. Rice is so good and I als...
3 Rosie, Dakota, and I LOVE Chaparral Dog Park!!...
4 General Manager Scott Petello is a good egg!!!...
...
9995 First visit...Had lunch here today - used my G...
9996 Should be called house of deliciousness!\n\nI ...
9997 I recently visited Olive and Ivy for business ...
9998 My nephew just moved to Scottsdale recently so...
9999 4-5 locations.. all 4.5 star average.. I think...
Name: text, Length: 10000, dtype: object
In [22]: biz_df.columns, review_df.columns
Out[22]: (Index(['business_id', 'full_address', 'open', 'categories', 'city',

'review_count', 'name', 'neighborhoods', 'longitude', 'state', 'stars',
'latitude', 'type'],
dtype='object'),
Index(['votes', 'user_id', 'review_id', 'stars', 'date', 'text', 'type',
'business_id'],
dtype='object'))
In [23]: biz_df['categories']
Out[23]: 0 [Accountants, Professional Services, Tax Servi...

1 [Sporting Goods, Bikes, Shopping]
2 []
3 [Food, Grocery]
4 [Food, Bagels, Delis, Restaurants]
...
11532 [Mexican, Restaurants]
11533 [Mexican, Restaurants]
11534 [Food, Grocery]
11535 [Greek, Mediterranean, Restaurants]
11536 [Print Media, Mass Media]
Name: categories, Length: 11537, dtype: object
In [24]: # Pull out only Nightlife and Restaurants businesses

two_biz = biz_df[biz_df.apply(lambda x: 'Nightlife' in x['categories'] or
'Restaurants' in x['categories'], axis=1)]

# Join with the reviews to get all reviews on the two types of business
two_biz_reviews = two_biz.merge(review_df, on='business_id', how='inner')

# Trim away the features we won't use
two_biz_reviews = two_biz_reviews[['business_id',
'text',
'categories']]

# Create the target column--True for Nightlife businesses, and False otherwise
two_biz_reviews['target'] = two_biz_reviews.apply(lambda x: 'Nightlife' in x['categories'], axis=1)
two_biz_reviews
Out[24]: business_id text categories target
0 JxVGJ9Nly2FFIs_WpJvkug They built a Sauce in Minneapolis & it only la... [Pizza, Restaurants] False
1 JxVGJ9Nly2FFIs_WpJvkug I was pleasantly surprised by Sauce. We went h... [Pizza, Restaurants] False
2 JxVGJ9Nly2FFIs_WpJvkug I was very disappointed my last experience at ... [Pizza, Restaurants] False
3 JxVGJ9Nly2FFIs_WpJvkug Fun, Fast, Easy, Yummy. Nice Flatbread style ... [Pizza, Restaurants] False
4 Jj7bcQ6NDfKoz4TXwvYfMg Pros... Quick, good, cooked right, self serve ... [Burgers, Restaurants] False
... ... ... ... ...
7207 QzXFdjIbFRGhzL83goPPLA ordered the steak sandwich (medium rare). Cam... [Asian Fusion, Restaurants] False
7208 QzXFdjIbFRGhzL83goPPLA Good food when it's slow. Not so good when sup... [Asian Fusion, Restaurants] False
7209 QzXFdjIbFRGhzL83goPPLA Good service. The waitress was friendly. but t... [Asian Fusion, Restaurants] False
7210 GZ8KctCJxGzYZ7aAdapprg Not recommended if you're not white. For me, i... [Active Life, Amusement Parks, Nightlife, Bowl... True
7211 F3tqTcfKnljJcSyyqN0bbw Great food, clean a bit old but nice [Mexican, Restaurants] False
7212 rows × 4 columns
In [25]: print(two_biz_reviews.target.value_counts())
# two_biz_reviews.head()
False 5899
True 1313
Name: target, dtype: int64
In [26]: nightlife = two_biz_reviews[two_biz_reviews.apply(lambda x: 'Nightlife' in x['categories'], axis=1)]

restaurants = two_biz_reviews[two_biz_reviews.apply(lambda x: 'Restaurants' in x['categories'], axis=1)]
print(f'nightlife : {nightlife.shape} \nrestaurants : {restaurants.shape}')
nightlife : (1313, 4)
restaurants : (6911, 4)
In [28]: (1313*.9)/6911
Out[28]: 0.17098827955433368
In [29]: nightlife_subset = nightlife.sample(frac=.9, random_state=123)

restaurant_subset = restaurants.sample(frac=0.17, random_state=123)

print(nightlife_subset.shape, restaurant_subset.shape)
combined = pd.concat([nightlife_subset, restaurant_subset])
combined.shape
(1182, 4) (1175, 4)
Out[29]: (2357, 4)
Data Split
In [30]: from sklearn.model_selection import train_test_split

# Split into training and test datasets
training_data, test_data = train_test_split(combined, train_size=0.7, random_state=123)
training_data.shape, test_data.shape
Out[30]: ((1649, 4), (708, 4))
Data Representation
In [31]: # Represent the review text as a bag-of-words

bow_transform = CountVectorizer()
X_tr_bow = bow_transform.fit_transform(training_data['text'])
X_te_bow = bow_transform.transform(test_data['text'])

print(len(bow_transform.vocabulary_))

# target data
y_tr = training_data['target']
y_te = test_data['target']

# Create the tf-idf representation using the bag-of-words matrix
tfidf_trfm = TfidfTransformer(norm=None)
X_tr_tfidf = tfidf_trfm.fit_transform(X_tr_bow)
X_te_tfidf = tfidf_trfm.transform(X_te_bow)
11478
Applying Machine Mearning
In [32]: from sklearn.linear_model import LogisticRegression

def simple_logistic_classify(X_tr, y_tr, X_test, y_test, description):
### Helper function to train a logistic classifier and score on testdata
m = LogisticRegression().fit(X_tr, y_tr)
s = m.score(X_test, y_test)
print ('Test score with', description, 'features:', s)
return m

m1 = simple_logistic_classify(X_tr_bow, y_tr, X_te_bow, y_te, 'bow')
m3 = simple_logistic_classify(X_tr_tfidf, y_tr, X_te_tfidf, y_te, 'tfidf')

Test score with bow features: 0.7358757062146892

Test score with tfidf features: 0.730225988700565
Fine-Tuning
In [33]: import sklearn.model_selection as model

# Specify a search grid, then do a 5-fold grid search for each of the feature sets
# 1e5 = 100000
prams = [1e-5, 1e-4, 1e-3, 1e-1, 1e2]
param_grid_ = {'C': prams}

# Tune classifier for bag-of-words representation
print('using BoW:')
bow_search = model.GridSearchCV(LogisticRegression(), cv=5, param_grid=param_grid_, verbose=2, n_jobs=3)
bow_search.fit(X_tr_bow, y_tr)

# Tune classifier for tf-idf
print('using tf-idf:')
tfidf_search = model.GridSearchCV(LogisticRegression(), cv=5, param_grid=param_grid_, verbose=2, n_jobs=3)
tfidf_search.fit(X_tr_tfidf, y_tr)
using BoW:
Fitting 5 folds for each of 5 candidates, totalling 25 fits
using tf-idf:
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Out[33]: GridSearchCV(cv=5, estimator=LogisticRegression(), n_jobs=3,

param_grid={'C': [1e-05, 0.0001, 0.001, 0.1, 100.0]}, verbose=2)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
In [34]: search_results = pd.DataFrame.from_dict({'bow': bow_search.cv_results_['mean_test_score'],
'tfidf': tfidf_search.cv_results_['mean_test_score'],
'Inverse of regularization strength (C)': prams})
search_results
Out[34]: bow tfidf Inverse of regularization strength (C)
0 0.579140 0.579140 0.00001
1 0.579140 0.605828 0.00010
2 0.600372 0.763500 0.00100
3 0.752578 0.767754 0.10000
4 0.728319 0.738637 100.00000
In [ ]:

9 Feature Engineering Text Data

Uploaded by

Document Information

Original Description:

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

9 Feature Engineering Text Data

Uploaded by

Copyright:

Available Formats

In [1]: import pandas as pd

In [3]: bow_converter = CountVectorizer()

Words of Bag-of-words : ['am', 'are', 'fine', 'hi', 'how', 'you']

In [5]: features = bow_converter.transform(text).toarray()

Out[7]: ['Hi, how are you you?', 'I am Fine. You?']

In [6]: frequency_matrix = pd.DataFrame(features, index=text,

Out[6]: am are fine hi how you

Hi, how are you you? 0 1 0 1 1 2

In [10]: bigram_converter = CountVectorizer(ngram_range=(1, 2))

In [11]: features = bigram_converter.transform(text).toarray()

Hi, how are you you? 0 0 1 1 0 0 1 1 1 1 2 1

['am', 'are', 'fine', 'hi', 'how', 'you']

Bag of words Vs. Bag of N-grams

Out[13]: votes user_id review_id stars date text type business_id

{'funny': 0, 'useful': 2011-01- My wife took me here on my

{'funny': 0, 'useful': 2011-07- I have no idea why some people

{'funny': 0, 'useful': 2012-06- love the gyro plate. Rice is so

{'funny': 0, 'useful': 2010-05- Rosie, Dakota, and I LOVE

{'funny': 0, 'useful': vYmM4KTsC8ZfQBg- 2012-01- General Manager Scott Petello

In [14]: bow_converter = CountVectorizer()

In [15]: #vocabulary size

Unigram size : 29185

Tf-Idf (Term frequency–Inverse document frequency)

In [17]: features = tfidf.transform(text).toarray()

[[0. 0. 0. 0.40264194 0.49248889 0.

Out[17]: and cat dog is it kitten pen puppy that this

In [18]: import math

Simple comparison (for self study)

In [34]: # Load Yelp reviews data

Out[34]: votes user_id review_id stars date text type business_id

{'funny': 0, 'useful': 2011-01- My wife took me here on my

{'funny': 0, 'useful': 2011-07- I have no idea why some people

{'funny': 0, 'useful': 2012-06- love the gyro plate. Rice is so

{'funny': 0, 'useful': 2010-05- Rosie, Dakota, and I LOVE

{'funny': 0, 'useful': vYmM4KTsC8ZfQBg- 2012-01- General Manager Scott Petello

Out[20]: 0 My wife took me here on my birthday for breakf...

In [22]: biz_df.columns, review_df.columns

Out[22]: (Index(['business_id', 'full_address', 'open', 'categories', 'city',

Out[23]: 0 [Accountants, Professional Services, Tax Servi...

In [24]: # Pull out only Nightlife and Restaurants businesses

Out[24]: business_id text categories target

... ... ... ... ...

7212 rows × 4 columns

In [26]: nightlife = two_biz_reviews[two_biz_reviews.apply(lambda x: 'Nightlife' in x['categories'], axis=1)]

In [29]: nightlife_subset = nightlife.sample(frac=.9, random_state=123)

In [30]: from sklearn.model_selection import train_test_split

Out[30]: ((1649, 4), (708, 4))

In [31]: # Represent the review text as a bag-of-words

Applying Machine Mearning

In [32]: from sklearn.linear_model import LogisticRegression

Test score with bow features: 0.7358757062146892

In [33]: import sklearn.model_selection as model

Out[33]: GridSearchCV(cv=5, estimator=LogisticRegression(), n_jobs=3,

Out[34]: bow tfidf Inverse of regularization strength (C)

0 0.579140 0.579140 0.00001

1 0.579140 0.605828 0.00010

2 0.600372 0.763500 0.00100

3 0.752578 0.767754 0.10000

4 0.728319 0.738637 100.00000

You might also like