Text Mining Basics

text-mining-basics
December 31, 2022
1 Text Mining
[1]: import pandas
import nltk
from nltk.corpus import stopwords
[4]: print(stopwords.words('english'))
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're",
"you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he',
'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's",
'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what',
'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is',
'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having',
'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or',
'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about',
'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above',
'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under',
'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why',
'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some',
'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very',
's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now',
'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn',
"couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn',
"hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't",
'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn',
"shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn',
"wouldn't"]
# Text Extraction from Resources
[5]: path= r"C:\Users\Asus\Desktop\the idiot.txt"
[6]: with open(path, 'r') as myfile:

text=myfile.read()
[7]: import requests as rq
1
text = rq.get("the_matrix_synopsis.txt").text
[8]: import urllib.request as ur

scrabed_web = ur.urlopen('https://faculty.elgin.edu/jputz/SampleTextPage.html')
news = scrabed_web.read()
[9]: import pandas as pd

comments = pd.read_excel("Financial Sample.xlsx")
[10]: print(comments.head(2))
Segment Country Product Discount Band Units Sold \
0 Government Canada Carretera None 1618.5
1 Government Germany Carretera None 1321.0
Manufacturing Price Sale Price Gross Sales Discounts Sales COGS \
0 3 20 32370.0 0.0 32370.0 16185.0
1 3 20 26420.0 0.0 26420.0 13210.0
Profit Date Month Number Month Name Year
0 16185.0 2014-01-01 1 January 2014
1 13210.0 2014-01-01 1 January 2014

# Tokenization and Dropping Stopwords
[1]: from nltk.corpus import stopwords

from nltk.tokenize import word_tokenize, sent_tokenize
[2]: path= "the idiot.txt"

with open(path, 'r') as myfile:
text=myfile.read()
[8]: en_stops = stopwords.words('english')

print(en_stops)
print('=' * 127)
print(len(en_stops))
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're",
"you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he',
2
'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's",
'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what',
'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is',
'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having',
'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or',
'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about',
'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above',
'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under',
'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why',
'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some',
'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very',
's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now',
'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn',
"couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn',
"hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't",
'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn',
"shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn',
"wouldn't"]
================================================================================
===============================================
179
[14]: data_tokenize = word_tokenize(text)

# print(data_tokenize)
[10]: filted_words = []
for word in data_tokenize:
if word not in en_stops:
filted_words.append(word)
print(filted_words)
['The', 'Project', 'Gutenberg', 'eBook', 'The', 'Idiot', ',', 'Fyodor',

'Dostoyevsky', 'This', 'eBook', 'use', 'anyone', 'anywhere', 'United', 'States',
'parts', 'world', 'cost', 'almost', 'restrictions', 'whatsoever', '.', 'You',
'may', 'copy', ',', 'give', 'away', 're-use', 'terms', 'Project', 'Gutenberg',
'License', 'included', 'eBook', 'online', 'www.gutenberg.org', '.', 'If',
'located', 'United', 'States', ',', 'check', 'laws', 'country', 'located',
'using', 'eBook', '.', 'Title', ':', 'The', 'Idiot', 'Author', ':', 'Fyodor',
'Dostoyevsky', 'Translator', ':', 'Eva', 'Martin', 'Release', 'Date', ':',
'May', ',', '2001', '[', 'eBook', '#', '2638', ']', '[', 'Most', 'recently',
'updated', ':', 'June', '21', ',', '2021', ']', 'Language', ':', 'English',
'Character', 'set', 'encoding', ':', 'UTF-8', 'Produced', ':', 'Martin',
'Adamson', ',', 'David', 'Widger', ',', 'corrections', 'Andrew', 'Sly', '*',
'*', '*', 'START', 'OF', 'THE', 'PROJECT', 'GUTENBERG', 'EBOOK', 'THE', 'IDIOT',
'*', '*', '*', 'The', 'Idiot', 'Towards', 'end', 'November', ',', 'thaw', ',',
'nine', '’', 'clock', 'one', 'morning', ',', 'train', 'Warsaw', 'Petersburg',
3
'railway', 'approaching', 'latter', 'city', 'full', 'speed', '.', 'The',
'morning', 'damp', 'misty', 'great', 'difficulty', 'day', 'succeeded',
'breaking', ';', 'impossible', 'distinguish', 'anything', 'yards', 'away',
'carriage', 'windows', '.', 'The', 'wearer', 'cloak', 'young', 'fellow', ',',
'also', 'twenty-six', 'twenty-seven', 'years', 'age', ',', 'slightly', 'middle',
'height', ',', 'fair', ',', 'thin', ',', 'pointed', 'light', 'coloured',
'beard', ';', 'eyes', 'large', 'blue', ',', 'intent', 'look', ',', 'yet',
'heavy', 'expression', 'people', 'affirm', 'peculiarity', 'well', 'evidence',
',', 'epileptic', 'subject', '.', 'His', 'face', 'decidedly', 'pleasant', 'one',
';', 'refined', ',', 'quite', 'colourless', ',', 'except', 'circumstance',
'moment', 'blue', 'cold', '.', 'He', 'held', 'bundle', 'made', 'old', 'faded',
'silk', 'handkerchief', 'apparently', 'contained', 'travelling', 'wardrobe',
',', 'wore', 'thick', 'shoes', 'gaiters', ',', 'whole', 'appearance', 'un-
Russian', '.', '“', 'Wheugh', '!', 'goodness', '!', '”', 'The', 'black-haired',
'young', 'fellow', 'whistled', ',', 'laughed', '.', 'Replying', ',', 'made',
'known', 'inquirer', 'certainly', 'long', 'absent', 'Russia', ',', 'four',
'years', ';', 'sent', 'abroad', 'health', ';', 'suffered', 'strange', 'nervous',
'malady—a', 'kind', 'epilepsy', ',', 'convulsive', 'spasms', '.', 'His',
'interlocutor', 'burst', 'laughing', 'several', 'times', 'answers', ';', 'ever',
',', 'question', ',', '“', 'whether', 'cured', '?', '”', 'patient', 'replied',
':', '*', '*', '*', 'END', 'OF', 'THE', 'PROJECT', 'GUTENBERG', 'EBOOK', 'THE',
'IDIOT', '*', '*', '*', 'He', 'seemed', 'suspicion', 'impertinence',
'inappropriateness', 'fact', 'questions', 'put', '.', 'Replying', ',', 'made',
'known', 'inquirer', 'certainly', 'long', 'absent', 'Russia', ',', 'mo']
[15]: filted_words = " "

for word in data_tokenize:
if word not in en_stops:
filted_words= filted_words + word + " "
# print(filted_words)
[12]: print(len(filted_words))
1978
[16]: # print(sent_tokenize(text))
# Corpus & Word Net
[19]: from nltk.corpus import gutenberg as gt

print(gt.fileids())
['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-

kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt',
'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-
thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-
paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-
macbeth.txt', 'whitman-leaves.txt']
4
[20]: print(gt.words('austen-emma.txt'))
['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']', …]

# Stemming & Lemmatization
[21]: from nltk.stem import PorterStemmer

from nltk.tokenize import word_tokenize
stemmer = PorterStemmer()
text = "programmers program with a programming algorithm languages"
text1 = "there are several books"
[22]: text1 = word_tokenize(text1)

for word in text1:
print(stemmer.stem(word))
there
are
sever
book
[23]: # Lemmatization
text = "programmers program with a programming algorithm languages"

text1 = "there are several books"
from nltk.stem import WordNetLemmatizer

lemmi = WordNetLemmatizer()
[24]: text1 = word_tokenize(text1)

for word in text1:
print(lemmi.lemmatize(word))
there
are
several
book
# Pos Tagging
• tagging prior to lemmatization increases accuracy
5
[26]: import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
path= r"C:\Users\Asus\Desktop\Data Science\Other\2- Text Mining-Rabti\the idiot.

,→txt"
with open(path, 'r') as myfile:

text=myfile.read()
text = word_tokenize(text)
[27]: print(nltk.pos_tag(text))
[('The', 'DT'), ('Project', 'NNP'), ('Gutenberg', 'NNP'), ('eBook', 'NN'),

('of', 'IN'), ('The', 'DT'), ('Idiot', 'NNP'), (',', ','), ('by', 'IN'),
('Fyodor', 'NNP'), ('Dostoyevsky', 'NNP'), ('This', 'DT'), ('eBook', 'NN'),
('is', 'VBZ'), ('for', 'IN'), ('the', 'DT'), ('use', 'NN'), ('of', 'IN'),
('anyone', 'NN'), ('anywhere', 'RB'), ('in', 'IN'), ('the', 'DT'), ('United',
'NNP'), ('States', 'NNPS'), ('and', 'CC'), ('most', 'JJS'), ('other', 'JJ'),
('parts', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('world', 'NN'), ('at', 'IN'),
('no', 'DT'), ('cost', 'NN'), ('and', 'CC'), ('with', 'IN'), ('almost', 'RB'),
('no', 'DT'), ('restrictions', 'NNS'), ('whatsoever', 'RB'), ('.', '.'), ('You',
'PRP'), ('may', 'MD'), ('copy', 'VB'), ('it', 'PRP'), (',', ','), ('give',
'VB'), ('it', 'PRP'), ('away', 'RB'), ('or', 'CC'), ('re-use', 'VB'), ('it',
'PRP'), ('under', 'IN'), ('the', 'DT'), ('terms', 'NNS'), ('of', 'IN'), ('the',
'DT'), ('Project', 'NNP'), ('Gutenberg', 'NNP'), ('License', 'NNP'),
('included', 'VBD'), ('with', 'IN'), ('this', 'DT'), ('eBook', 'NN'), ('or',
'CC'), ('online', 'NN'), ('at', 'IN'), ('www.gutenberg.org', 'NN'), ('.', '.'),
('If', 'IN'), ('you', 'PRP'), ('are', 'VBP'), ('not', 'RB'), ('located', 'VBN'),
('in', 'IN'), ('the', 'DT'), ('United', 'NNP'), ('States', 'NNPS'), (',', ','),
('you', 'PRP'), ('will', 'MD'), ('have', 'VB'), ('to', 'TO'), ('check', 'VB'),
('the', 'DT'), ('laws', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('country', 'NN'),
('where', 'WRB'), ('you', 'PRP'), ('are', 'VBP'), ('located', 'VBN'), ('before',
'IN'), ('using', 'VBG'), ('this', 'DT'), ('eBook', 'NN'), ('.', '.'), ('Title',
'NN'), (':', ':'), ('The', 'DT'), ('Idiot', 'NNP'), ('Author', 'NNP'), (':',
':'), ('Fyodor', 'NNP'), ('Dostoyevsky', 'NNP'), ('Translator', 'NNP'), (':',
':'), ('Eva', 'NNP'), ('Martin', 'NNP'), ('Release', 'NNP'), ('Date', 'NNP'),
(':', ':'), ('May', 'NNP'), (',', ','), ('2001', 'CD'), ('[', 'NNP'), ('eBook',
'NN'), ('#', '#'), ('2638', 'CD'), (']', 'NNP'), ('[', 'NNP'), ('Most', 'NNP'),
('recently', 'RB'), ('updated', 'VBD'), (':', ':'), ('June', 'NNP'), ('21',
'CD'), (',', ','), ('2021', 'CD'), (']', 'JJ'), ('Language', 'NNP'), (':', ':'),
('English', 'JJ'), ('Character', 'NNP'), ('set', 'VBD'), ('encoding', 'VBG'),
(':', ':'), ('UTF-8', 'NN'), ('Produced', 'VBN'), ('by', 'IN'), (':', ':'),
('Martin', 'NNP'), ('Adamson', 'NNP'), (',', ','), ('David', 'NNP'), ('Widger',
'NNP'), (',', ','), ('with', 'IN'), ('corrections', 'NNS'), ('by', 'IN'),
('Andrew', 'NNP'), ('Sly', 'NNP'), ('*', 'NNP'), ('*', 'NNP'), ('*', 'NNP'),
('START', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('PROJECT', 'NNP'),
('GUTENBERG', 'NNP'), ('EBOOK', 'NNP'), ('THE', 'NNP'), ('IDIOT', 'NNP'), ('*',
'NNP'), ('*', 'NNP'), ('*', 'VBD'), ('The', 'DT'), ('Idiot', 'NNP'), ('Towards',
'NNP'), ('the', 'DT'), ('end', 'NN'), ('of', 'IN'), ('November', 'NNP'), (',',
6
','), ('during', 'IN'), ('a', 'DT'), ('thaw', 'NN'), (',', ','), ('at', 'IN'),
('nine', 'CD'), ('o', 'JJ'), ('’', 'FW'), ('clock', 'NN'), ('one', 'CD'),
('morning', 'NN'), (',', ','), ('a', 'DT'), ('train', 'NN'), ('on', 'IN'),
('the', 'DT'), ('Warsaw', 'NNP'), ('and', 'CC'), ('Petersburg', 'NNP'),
('railway', 'NN'), ('was', 'VBD'), ('approaching', 'VBG'), ('the', 'DT'),
('latter', 'JJ'), ('city', 'NN'), ('at', 'IN'), ('full', 'JJ'), ('speed', 'NN'),
('.', '.'), ('The', 'DT'), ('morning', 'NN'), ('was', 'VBD'), ('so', 'RB'),
('damp', 'JJ'), ('and', 'CC'), ('misty', 'VBZ'), ('that', 'IN'), ('it', 'PRP'),
('was', 'VBD'), ('only', 'RB'), ('with', 'IN'), ('great', 'JJ'), ('difficulty',
'NN'), ('that', 'IN'), ('the', 'DT'), ('day', 'NN'), ('succeeded', 'VBD'),
('in', 'IN'), ('breaking', 'NN'), (';', ':'), ('and', 'CC'), ('it', 'PRP'),
('was', 'VBD'), ('impossible', 'JJ'), ('to', 'TO'), ('distinguish', 'VB'),
('anything', 'NN'), ('more', 'JJR'), ('than', 'IN'), ('a', 'DT'), ('few', 'JJ'),
('yards', 'NNS'), ('away', 'RB'), ('from', 'IN'), ('the', 'DT'), ('carriage',
'NN'), ('windows', 'VBZ'), ('.', '.'), ('The', 'DT'), ('wearer', 'NN'), ('of',
'IN'), ('this', 'DT'), ('cloak', 'NN'), ('was', 'VBD'), ('a', 'DT'), ('young',
'JJ'), ('fellow', 'NN'), (',', ','), ('also', 'RB'), ('of', 'IN'), ('about',
'IN'), ('twenty-six', 'JJ'), ('or', 'CC'), ('twenty-seven', 'JJ'), ('years',
'NNS'), ('of', 'IN'), ('age', 'NN'), (',', ','), ('slightly', 'RB'), ('above',
'IN'), ('the', 'DT'), ('middle', 'JJ'), ('height', 'NN'), (',', ','), ('very',
'RB'), ('fair', 'JJ'), (',', ','), ('with', 'IN'), ('a', 'DT'), ('thin', 'JJ'),
(',', ','), ('pointed', 'JJ'), ('and', 'CC'), ('very', 'RB'), ('light', 'JJ'),
('coloured', 'VBN'), ('beard', 'NN'), (';', ':'), ('his', 'PRP$'), ('eyes',
'NNS'), ('were', 'VBD'), ('large', 'JJ'), ('and', 'CC'), ('blue', 'JJ'), (',',
','), ('and', 'CC'), ('had', 'VBD'), ('an', 'DT'), ('intent', 'JJ'), ('look',
'NN'), ('about', 'IN'), ('them', 'PRP'), (',', ','), ('yet', 'RB'), ('that',
'DT'), ('heavy', 'JJ'), ('expression', 'NN'), ('which', 'WDT'), ('some', 'DT'),
('people', 'NNS'), ('affirm', 'VBP'), ('to', 'TO'), ('be', 'VB'), ('a', 'DT'),
('peculiarity', 'NN'), ('as', 'RB'), ('well', 'RB'), ('as', 'IN'), ('evidence',
'NN'), (',', ','), ('of', 'IN'), ('an', 'DT'), ('epileptic', 'JJ'), ('subject',
'NN'), ('.', '.'), ('His', 'PRP$'), ('face', 'NN'), ('was', 'VBD'),
('decidedly', 'RB'), ('a', 'DT'), ('pleasant', 'JJ'), ('one', 'NN'), ('for',
'IN'), ('all', 'DT'), ('that', 'DT'), (';', ':'), ('refined', 'VBN'), (',',
','), ('but', 'CC'), ('quite', 'RB'), ('colourless', 'JJ'), (',', ','),
('except', 'IN'), ('for', 'IN'), ('the', 'DT'), ('circumstance', 'NN'), ('that',
'WDT'), ('at', 'IN'), ('this', 'DT'), ('moment', 'NN'), ('it', 'PRP'), ('was',
'VBD'), ('blue', 'JJ'), ('with', 'IN'), ('cold', 'JJ'), ('.', '.'), ('He',
'PRP'), ('held', 'VBD'), ('a', 'DT'), ('bundle', 'NN'), ('made', 'VBD'), ('up',
'RB'), ('of', 'IN'), ('an', 'DT'), ('old', 'JJ'), ('faded', 'JJ'), ('silk',
'NN'), ('handkerchief', 'NN'), ('that', 'WDT'), ('apparently', 'RB'),
('contained', 'VBD'), ('all', 'DT'), ('his', 'PRP$'), ('travelling', 'NN'),
('wardrobe', 'NN'), (',', ','), ('and', 'CC'), ('wore', 'VBD'), ('thick', 'NN'),
('shoes', 'NNS'), ('and', 'CC'), ('gaiters', 'NNS'), (',', ','), ('his',
'PRP$'), ('whole', 'JJ'), ('appearance', 'NN'), ('being', 'VBG'), ('very',
'RB'), ('un-Russian', 'JJ'), ('.', '.'), ('“', 'CC'), ('Wheugh', 'IN'), ('!',
'.'), ('my', 'PRP$'), ('goodness', 'NN'), ('!', '.'), ('”', 'VB'), ('The',
'DT'), ('black-haired', 'JJ'), ('young', 'JJ'), ('fellow', 'NN'), ('whistled',
'VBD'), (',', ','), ('and', 'CC'), ('then', 'RB'), ('laughed', 'VBD'), ('.',
7
'.'), ('Replying', 'VBG'), ('to', 'TO'), ('them', 'PRP'), (',', ','), ('he',
'PRP'), ('made', 'VBD'), ('known', 'VBN'), ('to', 'TO'), ('the', 'DT'),
('inquirer', 'NN'), ('that', 'IN'), ('he', 'PRP'), ('certainly', 'RB'), ('had',
'VBD'), ('been', 'VBN'), ('long', 'RB'), ('absent', 'JJ'), ('from', 'IN'),
('Russia', 'NNP'), (',', ','), ('more', 'JJR'), ('than', 'IN'), ('four', 'CD'),
('years', 'NNS'), (';', ':'), ('that', 'IN'), ('he', 'PRP'), ('had', 'VBD'),
('been', 'VBN'), ('sent', 'VBN'), ('abroad', 'RB'), ('for', 'IN'), ('his',
'PRP$'), ('health', 'NN'), (';', ':'), ('that', 'IN'), ('he', 'PRP'), ('had',
'VBD'), ('suffered', 'VBN'), ('from', 'IN'), ('some', 'DT'), ('strange', 'JJ'),
('nervous', 'JJ'), ('malady—a', 'NN'), ('kind', 'NN'), ('of', 'IN'),
('epilepsy', 'NN'), (',', ','), ('with', 'IN'), ('convulsive', 'JJ'), ('spasms',
'NNS'), ('.', '.'), ('His', 'PRP$'), ('interlocutor', 'NN'), ('burst', 'NN'),
('out', 'IN'), ('laughing', 'VBG'), ('several', 'JJ'), ('times', 'NNS'), ('at',
'IN'), ('his', 'PRP$'), ('answers', 'NNS'), (';', ':'), ('and', 'CC'), ('more',
'RBR'), ('than', 'IN'), ('ever', 'RB'), (',', ','), ('when', 'WRB'), ('to',
'TO'), ('the', 'DT'), ('question', 'NN'), (',', ','), ('“', 'VBZ'), ('whether',
'IN'), ('he', 'PRP'), ('had', 'VBD'), ('been', 'VBN'), ('cured', 'VBN'), ('?',
'.'), ('”', 'VB'), ('the', 'DT'), ('patient', 'NN'), ('replied', 'VBD'), (':',
':'), ('*', 'NN'), ('*', 'VBZ'), ('*', 'JJ'), ('END', 'NN'), ('OF', 'IN'),
('THE', 'NNP'), ('PROJECT', 'NNP'), ('GUTENBERG', 'NNP'), ('EBOOK', 'NNP'),
('THE', 'NNP'), ('IDIOT', 'NNP'), ('*', 'NNP'), ('*', 'NNP'), ('*', 'NNP'),
('He', 'PRP'), ('seemed', 'VBD'), ('to', 'TO'), ('have', 'VB'), ('no', 'DT'),
('suspicion', 'NN'), ('of', 'IN'), ('any', 'DT'), ('impertinence', 'NN'), ('or',
'CC'), ('inappropriateness', 'NN'), ('in', 'IN'), ('the', 'DT'), ('fact', 'NN'),
('of', 'IN'), ('such', 'JJ'), ('questions', 'NNS'), ('being', 'VBG'), ('put',
'VBN'), ('to', 'TO'), ('him', 'PRP'), ('.', '.'), ('Replying', 'VBG'), ('to',
'TO'), ('them', 'PRP'), (',', ','), ('he', 'PRP'), ('made', 'VBD'), ('known',
'VBN'), ('to', 'TO'), ('the', 'DT'), ('inquirer', 'NN'), ('that', 'IN'), ('he',
'PRP'), ('certainly', 'RB'), ('had', 'VBD'), ('been', 'VBN'), ('long', 'RB'),
('absent', 'JJ'), ('from', 'IN'), ('Russia', 'NNP'), (',', ','), ('mo', 'NN')]
# Pos Tagging (Name Entity Recognition & Chunking)
[28]: import nltk

from nltk.tag import pos_tag
news = "European Authorities find Google a record $5.2 billion on Wedensday"
def preprocess(sent):
sent=word_tokenize(sent)
sent=pos_tag(sent)
return sent
sent =preprocess(news)
print(sent)
[('European', 'JJ'), ('Authorities', 'NNP'), ('find', 'VBP'), ('Google', 'NNP'),

('a', 'DT'), ('record', 'NN'), ('$', '$'), ('5.2', 'CD'), ('billion', 'CD'),
('on', 'IN'), ('Wedensday', 'NN')]
8
[29]: pattern = 'NP: {<DT>?<JJ>*<NN> | <DT>?<JJ>*<NNS>}'
cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)
print(cs)
(S
European/JJ
Authorities/NNP
find/VBP
Google/NNP
(NP a/DT record/NN)
$/$
5.2/CD
billion/CD
on/IN
(NP Wedensday/NN))
[30]: # cs.draw()
[31]: from nltk.corpus import conll2000

print(conll2000.chunked_sents('train.txt') [99])
print(conll2000.chunked_sents('train.txt'))
(S
(PP Over/IN)
(NP a/DT cup/NN)
(PP of/IN)
(NP coffee/NN)
,/,
(NP Mr./NNP Stone/NNP)
(VP told/VBD)
9
(NP his/PRP$ story/NN)
./.)
[Tree('S', [Tree('NP', [('Confidence', 'NN')]), Tree('PP', [('in', 'IN')]),

Tree('NP', [('the', 'DT'), ('pound', 'NN')]), Tree('VP', [('is', 'VBZ'),
('widely', 'RB'), ('expected', 'VBN'), ('to', 'TO'), ('take', 'VB')]),
Tree('NP', [('another', 'DT'), ('sharp', 'JJ'), ('dive', 'NN')]), ('if', 'IN'),
Tree('NP', [('trade', 'NN'), ('figures', 'NNS')]), Tree('PP', [('for', 'IN')]),
Tree('NP', [('September', 'NNP')]), (',', ','), ('due', 'JJ'), Tree('PP',
[('for', 'IN')]), Tree('NP', [('release', 'NN')]), Tree('NP', [('tomorrow',
'NN')]), (',', ','), Tree('VP', [('fail', 'VB'), ('to', 'TO'), ('show', 'VB')]),
Tree('NP', [('a', 'DT'), ('substantial', 'JJ'), ('improvement', 'NN')]),
Tree('PP', [('from', 'IN')]), Tree('NP', [('July', 'NNP'), ('and', 'CC'),
('August', 'NNP')]), Tree('NP', [("'s", 'POS'), ('near-record', 'JJ'),
('deficits', 'NNS')]), ('.', '.')]), Tree('S', [('Chancellor', 'NNP'),
Tree('PP', [('of', 'IN')]), Tree('NP', [('the', 'DT'), ('Exchequer', 'NNP')]),
Tree('NP', [('Nigel', 'NNP'), ('Lawson', 'NNP')]), Tree('NP', [("'s", 'POS'),
('restated', 'VBN'), ('commitment', 'NN')]), Tree('PP', [('to', 'TO')]),
Tree('NP', [('a', 'DT'), ('firm', 'NN'), ('monetary', 'JJ'), ('policy', 'NN')]),
Tree('VP', [('has', 'VBZ'), ('helped', 'VBN'), ('to', 'TO'), ('prevent',
'VB')]), Tree('NP', [('a', 'DT'), ('freefall', 'NN')]), Tree('PP', [('in',
'IN')]), Tree('NP', [('sterling', 'NN')]), Tree('PP', [('over', 'IN')]),
Tree('NP', [('the', 'DT'), ('past', 'JJ'), ('week', 'NN')]), ('.', '.')]), …]
# Ngram
[22]: import nltk

from nltk import ngrams
text = "A wonderful serenity has taken possession of my entire soul, like these␣
,→sweet mornings of spring which I enjoy with my whole heart. I am alone, and␣
,→feel the charm of existence in this spot, which was created for the bliss of␣
,→souls like mine. I am so happy, my dear friend, so absorbed in the exquisite␣
,→sense of mere tranquil existence, that I neglect my talents. I should be␣
,→incapable of drawing a single stroke at the present moment; and yet I feel␣
,→that I never was a greater artist than now. When, while the lovely valley␣
,→teems with vapour around me, and the meridian sun strikes the upper surface␣
,→of the impenetrable foliage of my trees, and but a few stray gleams steal␣
,→into the inner sanctuary, I throw myself down among the tall grass by the␣
,→trickling stream; and, as I lie close to the earth, a thousand unknown␣
,→plants are noticed by me: when I hear the buzz of the little world among the␣
,→stalks, and grow familiar with the countless indescribable forms of the␣
,→insects and flies, then I feel the presence of the Almighty, who formed us␣
,→in his own image, and the breath"
10
[23]: from nltk.corpus import stopwords
en_stops = stopwords.words('english')
[24]: def preprocess(sent):

text = word_tokenize(sent)
filtered_words = []
for word in text:
if word.lower() not in en_stops:
filtered_words.append(word)
return filtered_words
text = preprocess(text)
n = 3
threegrams = ngrams(text, n)
for grams in threegrams:

print(grams)
('wonderful', 'serenity', 'taken')
('serenity', 'taken', 'possession')
('taken', 'possession', 'entire')
('possession', 'entire', 'soul')
('entire', 'soul', ',')
('soul', ',', 'like')
(',', 'like', 'sweet')
('like', 'sweet', 'mornings')
('sweet', 'mornings', 'spring')
('mornings', 'spring', 'enjoy')
('spring', 'enjoy', 'whole')
('enjoy', 'whole', 'heart')
('whole', 'heart', '.')
('heart', '.', 'alone')
('.', 'alone', ',')
11
('alone', ',', 'feel')
(',', 'feel', 'charm')
('feel', 'charm', 'existence')
('charm', 'existence', 'spot')
('existence', 'spot', ',')
('spot', ',', 'created')
(',', 'created', 'bliss')
('created', 'bliss', 'souls')
('bliss', 'souls', 'like')
('souls', 'like', 'mine')
('like', 'mine', '.')
('mine', '.', 'happy')
('.', 'happy', ',')
('happy', ',', 'dear')
(',', 'dear', 'friend')
('dear', 'friend', ',')
('friend', ',', 'absorbed')
(',', 'absorbed', 'exquisite')
('absorbed', 'exquisite', 'sense')
('exquisite', 'sense', 'mere')
('sense', 'mere', 'tranquil')
('mere', 'tranquil', 'existence')
('tranquil', 'existence', ',')
('existence', ',', 'neglect')
12
(',', 'neglect', 'talents')
('neglect', 'talents', '.')
('talents', '.', 'incapable')
('.', 'incapable', 'drawing')
('incapable', 'drawing', 'single')
('drawing', 'single', 'stroke')
('single', 'stroke', 'present')
('stroke', 'present', 'moment')
('present', 'moment', ';')
('moment', ';', 'yet')
(';', 'yet', 'feel')
('yet', 'feel', 'never')
('feel', 'never', 'greater')
('never', 'greater', 'artist')
('greater', 'artist', '.')
('artist', '.', ',')
('.', ',', 'lovely')
(',', 'lovely', 'valley')
('lovely', 'valley', 'teems')
('valley', 'teems', 'vapour')
('teems', 'vapour', 'around')
('vapour', 'around', ',')
('around', ',', 'meridian')
(',', 'meridian', 'sun')
13
('meridian', 'sun', 'strikes')
('sun', 'strikes', 'upper')
('strikes', 'upper', 'surface')
('upper', 'surface', 'impenetrable')
('surface', 'impenetrable', 'foliage')
('impenetrable', 'foliage', 'trees')
('foliage', 'trees', ',')
('trees', ',', 'stray')
(',', 'stray', 'gleams')
('stray', 'gleams', 'steal')
('gleams', 'steal', 'inner')
('steal', 'inner', 'sanctuary')
('inner', 'sanctuary', ',')
('sanctuary', ',', 'throw')
(',', 'throw', 'among')
('throw', 'among', 'tall')
('among', 'tall', 'grass')
('tall', 'grass', 'trickling')
('grass', 'trickling', 'stream')
('trickling', 'stream', ';')
('stream', ';', ',')
(';', ',', 'lie')
(',', 'lie', 'close')
('lie', 'close', 'earth')
14
('close', 'earth', ',')
('earth', ',', 'thousand')
(',', 'thousand', 'unknown')
('thousand', 'unknown', 'plants')
('unknown', 'plants', 'noticed')
('plants', 'noticed', ':')
('noticed', ':', 'hear')
(':', 'hear', 'buzz')
('hear', 'buzz', 'little')
('buzz', 'little', 'world')
('little', 'world', 'among')
('world', 'among', 'stalks')
('among', 'stalks', ',')
('stalks', ',', 'grow')
(',', 'grow', 'familiar')
('grow', 'familiar', 'countless')
('familiar', 'countless', 'indescribable')
('countless', 'indescribable', 'forms')
('indescribable', 'forms', 'insects')
('forms', 'insects', 'flies')
('insects', 'flies', ',')
('flies', ',', 'feel')
(',', 'feel', 'presence')
('feel', 'presence', 'Almighty')
15
('presence', 'Almighty', ',')
('Almighty', ',', 'formed')
(',', 'formed', 'us')
('formed', 'us', 'image')
('us', 'image', ',')
('image', ',', 'breath')
16

Text Mining Basics

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Text Mining Basics

Uploaded by

Copyright:

Available Formats

text-mining-basics

December 31, 2022

[5]: path= r"C:\Users\Asus\Desktop\the idiot.txt"

[6]: with open(path, 'r') as myfile:

[7]: import requests as rq

[8]: import urllib.request as ur

[9]: import pandas as pd

Segment Country Product Discount Band Units Sold \

0 Government Canada Carretera None 1618.5

1 Government Germany Carretera None 1321.0

Manufacturing Price Sale Price Gross Sales Discounts Sales COGS \

0 3 20 32370.0 0.0 32370.0 16185.0

1 3 20 26420.0 0.0 26420.0 13210.0

Profit Date Month Number Month Name Year

0 16185.0 2014-01-01 1 January 2014

1 13210.0 2014-01-01 1 January 2014

[1]: from nltk.corpus import stopwords

[2]: path= "the idiot.txt"

[8]: en_stops = stopwords.words('english')

[14]: data_tokenize = word_tokenize(text)

['The', 'Project', 'Gutenberg', 'eBook', 'The', 'Idiot', ',', 'Fyodor',

[15]: filted_words = " "

# Corpus & Word Net

[19]: from nltk.corpus import gutenberg as gt

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-

['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']', …]

[21]: from nltk.stem import PorterStemmer

[22]: text1 = word_tokenize(text1)

text = "programmers program with a programming algorithm languages"

from nltk.stem import WordNetLemmatizer

[24]: text1 = word_tokenize(text1)

path= r"C:\Users\Asus\Desktop\Data Science\Other\2- Text Mining-Rabti\the idiot.

with open(path, 'r') as myfile:

[('The', 'DT'), ('Project', 'NNP'), ('Gutenberg', 'NNP'), ('eBook', 'NN'),

[28]: import nltk

news = "European Authorities find Google a record $5.2 billion on Wedensday"

[('European', 'JJ'), ('Authorities', 'NNP'), ('find', 'VBP'), ('Google', 'NNP'),

(NP a/DT record/NN)

[31]: from nltk.corpus import conll2000

(NP a/DT cup/NN)

(NP Mr./NNP Stone/NNP)

[Tree('S', [Tree('NP', [('Confidence', 'NN')]), Tree('PP', [('in', 'IN')]),

[22]: import nltk

,→souls like mine. I am so happy, my dear friend, so absorbed in the exquisite␣

,→sense of mere tranquil existence, that I neglect my talents. I should be␣

,→trickling stream; and, as I lie close to the earth, a thousand unknown␣

,→in his own image, and the breath"

[24]: def preprocess(sent):

for grams in threegrams:

('wonderful', 'serenity', 'taken')

('serenity', 'taken', 'possession')

('taken', 'possession', 'entire')

('possession', 'entire', 'soul')

('entire', 'soul', ',')

('soul', ',', 'like')

(',', 'like', 'sweet')

('like', 'sweet', 'mornings')

('sweet', 'mornings', 'spring')

('mornings', 'spring', 'enjoy')

('spring', 'enjoy', 'whole')

('enjoy', 'whole', 'heart')

('whole', 'heart', '.')

('heart', '.', 'alone')