You are on page 1of 16

text-mining-basics

December 31, 2022

1 Text Mining
[1]: import pandas
import nltk
from nltk.corpus import stopwords

[4]: print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're",
"you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he',
'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's",
'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what',
'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is',
'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having',
'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or',
'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about',
'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above',
'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under',
'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why',
'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some',
'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very',
's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now',
'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn',
"couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn',
"hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't",
'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn',
"shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn',
"wouldn't"]
# Text Extraction from Resources

[5]: path= r"C:\Users\Asus\Desktop\the idiot.txt"

[6]: with open(path, 'r') as myfile:


text=myfile.read()

[7]: import requests as rq

1
text = rq.get("the_matrix_synopsis.txt").text

[8]: import urllib.request as ur


scrabed_web = ur.urlopen('https://faculty.elgin.edu/jputz/SampleTextPage.html')
news = scrabed_web.read()

[9]: import pandas as pd


comments = pd.read_excel("Financial Sample.xlsx")

[10]: print(comments.head(2))

Segment Country Product Discount Band Units Sold \

0 Government Canada Carretera None 1618.5

1 Government Germany Carretera None 1321.0

Manufacturing Price Sale Price Gross Sales Discounts Sales COGS \

0 3 20 32370.0 0.0 32370.0 16185.0

1 3 20 26420.0 0.0 26420.0 13210.0

Profit Date Month Number Month Name Year

0 16185.0 2014-01-01 1 January 2014

1 13210.0 2014-01-01 1 January 2014


# Tokenization and Dropping Stopwords

[1]: from nltk.corpus import stopwords


from nltk.tokenize import word_tokenize, sent_tokenize

[2]: path= "the idiot.txt"


with open(path, 'r') as myfile:
text=myfile.read()

[8]: en_stops = stopwords.words('english')


print(en_stops)
print('=' * 127)
print(len(en_stops))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're",
"you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he',

2
'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's",
'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what',
'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is',
'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having',
'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or',
'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about',
'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above',
'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under',
'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why',
'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some',
'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very',
's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now',
'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn',
"couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn',
"hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't",
'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn',
"shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn',
"wouldn't"]

================================================================================
===============================================

179

[14]: data_tokenize = word_tokenize(text)


# print(data_tokenize)

[10]: filted_words = []
for word in data_tokenize:
if word not in en_stops:
filted_words.append(word)
print(filted_words)

['The', 'Project', 'Gutenberg', 'eBook', 'The', 'Idiot', ',', 'Fyodor',


'Dostoyevsky', 'This', 'eBook', 'use', 'anyone', 'anywhere', 'United', 'States',
'parts', 'world', 'cost', 'almost', 'restrictions', 'whatsoever', '.', 'You',
'may', 'copy', ',', 'give', 'away', 're-use', 'terms', 'Project', 'Gutenberg',
'License', 'included', 'eBook', 'online', 'www.gutenberg.org', '.', 'If',
'located', 'United', 'States', ',', 'check', 'laws', 'country', 'located',
'using', 'eBook', '.', 'Title', ':', 'The', 'Idiot', 'Author', ':', 'Fyodor',
'Dostoyevsky', 'Translator', ':', 'Eva', 'Martin', 'Release', 'Date', ':',
'May', ',', '2001', '[', 'eBook', '#', '2638', ']', '[', 'Most', 'recently',
'updated', ':', 'June', '21', ',', '2021', ']', 'Language', ':', 'English',
'Character', 'set', 'encoding', ':', 'UTF-8', 'Produced', ':', 'Martin',
'Adamson', ',', 'David', 'Widger', ',', 'corrections', 'Andrew', 'Sly', '*',
'*', '*', 'START', 'OF', 'THE', 'PROJECT', 'GUTENBERG', 'EBOOK', 'THE', 'IDIOT',
'*', '*', '*', 'The', 'Idiot', 'Towards', 'end', 'November', ',', 'thaw', ',',
'nine', '’', 'clock', 'one', 'morning', ',', 'train', 'Warsaw', 'Petersburg',

3
'railway', 'approaching', 'latter', 'city', 'full', 'speed', '.', 'The',
'morning', 'damp', 'misty', 'great', 'difficulty', 'day', 'succeeded',
'breaking', ';', 'impossible', 'distinguish', 'anything', 'yards', 'away',
'carriage', 'windows', '.', 'The', 'wearer', 'cloak', 'young', 'fellow', ',',
'also', 'twenty-six', 'twenty-seven', 'years', 'age', ',', 'slightly', 'middle',
'height', ',', 'fair', ',', 'thin', ',', 'pointed', 'light', 'coloured',
'beard', ';', 'eyes', 'large', 'blue', ',', 'intent', 'look', ',', 'yet',
'heavy', 'expression', 'people', 'affirm', 'peculiarity', 'well', 'evidence',
',', 'epileptic', 'subject', '.', 'His', 'face', 'decidedly', 'pleasant', 'one',
';', 'refined', ',', 'quite', 'colourless', ',', 'except', 'circumstance',
'moment', 'blue', 'cold', '.', 'He', 'held', 'bundle', 'made', 'old', 'faded',
'silk', 'handkerchief', 'apparently', 'contained', 'travelling', 'wardrobe',
',', 'wore', 'thick', 'shoes', 'gaiters', ',', 'whole', 'appearance', 'un-
Russian', '.', '“', 'Wheugh', '!', 'goodness', '!', '”', 'The', 'black-haired',
'young', 'fellow', 'whistled', ',', 'laughed', '.', 'Replying', ',', 'made',
'known', 'inquirer', 'certainly', 'long', 'absent', 'Russia', ',', 'four',
'years', ';', 'sent', 'abroad', 'health', ';', 'suffered', 'strange', 'nervous',
'malady—a', 'kind', 'epilepsy', ',', 'convulsive', 'spasms', '.', 'His',
'interlocutor', 'burst', 'laughing', 'several', 'times', 'answers', ';', 'ever',
',', 'question', ',', '“', 'whether', 'cured', '?', '”', 'patient', 'replied',
':', '*', '*', '*', 'END', 'OF', 'THE', 'PROJECT', 'GUTENBERG', 'EBOOK', 'THE',
'IDIOT', '*', '*', '*', 'He', 'seemed', 'suspicion', 'impertinence',
'inappropriateness', 'fact', 'questions', 'put', '.', 'Replying', ',', 'made',
'known', 'inquirer', 'certainly', 'long', 'absent', 'Russia', ',', 'mo']

[15]: filted_words = " "


for word in data_tokenize:
if word not in en_stops:
filted_words= filted_words + word + " "
# print(filted_words)

[12]: print(len(filted_words))

1978

[16]: # print(sent_tokenize(text))

# Corpus & Word Net

[19]: from nltk.corpus import gutenberg as gt


print(gt.fileids())

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-


kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt',
'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-
thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-
paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-
macbeth.txt', 'whitman-leaves.txt']

4
[20]: print(gt.words('austen-emma.txt'))

['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']', …]


# Stemming & Lemmatization

[21]: from nltk.stem import PorterStemmer


from nltk.tokenize import word_tokenize

stemmer = PorterStemmer()
text = "programmers program with a programming algorithm languages"
text1 = "there are several books"

[22]: text1 = word_tokenize(text1)


for word in text1:
print(stemmer.stem(word))

there

are

sever

book

[23]: # Lemmatization

text = "programmers program with a programming algorithm languages"


text1 = "there are several books"

from nltk.stem import WordNetLemmatizer


from nltk.tokenize import word_tokenize

lemmi = WordNetLemmatizer()

[24]: text1 = word_tokenize(text1)


for word in text1:
print(lemmi.lemmatize(word))

there

are

several

book
# Pos Tagging
• tagging prior to lemmatization increases accuracy

5
[26]: import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

path= r"C:\Users\Asus\Desktop\Data Science\Other\2- Text Mining-Rabti\the idiot.


,→txt"

with open(path, 'r') as myfile:


text=myfile.read()
text = word_tokenize(text)

[27]: print(nltk.pos_tag(text))

[('The', 'DT'), ('Project', 'NNP'), ('Gutenberg', 'NNP'), ('eBook', 'NN'),


('of', 'IN'), ('The', 'DT'), ('Idiot', 'NNP'), (',', ','), ('by', 'IN'),
('Fyodor', 'NNP'), ('Dostoyevsky', 'NNP'), ('This', 'DT'), ('eBook', 'NN'),
('is', 'VBZ'), ('for', 'IN'), ('the', 'DT'), ('use', 'NN'), ('of', 'IN'),
('anyone', 'NN'), ('anywhere', 'RB'), ('in', 'IN'), ('the', 'DT'), ('United',
'NNP'), ('States', 'NNPS'), ('and', 'CC'), ('most', 'JJS'), ('other', 'JJ'),
('parts', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('world', 'NN'), ('at', 'IN'),
('no', 'DT'), ('cost', 'NN'), ('and', 'CC'), ('with', 'IN'), ('almost', 'RB'),
('no', 'DT'), ('restrictions', 'NNS'), ('whatsoever', 'RB'), ('.', '.'), ('You',
'PRP'), ('may', 'MD'), ('copy', 'VB'), ('it', 'PRP'), (',', ','), ('give',
'VB'), ('it', 'PRP'), ('away', 'RB'), ('or', 'CC'), ('re-use', 'VB'), ('it',
'PRP'), ('under', 'IN'), ('the', 'DT'), ('terms', 'NNS'), ('of', 'IN'), ('the',
'DT'), ('Project', 'NNP'), ('Gutenberg', 'NNP'), ('License', 'NNP'),
('included', 'VBD'), ('with', 'IN'), ('this', 'DT'), ('eBook', 'NN'), ('or',
'CC'), ('online', 'NN'), ('at', 'IN'), ('www.gutenberg.org', 'NN'), ('.', '.'),
('If', 'IN'), ('you', 'PRP'), ('are', 'VBP'), ('not', 'RB'), ('located', 'VBN'),
('in', 'IN'), ('the', 'DT'), ('United', 'NNP'), ('States', 'NNPS'), (',', ','),
('you', 'PRP'), ('will', 'MD'), ('have', 'VB'), ('to', 'TO'), ('check', 'VB'),
('the', 'DT'), ('laws', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('country', 'NN'),
('where', 'WRB'), ('you', 'PRP'), ('are', 'VBP'), ('located', 'VBN'), ('before',
'IN'), ('using', 'VBG'), ('this', 'DT'), ('eBook', 'NN'), ('.', '.'), ('Title',
'NN'), (':', ':'), ('The', 'DT'), ('Idiot', 'NNP'), ('Author', 'NNP'), (':',
':'), ('Fyodor', 'NNP'), ('Dostoyevsky', 'NNP'), ('Translator', 'NNP'), (':',
':'), ('Eva', 'NNP'), ('Martin', 'NNP'), ('Release', 'NNP'), ('Date', 'NNP'),
(':', ':'), ('May', 'NNP'), (',', ','), ('2001', 'CD'), ('[', 'NNP'), ('eBook',
'NN'), ('#', '#'), ('2638', 'CD'), (']', 'NNP'), ('[', 'NNP'), ('Most', 'NNP'),
('recently', 'RB'), ('updated', 'VBD'), (':', ':'), ('June', 'NNP'), ('21',
'CD'), (',', ','), ('2021', 'CD'), (']', 'JJ'), ('Language', 'NNP'), (':', ':'),
('English', 'JJ'), ('Character', 'NNP'), ('set', 'VBD'), ('encoding', 'VBG'),
(':', ':'), ('UTF-8', 'NN'), ('Produced', 'VBN'), ('by', 'IN'), (':', ':'),
('Martin', 'NNP'), ('Adamson', 'NNP'), (',', ','), ('David', 'NNP'), ('Widger',
'NNP'), (',', ','), ('with', 'IN'), ('corrections', 'NNS'), ('by', 'IN'),
('Andrew', 'NNP'), ('Sly', 'NNP'), ('*', 'NNP'), ('*', 'NNP'), ('*', 'NNP'),
('START', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('PROJECT', 'NNP'),
('GUTENBERG', 'NNP'), ('EBOOK', 'NNP'), ('THE', 'NNP'), ('IDIOT', 'NNP'), ('*',
'NNP'), ('*', 'NNP'), ('*', 'VBD'), ('The', 'DT'), ('Idiot', 'NNP'), ('Towards',
'NNP'), ('the', 'DT'), ('end', 'NN'), ('of', 'IN'), ('November', 'NNP'), (',',

6
','), ('during', 'IN'), ('a', 'DT'), ('thaw', 'NN'), (',', ','), ('at', 'IN'),
('nine', 'CD'), ('o', 'JJ'), ('’', 'FW'), ('clock', 'NN'), ('one', 'CD'),
('morning', 'NN'), (',', ','), ('a', 'DT'), ('train', 'NN'), ('on', 'IN'),
('the', 'DT'), ('Warsaw', 'NNP'), ('and', 'CC'), ('Petersburg', 'NNP'),
('railway', 'NN'), ('was', 'VBD'), ('approaching', 'VBG'), ('the', 'DT'),
('latter', 'JJ'), ('city', 'NN'), ('at', 'IN'), ('full', 'JJ'), ('speed', 'NN'),
('.', '.'), ('The', 'DT'), ('morning', 'NN'), ('was', 'VBD'), ('so', 'RB'),
('damp', 'JJ'), ('and', 'CC'), ('misty', 'VBZ'), ('that', 'IN'), ('it', 'PRP'),
('was', 'VBD'), ('only', 'RB'), ('with', 'IN'), ('great', 'JJ'), ('difficulty',
'NN'), ('that', 'IN'), ('the', 'DT'), ('day', 'NN'), ('succeeded', 'VBD'),
('in', 'IN'), ('breaking', 'NN'), (';', ':'), ('and', 'CC'), ('it', 'PRP'),
('was', 'VBD'), ('impossible', 'JJ'), ('to', 'TO'), ('distinguish', 'VB'),
('anything', 'NN'), ('more', 'JJR'), ('than', 'IN'), ('a', 'DT'), ('few', 'JJ'),
('yards', 'NNS'), ('away', 'RB'), ('from', 'IN'), ('the', 'DT'), ('carriage',
'NN'), ('windows', 'VBZ'), ('.', '.'), ('The', 'DT'), ('wearer', 'NN'), ('of',
'IN'), ('this', 'DT'), ('cloak', 'NN'), ('was', 'VBD'), ('a', 'DT'), ('young',
'JJ'), ('fellow', 'NN'), (',', ','), ('also', 'RB'), ('of', 'IN'), ('about',
'IN'), ('twenty-six', 'JJ'), ('or', 'CC'), ('twenty-seven', 'JJ'), ('years',
'NNS'), ('of', 'IN'), ('age', 'NN'), (',', ','), ('slightly', 'RB'), ('above',
'IN'), ('the', 'DT'), ('middle', 'JJ'), ('height', 'NN'), (',', ','), ('very',
'RB'), ('fair', 'JJ'), (',', ','), ('with', 'IN'), ('a', 'DT'), ('thin', 'JJ'),
(',', ','), ('pointed', 'JJ'), ('and', 'CC'), ('very', 'RB'), ('light', 'JJ'),
('coloured', 'VBN'), ('beard', 'NN'), (';', ':'), ('his', 'PRP$'), ('eyes',
'NNS'), ('were', 'VBD'), ('large', 'JJ'), ('and', 'CC'), ('blue', 'JJ'), (',',
','), ('and', 'CC'), ('had', 'VBD'), ('an', 'DT'), ('intent', 'JJ'), ('look',
'NN'), ('about', 'IN'), ('them', 'PRP'), (',', ','), ('yet', 'RB'), ('that',
'DT'), ('heavy', 'JJ'), ('expression', 'NN'), ('which', 'WDT'), ('some', 'DT'),
('people', 'NNS'), ('affirm', 'VBP'), ('to', 'TO'), ('be', 'VB'), ('a', 'DT'),
('peculiarity', 'NN'), ('as', 'RB'), ('well', 'RB'), ('as', 'IN'), ('evidence',
'NN'), (',', ','), ('of', 'IN'), ('an', 'DT'), ('epileptic', 'JJ'), ('subject',
'NN'), ('.', '.'), ('His', 'PRP$'), ('face', 'NN'), ('was', 'VBD'),
('decidedly', 'RB'), ('a', 'DT'), ('pleasant', 'JJ'), ('one', 'NN'), ('for',
'IN'), ('all', 'DT'), ('that', 'DT'), (';', ':'), ('refined', 'VBN'), (',',
','), ('but', 'CC'), ('quite', 'RB'), ('colourless', 'JJ'), (',', ','),
('except', 'IN'), ('for', 'IN'), ('the', 'DT'), ('circumstance', 'NN'), ('that',
'WDT'), ('at', 'IN'), ('this', 'DT'), ('moment', 'NN'), ('it', 'PRP'), ('was',
'VBD'), ('blue', 'JJ'), ('with', 'IN'), ('cold', 'JJ'), ('.', '.'), ('He',
'PRP'), ('held', 'VBD'), ('a', 'DT'), ('bundle', 'NN'), ('made', 'VBD'), ('up',
'RB'), ('of', 'IN'), ('an', 'DT'), ('old', 'JJ'), ('faded', 'JJ'), ('silk',
'NN'), ('handkerchief', 'NN'), ('that', 'WDT'), ('apparently', 'RB'),
('contained', 'VBD'), ('all', 'DT'), ('his', 'PRP$'), ('travelling', 'NN'),
('wardrobe', 'NN'), (',', ','), ('and', 'CC'), ('wore', 'VBD'), ('thick', 'NN'),
('shoes', 'NNS'), ('and', 'CC'), ('gaiters', 'NNS'), (',', ','), ('his',
'PRP$'), ('whole', 'JJ'), ('appearance', 'NN'), ('being', 'VBG'), ('very',
'RB'), ('un-Russian', 'JJ'), ('.', '.'), ('“', 'CC'), ('Wheugh', 'IN'), ('!',
'.'), ('my', 'PRP$'), ('goodness', 'NN'), ('!', '.'), ('”', 'VB'), ('The',
'DT'), ('black-haired', 'JJ'), ('young', 'JJ'), ('fellow', 'NN'), ('whistled',
'VBD'), (',', ','), ('and', 'CC'), ('then', 'RB'), ('laughed', 'VBD'), ('.',

7
'.'), ('Replying', 'VBG'), ('to', 'TO'), ('them', 'PRP'), (',', ','), ('he',
'PRP'), ('made', 'VBD'), ('known', 'VBN'), ('to', 'TO'), ('the', 'DT'),
('inquirer', 'NN'), ('that', 'IN'), ('he', 'PRP'), ('certainly', 'RB'), ('had',
'VBD'), ('been', 'VBN'), ('long', 'RB'), ('absent', 'JJ'), ('from', 'IN'),
('Russia', 'NNP'), (',', ','), ('more', 'JJR'), ('than', 'IN'), ('four', 'CD'),
('years', 'NNS'), (';', ':'), ('that', 'IN'), ('he', 'PRP'), ('had', 'VBD'),
('been', 'VBN'), ('sent', 'VBN'), ('abroad', 'RB'), ('for', 'IN'), ('his',
'PRP$'), ('health', 'NN'), (';', ':'), ('that', 'IN'), ('he', 'PRP'), ('had',
'VBD'), ('suffered', 'VBN'), ('from', 'IN'), ('some', 'DT'), ('strange', 'JJ'),
('nervous', 'JJ'), ('malady—a', 'NN'), ('kind', 'NN'), ('of', 'IN'),
('epilepsy', 'NN'), (',', ','), ('with', 'IN'), ('convulsive', 'JJ'), ('spasms',
'NNS'), ('.', '.'), ('His', 'PRP$'), ('interlocutor', 'NN'), ('burst', 'NN'),
('out', 'IN'), ('laughing', 'VBG'), ('several', 'JJ'), ('times', 'NNS'), ('at',
'IN'), ('his', 'PRP$'), ('answers', 'NNS'), (';', ':'), ('and', 'CC'), ('more',
'RBR'), ('than', 'IN'), ('ever', 'RB'), (',', ','), ('when', 'WRB'), ('to',
'TO'), ('the', 'DT'), ('question', 'NN'), (',', ','), ('“', 'VBZ'), ('whether',
'IN'), ('he', 'PRP'), ('had', 'VBD'), ('been', 'VBN'), ('cured', 'VBN'), ('?',
'.'), ('”', 'VB'), ('the', 'DT'), ('patient', 'NN'), ('replied', 'VBD'), (':',
':'), ('*', 'NN'), ('*', 'VBZ'), ('*', 'JJ'), ('END', 'NN'), ('OF', 'IN'),
('THE', 'NNP'), ('PROJECT', 'NNP'), ('GUTENBERG', 'NNP'), ('EBOOK', 'NNP'),
('THE', 'NNP'), ('IDIOT', 'NNP'), ('*', 'NNP'), ('*', 'NNP'), ('*', 'NNP'),
('He', 'PRP'), ('seemed', 'VBD'), ('to', 'TO'), ('have', 'VB'), ('no', 'DT'),
('suspicion', 'NN'), ('of', 'IN'), ('any', 'DT'), ('impertinence', 'NN'), ('or',
'CC'), ('inappropriateness', 'NN'), ('in', 'IN'), ('the', 'DT'), ('fact', 'NN'),
('of', 'IN'), ('such', 'JJ'), ('questions', 'NNS'), ('being', 'VBG'), ('put',
'VBN'), ('to', 'TO'), ('him', 'PRP'), ('.', '.'), ('Replying', 'VBG'), ('to',
'TO'), ('them', 'PRP'), (',', ','), ('he', 'PRP'), ('made', 'VBD'), ('known',
'VBN'), ('to', 'TO'), ('the', 'DT'), ('inquirer', 'NN'), ('that', 'IN'), ('he',
'PRP'), ('certainly', 'RB'), ('had', 'VBD'), ('been', 'VBN'), ('long', 'RB'),
('absent', 'JJ'), ('from', 'IN'), ('Russia', 'NNP'), (',', ','), ('mo', 'NN')]
# Pos Tagging (Name Entity Recognition & Chunking)

[28]: import nltk


from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

news = "European Authorities find Google a record $5.2 billion on Wedensday"

def preprocess(sent):
sent=word_tokenize(sent)
sent=pos_tag(sent)
return sent
sent =preprocess(news)
print(sent)

[('European', 'JJ'), ('Authorities', 'NNP'), ('find', 'VBP'), ('Google', 'NNP'),


('a', 'DT'), ('record', 'NN'), ('$', '$'), ('5.2', 'CD'), ('billion', 'CD'),
('on', 'IN'), ('Wedensday', 'NN')]

8
[29]: pattern = 'NP: {<DT>?<JJ>*<NN> | <DT>?<JJ>*<NNS>}'
cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)
print(cs)

(S

European/JJ

Authorities/NNP

find/VBP

Google/NNP

(NP a/DT record/NN)

$/$

5.2/CD

billion/CD

on/IN

(NP Wedensday/NN))

[30]: # cs.draw()

[31]: from nltk.corpus import conll2000


print(conll2000.chunked_sents('train.txt') [99])
print(conll2000.chunked_sents('train.txt'))

(S

(PP Over/IN)

(NP a/DT cup/NN)

(PP of/IN)

(NP coffee/NN)

,/,

(NP Mr./NNP Stone/NNP)

(VP told/VBD)

9
(NP his/PRP$ story/NN)

./.)

[Tree('S', [Tree('NP', [('Confidence', 'NN')]), Tree('PP', [('in', 'IN')]),


Tree('NP', [('the', 'DT'), ('pound', 'NN')]), Tree('VP', [('is', 'VBZ'),
('widely', 'RB'), ('expected', 'VBN'), ('to', 'TO'), ('take', 'VB')]),
Tree('NP', [('another', 'DT'), ('sharp', 'JJ'), ('dive', 'NN')]), ('if', 'IN'),
Tree('NP', [('trade', 'NN'), ('figures', 'NNS')]), Tree('PP', [('for', 'IN')]),
Tree('NP', [('September', 'NNP')]), (',', ','), ('due', 'JJ'), Tree('PP',
[('for', 'IN')]), Tree('NP', [('release', 'NN')]), Tree('NP', [('tomorrow',
'NN')]), (',', ','), Tree('VP', [('fail', 'VB'), ('to', 'TO'), ('show', 'VB')]),
Tree('NP', [('a', 'DT'), ('substantial', 'JJ'), ('improvement', 'NN')]),
Tree('PP', [('from', 'IN')]), Tree('NP', [('July', 'NNP'), ('and', 'CC'),
('August', 'NNP')]), Tree('NP', [("'s", 'POS'), ('near-record', 'JJ'),
('deficits', 'NNS')]), ('.', '.')]), Tree('S', [('Chancellor', 'NNP'),
Tree('PP', [('of', 'IN')]), Tree('NP', [('the', 'DT'), ('Exchequer', 'NNP')]),
Tree('NP', [('Nigel', 'NNP'), ('Lawson', 'NNP')]), Tree('NP', [("'s", 'POS'),
('restated', 'VBN'), ('commitment', 'NN')]), Tree('PP', [('to', 'TO')]),
Tree('NP', [('a', 'DT'), ('firm', 'NN'), ('monetary', 'JJ'), ('policy', 'NN')]),
Tree('VP', [('has', 'VBZ'), ('helped', 'VBN'), ('to', 'TO'), ('prevent',
'VB')]), Tree('NP', [('a', 'DT'), ('freefall', 'NN')]), Tree('PP', [('in',
'IN')]), Tree('NP', [('sterling', 'NN')]), Tree('PP', [('over', 'IN')]),
Tree('NP', [('the', 'DT'), ('past', 'JJ'), ('week', 'NN')]), ('.', '.')]), …]
# Ngram

[22]: import nltk


from nltk.tokenize import word_tokenize
from nltk import ngrams
text = "A wonderful serenity has taken possession of my entire soul, like these␣
,→sweet mornings of spring which I enjoy with my whole heart. I am alone, and␣

,→feel the charm of existence in this spot, which was created for the bliss of␣

,→souls like mine. I am so happy, my dear friend, so absorbed in the exquisite␣

,→sense of mere tranquil existence, that I neglect my talents. I should be␣

,→incapable of drawing a single stroke at the present moment; and yet I feel␣

,→that I never was a greater artist than now. When, while the lovely valley␣

,→teems with vapour around me, and the meridian sun strikes the upper surface␣

,→of the impenetrable foliage of my trees, and but a few stray gleams steal␣

,→into the inner sanctuary, I throw myself down among the tall grass by the␣

,→trickling stream; and, as I lie close to the earth, a thousand unknown␣

,→plants are noticed by me: when I hear the buzz of the little world among the␣

,→stalks, and grow familiar with the countless indescribable forms of the␣

,→insects and flies, then I feel the presence of the Almighty, who formed us␣

,→in his own image, and the breath"

10
[23]: from nltk.corpus import stopwords
en_stops = stopwords.words('english')

[24]: def preprocess(sent):


text = word_tokenize(sent)
filtered_words = []
for word in text:
if word.lower() not in en_stops:
filtered_words.append(word)
return filtered_words

text = preprocess(text)
n = 3
threegrams = ngrams(text, n)

for grams in threegrams:


print(grams)

('wonderful', 'serenity', 'taken')

('serenity', 'taken', 'possession')

('taken', 'possession', 'entire')

('possession', 'entire', 'soul')

('entire', 'soul', ',')

('soul', ',', 'like')

(',', 'like', 'sweet')

('like', 'sweet', 'mornings')

('sweet', 'mornings', 'spring')

('mornings', 'spring', 'enjoy')

('spring', 'enjoy', 'whole')

('enjoy', 'whole', 'heart')

('whole', 'heart', '.')

('heart', '.', 'alone')

('.', 'alone', ',')

11
('alone', ',', 'feel')

(',', 'feel', 'charm')

('feel', 'charm', 'existence')

('charm', 'existence', 'spot')

('existence', 'spot', ',')

('spot', ',', 'created')

(',', 'created', 'bliss')

('created', 'bliss', 'souls')

('bliss', 'souls', 'like')

('souls', 'like', 'mine')

('like', 'mine', '.')

('mine', '.', 'happy')

('.', 'happy', ',')

('happy', ',', 'dear')

(',', 'dear', 'friend')

('dear', 'friend', ',')

('friend', ',', 'absorbed')

(',', 'absorbed', 'exquisite')

('absorbed', 'exquisite', 'sense')

('exquisite', 'sense', 'mere')

('sense', 'mere', 'tranquil')

('mere', 'tranquil', 'existence')

('tranquil', 'existence', ',')

('existence', ',', 'neglect')

12
(',', 'neglect', 'talents')

('neglect', 'talents', '.')

('talents', '.', 'incapable')

('.', 'incapable', 'drawing')

('incapable', 'drawing', 'single')

('drawing', 'single', 'stroke')

('single', 'stroke', 'present')

('stroke', 'present', 'moment')

('present', 'moment', ';')

('moment', ';', 'yet')

(';', 'yet', 'feel')

('yet', 'feel', 'never')

('feel', 'never', 'greater')

('never', 'greater', 'artist')

('greater', 'artist', '.')

('artist', '.', ',')

('.', ',', 'lovely')

(',', 'lovely', 'valley')

('lovely', 'valley', 'teems')

('valley', 'teems', 'vapour')

('teems', 'vapour', 'around')

('vapour', 'around', ',')

('around', ',', 'meridian')

(',', 'meridian', 'sun')

13
('meridian', 'sun', 'strikes')

('sun', 'strikes', 'upper')

('strikes', 'upper', 'surface')

('upper', 'surface', 'impenetrable')

('surface', 'impenetrable', 'foliage')

('impenetrable', 'foliage', 'trees')

('foliage', 'trees', ',')

('trees', ',', 'stray')

(',', 'stray', 'gleams')

('stray', 'gleams', 'steal')

('gleams', 'steal', 'inner')

('steal', 'inner', 'sanctuary')

('inner', 'sanctuary', ',')

('sanctuary', ',', 'throw')

(',', 'throw', 'among')

('throw', 'among', 'tall')

('among', 'tall', 'grass')

('tall', 'grass', 'trickling')

('grass', 'trickling', 'stream')

('trickling', 'stream', ';')

('stream', ';', ',')

(';', ',', 'lie')

(',', 'lie', 'close')

('lie', 'close', 'earth')

14
('close', 'earth', ',')

('earth', ',', 'thousand')

(',', 'thousand', 'unknown')

('thousand', 'unknown', 'plants')

('unknown', 'plants', 'noticed')

('plants', 'noticed', ':')

('noticed', ':', 'hear')

(':', 'hear', 'buzz')

('hear', 'buzz', 'little')

('buzz', 'little', 'world')

('little', 'world', 'among')

('world', 'among', 'stalks')

('among', 'stalks', ',')

('stalks', ',', 'grow')

(',', 'grow', 'familiar')

('grow', 'familiar', 'countless')

('familiar', 'countless', 'indescribable')

('countless', 'indescribable', 'forms')

('indescribable', 'forms', 'insects')

('forms', 'insects', 'flies')

('insects', 'flies', ',')

('flies', ',', 'feel')

(',', 'feel', 'presence')

('feel', 'presence', 'Almighty')

15
('presence', 'Almighty', ',')

('Almighty', ',', 'formed')

(',', 'formed', 'us')

('formed', 'us', 'image')

('us', 'image', ',')

('image', ',', 'breath')

16

You might also like