Professional Documents
Culture Documents
Text Mining Basics
Text Mining Basics
1 Text Mining
[1]: import pandas
import nltk
from nltk.corpus import stopwords
[4]: print(stopwords.words('english'))
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're",
"you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he',
'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's",
'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what',
'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is',
'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having',
'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or',
'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about',
'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above',
'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under',
'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why',
'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some',
'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very',
's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now',
'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn',
"couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn',
"hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't",
'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn',
"shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn',
"wouldn't"]
# Text Extraction from Resources
1
text = rq.get("the_matrix_synopsis.txt").text
[10]: print(comments.head(2))
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're",
"you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he',
2
'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's",
'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what',
'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is',
'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having',
'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or',
'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about',
'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above',
'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under',
'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why',
'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some',
'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very',
's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now',
'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn',
"couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn',
"hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't",
'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn',
"shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn',
"wouldn't"]
================================================================================
===============================================
179
[10]: filted_words = []
for word in data_tokenize:
if word not in en_stops:
filted_words.append(word)
print(filted_words)
3
'railway', 'approaching', 'latter', 'city', 'full', 'speed', '.', 'The',
'morning', 'damp', 'misty', 'great', 'difficulty', 'day', 'succeeded',
'breaking', ';', 'impossible', 'distinguish', 'anything', 'yards', 'away',
'carriage', 'windows', '.', 'The', 'wearer', 'cloak', 'young', 'fellow', ',',
'also', 'twenty-six', 'twenty-seven', 'years', 'age', ',', 'slightly', 'middle',
'height', ',', 'fair', ',', 'thin', ',', 'pointed', 'light', 'coloured',
'beard', ';', 'eyes', 'large', 'blue', ',', 'intent', 'look', ',', 'yet',
'heavy', 'expression', 'people', 'affirm', 'peculiarity', 'well', 'evidence',
',', 'epileptic', 'subject', '.', 'His', 'face', 'decidedly', 'pleasant', 'one',
';', 'refined', ',', 'quite', 'colourless', ',', 'except', 'circumstance',
'moment', 'blue', 'cold', '.', 'He', 'held', 'bundle', 'made', 'old', 'faded',
'silk', 'handkerchief', 'apparently', 'contained', 'travelling', 'wardrobe',
',', 'wore', 'thick', 'shoes', 'gaiters', ',', 'whole', 'appearance', 'un-
Russian', '.', '“', 'Wheugh', '!', 'goodness', '!', '”', 'The', 'black-haired',
'young', 'fellow', 'whistled', ',', 'laughed', '.', 'Replying', ',', 'made',
'known', 'inquirer', 'certainly', 'long', 'absent', 'Russia', ',', 'four',
'years', ';', 'sent', 'abroad', 'health', ';', 'suffered', 'strange', 'nervous',
'malady—a', 'kind', 'epilepsy', ',', 'convulsive', 'spasms', '.', 'His',
'interlocutor', 'burst', 'laughing', 'several', 'times', 'answers', ';', 'ever',
',', 'question', ',', '“', 'whether', 'cured', '?', '”', 'patient', 'replied',
':', '*', '*', '*', 'END', 'OF', 'THE', 'PROJECT', 'GUTENBERG', 'EBOOK', 'THE',
'IDIOT', '*', '*', '*', 'He', 'seemed', 'suspicion', 'impertinence',
'inappropriateness', 'fact', 'questions', 'put', '.', 'Replying', ',', 'made',
'known', 'inquirer', 'certainly', 'long', 'absent', 'Russia', ',', 'mo']
[12]: print(len(filted_words))
1978
[16]: # print(sent_tokenize(text))
4
[20]: print(gt.words('austen-emma.txt'))
stemmer = PorterStemmer()
text = "programmers program with a programming algorithm languages"
text1 = "there are several books"
there
are
sever
book
[23]: # Lemmatization
lemmi = WordNetLemmatizer()
there
are
several
book
# Pos Tagging
• tagging prior to lemmatization increases accuracy
5
[26]: import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
[27]: print(nltk.pos_tag(text))
6
','), ('during', 'IN'), ('a', 'DT'), ('thaw', 'NN'), (',', ','), ('at', 'IN'),
('nine', 'CD'), ('o', 'JJ'), ('’', 'FW'), ('clock', 'NN'), ('one', 'CD'),
('morning', 'NN'), (',', ','), ('a', 'DT'), ('train', 'NN'), ('on', 'IN'),
('the', 'DT'), ('Warsaw', 'NNP'), ('and', 'CC'), ('Petersburg', 'NNP'),
('railway', 'NN'), ('was', 'VBD'), ('approaching', 'VBG'), ('the', 'DT'),
('latter', 'JJ'), ('city', 'NN'), ('at', 'IN'), ('full', 'JJ'), ('speed', 'NN'),
('.', '.'), ('The', 'DT'), ('morning', 'NN'), ('was', 'VBD'), ('so', 'RB'),
('damp', 'JJ'), ('and', 'CC'), ('misty', 'VBZ'), ('that', 'IN'), ('it', 'PRP'),
('was', 'VBD'), ('only', 'RB'), ('with', 'IN'), ('great', 'JJ'), ('difficulty',
'NN'), ('that', 'IN'), ('the', 'DT'), ('day', 'NN'), ('succeeded', 'VBD'),
('in', 'IN'), ('breaking', 'NN'), (';', ':'), ('and', 'CC'), ('it', 'PRP'),
('was', 'VBD'), ('impossible', 'JJ'), ('to', 'TO'), ('distinguish', 'VB'),
('anything', 'NN'), ('more', 'JJR'), ('than', 'IN'), ('a', 'DT'), ('few', 'JJ'),
('yards', 'NNS'), ('away', 'RB'), ('from', 'IN'), ('the', 'DT'), ('carriage',
'NN'), ('windows', 'VBZ'), ('.', '.'), ('The', 'DT'), ('wearer', 'NN'), ('of',
'IN'), ('this', 'DT'), ('cloak', 'NN'), ('was', 'VBD'), ('a', 'DT'), ('young',
'JJ'), ('fellow', 'NN'), (',', ','), ('also', 'RB'), ('of', 'IN'), ('about',
'IN'), ('twenty-six', 'JJ'), ('or', 'CC'), ('twenty-seven', 'JJ'), ('years',
'NNS'), ('of', 'IN'), ('age', 'NN'), (',', ','), ('slightly', 'RB'), ('above',
'IN'), ('the', 'DT'), ('middle', 'JJ'), ('height', 'NN'), (',', ','), ('very',
'RB'), ('fair', 'JJ'), (',', ','), ('with', 'IN'), ('a', 'DT'), ('thin', 'JJ'),
(',', ','), ('pointed', 'JJ'), ('and', 'CC'), ('very', 'RB'), ('light', 'JJ'),
('coloured', 'VBN'), ('beard', 'NN'), (';', ':'), ('his', 'PRP$'), ('eyes',
'NNS'), ('were', 'VBD'), ('large', 'JJ'), ('and', 'CC'), ('blue', 'JJ'), (',',
','), ('and', 'CC'), ('had', 'VBD'), ('an', 'DT'), ('intent', 'JJ'), ('look',
'NN'), ('about', 'IN'), ('them', 'PRP'), (',', ','), ('yet', 'RB'), ('that',
'DT'), ('heavy', 'JJ'), ('expression', 'NN'), ('which', 'WDT'), ('some', 'DT'),
('people', 'NNS'), ('affirm', 'VBP'), ('to', 'TO'), ('be', 'VB'), ('a', 'DT'),
('peculiarity', 'NN'), ('as', 'RB'), ('well', 'RB'), ('as', 'IN'), ('evidence',
'NN'), (',', ','), ('of', 'IN'), ('an', 'DT'), ('epileptic', 'JJ'), ('subject',
'NN'), ('.', '.'), ('His', 'PRP$'), ('face', 'NN'), ('was', 'VBD'),
('decidedly', 'RB'), ('a', 'DT'), ('pleasant', 'JJ'), ('one', 'NN'), ('for',
'IN'), ('all', 'DT'), ('that', 'DT'), (';', ':'), ('refined', 'VBN'), (',',
','), ('but', 'CC'), ('quite', 'RB'), ('colourless', 'JJ'), (',', ','),
('except', 'IN'), ('for', 'IN'), ('the', 'DT'), ('circumstance', 'NN'), ('that',
'WDT'), ('at', 'IN'), ('this', 'DT'), ('moment', 'NN'), ('it', 'PRP'), ('was',
'VBD'), ('blue', 'JJ'), ('with', 'IN'), ('cold', 'JJ'), ('.', '.'), ('He',
'PRP'), ('held', 'VBD'), ('a', 'DT'), ('bundle', 'NN'), ('made', 'VBD'), ('up',
'RB'), ('of', 'IN'), ('an', 'DT'), ('old', 'JJ'), ('faded', 'JJ'), ('silk',
'NN'), ('handkerchief', 'NN'), ('that', 'WDT'), ('apparently', 'RB'),
('contained', 'VBD'), ('all', 'DT'), ('his', 'PRP$'), ('travelling', 'NN'),
('wardrobe', 'NN'), (',', ','), ('and', 'CC'), ('wore', 'VBD'), ('thick', 'NN'),
('shoes', 'NNS'), ('and', 'CC'), ('gaiters', 'NNS'), (',', ','), ('his',
'PRP$'), ('whole', 'JJ'), ('appearance', 'NN'), ('being', 'VBG'), ('very',
'RB'), ('un-Russian', 'JJ'), ('.', '.'), ('“', 'CC'), ('Wheugh', 'IN'), ('!',
'.'), ('my', 'PRP$'), ('goodness', 'NN'), ('!', '.'), ('”', 'VB'), ('The',
'DT'), ('black-haired', 'JJ'), ('young', 'JJ'), ('fellow', 'NN'), ('whistled',
'VBD'), (',', ','), ('and', 'CC'), ('then', 'RB'), ('laughed', 'VBD'), ('.',
7
'.'), ('Replying', 'VBG'), ('to', 'TO'), ('them', 'PRP'), (',', ','), ('he',
'PRP'), ('made', 'VBD'), ('known', 'VBN'), ('to', 'TO'), ('the', 'DT'),
('inquirer', 'NN'), ('that', 'IN'), ('he', 'PRP'), ('certainly', 'RB'), ('had',
'VBD'), ('been', 'VBN'), ('long', 'RB'), ('absent', 'JJ'), ('from', 'IN'),
('Russia', 'NNP'), (',', ','), ('more', 'JJR'), ('than', 'IN'), ('four', 'CD'),
('years', 'NNS'), (';', ':'), ('that', 'IN'), ('he', 'PRP'), ('had', 'VBD'),
('been', 'VBN'), ('sent', 'VBN'), ('abroad', 'RB'), ('for', 'IN'), ('his',
'PRP$'), ('health', 'NN'), (';', ':'), ('that', 'IN'), ('he', 'PRP'), ('had',
'VBD'), ('suffered', 'VBN'), ('from', 'IN'), ('some', 'DT'), ('strange', 'JJ'),
('nervous', 'JJ'), ('malady—a', 'NN'), ('kind', 'NN'), ('of', 'IN'),
('epilepsy', 'NN'), (',', ','), ('with', 'IN'), ('convulsive', 'JJ'), ('spasms',
'NNS'), ('.', '.'), ('His', 'PRP$'), ('interlocutor', 'NN'), ('burst', 'NN'),
('out', 'IN'), ('laughing', 'VBG'), ('several', 'JJ'), ('times', 'NNS'), ('at',
'IN'), ('his', 'PRP$'), ('answers', 'NNS'), (';', ':'), ('and', 'CC'), ('more',
'RBR'), ('than', 'IN'), ('ever', 'RB'), (',', ','), ('when', 'WRB'), ('to',
'TO'), ('the', 'DT'), ('question', 'NN'), (',', ','), ('“', 'VBZ'), ('whether',
'IN'), ('he', 'PRP'), ('had', 'VBD'), ('been', 'VBN'), ('cured', 'VBN'), ('?',
'.'), ('”', 'VB'), ('the', 'DT'), ('patient', 'NN'), ('replied', 'VBD'), (':',
':'), ('*', 'NN'), ('*', 'VBZ'), ('*', 'JJ'), ('END', 'NN'), ('OF', 'IN'),
('THE', 'NNP'), ('PROJECT', 'NNP'), ('GUTENBERG', 'NNP'), ('EBOOK', 'NNP'),
('THE', 'NNP'), ('IDIOT', 'NNP'), ('*', 'NNP'), ('*', 'NNP'), ('*', 'NNP'),
('He', 'PRP'), ('seemed', 'VBD'), ('to', 'TO'), ('have', 'VB'), ('no', 'DT'),
('suspicion', 'NN'), ('of', 'IN'), ('any', 'DT'), ('impertinence', 'NN'), ('or',
'CC'), ('inappropriateness', 'NN'), ('in', 'IN'), ('the', 'DT'), ('fact', 'NN'),
('of', 'IN'), ('such', 'JJ'), ('questions', 'NNS'), ('being', 'VBG'), ('put',
'VBN'), ('to', 'TO'), ('him', 'PRP'), ('.', '.'), ('Replying', 'VBG'), ('to',
'TO'), ('them', 'PRP'), (',', ','), ('he', 'PRP'), ('made', 'VBD'), ('known',
'VBN'), ('to', 'TO'), ('the', 'DT'), ('inquirer', 'NN'), ('that', 'IN'), ('he',
'PRP'), ('certainly', 'RB'), ('had', 'VBD'), ('been', 'VBN'), ('long', 'RB'),
('absent', 'JJ'), ('from', 'IN'), ('Russia', 'NNP'), (',', ','), ('mo', 'NN')]
# Pos Tagging (Name Entity Recognition & Chunking)
def preprocess(sent):
sent=word_tokenize(sent)
sent=pos_tag(sent)
return sent
sent =preprocess(news)
print(sent)
8
[29]: pattern = 'NP: {<DT>?<JJ>*<NN> | <DT>?<JJ>*<NNS>}'
cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)
print(cs)
(S
European/JJ
Authorities/NNP
find/VBP
Google/NNP
$/$
5.2/CD
billion/CD
on/IN
(NP Wedensday/NN))
[30]: # cs.draw()
(S
(PP Over/IN)
(PP of/IN)
(NP coffee/NN)
,/,
(VP told/VBD)
9
(NP his/PRP$ story/NN)
./.)
,→feel the charm of existence in this spot, which was created for the bliss of␣
,→incapable of drawing a single stroke at the present moment; and yet I feel␣
,→that I never was a greater artist than now. When, while the lovely valley␣
,→teems with vapour around me, and the meridian sun strikes the upper surface␣
,→of the impenetrable foliage of my trees, and but a few stray gleams steal␣
,→into the inner sanctuary, I throw myself down among the tall grass by the␣
,→plants are noticed by me: when I hear the buzz of the little world among the␣
,→stalks, and grow familiar with the countless indescribable forms of the␣
,→insects and flies, then I feel the presence of the Almighty, who formed us␣
10
[23]: from nltk.corpus import stopwords
en_stops = stopwords.words('english')
text = preprocess(text)
n = 3
threegrams = ngrams(text, n)
11
('alone', ',', 'feel')
12
(',', 'neglect', 'talents')
13
('meridian', 'sun', 'strikes')
14
('close', 'earth', ',')
15
('presence', 'Almighty', ',')
16