You are on page 1of 3

4_POSTagging

February 22, 2022

0.0.1 CSC498B-688J_Natural Language Processing

Pauline Maouad, PhD - SAS | CSM, LAU

SPRING, 2022
[ ]: import nltk
# from nltk.book import *
from nltk.stem.porter import *
from nltk.stem import *

from nltk import sent_tokenize, word_tokenize


from nltk.corpus import stopwords
from collections import Counter

from nltk.stem import WordNetLemmatizer

from nltk.tokenize import RegexpTokenizer #regular expression tokenizer


import re # regular expression module

# from nltk import FreqDist

from nltk.corpus import gutenberg as g


from nltk.corpus import brown as b
from nltk.corpus import treebank
from nltk.corpus import nps_chat
from nltk.corpus.reader import TaggedCorpusReader as tcr

[ ]: #The nltk.tag module defines functions and classes for manipulating tagged␣
,→tokens, which combine a basic

#token value with a tag. Tags are case-sensitive strings that identify some␣
,→property of a token, such as

#its part of speech. Tagged tokens are encoded as tuples (tag, token)\

import nltk.tag

1
0.0.2 Various Tagsets

[ ]: treebank.tagged_words()[1:20]

[ ]: treebank.tagged_words(tagset ='universal')[1:20]

0.0.3 Tagset Documentation

[ ]: nltk.help.upenn_tagset('VB.*') # regular expression

[ ]: nltk.help.brown_tagset('VB.*')

0.0.4 Tagging with pos_tag

[ ]: text = "The bitterest tears shed over graves are for words left unsaid and deeds␣
,→left undone - Harriet Beecher Stowe"

tok_text = word_tokenize(text)
nltk.pos_tag(tok_text)

String Representation for Tagged Tokens


[ ]: tagged_token = nltk.tag.str2tuple('Learn/VB')

print(tagged_token)

0.0.5 Corpus reader functions are named based on the type of information they return.

tagged_words() method –> returns tokenized words and their tags


[ ]: # Reading a tagged corpora
# b is an acronym for brown

# Read a tagged text in a given category


print(b.tagged_words(categories='belles_lettres', tagset=('brown'))[1:30])

tagged_sentences() method –> returns tokenized sentences and their tags


[ ]: print(b.tagged_sents(b.fileids()))

# NP-TL --> proper noun-Title


# NN-HL --> noun-Headline

2
Find most common tags
[ ]: brown_tagged = b.tagged_words(categories='science_fiction', tagset='universal')

tag_fd = nltk.FreqDist(tag for (word, tag) in brown_tagged)

print("\n Most common tags: \n", tag_fd.most_common())

You might also like