You are on page 1of 2

Tokenization

Sentence tokenization:

import nltk
from nltk.tokenize import sent_tokenize
s = '''Good muffins cost $3.88 in New York. Please buy me two of them. contact me
at alpha.berry@gmail.com.'''
sent_tokenize(s)

Word tokenization:

import ntk
from nltk.tokenize import word_tokenize
s = ‘’’Good muffins cost $3.88 in New York. Please buy me two of them. contact me
at alpha.berry@gmail.com.’’’
print(word_tokenize(s))

Stemming
import nltk
from nltk.stem import PorterStemmer

ps = PorterStemmer()

words = ["program", "programs", "programmer", "programming", "programmers"]

for w in words:
print(w, " : ", ps.stem(w))

Lemmatization:
import nltk
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
words = ["running", "jumps", "better"]

lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

print(lemmatized_words)

Stop Word Removal:

from nltk.corpus import stopwords


from nltk.tokenize import word_tokenize

text = "How to remove stop words with NLTK library in Python"


tokens = word_tokenize(text.lower())

english_stopwords = stopwords.words('english')
stopwords = [t for t in tokens if t not in english_stopwords]

print(stopwords)

POS Tagging:
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag

sentence = "POS tagging helps identify the parts of speech in a sentence."

words = word_tokenize(sentence)

pos_tags = pos_tag(words)

print(pos_tags)

Reading Corpus using Wordnet:

import nltk
from nltk.corpus import wordnet
syn = wordnet.synsets('hello')[0]

print ("\nSynset meaning : ", syn.definition())

Regular Expression Removal:

import re
text = "Remove numbers like 123 and special characters (!@#$) from this text."

pattern = r'[0-9!@#$]+'

cleaned_text = re.sub(pattern, '', text)

print(cleaned_text)

You might also like