Professional Documents
Culture Documents
Codes
Codes
Sentence tokenization:
import nltk
from nltk.tokenize import sent_tokenize
s = '''Good muffins cost $3.88 in New York. Please buy me two of them. contact me
at alpha.berry@gmail.com.'''
sent_tokenize(s)
Word tokenization:
import ntk
from nltk.tokenize import word_tokenize
s = ‘’’Good muffins cost $3.88 in New York. Please buy me two of them. contact me
at alpha.berry@gmail.com.’’’
print(word_tokenize(s))
Stemming
import nltk
from nltk.stem import PorterStemmer
ps = PorterStemmer()
for w in words:
print(w, " : ", ps.stem(w))
Lemmatization:
import nltk
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
words = ["running", "jumps", "better"]
print(lemmatized_words)
english_stopwords = stopwords.words('english')
stopwords = [t for t in tokens if t not in english_stopwords]
print(stopwords)
POS Tagging:
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
words = word_tokenize(sentence)
pos_tags = pos_tag(words)
print(pos_tags)
import nltk
from nltk.corpus import wordnet
syn = wordnet.synsets('hello')[0]
import re
text = "Remove numbers like 123 and special characters (!@#$) from this text."
pattern = r'[0-9!@#$]+'
print(cleaned_text)