Professional Documents
Culture Documents
TF Idf
TF Idf
In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorize
r
vect = CountVectorizer()
vect
Out[1]:
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
lowercase=True, max_df=1.0, max_features=None, min_df=1,
ngram_range=(1, 1), preprocessor=None, stop_words=None,
strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
tokenizer=None, vocabulary=None)
In [2]:
# use TreeankWordTokenizer
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
vect.set_params(tokenizer=tokenizer.tokenize)
TF-IDF Vectorizer
TF-IDF stands for term frequency-inverse document frequency.
TF-IDF weight is a statistical measure used to evaluate how
important a word is to a document in a collection or corpus. The
importance increases proportionally to the number of times a
word appears in the document but is offset by the frequency of
the word in the corpus.
Thus,
In [3]:
# import and instantiate TfidfVectorizer (with the default parameters)
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer()
vect
Out[3]:
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
lowercase=True, max_df=1.0, max_features=None, min_df=1,
ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
stop_words=None, strip_accents=None, sublinear_tf=False,
token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
vocabulary=None)
In [4]:
# use TreeankWordTokenizer
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
vect.set_params(tokenizer=tokenizer.tokenize)