def get_vectorizer(self, ngram_range=(1, 3), min_df=2, max_df=1.0):
"""
Define a binary CountVectorizer (Feature Presence) using n-grams and min and max document frequency
:param ngram_range: n-grams are created for all numbers within this range
:param min_df: min document frequency of features
:param max_df: max document frequency of features
:return:
"""
if self.is_weight == 'FP':#Feature Presence
vectorizer = CountVectorizer(ngram_range=ngram_range,
tokenizer=self.tokenize,
min_df=min_df,
max_df=max_df,
binary=True,
stop_words='english')
if self.is_weight == 'TF-IDF':#Feature Presence
vectorizer = TfidfVectorizer(ngram_range=ngram_range,
tokenizer=self.tokenize,
min_df=min_df,
max_df=max_df,
binary=True,
stop_words='english')
return vectorizer
评论列表
文章目录