def new(n_feature=128):
vectorizer = CountVectorizer(
encoding='utf-8',
ngram_range=(1,1), # Unigram only
max_features=n_feature,
binary=True
)
# Fill the gap (missing expected tags)
# ---
# Hypothesis: Some tags are somehow related so
# we smoothen the missing values with matrix factorisation.
smoother = NMF(n_components=n_feature)
# Binarise the vector's individual values
binariser = Binarizer(copy=True)
# Count vectoriser => NMF as smoother => Binariser
print(colored('Taghasher model created','yellow'))
return [vectorizer,smoother,binariser]
评论列表
文章目录