def _vectorize_documents(self,method='tfidf',max_features=100):
stop_words = []
try:
for lexicon_id in self.params['cluster_lexicons']:
lexicon = Lexicon.objects.get(id=int(lexicon_id))
words = Word.objects.filter(lexicon=lexicon)
stop_words+=[word.wrd for word in words]
except:
KeyError
if method == 'count':
vectorizer = CountVectorizer(analyzer='word', max_features=max_features, stop_words=stop_words)
if method == 'tfidf':
vectorizer = TfidfVectorizer(analyzer='word', max_features=max_features, stop_words=stop_words)
document_vectors = vectorizer.fit_transform(self.documents)
document_vectors = document_vectors.toarray()
return document_vectors,vectorizer.get_feature_names()
评论列表
文章目录