cluster_manager.py 文件源码-python代码片段

cluster_manager.py 文件源码

python

阅读 68 收藏 0 点赞 0 评论 0

def _vectorize_documents(self,method='tfidf',max_features=100):
        stop_words = []

        try:
            for lexicon_id in self.params['cluster_lexicons']:
                lexicon = Lexicon.objects.get(id=int(lexicon_id))
                words = Word.objects.filter(lexicon=lexicon)
                stop_words+=[word.wrd for word in words]
        except:
            KeyError

        if method == 'count':
            vectorizer = CountVectorizer(analyzer='word', max_features=max_features, stop_words=stop_words)
        if method == 'tfidf':
            vectorizer = TfidfVectorizer(analyzer='word', max_features=max_features, stop_words=stop_words)

        document_vectors = vectorizer.fit_transform(self.documents)
        document_vectors = document_vectors.toarray()

        return document_vectors,vectorizer.get_feature_names()