def build_lda_model(self, data, docs, n_topics=5):
texts = []
tokenizer = RegexpTokenizer(r'\w+')
for d in data:
raw = d.lower()
tokens = tokenizer.tokenize(raw)
stopped_tokens = self.remove_stopwords(tokens)
stemmed_tokens = stopped_tokens
#stemmer = PorterStemmer()
#stemmed_tokens = [stemmer.stem(token) for token in stopped_tokens]
texts.append(stemmed_tokens)
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
lda_model = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary,
num_topics=n_topics)
index = similarities.MatrixSimilarity(corpus)
self.save_lda_model(lda_model, corpus, dictionary, index)
self.save_similarities(index, docs)
return dictionary, texts, lda_model
lda_model_calculator.py 文件源码
python
阅读 19
收藏 0
点赞 0
评论 0
评论列表
文章目录