def trainModel(self):
'''
Train a LDA model, inclusive of 4 steps:
1. Parse the whole corpora into unigram token collections and document mapping (for later use)
2. Filter tokens which are not common (no_below_this_number), and too common (no_above_fraction_of_doc)
3. Indexing the token collections and do TF-IDF transformation
4. Call gensim.models.LdaModel and generate topic distributions of the corpora
'''
print 'Start preparing unigram tokens....'
## Start of preparing list of documents and tokens [[words_in_1st_doc],[words_in_2nd_doc]....], which comprise Bag-Of-Words (BOW)
# Get document_count, tokens, and document-index mapping from the corpora
doc_count,train_set,doc_mapping,link_mapping = self.__tokenizeWholeCorpora(path_corpora)
# Put the training data into gensim.corpora for later use
dic = corpora.Dictionary(train_set)
denominator = len(dic)
# Filtering infrequent words & common stopwords, thus reducing the dimension of terms (which prevents curse of dimensionality)
dic.filter_extremes(no_below=self.no_below_this_number, no_above=self.no_above_fraction_of_doc)
nominator = len(dic)
corpus = [dic.doc2bow(text) for text in train_set] # transform every token into BOW
print 'There are %i documents in the pool' % (doc_count)
print "In the corpus there are ", denominator, " raw tokens"
print "After filtering, in the corpus there are", nominator, "unique tokens, reduced ", (1-(nominator/denominator)),"%"
print 'Finished preparing unigram tokens....'
##END
print 'Start training LDA model....'
## Implementing TF-IDF as a vector for each document, and train LDA model on top of that
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
lda = models.LdaModel(corpus_tfidf, id2word = dic, num_topics = self.num_topics,iterations=self.num_of_iterations,passes = self.passes)
corpus_lda = lda[corpus_tfidf]
# Once done training, print all the topics and related words
print 'Finished training LDA model.......Here is the list of all topics & their most frequent words'
for i in range(self.num_topics):
print 'Topic %s : ' % (str(i)) + lda.print_topic(i)
# Exhibit perplexity of current model under specific topic hyperparameter : k. The lower the better
print '==============================='
print 'Model perplexity : ',lda.bound(corpus_lda),' when topic k =', str(self.num_topics)
print '==============================='
return lda,doc_mapping,link_mapping,corpus
评论列表
文章目录