LDAModel_English.py 文件源码-python代码片段

def trainModel(self):
        '''
        Train a LDA model, inclusive of 4 steps:
        1. Parse the whole corpora into unigram token collections and document mapping (for later use)
        2. Filter tokens which are not common (no_below_this_number), and too common (no_above_fraction_of_doc)
        3. Indexing the token collections and do TF-IDF transformation
        4. Call gensim.models.LdaModel and generate topic distributions of the corpora
        '''
        print 'Start preparing unigram tokens....'      
        ## Start of preparing list of documents and tokens [[words_in_1st_doc],[words_in_2nd_doc]....], which comprise Bag-Of-Words (BOW)
        # Get document_count, tokens, and document-index mapping from the corpora
        doc_count,train_set,doc_mapping,link_mapping = self.__tokenizeWholeCorpora(path_corpora) 
        # Put the training data into gensim.corpora for later use
        dic = corpora.Dictionary(train_set) 
        denominator = len(dic)
        # Filtering infrequent words & common stopwords, thus reducing the dimension of terms (which prevents curse of dimensionality)
        dic.filter_extremes(no_below=self.no_below_this_number, no_above=self.no_above_fraction_of_doc)
        nominator = len(dic)
        corpus = [dic.doc2bow(text) for text in train_set]  # transform every token into BOW
        print 'There are %i documents in the pool' % (doc_count)
        print "In the corpus there are ", denominator, " raw tokens"
        print "After filtering, in the corpus there are", nominator, "unique tokens, reduced ", (1-(nominator/denominator)),"%"
        print 'Finished preparing unigram tokens....'   
        ##END 

        print 'Start training LDA model....'
        ## Implementing TF-IDF as a vector for each document, and train LDA model on top of that
        tfidf = models.TfidfModel(corpus)
        corpus_tfidf = tfidf[corpus]
        lda = models.LdaModel(corpus_tfidf, id2word = dic, num_topics = self.num_topics,iterations=self.num_of_iterations,passes = self.passes)
        corpus_lda = lda[corpus_tfidf]
        # Once done training, print all the topics and related words
        print 'Finished training LDA model.......Here is the list of all topics & their most frequent words'    
        for i in range(self.num_topics):
            print 'Topic %s : ' % (str(i)) + lda.print_topic(i)
        # Exhibit perplexity of current model under specific topic hyperparameter : k. The lower the better
        print '==============================='
        print 'Model perplexity : ',lda.bound(corpus_lda),' when topic k =', str(self.num_topics)
        print '==============================='   

        return lda,doc_mapping,link_mapping,corpus