tfidf.py 文件源码

python
阅读 24 收藏 0 点赞 0 评论 0

项目:DeepNews 作者: kabrapratik28 项目源码 文件源码
def train_tfidf_model(self,file_path='../../temp_results/corpus.txt'):
        textfile = codecs.open(file_path, "r", "utf-8")   

        print("Reading and Processing Text File")
        first_lines=[]
        for line in textfile:
            first_lines.append(line.strip())

        print ("--------Building Corpora Dictionary---------------" )
        dictionary = corpora.Dictionary(line.split('#|#')[1].split() for line in first_lines)

        #remove words that appear less than 2 times
        #twoids = [tokenid for tokenid,docfreq in iteritems(dictionary.dfs) if docfreq < 2]
        #dictionary.filter_tokens(fiveids)

        #Remove Gaps
        dictionary.compactify()
        dictionary.save_as_text('../../temp_results/tfidf_dictionary.txt',sort_by_word=False)
        dictionary.save('../../temp_results/tfidf_dictionary')
        print("Dictionary Saved")

        print ("--Now Transforming to Bag of Words Vectors on the Fly--")
        class MyCorpus(object):
            def __iter__(self):
                for line in first_lines:
                    yield dictionary.doc2bow(line.split()) 

        news_corpus  = MyCorpus()
        print("Corpus Built...Now Starting Model Training")
        tfidf_model = models.TfidfModel(news_corpus)
        tfidf_model.save('../../temp_results/tfidf_model')
        print("Model Trained & Saved")
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号