svm_tfidf.py 文件源码

python
阅读 22 收藏 0 点赞 0 评论 0

项目:SinaWeiboSpider 作者: SuperSaiyanSSS 项目源码 文件源码
def reduce_tfidf(dictionary, weibo_test):
    corpus_tfidf = None
    # # # # ?????  ????????tfidf
    if not os.path.exists(path_tmp_tfidf):
        print('=== ?????tfidf??????????tfidf?? ===')
        # ?????????tfidf???????????????????
        if not dictionary:  # ????????????????????
            dictionary = corpora.Dictionary.load(path_dictionary)
        os.makedirs(path_tmp_tfidf)
        files = os_path.LoadFiles(path_doc_root)
        tfidf_model = models.TfidfModel(dictionary=dictionary)
        corpus_tfidf = {}
        for i, msg in enumerate(files):
            catg = msg[0]
            file = msg[1]
            word_list = convert_doc_to_wordlist(file, cut_all=False)
            file_bow = dictionary.doc2bow(word_list)
            file_tfidf = tfidf_model[file_bow]
            tmp = corpus_tfidf.get(catg, [])
            tmp.append(file_tfidf)
            if tmp.__len__() == 1:
                corpus_tfidf[catg] = tmp
        # ?tfidf????????
        catgs = list(corpus_tfidf.keys())
        for catg in catgs:
            corpora.MmCorpus.serialize('{f}{s}{c}.mm'.format(f=path_tmp_tfidf, s=os.sep, c=catg),
                                       corpus_tfidf.get(catg),
                                       id2word=dictionary
                                       )
            print('catg {c} has been transformed into tfidf vector'.format(c=catg))
        print('=== tfidf?????? ===')
    else:
        print('=== ???tfidf???????????? ===')

    svm_lsi.reduce_lsi(dictionary, corpus_tfidf, weibo_test)
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号