topics.py 文件源码

python
阅读 23 收藏 0 点赞 0 评论 0

项目:eea.corpus 作者: eea 项目源码 文件源码
def wordcloud_visualization(corpus, topics, num_docs=None, min_df=0.1,
                            ngrams=1, weighting='tf', max_df=0.7, mds='pcoa',
                            *args, **kwargs):
    font = pkg_resources.resource_filename(__name__,
                                           "fonts/ZillaSlab-Medium.ttf")
    print(font)
    model, doc_term_matrix, vectorizer = build_model(
        corpus, topics, num_docs, ngrams, weighting, min_df, max_df
    )
    prep_data = prepare(model.model, doc_term_matrix, vectorizer, mds=mds)
    ti = prep_data.topic_info
    topic_labels = ti.groupby(['Category']).groups.keys()

    plt.clf()
    topics = []
    for label in topic_labels:
        out = StringIO()
        df = ti[ti.Category == label].sort_values(by='Total',
                                                     ascending=False)[:20]
        tf = dict(df[['Term', 'Total']].to_dict('split')['data'])

        wc = wordcloud.WordCloud(font_path=font, width=600, height=300,
                                 background_color='white')
        wc.fit_words(tf)
        plt.imshow(wc)
        plt.axis('off')
        plt.savefig(out)
        out.seek(0)
        topics.append((label, out.read()))

    return topics
    """
     Category         Freq            Term        Total  loglift  logprob
term
478   Default   738.000000          specie   738.000000   1.0000   1.0000
...       ...          ...             ...          ...      ...      ...
191   Topic10    25.344278           space   145.983738   1.8935  -5.0376
190   Topic10    32.076070           green   193.201661   1.8488  -4.8020
319   Topic10    12.129367          aspect    73.063725   1.8488  -5.7745

"""
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号