word_cluster.py 文件源码-python代码片段

word_cluster.py 文件源码

python

阅读 38 收藏 0 点赞 0 评论 0

项目：PolBotCheck 作者: codeforfrankfurt 项目源码文件源码

def get_corpus_of_most_active_users(n_users=5):
    tweets = []
    texts = []
    with open(DATASET_PATH) as f:
        for line in f:
            tweets.append(json.loads(line)['user']['screen_name'])
            texts.append((json.loads(line)['user']['screen_name'], json.loads(line)['text']))

    users = nltk.FreqDist(tweets).most_common(n_users)

    dict = {}
    for user, tweet in texts:
        if user in dict:
            dict[user] = " ".join([dict[user],tweet])
        else:
            dict[user] = tweet

    corpus = [dict[name] for name, _ in users]
    user_names = [name for name, _ in users]
    return  corpus, user_names