def get_corpus_of_most_active_users(n_users=5):
tweets = []
texts = []
with open(DATASET_PATH) as f:
for line in f:
tweets.append(json.loads(line)['user']['screen_name'])
texts.append((json.loads(line)['user']['screen_name'], json.loads(line)['text']))
users = nltk.FreqDist(tweets).most_common(n_users)
dict = {}
for user, tweet in texts:
if user in dict:
dict[user] = " ".join([dict[user],tweet])
else:
dict[user] = tweet
corpus = [dict[name] for name, _ in users]
user_names = [name for name, _ in users]
return corpus, user_names
评论列表
文章目录