def get_words(tweets):
"""Given a set of tweets, return the most frequently-used words."""
tweets = filter(lambda x: not(x.is_rt), tweets)
tokenized = [nltk.word_tokenize(handle_strip(t.tweet_text))
for t in tweets]
words = [item for sublist in tokenized for item in sublist]
longwords = filter(lambda x: len(x) > 6, words)
lcwords = map(lambda x: x.lower(), longwords)
fdist = nltk.FreqDist(lcwords)
common = fdist.most_common(100)
common = filter(lambda x: x[1] > 4, common)
common = map(lambda x: [x[0], 6 + int(x[1]/3)], common)
return common
评论列表
文章目录