def test():
gt = GetTweets()
documents = gt.get_hashtag('ferguson', count=20)
documents += gt.get_hashtag('police', count=21)
print 'Query:', documents[-1]
tokenizer = RegexpTokenizer('\w+')
vols = []
for doc in documents:
samples = []
for token in tokenizer.tokenize(doc):
word = token.lower()
if word not in ENGLISH_STOP_WORDS and word not in punctuation:
samples.append(word)
vols.append(volumize(FreqDist(samples)))
vectors = [ doc_code(v) for v in vols[:-1] ]
query_vec = doc_code(vols[-1])
sims = [ cos(v, query_vec) for v in vectors ]
m = max(sims)
print m, documents[sims.index(m)]
评论列表
文章目录