def rank_by_inverted_words(raw_query, filehashes=None):
from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer()
keywords = tokenizer.tokenize(raw_query)
kv_paperwords = lambda filehash: KeyValueStore('paperwords:' + filehash)
if not filehashes: # retrieve all from db. complexity warning.
scopes = KeyValueStore.scopes('paper:*')
filehashes = [scope[len('paper:'):] for scope in scopes]
score_by_filehash = {}
for filehash in filehashes:
word_dict = kv_paperwords(filehash)
score = 0.
for word in keywords:
score += word_dict.get(word, default=0.)
score_by_filehash[filehash] = score
print score_by_filehash
return sorted(score_by_filehash, key=lambda k: score_by_filehash[k], reverse=True)
评论列表
文章目录