def inverse_indexing_once():
kv_paperwords = lambda filehash: KeyValueStore('paperwords:' + filehash)
scopes = KeyValueStore.scopes('paper:*')
from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer()
def make_dict(text, weight=1., prefix_weight=0.):
if not text:
return {}
words = tokenizer.tokenize(text.lower().strip())
result = {}
for word in words:
for i in range(1, len(word)):
prefix = word[:i]
if prefix not in result:
result[prefix] = 0.
result[prefix] += prefix_weight
if word not in result:
result[word] = 0.
result[word] += weight
return result
def merge_dict(dict1, dict2):
new_dict = {}
for word in set(dict1.keys()).union(dict2.keys()):
weight1 = dict1.get(word, 0.)
weight2 = dict2.get(word, 0.)
new_dict[word] = weight1 + weight2
return new_dict
for scope in scopes:
filehash = scope[len('paper:'):]
meta = KeyValueStore(scope_name=scope)
title = meta['title']
abstract = meta.get('abstract', default='')
dict_title = make_dict(title, weight=6., prefix_weight=0.06)
dict_abstract = make_dict(abstract, weight=2., prefix_weight=0.02)
final_dict = merge_dict(dict_title, dict_abstract)
authors = meta['authors']
if authors:
for author in authors:
dict_author = make_dict(author['first_name'] + ' ' + author['last_name'])
final_dict = merge_dict(dict_author, final_dict)
kv_paperwords(filehash).update(final_dict)
评论列表
文章目录