paper.py 文件源码-python代码片段

def inverse_indexing_once():
    kv_paperwords = lambda filehash: KeyValueStore('paperwords:' + filehash)
    scopes = KeyValueStore.scopes('paper:*')
    from nltk.tokenize import TweetTokenizer
    tokenizer = TweetTokenizer()
    def make_dict(text, weight=1., prefix_weight=0.):
        if not text:
            return {}
        words = tokenizer.tokenize(text.lower().strip())
        result = {}
        for word in words:
            for i in range(1, len(word)):
                prefix = word[:i]
                if prefix not in result:
                    result[prefix] = 0.
                result[prefix] += prefix_weight
            if word not in result:
                result[word] = 0.
            result[word] += weight
        return result

    def merge_dict(dict1, dict2):
        new_dict = {}
        for word in set(dict1.keys()).union(dict2.keys()):
            weight1 = dict1.get(word, 0.)
            weight2 = dict2.get(word, 0.)
            new_dict[word] = weight1 + weight2
        return new_dict

    for scope in scopes:
        filehash = scope[len('paper:'):]
        meta = KeyValueStore(scope_name=scope)
        title = meta['title']
        abstract = meta.get('abstract', default='')

        dict_title = make_dict(title, weight=6., prefix_weight=0.06)
        dict_abstract = make_dict(abstract, weight=2., prefix_weight=0.02)
        final_dict = merge_dict(dict_title, dict_abstract)

        authors = meta['authors']
        if authors:
            for author in authors:
                dict_author = make_dict(author['first_name'] + ' ' + author['last_name'])
                final_dict = merge_dict(dict_author, final_dict)

        kv_paperwords(filehash).update(final_dict)