tasks.py 文件源码-python代码片段

tasks.py 文件源码

python

阅读 33 收藏 0 点赞 0 评论 0

def build_token_counts(characterizer, texts):
    tokenizer = Tokenizer(characterizer=characterizer)
    tokenizer.train([t['text'] for t in texts])

    token_counts = Counter()
    seq_matcher = difflib.SequenceMatcher()

    for t in texts:
        t['tokens'] = tokenizer.tokenize(t['text'])
        if not t['tokens']:
            continue

        if 'urls' in t['entities'] and t['entities']['urls']:
            #TODO: replace those urls instead of adding them
            for url in t['entities']['urls']:
                t['tokens'].append(url['display_url'])

        if t['__is_rt__']:
            t['tokens'].append(u'@{0}'.format(t['user']['screen_name']).lower())

        token_counts.update(t['tokens'])

    return token_counts