def build_token_counts(characterizer, texts):
tokenizer = Tokenizer(characterizer=characterizer)
tokenizer.train([t['text'] for t in texts])
token_counts = Counter()
seq_matcher = difflib.SequenceMatcher()
for t in texts:
t['tokens'] = tokenizer.tokenize(t['text'])
if not t['tokens']:
continue
if 'urls' in t['entities'] and t['entities']['urls']:
#TODO: replace those urls instead of adding them
for url in t['entities']['urls']:
t['tokens'].append(url['display_url'])
if t['__is_rt__']:
t['tokens'].append(u'@{0}'.format(t['user']['screen_name']).lower())
token_counts.update(t['tokens'])
return token_counts
评论列表
文章目录