def tokenize_texts(texts, words):
results = []
for text in texts:
t = text.lower().strip()
t = t.replace('\n', ' ').replace('\t', ' ')
t = t.replace("'s", " 's ")
t = t.replace("'ll", " 'll ")
t = t.replace('-', ' - ')
t = t.replace('.', ' . ')
res = TweetTokenizer(preserve_case=False, reduce_len=True).tokenize(t)
ids = []
for w in res:
w_id = words.get(w)
if w_id is None:
# log.warning("Unknown word found: %s", w)
w_id = 0
ids.append(w_id)
results.append(ids)
return results
评论列表
文章目录