def offset_tokenize(text):
tail = text
accum = 0
tokens = [word for sent in sent_tokenize(text) for word in word_tokenize(sent)]
info_tokens = []
for tok in tokens:
scaped_tok = re.escape(tok)
m = re.search(scaped_tok, tail)
start, end = m.span()
# global offsets
gs = accum + start
ge = accum + end
accum += end
# keep searching in the rest
tail = tail[end:]
info_tokens.append((tok, (gs, ge)))
return info_tokens
评论列表
文章目录