def map_chars_to_tokens(doc):
"""
Creates a mapping from input characters to corresponding input tokens
For instance, given the input:
Nuclear theory ...
|||||||||||||||
012345678911111...
01234
it returns an array of size equal to the number of input chars plus one,
whcih looks like this:
000000011111112...
This means that the first 7 chars map to the first token ("Nuclear"),
the next 7 chars (including the initial whitespace) map to the second
token ("theory") and so on.
"""
n_chars = len(doc.text_with_ws)
char2token = np.zeros(n_chars + 1, 'int')
start_char = 0
for token in doc:
end_char = token.idx + len(token)
char2token[start_char:end_char] = token.i
start_char = end_char
char2token[-1] = char2token[-2] + 1
return char2token
评论列表
文章目录