def tokenizeVal(sent):
'''Return the tokens of a sentence including punctuation.
>>> tokenize('Bob dropped the apple. Where is the apple?')
['Bob', 'dropped', 'the', 'apple', '.', 'Where', 'is', 'the', 'apple', '?']
'''
tokenizedSent = [token.replace("``", '"').replace("''", '"') for token in nltk.word_tokenize(sent)]
tokenIdx2CharIdx = [None] * len(tokenizedSent)
idx = 0
token_idx = 0
while idx < len(sent) and token_idx < len(tokenizedSent):
word = tokenizedSent[token_idx]
if sent[idx:idx+len(word)] == word:
tokenIdx2CharIdx[token_idx] = idx
idx += len(word)
token_idx += 1
else:
idx += 1
return tokenizedSent, tokenIdx2CharIdx
QnARecurAtteLatest3Atten.py 文件源码
python
阅读 16
收藏 0
点赞 0
评论 0
评论列表
文章目录