def get_token_list(self, index):
"""Get list of (word, tag) pair"""
if not self.word_delimiter or not self.tag_delimiter:
return list()
# Get content by index
content = self.__corpus[index].content
# Empty file
if not content:
return list()
# Split each word by word delimiter
token_list = content.split(self.word_delimiter)
for idx, token in enumerate(token_list):
# Empty or Spacebar
if token == "" or token == constant.SPACEBAR:
word = constant.SPACEBAR
tag = constant.PAD_TAG_INDEX
# Word
else:
# Split word and tag by tag delimiter
datum = token.split(self.tag_delimiter)
word = datum[0]
tag = datum[-2]
# Replace escape character to proper character
word = word.replace(constant.ESCAPE_WORD_DELIMITER, self.word_delimiter)
tag = tag.replace(constant.ESCAPE_TAG_DELIMITER, self.tag_delimiter)
# Replace token with word and tag pair
token_list[idx] = (word, tag)
return token_list
评论列表
文章目录