utils.py 文件源码

python
阅读 30 收藏 0 点赞 0 评论 0

项目:SynThai 作者: KenjiroAI 项目源码 文件源码
def get_token_list(self, index):
        """Get list of (word, tag) pair"""

        if not self.word_delimiter or not self.tag_delimiter:
            return list()

        # Get content by index
        content = self.__corpus[index].content

        # Empty file
        if not content:
            return list()

        # Split each word by word delimiter
        token_list = content.split(self.word_delimiter)

        for idx, token in enumerate(token_list):
            # Empty or Spacebar
            if token == "" or token == constant.SPACEBAR:
                word = constant.SPACEBAR
                tag = constant.PAD_TAG_INDEX

            # Word
            else:
                # Split word and tag by tag delimiter
                datum = token.split(self.tag_delimiter)
                word = datum[0]
                tag = datum[-2]

                # Replace escape character to proper character
                word = word.replace(constant.ESCAPE_WORD_DELIMITER, self.word_delimiter)
                tag = tag.replace(constant.ESCAPE_TAG_DELIMITER, self.tag_delimiter)

            # Replace token with word and tag pair
            token_list[idx] = (word, tag)

        return token_list
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号