treetagger.py 文件源码-python代码片段

treetagger.py 文件源码

python

阅读 29 收藏 0 点赞 0 评论 0

项目：europarl 作者: chozelinek 项目源码文件源码

def escape(self, tags):
        output = []
        for tag in tags:
            if re.match(r'<.+ >$', tag):
                try:
                    etree.fromstring(tag)
                    output.append(tag)
                except:
                    tag = re.sub(r'(<)(.+) (>)', r'\1\n\2\n\3', tag)
                    tag = self.tagger.tag_text(tag, notagdns=True, notagip=True, notagurl=True, notagemail=True, tagonly=True)
                    tag = [html.escape(t) for t in tag]
                    output += tag
            elif not re.match(r'<.+>$', tag):
                output.append(html.escape(tag))
            else:
                test = re.match(r'<rep(.+?) text="(.+)"', tag)
                if test is not None:
                    output.append('<rep{} text="{}"/>'.format(test.group(1), html.escape(test.group(2))))
                else:
                    if re.match(r'[<>]\t', tag):
                        output.append(html.escape(tag))
                    else:
                        output.append(tag)
        return output