proceedings_xml.py 文件源码

python
阅读 23 收藏 0 点赞 0 评论 0

项目:europarl 作者: chozelinek 项目源码 文件源码
def get_role(self, intervention):
        roles = intervention.xpath('.//span[@class="italic"][text()[re:test(.,"^[\s\xad\-–?—\.]*(?:{})[\s\xad\-–?\.]*(?:\([A-Z][A-Z]\))?[\s\xad\-–?—\.]*$", "m")]]'.format('|'.join(self.loc['roles'])), namespaces=self.ns)
        if len(roles) > 0:
            output = []
            for role in roles:
                if type(role) is str:
                    output.append(role)
                elif type(role) is html.HtmlElement:
                    output.append(role.text)
            for role in roles:
                lang = re.match(
                    r'.*({}).*'.format('|'.join(self.langs)),
                    role.text)
                if lang is not None:
                    i_lang = lang.group(1)
                else:
                    i_lang = None
                role.drop_tree()
        else:
            output = None
            i_lang = None
        if output is not None:
            output = " ".join(output)
            output = re.sub(r'\n', r' ', output)
            output = re.sub(r' +', r' ', output)
            output = re.sub(r'\([\p{Lu}\&/\-–]+\)', r'', output)
            output = re.sub(r'(\p{Ll})[\s\.\xad–\-?—,\)]+\Z', r'\1', output)
            output = re.sub(r'\A[\xad\s\.—–\-?,\)\(]+', r'', output)
            output = re.sub(r'[\xad\s\.—–\-?,\)]+\Z', r'', output)
        return output, i_lang
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号