def get_role(self, intervention):
roles = intervention.xpath('.//span[@class="italic"][text()[re:test(.,"^[\s\xad\-–?—\.]*(?:{})[\s\xad\-–?\.]*(?:\([A-Z][A-Z]\))?[\s\xad\-–?—\.]*$", "m")]]'.format('|'.join(self.loc['roles'])), namespaces=self.ns)
if len(roles) > 0:
output = []
for role in roles:
if type(role) is str:
output.append(role)
elif type(role) is html.HtmlElement:
output.append(role.text)
for role in roles:
lang = re.match(
r'.*({}).*'.format('|'.join(self.langs)),
role.text)
if lang is not None:
i_lang = lang.group(1)
else:
i_lang = None
role.drop_tree()
else:
output = None
i_lang = None
if output is not None:
output = " ".join(output)
output = re.sub(r'\n', r' ', output)
output = re.sub(r' +', r' ', output)
output = re.sub(r'\([\p{Lu}\&/\-–]+\)', r'', output)
output = re.sub(r'(\p{Ll})[\s\.\xad–\-?—,\)]+\Z', r'\1', output)
output = re.sub(r'\A[\xad\s\.—–\-?,\)\(]+', r'', output)
output = re.sub(r'[\xad\s\.—–\-?,\)]+\Z', r'', output)
return output, i_lang
评论列表
文章目录