def get_language(self, s_intervention, p, i_lang, new_paragraphs):
language = p.xpath('.//span[@class="italic"][text()[re:test(.,"^[\xad\s\.—–\-?,\(]*({})[\xad\s\.—–\-?,\)]*")]]'.format('|'.join(self.langs)), namespaces=self.ns)
if len(language) > 0 and not self.explanations_of_vote.match(language[0].text):
lang = re.match(
r'.*({}).*'.format('|'.join(self.langs)),
language[0].text)
output = lang.group(1)
for l in language:
l.drop_tree()
else:
p = html.tostring(p, with_tail=True, encoding='utf-8').decode('utf-8')
lang_in_text = re.search(
r'\(({})\)'.format('|'.join(self.langs)),
p)
if lang_in_text is not None:
output = lang_in_text.group(1)
p = re.sub(r'\(({})\) *'.format('|'.join(self.langs)), r'', p)
else:
if len(new_paragraphs) == 0:
if 'role' in s_intervention.keys():
president_pattern = '|'.join(self.loc['president'])
if re.match(r'{}\Z'.format(president_pattern), s_intervention['role']):
output = 'unknown'
else:
if i_lang is None:
output = self.language.upper()
else:
output = i_lang
else:
if i_lang is None:
output = self.language.upper()
else:
output = i_lang
else:
output = new_paragraphs[-1]['language']
p = html.fromstring(p)
return output, p
评论列表
文章目录