def parse_translation_table(self, table):
"""
Parse the table to get translations and the languages.
Hopefully this function will work for most editions. Override this method if needed.
:param table: a Tag object. Not necessary a table; can be a div.
:return: (translation, language_name, language_code)
"""
for li in table.find_all('li'):
if not isinstance(li, Tag):
continue
text = li.get_text().split(':')
# TBD: the table is not a translation table
# OR the table is a translation table but there are some <li> without colon
if len(text) < 2:
continue
# language name is before ":"
lang_name = text[0].strip()
# language code is in super script
lang_code = li.find("sup")
if lang_code:
lang_code = remove_all_punctuation(lang_code.text).strip()
else:
lang_code = ""
t = remove_parenthesis(text[1])
trans_list = re.split(COMMA_OR_SEMICOLON, t)
# each "trans" is: translation <sup>(lang_code)</sup> (transliteration)
# lang_code and transliteration may not exist
for trans in trans_list:
# translation = trans.split('(')[0].strip()
translation = re.split(r'[(??]', trans)[0].strip()
# Throw out tuples if they have '[['
if "[[" in translation:
continue
yield (translation, lang_name, lang_code)
general.py 文件源码
python
阅读 31
收藏 0
点赞 0
评论 0
评论列表
文章目录