general.py 文件源码

python
阅读 31 收藏 0 点赞 0 评论 0

项目:wiktionary-translations-parser 作者: elfxiong 项目源码 文件源码
def parse_translation_table(self, table):
        """
        Parse the table to get translations and the languages.
        Hopefully this function will work for most editions. Override this method if needed.
        :param table: a Tag object. Not necessary a table; can be a div.
        :return: (translation, language_name, language_code)
        """
        for li in table.find_all('li'):
            if not isinstance(li, Tag):
                continue
            text = li.get_text().split(':')

            # TBD: the table is not a translation table
            #  OR the table is a translation table but there are some <li> without colon
            if len(text) < 2:
                continue

            # language name is before ":"
            lang_name = text[0].strip()

            # language code is in super script
            lang_code = li.find("sup")
            if lang_code:
                lang_code = remove_all_punctuation(lang_code.text).strip()
            else:
                lang_code = ""

            t = remove_parenthesis(text[1])
            trans_list = re.split(COMMA_OR_SEMICOLON, t)
            # each "trans" is: translation <sup>(lang_code)</sup> (transliteration)
            # lang_code and transliteration may not exist
            for trans in trans_list:
                # translation = trans.split('(')[0].strip()
                translation = re.split(r'[(??]', trans)[0].strip()
                # Throw out tuples if they have '[['
                if "[[" in translation:
                    continue

                yield (translation, lang_name, lang_code)
评论列表


问题


面经


文章

微信
公众号

扫码关注公众号