html_pbp.py 文件源码-python代码片段

html_pbp.py 文件源码

python

阅读 26 收藏 0 点赞 0 评论 0

项目：Hockey-Scraper 作者: HarryShomer 项目源码文件源码

def get_soup(game_html):
    """
    Uses Beautiful soup to parses the html document.
    Some parsers work for some pages but don't work for others....I'm not sure why so I just try them all here in order

    :param game_html: html doc

    :return: "soupified" html and player_shifts portion of html (it's a bunch of td tags)
    """
    strainer = SoupStrainer('td', attrs={'class': re.compile(r'bborder')})
    soup = BeautifulSoup(game_html.text, "lxml", parse_only=strainer)
    soup = soup.select('td.+.bborder')

    if len(soup) == 0:
        soup = BeautifulSoup(game_html.text, "html.parser", parse_only=strainer)
        soup = soup.select('td.+.bborder')

        if len(soup) == 0:
            soup = BeautifulSoup(game_html.text, "html5lib")
            soup = soup.select('td.+.bborder')

    return soup