gd_scrape.py 文件源码-python代码片段

gd_scrape.py 文件源码

python

阅读 24 收藏 0 点赞 0 评论 0

项目：pitchfx-data-scraper 作者: whazell 项目源码文件源码

def get_links ( url ):
    '''
        Get all the links off of the page:
        gd2.mlb.com/components/game/mlb/year/month/day/

        And finds the links for the games that have the following 
        format:

        gid_year_mm_dd_team1mlb_team2mlb   
    '''
    f = get_page (url)
    if f==False: return False

    # Compile the regex to match links outside of the loop for 
    # performance
    links = []
    regex = re.compile("\"gid_(.*?)\"", re.IGNORECASE)

    # Find all links on page and if they are links to games then add to list
    for link in BeautifulSoup(f, "lxml",parse_only=SoupStrainer('a', href=True) ):
        match = regex.findall(str(link))
        if match:
           links.extend(match)

    return links