scraper.py 文件源码-python代码片段

def load_overview_pages(players):
  """
  Hit the overview page and load gamelog_url_list for each of the players in the player dict.
  Maybe this should be in the webio submodule? I am leaving it here since it controls scraping program flow.
  :param players: player dict
  :return dict: player dict
  """
  # Helper function to guess which position a player plays from the overview table of stats.
  # Just grab the position from the most recent year in which it was defined, and return that.
  def quick_position_guess(overview_table):
    return overview_table.dropna(subset=['Pos'])['Pos'].iloc[-1]
  pbar = progressbar.ProgressBar(widgets=[progressbar.Percentage(), ' ', progressbar.Bar(), ' ', progressbar.ETA()])
  print 'Accessing and parsing overview pages...'
  for i, (bref_id, player_dict) in pbar(list(enumerate(players.items()))):
    overview_soup = getSoupFromURL(players[bref_id]['overview_url'])
    players[bref_id]['overview_url_content'] = overview_soup.text
    # the links to each year's game logs are in <li> tags, and the text contains 'Game Logs'
    # so we can use those to pull out our urls.
    for li in overview_soup.find_all('li'):
      if 'Game Logs' in li.getText():
        game_log_links = li.findAll('a')
        for game_log_link in game_log_links:
          players[bref_id]['gamelog_url_list'].append('http://www.basketball-reference.com' + game_log_link.get('href'))
    player_name = overview_soup.find('h1').text
    players[bref_id]['info']['name'] = player_name
    # Read (guess?) player's position
    overview_table = dfFromOverviewPage(overview_soup)
    if len(overview_table.dropna(subset=['Pos'])) > 0:
      players[bref_id]['info']['pos'] = quick_position_guess(overview_table)
    else:
      players[bref_id]['info']['pos'] = '?'  # this will only happen for chumps but by defining a value we should block exceptions
  return players