def load_overview_pages(players):
"""
Hit the overview page and load gamelog_url_list for each of the players in the player dict.
Maybe this should be in the webio submodule? I am leaving it here since it controls scraping program flow.
:param players: player dict
:return dict: player dict
"""
# Helper function to guess which position a player plays from the overview table of stats.
# Just grab the position from the most recent year in which it was defined, and return that.
def quick_position_guess(overview_table):
return overview_table.dropna(subset=['Pos'])['Pos'].iloc[-1]
pbar = progressbar.ProgressBar(widgets=[progressbar.Percentage(), ' ', progressbar.Bar(), ' ', progressbar.ETA()])
print 'Accessing and parsing overview pages...'
for i, (bref_id, player_dict) in pbar(list(enumerate(players.items()))):
overview_soup = getSoupFromURL(players[bref_id]['overview_url'])
players[bref_id]['overview_url_content'] = overview_soup.text
# the links to each year's game logs are in <li> tags, and the text contains 'Game Logs'
# so we can use those to pull out our urls.
for li in overview_soup.find_all('li'):
if 'Game Logs' in li.getText():
game_log_links = li.findAll('a')
for game_log_link in game_log_links:
players[bref_id]['gamelog_url_list'].append('http://www.basketball-reference.com' + game_log_link.get('href'))
player_name = overview_soup.find('h1').text
players[bref_id]['info']['name'] = player_name
# Read (guess?) player's position
overview_table = dfFromOverviewPage(overview_soup)
if len(overview_table.dropna(subset=['Pos'])) > 0:
players[bref_id]['info']['pos'] = quick_position_guess(overview_table)
else:
players[bref_id]['info']['pos'] = '?' # this will only happen for chumps but by defining a value we should block exceptions
return players
评论列表
文章目录