def dump_nba_data(outfile, start_date=None, end_date=None, max_count=None, use_random=False):
"""
Dump NBA statistical data to a file.
:param str outfile: name of file to become pickled pandas datafile
:param str start_date: don't include games from before this date when dumping data
:param str end_date: don't include games from after this date when dumping data
:param int max_count: maximum # of rows to dump
:param bool use_random: whether to select rows at random (if False, choose most recent)
:return:
"""
if start_date:
start_date = parser.parse(start_date)
else:
start_date = datetime.datetime(2010, 10, 1)
if end_date:
end_date = parser.parse(end_date)
else:
end_date = datetime.datetime.today()
print 'Dump NBA data for %s to %s' % (start_date, end_date)
print 'loading data...'
all_game_rows = load_all_game_data()
# Filter by date
if start_date is not None:
all_game_rows = all_game_rows[all_game_rows['date'] > start_date]
if end_date is not None:
all_game_rows = all_game_rows[all_game_rows['date'] < end_date]
# Sample filtered data
if max_count and max_count < len(all_game_rows):
print 'sampling %d rows...' % max_count
if use_random:
# We seed to 0 when we call this from CLI to make sure that random splits are replicable.
random.seed(0)
kept_indices = random.sample(all_game_rows.index, max_count)
selected = all_game_rows.loc[kept_indices]
else:
all_game_rows.sort("date")
selected = all_game_rows.tail(max_count)
else:
selected = all_game_rows
print 'saving...'
pandas.to_pickle(selected, outfile)
print 'Done!'
return selected
评论列表
文章目录