def split_data(infile, train, test, attrfile, na_strategy, trainpct, split_randomly):
expanded_data = strip_and_process_na(pd.read_pickle(infile), attrfile, na_strategy)
train_example_count = int(len(expanded_data.index) * trainpct / 100.0)
if split_randomly:
train_indices = np.random.choice(expanded_data.index, size=train_example_count)
else:
train_indices = expanded_data.sort("Date").index[:train_example_count]
train_data = expanded_data.ix[train_indices]
test_data = expanded_data.drop(train_indices)
pd.to_pickle(train_data, train)
pd.to_pickle(test_data, test)
评论列表
文章目录