split.py 文件源码-python代码片段

split.py 文件源码

python

阅读 38 收藏 0 点赞 0 评论 0

项目：sportsball 作者: jgershen 项目源码文件源码

def split_data(infile, train, test, attrfile, na_strategy, trainpct, split_randomly):
  expanded_data = strip_and_process_na(pd.read_pickle(infile), attrfile, na_strategy)
  train_example_count = int(len(expanded_data.index) * trainpct / 100.0)
  if split_randomly:
    train_indices = np.random.choice(expanded_data.index, size=train_example_count)
  else:
    train_indices = expanded_data.sort("Date").index[:train_example_count]
  train_data = expanded_data.ix[train_indices]
  test_data = expanded_data.drop(train_indices)

  pd.to_pickle(train_data, train)
  pd.to_pickle(test_data, test)