split.py 文件源码-python代码片段

def split_cli():
  p = ArgumentParser()
  p.add_argument("expanded", default="expanded.pickle", help="Expanded pickle file targets.")
  p.add_argument("stripped", default="test.pickle", help="stripped data filename")
  p.add_argument("train", default="train.pickle", help="training filename")
  p.add_argument("test", default="test.pickle", help="test filename")
  p.add_argument("attrfile", default="attrs.txt", help="attrs to care about for NA purposes")
  p.add_argument("--na-strategy", default="drop", help="what to do with NA rows (default is drop them)")
  p.add_argument("--trainpct", default=70, type=int, help="percentage of data to put into training set")
  p.add_argument("--random", action='store_true', help="split train/test sets randomly (default is by time)")
  cfg = p.parse_args()

  strip_and_process_to_files(expanded_file=pd.read_pickle(cfg.expanded),
                             stripped_file=cfg.stripped,
                             attrfile=cfg.attrfile,
                             na_strategy=cfg.na_strategy)
  split_to_files(trainfile=cfg.train,
                 testfile=cfg.test,
                 stripped=cfg.stripped,
                 trainpct=cfg.trainpct,
                 split_randomly=cfg.random)