def split_cli():
p = ArgumentParser()
p.add_argument("expanded", default="expanded.pickle", help="Expanded pickle file targets.")
p.add_argument("stripped", default="test.pickle", help="stripped data filename")
p.add_argument("train", default="train.pickle", help="training filename")
p.add_argument("test", default="test.pickle", help="test filename")
p.add_argument("attrfile", default="attrs.txt", help="attrs to care about for NA purposes")
p.add_argument("--na-strategy", default="drop", help="what to do with NA rows (default is drop them)")
p.add_argument("--trainpct", default=70, type=int, help="percentage of data to put into training set")
p.add_argument("--random", action='store_true', help="split train/test sets randomly (default is by time)")
cfg = p.parse_args()
strip_and_process_to_files(expanded_file=pd.read_pickle(cfg.expanded),
stripped_file=cfg.stripped,
attrfile=cfg.attrfile,
na_strategy=cfg.na_strategy)
split_to_files(trainfile=cfg.train,
testfile=cfg.test,
stripped=cfg.stripped,
trainpct=cfg.trainpct,
split_randomly=cfg.random)
评论列表
文章目录