def main(conf):
dump_dir = conf['fuzzy.dump.dir']
makedirs(dump_dir)
logging.info('Loading train dataset')
train_df = load_train_df(conf['fuzzy.dataset'])
logging.info('Loading test dataset')
test_df = load_test_df(conf['fuzzy.dataset'])
compute_features(train_df, test_df)
logging.info('Writing train dataset to disk')
train_df[[
FieldsTrain.id,
FieldsTrain.is_duplicate,
Fields.qratio,
Fields.wratio,
Fields.partial_ratio,
Fields.partial_token_set_ratio,
Fields.partial_token_sort_ratio,
Fields.token_set_ratio,
Fields.token_sort_ratio
]].to_csv(join_path(dump_dir, 'train.csv'), index=False)
logging.info('Writing test dataset to disk')
test_df[[
FieldsTest.test_id,
Fields.qratio,
Fields.wratio,
Fields.partial_ratio,
Fields.partial_token_set_ratio,
Fields.partial_token_sort_ratio,
Fields.token_set_ratio,
Fields.token_sort_ratio
]].to_csv(join_path(dump_dir, 'test.csv'), index=False)
评论列表
文章目录