def dataset_path_iterator(file_path: str) -> Iterator[str]:
"""
An iterator returning file_paths in a directory
containing CONLL-formatted files.
"""
logger.info("Reading CONLL sentences from dataset files at: %s", file_path)
for root, _, files in tqdm.tqdm(list(os.walk(file_path))):
for data_file in files:
# These are a relic of the dataset pre-processing. Every
# file will be duplicated - one file called filename.gold_skel
# and one generated from the preprocessing called filename.gold_conll.
if not data_file.endswith("gold_conll"):
continue
yield os.path.join(root, data_file)
评论列表
文章目录