def ingest_librispeech(input_directory, manifest_file=None, absolute_paths=True):
""" Finds all .txt files and their indicated .flac files and writes them to an Aeon
compatible manifest file.
Arguments:
input_directory (str): Path to librispeech directory
manifest_file (str): Path to manifest file to output.
absolute_paths (bool): Whether audio file paths should be absolute or
relative to input_directory.
"""
if not os.path.isdir(input_directory):
raise IOError("Data directory does not exist! {}".format(input_directory))
if manifest_file is None:
manifest_file = os.path.join(input_directory, manifest_file)
transcript_files = get_files(input_directory, pattern="*.txt")
if len(transcript_files) == 0:
raise IOError("No .txt files were found in {}".format(input_directory))
tqdm.write("Preparing manifest file...")
with open(manifest_file, "w") as manifest:
manifest.write("@FILE\tSTRING\n")
for tfile in tqdm(transcript_files, unit=" Files", mininterval=.001):
directory = os.path.dirname(tfile)
if absolute_paths is False:
directory = os.path.relpath(directory, input_directory)
with open(tfile, "r") as fid:
for line in fid.readlines():
id_, transcript = line.split(" ", 1)
afile = "{}.flac".format(os.path.join(directory, id_))
manifest.write("{}\t{}\n".format(afile, transcript))
评论列表
文章目录