def create_csv(data_dir):
'''
Generates CSV file (as required by DeepSpeech_RHL.py) in the given dir.
Args:
data_dir: Directory where all .wav files and
their associated timescripts are stored.
'''
# Get all audio and transcript file paths.
audio_file_paths = sorted(glob.glob(data_dir + "*.wav"))
transcript_file_paths = sorted(glob.glob(data_dir + "*.txt"))
audio_file_sizes = []
transcripts = []
for x, y in zip(audio_file_paths, transcript_file_paths):
with open(y, "rb") as f:
transcripts.append(f.read())
# Get file size.
metadata = os.stat(x)
audio_file_sizes.append(metadata.st_size)
# Create pandas dataframe
df = pandas.DataFrame(columns=["wav_filename", "wav_filesize", "transcript"])
df["wav_filename"] = audio_file_paths
df["wav_filesize"] = audio_file_sizes
df["transcript"] = transcripts
df.to_csv(data_dir + "data.csv", sep=",", index=None) # Save CSV
data_preprocessing.py 文件源码
python
阅读 33
收藏 0
点赞 0
评论 0
评论列表
文章目录