def _process_dataset(anno, sample_rate, n_samples, n_threads):
"""Processes, and saves MagnaTagATune dataset using multi-processes.
Args:
anno: Annotation DataFrame contains tags, mp3_path, split, and shard.
sample_rate: Sampling rate of the audios. If the sampling rate is different
with an audio's original sampling rate, then it re-samples the audio.
n_samples: Number of samples one segment contains.
n_threads: Number of threads to process the dataset.
"""
args_queue = Queue()
split_and_shard_sets = pd.unique([tuple(x) for x in anno[['split', 'shard']].values])
for split, shard in split_and_shard_sets:
assigned_anno = anno[(anno['split'] == split) & (anno['shard'] == shard)]
n_shards = anno[anno['split'] == split]['shard'].nunique()
args = (assigned_anno, sample_rate, n_samples, split, shard, n_shards)
args_queue.put(args)
if FLAGS.n_threads > 1:
threads = []
for _ in range(FLAGS.n_threads):
thread = Thread(target=_process_audio_files, args=[args_queue])
thread.start()
threads.append(thread)
for thread in threads:
thread.join()
else:
_process_audio_files(args_queue)
评论列表
文章目录