python类load()的实例源码-面圈网

audiostruct.py 文件源码项目：gtzan.keras 作者: Hguimaraes 项目源码文件源码阅读 16 收藏 0 点赞 0 评论 0

def getdata(self):
    # Structure for the array of songs
    song_data = []
    genre_data = []

    # Read files from the folders
    for x,_ in self.genres.items():
      for root, subdirs, files in os.walk(self.file_path + x):
        for file in files:
          # Read the audio file
            file_name = self.file_path + x + "/" + file
            print(file_name)
            signal, sr = librosa.load(file_name)

            # Calculate the melspectrogram of the audio and use log scale
            melspec = librosa.feature.melspectrogram(signal[:self.song_samples],
              sr = sr, n_fft = self.n_fft, hop_length = self.hop_length).T[:1280,]

            # Append the result to the data structure
            song_data.append(melspec)
            genre_data.append(self.genres[x])
    return np.array(song_data), keras.utils.to_categorical(genre_data, len(self.genres))

labeled_example.py 文件源码项目：speechless 作者: JuliusKunze 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def __init__(self,
                 audio_file: Path,
                 id: Optional[str] = None,
                 sample_rate_to_convert_to: int = 16000,
                 label: Optional[str] = "nolabel",
                 fourier_window_length: int = 512,
                 hop_length: int = 128,
                 mel_frequency_count: int = 128,
                 label_with_tags: str = None,
                 positional_label: Optional[PositionalLabel] = None):
        # The default values for hop_length and fourier_window_length are powers of 2 near the values specified in the wave2letter paper.

        if id is None:
            id = name_without_extension(audio_file)

        self.audio_file = audio_file

        super().__init__(
            id=id, get_raw_audio=lambda: librosa.load(str(self.audio_file), sr=self.sample_rate)[0],
            label=label, sample_rate=sample_rate_to_convert_to,
            fourier_window_length=fourier_window_length, hop_length=hop_length, mel_frequency_count=mel_frequency_count,
            label_with_tags=label_with_tags, positional_label=positional_label)

speech_data.py 文件源码项目：skill-voice-recognition 作者: TREE-Edu 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def __init__(self, images, labels, fake_data=False, one_hot=False, load=False):
        """Construct a DataSet. one_hot arg is used only if fake_data is true."""
        if fake_data:
            self._num_examples = 10000
            self.one_hot = one_hot
        else:
            num = len(images)
            assert num == len(labels), ('images.shape: %s labels.shape: %s' % (images.shape, labels.shape))
            print("len(images) %d" % num)
            self._num_examples = num
        self.cache={}
        self._image_names = numpy.array(images)
        self._labels = labels
        self._epochs_completed = 0
        self._index_in_epoch = 0
        self._images=[]
        if load: # Otherwise loaded on demand
            self._images=self.load(self._image_names)

speech_data.py 文件源码项目：skill-voice-recognition 作者: TREE-Edu 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def read_data_sets(train_dir,source_data=Source.NUMBER_IMAGES, fake_data=False, one_hot=True):
    class DataSets(object):
        pass
    data_sets = DataSets()
    if fake_data:
        data_sets.train = DataSet([], [], fake_data=True, one_hot=one_hot)
        data_sets.validation = DataSet([], [], fake_data=True, one_hot=one_hot)
        data_sets.test = DataSet([], [], fake_data=True, one_hot=one_hot)
        return data_sets
    VALIDATION_SIZE = 2000
    local_file = maybe_download(source_data, train_dir)
    train_images = extract_images(TRAIN_INDEX,train=True)
    train_labels = extract_labels(TRAIN_INDEX,train=True, one_hot=one_hot)
    test_images = extract_images(TEST_INDEX,train=False)
    test_labels = extract_labels(TEST_INDEX,train=False, one_hot=one_hot)
    # train_images = train_images[:VALIDATION_SIZE]
    # train_labels = train_labels[:VALIDATION_SIZE:]
    # test_images = test_images[VALIDATION_SIZE:]
    # test_labels = test_labels[VALIDATION_SIZE:]
    data_sets.train = DataSet(train_images, train_labels , load=False)
    data_sets.test = DataSet(test_images, test_labels, load=True)
    # data_sets.validation = DataSet(validation_images, validation_labels, load=True)
    return data_sets

audio_io.py 文件源码项目：magenta 作者: tensorflow 项目源码文件源码阅读 18 收藏 0 点赞 0 评论 0

def load_audio(audio_filename, sample_rate):
  """Loads an audio file.

  Args:
    audio_filename: File path to load.
    sample_rate: The number of samples per second at which the audio will be
        returned. Resampling will be performed if necessary.

  Returns:
    A numpy array of audio samples, single-channel (mono) and sampled at the
    specified rate, in float32 format.

  Raises:
    AudioIOReadException: If librosa is unable to load the audio data.
  """
  try:
    y, unused_sr = librosa.load(audio_filename, sr=sample_rate, mono=True)
  except Exception as e:  # pylint: disable=broad-except
    raise AudioIOReadException(e)
  return y

hll.py 文件源码项目：motif 作者: rabitt 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def get_seeds(self, audio_filepath):
        """Get the seeds file to pass to the HLL tracker.

        Parameters
        ----------
        audio_filepath : str
            Path to audio file.

        Returns
        -------
        seeds_fpath : str
            Path to the seeds output file.

        """
        y, sr = librosa.load(audio_filepath, sr=44100)
        y_harmonic = librosa.effects.harmonic(y)
        cqt, samples, freqs = self._compute_cqt(y_harmonic, sr)
        seeds = self._pick_seeds_cqt(cqt, freqs, samples)

        seeds_fpath = tmp.mktemp('.csv')
        with open(seeds_fpath, 'w') as fhandle:
            writer = csv.writer(fhandle, delimiter=',')
            writer.writerows(seeds)
        return seeds_fpath

singlelayer.py 文件源码项目：EUSIPCO2017 作者: Veleslavia 项目源码文件源码阅读 18 收藏 0 点赞 0 评论 0

def compute_spectrograms(filename):
    out_rate = 12000
    N_FFT = 512
    HOP_LEN = 256

    frames, rate = librosa.load(filename, sr=out_rate, mono=True)
    if len(frames) < out_rate*3:
        # if less then 3 second - can't process
        raise Exception("Audio duration is too short")

    logam = librosa.logamplitude
    melgram = librosa.feature.melspectrogram
    x = logam(melgram(y=frames, sr=out_rate, hop_length=HOP_LEN,
                      n_fft=N_FFT, n_mels=N_MEL_BANDS) ** 2,
              ref_power=1.0)

    # now going through spectrogram with the stride of the segment duration
    for start_idx in range(0, x.shape[1] - SEGMENT_DUR + 1, SEGMENT_DUR):
        yield x[:, start_idx:start_idx + SEGMENT_DUR]

predict_convnet_10sec_model.py 文件源码项目：laughter 作者: ganesh-srinivas 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def predict_on_long_clips():
    """Load the saved model and perform inference/prediction on features obtained from inputs. 
    Splits the audio into 10second chunks and predicts on those chunks."""
    with open(FILENAMES,"r") as fh:
        filecontents=fh.read()
        filenames=filecontents.splitlines()
        random.shuffle(filenames)
        filenames=filenames[:5] #[:5] is for quickly verifying if things work
        filenames = [DATASET_LOCATION+f for f in filenames]

    session = tf.Session()
    saver = tf.train.import_meta_graph(IMPORT_META_GRAPH)
    saver.restore(session, tf.train.latest_checkpoint(IMPORT_LATEST_CHECKPOINT))
    tf.global_variables_initializer().run(session=session)

    test_x = {}
    for f in filenames:
        s, sr = librosa.load(f)
        total_chunks = s.shape[0]/max_audio_length
        waveforms = [s[max_audio_length*i:max_audio_length*(i+1)] for i in range(total_chunks)]
        test_x[f] = extract_features_from_waveforms(waveforms)

        print "FILENAME: ", f
        predictions = session.run(tf.argmax(pred, 1), feed_dict={X: test_x[f]})
        print [possible_categories[p] for p in predictions]

predict_convnet_laughterornot_10sec_model.py 文件源码项目：laughter 作者: ganesh-srinivas 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def predict_on_long_clips():
    """Load the saved model and perform inference/prediction on features obtained from inputs. 
    Splits the audio into 10second chunks and predicts on those chunks."""
    with open(FILENAMES,"r") as fh:
        filecontents=fh.read()
        filenames=filecontents.splitlines()
        random.shuffle(filenames)
        filenames=filenames[:5] #[:5] is for quickly verifying if things work
        filenames = [DATASET_LOCATION+f for f in filenames]

    session = tf.Session()
    saver = tf.train.import_meta_graph(IMPORT_META_GRAPH)
    saver.restore(session, tf.train.latest_checkpoint(IMPORT_LATEST_CHECKPOINT))
    tf.global_variables_initializer().run(session=session)

    test_x = {}
    for f in filenames:
        s, sr = librosa.load(f)
        total_chunks = s.shape[0]/max_audio_length
        waveforms = [s[max_audio_length*i:max_audio_length*(i+1)] for i in range(total_chunks)]
        test_x[f] = extract_features_from_waveforms(waveforms)

        print "FILENAME: ", f
        predictions = session.run(tf.argmax(pred, 1), feed_dict={X: test_x[f]})
        print [possible_categories[p] for p in predictions]

predict_convnet_1sec_model.py 文件源码项目：laughter 作者: ganesh-srinivas 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def predict_on_long_clips():
    """Load the saved model and perform inference/prediction on features obtained from inputs. 
    Splits the audio into 10second chunks and predicts on those chunks."""
    with open(FILENAMES,"r") as fh:
        filecontents=fh.read()
        filenames=filecontents.splitlines()
        random.shuffle(filenames)
        filenames=filenames[:5] #[:5] is for quickly verifying if things work
        filenames = [DATASET_LOCATION+f for f in filenames]

    session = tf.Session()
    saver = tf.train.import_meta_graph(IMPORT_META_GRAPH)
    saver.restore(session, tf.train.latest_checkpoint(IMPORT_LATEST_CHECKPOINT))
    tf.global_variables_initializer().run(session=session)

    test_x = {}
    for f in filenames:
        s, sr = librosa.load(f)
        total_chunks = s.shape[0]/max_audio_length
        waveforms = [s[max_audio_length*i:max_audio_length*(i+1)] for i in range(total_chunks)]
        test_x[f] = extract_features_from_waveforms(waveforms)

        print "FILENAME: ", f
        predictions = session.run(tf.argmax(pred, 1), feed_dict={X: test_x[f]})
        print [possible_categories[p] for p in predictions]

aw.py 文件源码项目：mmfeat 作者: douwekiela 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def loadFile(self, fname):
        '''
        fname:      filename of the sound file we want to load
        '''
        if self.verbose: print('Loading %s' % fname)

        if self.cached:
            if not os.path.exists(fname + '-mfcc.npy'):
                y, sr = librosa.load(fname)
                data = mfcc(y=y, sr=sr).T
                np.save(fname + '-mfcc.npy', data)
            else:
                data = np.load(fname + '-mfcc.npy')
        else:
            y, sr = librosa.load(fname)
            # TODO: Add ability to filter by seconds/duration
            # seconds = y.size/sr
            data = mfcc(y=y, sr=sr).T

        return data

utils.py 文件源码项目：audio-tagging-toolkit 作者: hipstas 项目源码文件源码阅读 18 收藏 0 点赞 0 评论 0

def get_mfccs_and_deltas(wav_pathname, n_mfcc=13, n_fft=2048, freq_min=100, freq_max=16000):
    sample_array, sample_rate = librosa.load(wav_pathname, sr=44100)
    if len(sample_array) == 0:
        return []
    else:
        mfcc = librosa.feature.mfcc(sample_array, sample_rate, n_fft=n_fft, hop_length=n_fft, n_mfcc=n_mfcc, fmin=freq_min, fmax=freq_max)
        delta = librosa.feature.delta(mfcc)
        delta2 = librosa.feature.delta(mfcc, order=2)
        mfcc = mfcc.T  ### Transposing tables
        delta = delta.T  ## (We can instead set the axis above to do this without the extra step)
        delta2 = delta2.T
        mfcc_sans_0th = [frame_values[1:] for frame_values in mfcc]
        all_features = []
        for i in range(len(mfcc)):
            all_features.append(list(mfcc_sans_0th[i]) + list(delta[i]) + list(delta2[i]))
        return all_features

mir_convert.py 文件源码项目：toho_mir_ml 作者: kodack64 项目源码文件源码阅读 18 收藏 0 点赞 0 评论 0

def recognise_mfcc(filePath,outputDir,outputName,debug):

    print("start decompose harmonic/percussive and extract mfcc {0}".format(filePath))
    y,sr = librosa.load(filePath)
    mfcc = librosa.feature.mfcc(y=y,sr=sr)
    mfcc = np.transpose(mfcc)
    basePath = outputDir+outputName;
    np.savetxt(basePath+"_normal_mfcc.csv",mfcc,delimiter=",")
    harmonic_sep = 3.0
    percussive_sep = 3.0
    h,p = librosa.effects.hpss(y,margin=(harmonic_sep,percussive_sep))
    hmfcc = librosa.feature.mfcc(y=h,sr=sr)
    hmfcc = np.transpose(hmfcc)
    np.savetxt(basePath+"_harmonic_mfcc.csv",hmfcc,delimiter=",")
    pmfcc = librosa.feature.mfcc(y=p,sr=sr)
    pmfcc = np.transpose(pmfcc)
    np.savetxt(basePath+"_percussive_mfcc.csv",pmfcc,delimiter=",")

# extract rhythm patter with rp_extract

audio_reader.py 文件源码项目：tensorflow-wavenet 作者: ibab 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def load_generic_audio(directory, sample_rate):
    '''Generator that yields audio waveforms from the directory.'''
    files = find_files(directory)
    id_reg_exp = re.compile(FILE_PATTERN)
    print("files length: {}".format(len(files)))
    randomized_files = randomize_files(files)
    for filename in randomized_files:
        ids = id_reg_exp.findall(filename)
        if not ids:
            # The file name does not match the pattern containing ids, so
            # there is no id.
            category_id = None
        else:
            # The file name matches the pattern for containing ids.
            category_id = int(ids[0][0])
        audio, _ = librosa.load(filename, sr=sample_rate, mono=True)
        audio = audio.reshape(-1, 1)
        yield audio, filename, category_id

audio.py 文件源码项目：DropMuse 作者: DropMuse 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def get_audio_analysis(song_url):
    if(song_url is None):
        return None, None, None, None, None
    urlretrieve(song_url, "current.mp3")
    y, sr = librosa.load("./current.mp3")

    # Tempo = beats/minute
    tempo, beats = librosa.beat.beat_track(y=y, sr=sr)

    # pitch = Frequency
    pitches, magnitudes = librosa.piptrack(y=y, sr=sr,
                                           fmax=1000, hop_length=1000)

    pitches, magnitudes = extract_max(pitches, magnitudes, pitches.shape)
    y[abs(y) < 10**-2] = 0
    y = np.trim_zeros(y)

    json = {
        'sound_wave': np.array(y[:len(pitches)]).tolist(),
        'pitch': pitches
    }
    y_harm, y_per = librosa.effects.hpss(y)
    harm, perc = audio_fingerprint(y_harm), audio_fingerprint(y_per)
    pitch_ave = np.average(pitches)
    return float(tempo), float(pitch_ave), float(harm), float(perc), json

makewav.py 文件源码项目：SpeechSeparation 作者: Unisound 项目源码文件源码阅读 16 收藏 0 点赞 0 评论 0

def main():
    outdir = 'mix'
    if not os.path.exists(outdir):
        os.makedirs(outdir)
    audio_total1, sr = librosa.load('./cao.wav', sr=sample_rate, mono=True)
    audio_total2, sr = librosa.load('./huang.wav', sr=sample_rate, mono=True)

    seglen = int(sav_n_secs * sr)

    len1 = audio_total1.shape[0] - seglen
    len2 = audio_total2.shape[0] - seglen

    for i in range(train_data_num):
      if i % 100 == 0:
        print(i)
      idx1=random.randint(0, len1)
      idx2=random.randint(0, len2)
      mix(audio_total1[idx1:idx1+seglen], audio_total2[idx2:idx2+seglen], sample_rate, sav_n_secs,outdir,i)

cache_feature.py 文件源码项目：gcforest 作者: w821881341 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def save_cache(src_path, des_path, get_feature_func):
    des_path = osp.splitext(des_path)[0] + '.npy'
    try:
        X, sr = librosa.load(src_path)
        src = int(sr)
        feature = get_feature_func(X, sr)
        print('[INFO] Saving Cache in {} ...'.format(des_path))
        des_par = osp.abspath(osp.join(des_path, osp.pardir))
        if not osp.exists(des_par):
            os.makedirs(des_par)
    except Exception, e:
        print("[ERROR] Unkown error happend when dealing with{}".format(src_path))
        #print(e)
        return -1
    np.save(des_path, feature)
    return 0

adjust_volume.py 文件源码项目：aed-by-cnn 作者: tweihaha 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def adjust_volume(in_fp):
    def adjust(volume):
        audio_p = audio + volume
        fn_p = fn + "_" + str(volume) +"db" + ".wav"
        fd = audio_p.export(path.join(out_dir, str(volume) + 'db', path.split(in_dir)[-1], fn_p), format=format)

    in_dir, fn = path.split(in_fp)
    fn, file_ext = path.splitext(fn)
    file_ext = file_ext.lower()
    format = file_ext.replace('.', '')
    # audio = None
    y, sr = librosa.load(in_fp, sr=44100)
    tmp_in_fp = "tmp/" + fn + "_tmp.wav"
    librosa.output.write_wav(tmp_in_fp, y, sr, norm=False)
    format = "wav"
    audio = aseg.from_file(tmp_in_fp, format)
    os.remove(tmp_in_fp)

    if audio != None:
        for v in volume_list:
            adjust(v)

data_preprocess.py 文件源码项目：crnn-music-genre-classification 作者: meetshah1995 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def log_scale_melspectrogram(path, plot=False):
    signal, sr = lb.load(path, sr=Fs)
    n_sample = signal.shape[0]
    n_sample_fit = int(DURA*Fs)

    if n_sample < n_sample_fit:
        signal = np.hstack((signal, np.zeros((int(DURA*Fs) - n_sample,))))
    elif n_sample > n_sample_fit:
        signal = signal[(n_sample-n_sample_fit)/2:(n_sample+n_sample_fit)/2]

    melspect = lb.logamplitude(lb.feature.melspectrogram(y=signal, sr=Fs, hop_length=N_OVERLAP, n_fft=N_FFT, n_mels=N_MELS)**2, ref_power=1.0)

    if plot:
        melspect = melspect[np.newaxis, :]
        misc.imshow(melspect.reshape((melspect.shape[1],melspect.shape[2])))
        print(melspect.shape)

    return melspect

inputs.py 文件源码项目：EnglishSpeechUpsampler 作者: jhetherly 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def read_file_pair(filename_pair, mono=True):
    """
    given a pair of file names, read in both waveforms and upsample (through
    librosa's default interpolation) the downsampled waveform
    assumes the file name pair is of the form ("original", "downsampled")
    mono selects whether to read in mono or stereo formatted waveforms

    returns a pair of numpy arrays representing the original and upsampled
    waveform
    """
    channel = 1 if mono else 2
    true_waveform, true_br = librosa.load(filename_pair[0], sr=None,
                                          mono=mono)
    ds_waveform, _ = librosa.load(filename_pair[1], sr=true_br, mono=mono)
    # truth, example
    return true_waveform.reshape((-1, channel)), \
        ds_waveform.reshape((-1, channel))

analyzer.py 文件源码项目：vae-npvc 作者: JeremyCCHsu 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def extract(filename, fft_size=FFT_SIZE, dtype=np.float32):
    ''' Basic (WORLD) feature extraction ''' 
    x, _ = librosa.load(filename, sr=args.fs, mono=True, dtype=np.float64)
    features = wav2pw(x, args.fs, fft_size=fft_size)
    ap = features['ap']
    f0 = features['f0'].reshape([-1, 1])
    sp = features['sp']
    en = np.sum(sp + EPSILON, axis=1, keepdims=True)
    sp = np.log10(sp / en)
    return np.concatenate([sp, ap, f0, en], axis=1).astype(dtype)

trainModel.py 文件源码项目：Sound-classification-on-Raspberry-Pi-with-Tensorflow 作者: GianlucaPaolocci 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def extract_features(file_name):
    X, sample_rate = librosa.load(file_name)
    stft = np.abs(librosa.stft(X))
    mfccs = np.array(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=8).T)
    chroma = np.array(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T)
    mel = np.array(librosa.feature.melspectrogram(X, sr=sample_rate).T)
    contrast = np.array(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T)
    tonnetz = np.array(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T)
    return mfccs,chroma,mel,contrast,tonnetz

labeled_example.py 文件源码项目：speechless 作者: JuliusKunze 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def _load_from_cache(self):
        try:
            return numpy.load(str(self.spectrogram_cache_file))
        except ValueError:
            log("Recalculating cached file {} because loading failed.".format(self.spectrogram_cache_file))
            return self._calculate_and_save_spectrogram()

test_analyzer.py 文件源码项目：crema 作者: bmcfee 项目源码文件源码阅读 18 收藏 0 点赞 0 评论 0

def SIGNAL():
    y, sr = librosa.load(librosa.util.example_audio_file(),
                         sr=None)
    return y, sr

test_chord.py 文件源码项目：crema 作者: bmcfee 项目源码文件源码阅读 18 收藏 0 点赞 0 评论 0

def SIGNAL():
    y, sr = librosa.load(librosa.util.example_audio_file(),
                         sr=None)
    return y, sr

utility.py 文件源码项目：mugen 作者: scherroman 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def create_marked_audio_file(mark_locations: Union[List[float], np.ndarray], output_path: Opt[str] = None, *,
                             audio_file: Opt[str] = None, duration: float = None):
    if audio_file:
        y, sr = librosa.load(audio_file)
        marked_audio = librosa.core.clicks(times=mark_locations, sr=sr, length=len(y))
        marked_audio = y + marked_audio
    elif duration:
        sr = 22050
        marked_audio = librosa.core.clicks(times=mark_locations, sr=sr, length=int(sr * duration))
    else:
        raise ParameterError("Must provide either audio file or duration.")

    librosa.output.write_wav(path=output_path, y=marked_audio, sr=sr)

    return output_path

Audio.py 文件源码项目：mugen 作者: scherroman 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def __init__(self, file: str, *, sample_rate: int = 44100):
        """        
        Parameters
        ----------
        file
            Audio file to load
        """

        self.file = file
        self.samples, self.sample_rate = librosa.load(file, sr=sample_rate)
        self.duration = librosa.get_duration(y=self.samples, sr=self.sample_rate)

generate.py 文件源码项目：aurora 作者: caretcaret 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def clip_audio(specs, raw_audio, output):
  # Load the spec data. In clipping audio, we hold the specs fixed.
  spec_filenames = next(os.walk(specs))[2]
  if len(spec_filenames) == 0:
    print("No specs found.")
    return
  for spec_filename in spec_filenames:
    with open(os.path.join(specs, spec_filename)) as f:
      spec = json.load(f)
    youtube_id = spec['audio_source']['youtube_id']
    start_time = spec['audio_source']['start_time']
    end_time = spec['audio_source']['end_time']

    raw_audio_filenames = glob.glob(os.path.join(raw_audio, youtube_id + '.*'))
    if len(raw_audio_filenames) == 0:
      # No audio file found, skip.
      continue
    raw_audio_filename = raw_audio_filenames[0]
    raw_audio_extension = os.path.splitext(raw_audio_filename)[1]
    clip_filename = os.path.join(
        output, CLIP_NAME_PATTERN.format(youtube_id, start_time, end_time) +
        raw_audio_extension)

    # Call ffmpeg to output the trimmed clip.
    os.makedirs(os.path.dirname(clip_filename), exist_ok=True)
    call1 = ['ffmpeg', '-loglevel', 'error', '-n',
             '-ss', str(start_time), '-t', str(end_time - start_time),
             '-i', raw_audio_filename]
    if raw_audio_extension == 'ogg':
      call2 = ['-codec:a', 'libvorbis', '-strict', 'experimental']
    else:
      call2 = []
    call3 = [clip_filename]
    process = subprocess.run(call1 + call2 + call3)
    if process.returncode != 0:
      print("Error: {} encountered by {}".format(
          process.returncode, clip_filename))
    else:
      print(clip_filename)

test_preprocessing.py 文件源码项目：nnmnkwii 作者: r9y9 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def test_dtw_aligner():
    x, fs = librosa.load(example_audio_file(), sr=None)
    assert fs == 16000
    x_fast = librosa.effects.time_stretch(x, 2.0)

    X = _get_mcep(x, fs)
    Y = _get_mcep(x_fast, fs)

    D = X.shape[-1]

    # Create padded pair
    X, Y = adjast_frame_lengths(X, Y, divisible_by=2)

    # Add utterance axis
    X = X.reshape(1, -1, D)
    Y = Y.reshape(1, -1, D)

    X_aligned, Y_aligned = DTWAligner().transform((X, Y))
    assert X_aligned.shape == Y_aligned.shape
    assert np.linalg.norm(X_aligned - Y_aligned) < np.linalg.norm(X - Y)

    X_aligned, Y_aligned = IterativeDTWAligner(
        n_iter=2, max_iter_gmm=10, n_components_gmm=2).transform((X, Y))
    assert X_aligned.shape == Y_aligned.shape
    assert np.linalg.norm(X_aligned - Y_aligned) < np.linalg.norm(X - Y)

    # Custom dist function
    from nnmnkwii.metrics import melcd
    X_aligned, Y_aligned = DTWAligner(dist=melcd).transform((X, Y))
    assert np.linalg.norm(X_aligned - Y_aligned) < np.linalg.norm(X - Y)

speech_data.py 文件源码项目：skill-voice-recognition 作者: TREE-Edu 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def mfcc_batch_generator(batch_size=10, source=Source.DIGIT_WAVES, target=Target.digits):
    maybe_download(source, DATA_DIR)
    if target == Target.speaker: speakers = get_speakers()
    batch_features = []
    labels = []
    files = os.listdir(path)
    while True:
        print("loaded batch of %d files" % len(files))
        shuffle(files)
        for file in files:
            if not file.endswith(".wav"): continue
            wave, sr = librosa.load(path+file, mono=True)
            mfcc = librosa.feature.mfcc(wave, sr)
            if target==Target.speaker: label=one_hot_from_item(speaker(file), speakers)
            elif target==Target.digits:  label=dense_to_one_hot(int(file[0]),10)
            elif target==Target.first_letter:  label=dense_to_one_hot((ord(file[0]) - 48) % 32,32)
            elif target == Target.hotword: label = one_hot_word(file, pad_to=max_word_length)  #
            elif target == Target.word: label=string_to_int_word(file, pad_to=max_word_length)
                # label = file  # sparse_labels(file, pad_to=20)  # max_output_length
            else: raise Exception("todo : labels for Target!")
            labels.append(label)
            # print(np.array(mfcc).shape)
            mfcc=np.pad(mfcc,((0,0),(0,80-len(mfcc[0]))), mode='constant', constant_values=0)
            batch_features.append(np.array(mfcc))
            if len(batch_features) >= batch_size:
                # if target == Target.word:  labels = sparse_labels(labels)
                # labels=np.array(labels)
                # print(np.array(batch_features).shape)
                # yield np.array(batch_features), labels
                # print(np.array(labels).shape) # why (64,) instead of (64, 15, 32)? OK IFF dim_1==const (20)
                yield batch_features, labels  # basic_rnn_seq2seq inputs must be a sequence
                batch_features = []  # Reset for next batch
                labels = []


# If you set dynamic_pad=True when calling tf.train.batch the returned batch will be automatically padded with 0s. Handy! A lower-level option is to use tf.PaddingFIFOQueue.
# only apply to a subset of all images at one time