speech_utils.py 文件源码-python代码片段

def synthesis_speech(noisy_speech, ideal_mask, win_type, win_len, shift_len, syn_method='A&R'):
    samples = noisy_speech.shape[0]
    frames = (samples - win_len) // shift_len

    if win_type == 'hanning':
        window = np.hanning(win_len)
    elif win_type == 'hamming':
        window = np.hamming(win_len)
    elif win_type == 'rectangle':
        window = np.ones(win_len)
    to_ifft = np.zeros(win_len, dtype=np.complex64)
    clean_speech = np.zeros((frames-1)*shift_len+win_len, dtype=np.float32)
    window_sum = np.zeros((frames-1)*shift_len+win_len, dtype=np.float32)
    for i in range(frames):
        one_frame = noisy_speech[i * shift_len: i * shift_len + win_len]
        windowed_frame = np.multiply(one_frame, window)
        stft = np.fft.fft(windowed_frame, win_len)
        masked_abs = np.abs(stft[:win_len//2+1]) * ideal_mask[:, i]
        to_ifft[:win_len//2+1] = masked_abs * np.exp(1j * np.angle(stft[:win_len//2+1]))
        to_ifft[win_len//2+1:] = np.conj(to_ifft[win_len//2-1:0:-1])
        speech_seg = np.real(np.fft.ifft(to_ifft, win_len))

        if syn_method == 'A&R' or syn_method == 'ALLEN & RABINER':
            clean_speech[i*shift_len:i*shift_len+win_len] += speech_seg
            window_sum[i*shift_len:i*shift_len+win_len] += window

        elif syn_method == 'G&L' or syn_method == 'GRIFFIN & LIM':
            speech_seg = np.multiply(speech_seg, window)
            clean_speech[i * shift_len:i * shift_len + win_len] += speech_seg
            window_sum[i * shift_len:i * shift_len + win_len] += np.power(window, 2.)
        # if i > 0:
        #     clean_speech[i*shift_len: (i-1)*shift_len+win_len] *= 0.5
    window_sum = np.where(window_sum < 1e-2, 1e-2, window_sum)
    return clean_speech / window_sum