def synthesis_speech(noisy_speech, ideal_mask, win_type, win_len, shift_len, syn_method='A&R'):
samples = noisy_speech.shape[0]
frames = (samples - win_len) // shift_len
if win_type == 'hanning':
window = np.hanning(win_len)
elif win_type == 'hamming':
window = np.hamming(win_len)
elif win_type == 'rectangle':
window = np.ones(win_len)
to_ifft = np.zeros(win_len, dtype=np.complex64)
clean_speech = np.zeros((frames-1)*shift_len+win_len, dtype=np.float32)
window_sum = np.zeros((frames-1)*shift_len+win_len, dtype=np.float32)
for i in range(frames):
one_frame = noisy_speech[i * shift_len: i * shift_len + win_len]
windowed_frame = np.multiply(one_frame, window)
stft = np.fft.fft(windowed_frame, win_len)
masked_abs = np.abs(stft[:win_len//2+1]) * ideal_mask[:, i]
to_ifft[:win_len//2+1] = masked_abs * np.exp(1j * np.angle(stft[:win_len//2+1]))
to_ifft[win_len//2+1:] = np.conj(to_ifft[win_len//2-1:0:-1])
speech_seg = np.real(np.fft.ifft(to_ifft, win_len))
if syn_method == 'A&R' or syn_method == 'ALLEN & RABINER':
clean_speech[i*shift_len:i*shift_len+win_len] += speech_seg
window_sum[i*shift_len:i*shift_len+win_len] += window
elif syn_method == 'G&L' or syn_method == 'GRIFFIN & LIM':
speech_seg = np.multiply(speech_seg, window)
clean_speech[i * shift_len:i * shift_len + win_len] += speech_seg
window_sum[i * shift_len:i * shift_len + win_len] += np.power(window, 2.)
# if i > 0:
# clean_speech[i*shift_len: (i-1)*shift_len+win_len] *= 0.5
window_sum = np.where(window_sum < 1e-2, 1e-2, window_sum)
return clean_speech / window_sum
评论列表
文章目录