def mfcc_features(filename):
"""Preprocessing per CTC paper.
(These are not the simpler linear spectrogram features alone as in Deep
Speech).
Properties:
- 10ms frames with 5ms overlap
- 12 MFCCs with 26 filter banks
- replace first MFCC with energy (TODO: log-energy)
- add first-order derivatives for all of the above
- total: 26 coefficients
"""
d, sr = librosa.load(filename)
frame_length_seconds = 0.010
frame_overlap_seconds = 0.005
mfccs = librosa.feature.mfcc(d, sr, n_mfcc=1+12, n_fft=int(frame_overlap_seconds*sr), hop_length=int(frame_overlap_seconds*sr))
# energy (TODO: log?)
energy = librosa.feature.rmse(d, n_fft=int(frame_overlap_seconds*sr), hop_length=int(frame_overlap_seconds*sr))
mfccs[0] = energy # replace first MFCC with energy, per convention
deltas = librosa.feature.delta(mfccs, order=1)
mfccs_plus_deltas = np.vstack([mfccs, deltas])
coeffs = sklearn.preprocessing.scale(mfccs_plus_deltas, axis=1)
return coeffs
评论列表
文章目录