def wav_to_spec(wav_audio, hparams):
"""Transforms the contents of a wav file into a series of spectrograms."""
if hparams.spec_type == 'raw':
spec = _wav_to_framed_samples(wav_audio, hparams)
else:
if hparams.spec_type == 'cqt':
spec = _wav_to_cqt(wav_audio, hparams)
elif hparams.spec_type == 'mel':
spec = _wav_to_mel(wav_audio, hparams)
else:
raise ValueError('Invalid spec_type: {}'.format(hparams.spec_type))
if hparams.spec_log_amplitude:
spec = librosa.logamplitude(spec)
return spec
python类logamplitude()的实例源码
def compute_spectrograms(filename):
out_rate = 12000
N_FFT = 512
HOP_LEN = 256
frames, rate = librosa.load(filename, sr=out_rate, mono=True)
if len(frames) < out_rate*3:
# if less then 3 second - can't process
raise Exception("Audio duration is too short")
logam = librosa.logamplitude
melgram = librosa.feature.melspectrogram
x = logam(melgram(y=frames, sr=out_rate, hop_length=HOP_LEN,
n_fft=N_FFT, n_mels=N_MEL_BANDS) ** 2,
ref_power=1.0)
# now going through spectrogram with the stride of the segment duration
for start_idx in range(0, x.shape[1] - SEGMENT_DUR + 1, SEGMENT_DUR):
yield x[:, start_idx:start_idx + SEGMENT_DUR]
def get_feature_aqibsaeed_conv(X, sr, au_path=None):
"""
http://aqibsaeed.github.io/2016-09-24-urban-sound-classification-part-2/
"""
import librosa
def windows(data, window_size):
start = 0
while start < len(data):
yield start, start + window_size
start += (window_size / 2)
bands = 60
frames = 41
window_size = 512 * (frames - 1)
for (start,end) in windows(X, window_size):
if(len(X[start:end]) == window_size):
signal = X[start:end]
melspec = librosa.feature.melspectrogram(signal, n_mels = bands)
logspec = librosa.logamplitude(melspec)
logspec = logspec.T.flatten()[:, np.newaxis].T
log_specgrams.append(logspec)
def prepare_testset(dataset_name):
spec_folder=common.SPECTRO_PATH+SPECTRO_FOLDER+"/"
test_folder=common.DATA_DIR+'/spectro_%s_testset/' % dataset_name
if not os.path.exists(test_folder):
os.makedirs(test_folder)
items = open(common.DATASETS_DIR+'/items_index_test_%s.tsv' % dataset_name).read().splitlines()
testset = []
testset_index = []
for t,track_id in enumerate(items):
if MSD:
msd_folder = track_id[2]+"/"+track_id[3]+"/"+track_id[4]+"/"
else:
msd_folder = ""
file = spec_folder+msd_folder+track_id+".pk"
try:
spec = pickle.load(open(file))
spec = librosa.logamplitude(np.abs(spec) ** 2,ref_power=np.max).T
pickle.dump(spec, open(test_folder+track_id+".pk","wb"))
testset.append(track_id)
testset_index.append(t)
if t%1000==0:
print t
except:
print "no exist", file
def calc_power_spectrogram(audio_data, samplerate, n_mels=128, n_fft=512, hop_length=160):
"""
Calculate power spectrogram from the given raw audio data
Args:
audio_data: numpyarray of raw audio wave
samplerate: the sample rate of the `audio_data`
n_mels: the number of mels to generate
n_fft: the window size of the fft
hop_length: the hop length for the window
Returns: the spectrogram in the form [time, n_mels]
"""
spectrogram = librosa.feature.melspectrogram(audio_data, sr=samplerate, n_mels=n_mels, n_fft=n_fft, hop_length=hop_length)
# convert to log scale (dB)
log_spectrogram = librosa.logamplitude(spectrogram, ref_power=np.max)
# normalize
normalized_spectrogram = normalize(log_spectrogram)
return normalized_spectrogram.T
def _generate_spectrograms(self):
for row in tqdm(self.meta.itertuples(), total=len(self.meta)):
specfile = self.work_dir + row.filename + '.mel.spec.npy'
if os.path.exists(specfile):
continue
audio = load_audio(self.data_dir + 'audio/' + row.filename, 44100)
# audio *= 1.0 / np.max(np.abs(audio))
spec = librosa.feature.melspectrogram(audio, sr=44100, n_fft=self.FFT, fmax=self.FMAX,
hop_length=self.HOP, n_mels=self.BANDS)
# spec = librosa.logamplitude(spec)
freqs = librosa.core.mel_frequencies(n_mels=self.BANDS, fmax=self.FMAX)
spec = librosa.core.perceptual_weighting(spec, freqs, ref_power=np.max)
reduced_spec = skim.measure.block_reduce(spec, block_size=(3, 2), func=np.mean)
np.save(specfile, spec.astype('float16'), allow_pickle=False)
np.save(specfile[:-4] + '.ds.npy', reduced_spec.astype('float16'), allow_pickle=False)
def get_feature_aqibsaeed_conv(X, sr, au_path=None):
"""
http://aqibsaeed.github.io/2016-09-24-urban-sound-classification-part-2/
"""
import librosa
def windows(data, window_size):
start = 0
while start < len(data):
yield start, start + window_size
start += (window_size / 2)
bands = 60
frames = 41
window_size = 512 * (frames - 1)
for (start,end) in windows(X, window_size):
if(len(X[start:end]) == window_size):
signal = X[start:end]
melspec = librosa.feature.melspectrogram(signal, n_mels = bands)
logspec = librosa.logamplitude(melspec)
logspec = logspec.T.flatten()[:, np.newaxis].T
log_specgrams.append(logspec)
data_preprocess.py 文件源码
项目:crnn-music-genre-classification
作者: meetshah1995
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def log_scale_melspectrogram(path, plot=False):
signal, sr = lb.load(path, sr=Fs)
n_sample = signal.shape[0]
n_sample_fit = int(DURA*Fs)
if n_sample < n_sample_fit:
signal = np.hstack((signal, np.zeros((int(DURA*Fs) - n_sample,))))
elif n_sample > n_sample_fit:
signal = signal[(n_sample-n_sample_fit)/2:(n_sample+n_sample_fit)/2]
melspect = lb.logamplitude(lb.feature.melspectrogram(y=signal, sr=Fs, hop_length=N_OVERLAP, n_fft=N_FFT, n_mels=N_MELS)**2, ref_power=1.0)
if plot:
melspect = melspect[np.newaxis, :]
misc.imshow(melspect.reshape((melspect.shape[1],melspect.shape[2])))
print(melspect.shape)
return melspect
neural_network_audioset.py 文件源码
项目:TensorFlow_AudioSet_Example
作者: DantesLegacy
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def plot_log_power_specgram(sound_names,raw_sounds):
i = 1
fig = plt.figure(figsize=(25,60), dpi = 900)
for n,f in zip(sound_names,raw_sounds):
plt.subplot(10,1,i)
D = librosa.logamplitude(np.abs(librosa.stft(f))**2, ref_power=np.max)
librosa.display.specshow(D,x_axis='time' ,y_axis='log')
plt.title(n.title())
i += 1
plt.suptitle('Figure 3: Log power spectrogram',x=0.5, y=0.915,fontsize=18)
plt.show()
def compute_spectrograms(filename):
out_rate = 22050
frames, rate = librosa.load(filename, sr=out_rate, mono=True)
if len(frames) < out_rate:
# if less then 1 second - can't process
raise Exception("Audio duration is too short")
normalized_audio = _normalize(frames)
melspectr = librosa.feature.melspectrogram(y=normalized_audio, sr=out_rate, n_mels=N_MEL_BANDS, fmax=out_rate/2)
logmelspectr = librosa.logamplitude(melspectr**2, ref_power=1.0)
# now going through spectrogram with the stride of the segment duration
for start_idx in range(0, logmelspectr.shape[1] - SEGMENT_DUR + 1, SEGMENT_DUR):
yield logmelspectr[:, start_idx:start_idx + SEGMENT_DUR]
def extract_features(filename):
y, sr = librosa.load(filename)
y = shape_sound_clip(y)
S = librosa.feature.melspectrogram(y, sr=sr, n_mels=128)
log_S = librosa.logamplitude(S, ref_power=np.max)
mfcc = librosa.feature.mfcc(S=log_S, n_mfcc=13)
return mfcc.flatten()
def make_melgram(mono_sig, sr):
melgram = librosa.logamplitude(librosa.feature.melspectrogram(mono_sig,
sr=sr, n_mels=96),ref_power=1.0)[np.newaxis,np.newaxis,:,:]
return melgram
# turn multichannel audio as multiple melgram layers
def prepossessingAudio(audioPath, ppFilePath):
print 'Prepossessing ' + audioPath
featuresArray = []
for i in range(0, SOUND_SAMPLE_LENGTH, HAMMING_STRIDE):
if i + HAMMING_SIZE <= SOUND_SAMPLE_LENGTH - 1:
y, sr = librosa.load(audioPath, offset=i / 1000.0, duration=HAMMING_SIZE / 1000.0)
# Let's make and display a mel-scaled power (energy-squared) spectrogram
S = librosa.feature.melspectrogram(y, sr=sr, n_mels=128)
# Convert to log scale (dB). We'll use the peak power as reference.
log_S = librosa.logamplitude(S, ref_power=np.max)
mfcc = librosa.feature.mfcc(S=log_S, sr=sr, n_mfcc=13)
featuresArray.append(mfcc)
# featuresArray.append(S)
if len(featuresArray) == 599:
break
print 'storing pp file: ' + ppFilePath
f = open(ppFilePath, 'w')
f.write(pickle.dumps(featuresArray))
f.close()
def prepossessingAudio(audioPath, ppFilePath):
print 'Prepossessing ' + audioPath
featuresArray = []
for i in range(0, SOUND_SAMPLE_LENGTH, HAMMING_STRIDE):
if i + HAMMING_SIZE <= SOUND_SAMPLE_LENGTH - 1:
y, sr = librosa.load(audioPath, offset=i / 1000.0, duration=HAMMING_SIZE / 1000.0)
# Let's make and display a mel-scaled power (energy-squared) spectrogram
S = librosa.feature.melspectrogram(y, sr=sr, n_mels=128)
# Convert to log scale (dB). We'll use the peak power as reference.
log_S = librosa.logamplitude(S, ref_power=np.max)
mfcc = librosa.feature.mfcc(S=log_S, sr=sr, n_mfcc=13)
# featuresArray.append(mfcc)
featuresArray.append(S)
if len(featuresArray) == 599:
break
print 'storing pp file: ' + ppFilePath
f = open(ppFilePath, 'w')
f.write(pickle.dumps(featuresArray))
f.close()
def _generate_spectrograms(self):
for row in tqdm(self.meta.itertuples(), total=len(self.meta)):
specfile = self.work_dir + row.filename + '.orig.spec.npy'
if os.path.exists(specfile):
continue
audio = load_audio(self.data_dir + 'audio/' + row.filename, 22050)
audio *= 1.0 / np.max(np.abs(audio))
spec = librosa.feature.melspectrogram(audio, sr=22050, n_fft=1024,
hop_length=512, n_mels=self.bands)
spec = librosa.logamplitude(spec)
np.save(specfile, spec, allow_pickle=False)
def feature_extract(songfile_name):
'''
takes: filename
outputs: audio feature representation from that file (currently cqt)
**assumes working directory contains raw song files**
returns a tuple containing songfile name and numpy array of song features
'''
song_loc = os.path.abspath(songfile_name)
y, sr = librosa.load(song_loc)
desire_spect_len = 2580
C = librosa.cqt(y=y, sr=sr, hop_length=512, fmin=None,
n_bins=84, bins_per_octave=12, tuning=None,
filter_scale=1, norm=1, sparsity=0.01, real=False)
# get log-power spectrogram with noise floor of -80dB
C = librosa.logamplitude(C**2, ref_power=np.max)
# scale log-power spectrogram to positive integer value for smaller footpint
noise_floor_db = 80
scaling_factor = (2**16 - 1)/noise_floor_db
C += noise_floor_db
C *= scaling_factor
C = C.astype('uint16')
# if spectral respresentation too long, crop it, otherwise, zero-pad
if C.shape[1] >= desire_spect_len:
C = C[:,0:desire_spect_len]
else:
C = np.pad(C,((0,0),(0,desire_spect_len-C.shape[1])), 'constant')
return songfile_name, C
def create_feature_matrix_spark(song_files):
# cqt wrapper
def log_cqt(y,sr):
C = librosa.cqt(y=y, sr=sr, hop_length=512, fmin=None,
n_bins=84, bins_per_octave=12, tuning=None,
filter_scale=1, norm=1, sparsity=0.01, real=True)
# get log-power spectrogram with noise floor of -80dB
C = librosa.logamplitude(C**2, ref_power=np.max)
# scale log-power spectrogram to positive integer value for smaller footpint
noise_floor_db = 80
scaling_factor = (2**16 - 1)/noise_floor_db
C += noise_floor_db
C *= scaling_factor
C = C.astype('uint16')
return C
# padding wrapper
def padding(C,desired_spect_len):
if C.shape[1] >= desired_spect_len:
C = C[:,0:desired_spect_len]
else:
C = np.pad(C,((0,0),(0,desired_spect_len-C.shape[1])), 'constant')
return C
# load try-catch wrapper
def try_load(filename):
try:
sys.stdout.write('Processing: %s \r' % os.path.basename(filename))
sys.stdout.flush()
return librosa.load(filename)
except:
pass
# transormations
filesRDD = sc.parallelize(song_files)
rawAudioRDD = filesRDD.map(lambda x: (os.path.basename(x),try_load(x))).filter(lambda x: x[1] != None)
rawCQT = rawAudioRDD.map(lambda x: (x[int(0)], log_cqt(x[int(1)][int(0)],x[int(1)][int(1)])))
paddedCQT = rawCQT.map(lambda x: (x[0],padding(x[1],2580)))
return paddedCQT.collect()
preprocess_data.py 文件源码
项目:audio-classifier-keras-cnn
作者: drscotthawley
项目源码
文件源码
阅读 30
收藏 0
点赞 0
评论 0
def preprocess_dataset(inpath="Samples/", outpath="Preproc/"):
if not os.path.exists(outpath):
os.mkdir( outpath, 0755 ); # make a new directory for preproc'd files
class_names = get_class_names(path=inpath) # get the names of the subdirectories
nb_classes = len(class_names)
print("class_names = ",class_names)
for idx, classname in enumerate(class_names): # go through the subdirs
if not os.path.exists(outpath+classname):
os.mkdir( outpath+classname, 0755 ); # make a new subdirectory for preproc class
class_files = os.listdir(inpath+classname)
n_files = len(class_files)
n_load = n_files
print(' class name = {:14s} - {:3d}'.format(classname,idx),
", ",n_files," files in this class",sep="")
printevery = 20
for idx2, infilename in enumerate(class_files):
audio_path = inpath + classname + '/' + infilename
if (0 == idx2 % printevery):
print('\r Loading class: {:14s} ({:2d} of {:2d} classes)'.format(classname,idx+1,nb_classes),
", file ",idx2+1," of ",n_load,": ",audio_path,sep="")
#start = timer()
aud, sr = librosa.load(audio_path, sr=None)
melgram = librosa.logamplitude(librosa.feature.melspectrogram(aud, sr=sr, n_mels=96),ref_power=1.0)[np.newaxis,np.newaxis,:,:]
outfile = outpath + classname + '/' + infilename+'.npy'
np.save(outfile,melgram)
def __call__(self, S):
return librosa.logamplitude(S, **self.__dict__)
def preprocess_input(audio_path, dim_ordering='default'):
'''Reads an audio file and outputs a Mel-spectrogram.
'''
if dim_ordering == 'default':
dim_ordering = K.image_dim_ordering()
assert dim_ordering in {'tf', 'th'}
if librosa_exists():
import librosa
else:
raise RuntimeError('Librosa is required to process audio files.\n' +
'Install it via `pip install librosa` \nor visit ' +
'http://librosa.github.io/librosa/ for details.')
# mel-spectrogram parameters
SR = 12000
N_FFT = 512
N_MELS = 96
HOP_LEN = 256
DURA = 29.12
src, sr = librosa.load(audio_path, sr=SR)
n_sample = src.shape[0]
n_sample_wanted = int(DURA * SR)
# trim the signal at the center
if n_sample < n_sample_wanted: # if too short
src = np.hstack((src, np.zeros((int(DURA * SR) - n_sample,))))
elif n_sample > n_sample_wanted: # if too long
src = src[(n_sample - n_sample_wanted) / 2:
(n_sample + n_sample_wanted) / 2]
logam = librosa.logamplitude
melgram = librosa.feature.melspectrogram
x = logam(melgram(y=src, sr=SR, hop_length=HOP_LEN,
n_fft=N_FFT, n_mels=N_MELS) ** 2,
ref_power=1.0)
if dim_ordering == 'th':
x = np.expand_dims(x, axis=0)
elif dim_ordering == 'tf':
x = np.expand_dims(x, axis=3)
return x
def preprocess_input(audio_path, dim_ordering='default'):
'''Reads an audio file and outputs a Mel-spectrogram.
'''
if dim_ordering == 'default':
dim_ordering = K.image_dim_ordering()
assert dim_ordering in {'tf', 'th'}
if librosa_exists():
import librosa
else:
raise RuntimeError('Librosa is required to process audio files.\n' +
'Install it via `pip install librosa` \nor visit ' +
'http://librosa.github.io/librosa/ for details.')
# mel-spectrogram parameters
SR = 12000
N_FFT = 512
N_MELS = 96
HOP_LEN = 256
DURA = 29.12
src, sr = librosa.load(audio_path, sr=SR)
n_sample = src.shape[0]
n_sample_wanted = int(DURA * SR)
# trim the signal at the center
if n_sample < n_sample_wanted: # if too short
src = np.hstack((src, np.zeros((int(DURA * SR) - n_sample,))))
elif n_sample > n_sample_wanted: # if too long
src = src[(n_sample - n_sample_wanted) / 2:
(n_sample + n_sample_wanted) / 2]
logam = librosa.logamplitude
melgram = librosa.feature.melspectrogram
x = logam(melgram(y=src, sr=SR, hop_length=HOP_LEN,
n_fft=N_FFT, n_mels=N_MELS) ** 2,
ref_power=1.0)
if dim_ordering == 'th':
x = np.expand_dims(x, axis=0)
elif dim_ordering == 'tf':
x = np.expand_dims(x, axis=3)
return x
def preprocess_input(audio_path, dim_ordering='default'):
'''Reads an audio file and outputs a Mel-spectrogram.
'''
if dim_ordering == 'default':
dim_ordering = K.image_dim_ordering()
assert dim_ordering in {'tf', 'th'}
if librosa_exists():
import librosa
else:
raise RuntimeError('Librosa is required to process audio files.\n' +
'Install it via `pip install librosa` \nor visit ' +
'http://librosa.github.io/librosa/ for details.')
# mel-spectrogram parameters
SR = 12000
N_FFT = 512
N_MELS = 96
HOP_LEN = 256
DURA = 29.12
src, sr = librosa.load(audio_path, sr=SR)
n_sample = src.shape[0]
n_sample_wanted = int(DURA * SR)
# trim the signal at the center
if n_sample < n_sample_wanted: # if too short
src = np.hstack((src, np.zeros((int(DURA * SR) - n_sample,))))
elif n_sample > n_sample_wanted: # if too long
src = src[(n_sample - n_sample_wanted) / 2:
(n_sample + n_sample_wanted) / 2]
logam = librosa.logamplitude
melgram = librosa.feature.melspectrogram
x = logam(melgram(y=src, sr=SR, hop_length=HOP_LEN,
n_fft=N_FFT, n_mels=N_MELS) ** 2,
ref_power=1.0)
if dim_ordering == 'th':
x = np.expand_dims(x, axis=0)
elif dim_ordering == 'tf':
x = np.expand_dims(x, axis=3)
return x
def ispecgram(spec,
n_fft=512,
hop_length=None,
mask=True,
log_mag=True,
re_im=False,
dphase=True,
mag_only=True,
num_iters=1000):
"""Inverse Spectrogram using librosa.
Args:
spec: 3-D specgram array [freqs, time, (mag_db, dphase)].
n_fft: Size of the FFT.
hop_length: Stride of FFT. Defaults to n_fft/2.
mask: Reverse the mask of the phase derivative by the magnitude.
log_mag: Use the logamplitude.
re_im: Output Real and Imag. instead of logMag and dPhase.
dphase: Use derivative of phase instead of phase.
mag_only: Specgram contains no phase.
num_iters: Number of griffin-lim iterations for mag_only.
Returns:
audio: 1-D array of sound samples. Peak normalized to 1.
"""
if not hop_length:
hop_length = n_fft // 2
ifft_config = dict(win_length=n_fft, hop_length=hop_length, center=True)
if mag_only:
mag = spec[:, :, 0]
phase_angle = np.pi * np.random.rand(*mag.shape)
elif re_im:
spec_real = spec[:, :, 0] + 1.j * spec[:, :, 1]
else:
mag, p = spec[:, :, 0], spec[:, :, 1]
if mask and log_mag:
p /= (mag + 1e-13 * np.random.randn(*mag.shape))
if dphase:
# Roll up phase
phase_angle = np.cumsum(p * np.pi, axis=1)
else:
phase_angle = p * np.pi
# Magnitudes
if log_mag:
mag = (mag - 1.0) * 120.0
mag = 10**(mag / 20.0)
phase = np.cos(phase_angle) + 1.j * np.sin(phase_angle)
spec_real = mag * phase
if mag_only:
audio = griffin_lim(
mag, phase_angle, n_fft, hop_length, num_iters=num_iters)
else:
audio = librosa.core.istft(spec_real, **ifft_config)
return np.squeeze(audio / audio.max())
def dataset(modalities=0, forcetempTime=4, contactmicTime=0.2, leaveObjectOut=False, verbose=False):
materials = ['plastic', 'glass', 'fabric', 'metal', 'wood', 'ceramic']
X = []
y = []
objects = dict()
for m, material in enumerate(materials):
if verbose:
print 'Processing', material
sys.stdout.flush()
with open('data_processed/processed_0.1sbefore_%s_times_%.2f_%.2f.pkl' % (material, forcetempTime, contactmicTime), 'rb') as f:
allData = pickle.load(f)
for j, (objName, objData) in enumerate(allData.iteritems()):
if leaveObjectOut:
objects[objName] = {'x': [], 'y': []}
X = objects[objName]['x']
y = objects[objName]['y']
for i in xrange(len(objData['temperature'])):
y.append(m)
if modalities > 2:
# Mel-scaled power (energy-squared) spectrogram
sr = 48000
S = librosa.feature.melspectrogram(np.array(objData['contact'][i]), sr=sr, n_mels=128)
# Convert to log scale (dB)
log_S = librosa.logamplitude(S, ref_power=np.max)
if modalities == 0:
X.append(objData['force0'][i] + objData['force1'][i])
elif modalities == 1:
X.append(objData['temperature'][i])
elif modalities == 2:
X.append(objData['temperature'][i] + objData['force0'][i] + objData['force1'][i])
elif modalities == 3:
X.append(log_S.flatten())
elif modalities == 4:
X.append(objData['temperature'][i] + log_S.flatten().tolist())
elif modalities == 5:
X.append(objData['temperature'][i] + objData['force0'][i] + objData['force1'][i] + log_S.flatten().tolist())
elif modalities == 6:
X.append(objData['force0'][i] + objData['force1'][i] + log_S.flatten().tolist())
if leaveObjectOut:
return objects
else:
X = np.array(X)
y = np.array(y)
if verbose:
print 'X:', np.shape(X), 'y:', np.shape(y)
return X, y
def dataset(modalities=0, forcetempTime=4, contactmicTime=0.2, leaveObjectOut=False, verbose=False):
materials = ['plastic', 'glass', 'fabric', 'metal', 'wood', 'ceramic']
X = []
y = []
objects = dict()
for m, material in enumerate(materials):
if verbose:
print 'Processing', material
sys.stdout.flush()
with open('data_processed/processed_0.1sbefore_%s_times_%.2f_%.2f.pkl' % (material, forcetempTime, contactmicTime), 'rb') as f:
allData = pickle.load(f)
for j, (objName, objData) in enumerate(allData.iteritems()):
if leaveObjectOut:
objects[objName] = {'x': [], 'y': []}
X = objects[objName]['x']
y = objects[objName]['y']
for i in xrange(len(objData['temperature'])):
y.append(m)
if modalities > 2:
# Mel-scaled power (energy-squared) spectrogram
sr = 48000
S = librosa.feature.melspectrogram(np.array(objData['contact'][i]), sr=sr, n_mels=128)
# Convert to log scale (dB)
log_S = librosa.logamplitude(S, ref_power=np.max)
if modalities == 0:
X.append(objData['force0'][i] + objData['force1'][i])
elif modalities == 1:
X.append(objData['temperature'][i])
elif modalities == 2:
X.append(objData['temperature'][i] + objData['force0'][i] + objData['force1'][i])
elif modalities == 3:
X.append(log_S.flatten())
elif modalities == 4:
X.append(objData['temperature'][i] + log_S.flatten().tolist())
elif modalities == 5:
X.append(objData['temperature'][i] + objData['force0'][i] + objData['force1'][i] + log_S.flatten().tolist())
elif modalities == 6:
X.append(objData['force0'][i] + objData['force1'][i] + log_S.flatten().tolist())
if leaveObjectOut:
return objects
else:
X = np.array(X)
y = np.array(y)
if verbose:
print 'X:', np.shape(X), 'y:', np.shape(y)
return X, y
def dataset(modalities=0, forcetempTime=4, contactmicTime=0.2, leaveObjectOut=False, verbose=False, deriv=False):
materials = ['plastic', 'glass', 'fabric', 'metal', 'wood', 'ceramic']
X = []
y = []
objects = dict()
for m, material in enumerate(materials):
if verbose:
print 'Processing', material
sys.stdout.flush()
with open('data_processed/processed_0.1sbefore_%s_times_%.2f_%.2f.pkl' % (material, forcetempTime, contactmicTime), 'rb') as f:
allData = pickle.load(f)
for j, (objName, objData) in enumerate(allData.iteritems()):
if leaveObjectOut:
objects[objName] = {'x': [], 'y': []}
X = objects[objName]['x']
y = objects[objName]['y']
for i in xrange(len(objData['temperature'])):
y.append(m)
if deriv:
objData['force0'][i] = firstDeriv(objData['force0'][i], objData['forceTime'][i])
objData['force1'][i] = firstDeriv(objData['force1'][i], objData['forceTime'][i])
objData['temperature'][i] = firstDeriv(objData['temperature'][i], objData['temperatureTime'][i])
if modalities > 2:
# Mel-scaled power (energy-squared) spectrogram
sr = 48000
S = librosa.feature.melspectrogram(np.array(objData['contact'][i]), sr=sr, n_mels=128)
# Convert to log scale (dB)
log_S = librosa.logamplitude(S, ref_power=np.max)
if modalities == 0:
X.append(objData['force0'][i] + objData['force1'][i])
elif modalities == 1:
X.append(objData['temperature'][i])
elif modalities == 2:
X.append(objData['temperature'][i] + objData['force0'][i] + objData['force1'][i])
elif modalities == 3:
X.append(log_S.flatten())
elif modalities == 4:
X.append(objData['temperature'][i] + log_S.flatten().tolist())
elif modalities == 5:
X.append(objData['temperature'][i] + objData['force0'][i] + objData['force1'][i] + log_S.flatten().tolist())
elif modalities == 6:
X.append(objData['force0'][i] + objData['force1'][i] + log_S.flatten().tolist())
if leaveObjectOut:
return objects
else:
X = np.array(X)
y = np.array(y)
if verbose:
print 'X:', np.shape(X), 'y:', np.shape(y)
return X, y
audio_conv_utils.py 文件源码
项目:deep-learning-keras-projects
作者: jasmeetsb
项目源码
文件源码
阅读 16
收藏 0
点赞 0
评论 0
def preprocess_input(audio_path, dim_ordering='default'):
"""Reads an audio file and outputs a Mel-spectrogram.
# Arguments
audio_path: path to the target audio file.
dim_ordering: data format for the output spectrogram image.
# Returns
3D Numpy tensor encoding the Mel-spectrogram.
# Raises
ImportError: if librosa is not available.
"""
if dim_ordering == 'default':
dim_ordering = K.image_dim_ordering()
assert dim_ordering in {'tf', 'th'}
if librosa is None:
raise ImportError('Librosa is required to process audio files. '
'Install it via `pip install librosa` or visit '
'http://librosa.github.io/librosa/ for details.')
# mel-spectrogram parameters
sr = 12000
n_fft = 512
n_mels = 96
hop_length = 256
duration = 29.12
src, sr = librosa.load(audio_path, sr=sr)
n_sample = src.shape[0]
n_sample_wanted = int(duration * sr)
# trim the signal at the center
if n_sample < n_sample_wanted: # if too short
src = np.hstack((src, np.zeros((int(duration * sr) - n_sample,))))
elif n_sample > n_sample_wanted: # if too long
src = src[(n_sample - n_sample_wanted) // 2:
(n_sample + n_sample_wanted) // 2]
logam = librosa.logamplitude
melgram = librosa.feature.melspectrogram
x = logam(melgram(y=src, sr=sr, hop_length=hop_length,
n_fft=n_fft, n_mels=n_mels) ** 2,
ref_power=1.0)
if dim_ordering == 'th':
x = np.expand_dims(x, axis=0)
elif dim_ordering == 'tf':
x = np.expand_dims(x, axis=3)
return x
audio_conv_utils.py 文件源码
项目:vgg16-vgg19-resnet-inception-xception-example
作者: yong-ho
项目源码
文件源码
阅读 17
收藏 0
点赞 0
评论 0
def preprocess_input(audio_path, dim_ordering='default'):
'''Reads an audio file and outputs a Mel-spectrogram.
'''
if dim_ordering == 'default':
dim_ordering = K.image_dim_ordering()
assert dim_ordering in {'tf', 'th'}
if librosa_exists():
import librosa
else:
raise RuntimeError('Librosa is required to process audio files.\n' +
'Install it via `pip install librosa` \nor visit ' +
'http://librosa.github.io/librosa/ for details.')
# mel-spectrogram parameters
SR = 12000
N_FFT = 512
N_MELS = 96
HOP_LEN = 256
DURA = 29.12
src, sr = librosa.load(audio_path, sr=SR)
n_sample = src.shape[0]
n_sample_wanted = int(DURA * SR)
# trim the signal at the center
if n_sample < n_sample_wanted: # if too short
src = np.hstack((src, np.zeros((int(DURA * SR) - n_sample,))))
elif n_sample > n_sample_wanted: # if too long
src = src[(n_sample - n_sample_wanted) / 2:
(n_sample + n_sample_wanted) / 2]
logam = librosa.logamplitude
melgram = librosa.feature.melspectrogram
x = logam(melgram(y=src, sr=SR, hop_length=HOP_LEN,
n_fft=N_FFT, n_mels=N_MELS) ** 2,
ref_power=1.0)
if dim_ordering == 'th':
x = np.expand_dims(x, axis=0)
elif dim_ordering == 'tf':
x = np.expand_dims(x, axis=3)
return x
def preprocess_input(audio_path, dim_ordering='default'):
'''Reads an audio file and outputs a Mel-spectrogram.
'''
if dim_ordering == 'default':
dim_ordering = K.image_dim_ordering()
assert dim_ordering in {'tf', 'th'}
if librosa_exists():
import librosa
else:
raise RuntimeError('Librosa is required to process audio files.\n' +
'Install it via `pip install librosa` \nor visit ' +
'http://librosa.github.io/librosa/ for details.')
# mel-spectrogram parameters
SR = 12000
N_FFT = 512
N_MELS = 96
HOP_LEN = 256
DURA = 29.12
src, sr = librosa.load(audio_path, sr=SR)
n_sample = src.shape[0]
n_sample_wanted = int(DURA * SR)
# trim the signal at the center
if n_sample < n_sample_wanted: # if too short
src = np.hstack((src, np.zeros((int(DURA * SR) - n_sample,))))
elif n_sample > n_sample_wanted: # if too long
src = src[(n_sample - n_sample_wanted) / 2:
(n_sample + n_sample_wanted) / 2]
logam = librosa.logamplitude
melgram = librosa.feature.melspectrogram
x = logam(melgram(y=src, sr=SR, hop_length=HOP_LEN,
n_fft=N_FFT, n_mels=N_MELS) ** 2,
ref_power=1.0)
if dim_ordering == 'th':
x = np.expand_dims(x, axis=0)
elif dim_ordering == 'tf':
x = np.expand_dims(x, axis=3)
return x
def preprocess_input(audio_path, dim_ordering='default'):
'''Reads an audio file and outputs a Mel-spectrogram.
'''
if dim_ordering == 'default':
dim_ordering = K.image_dim_ordering()
assert dim_ordering in {'tf', 'th'}
if librosa_exists():
import librosa
else:
raise RuntimeError('Librosa is required to process audio files.\n' +
'Install it via `pip install librosa` \nor visit ' +
'http://librosa.github.io/librosa/ for details.')
# mel-spectrogram parameters
SR = 12000
N_FFT = 512
N_MELS = 96
HOP_LEN = 256
DURA = 29.12
src, sr = librosa.load(audio_path, sr=SR)
n_sample = src.shape[0]
n_sample_wanted = int(DURA * SR)
# trim the signal at the center
if n_sample < n_sample_wanted: # if too short
src = np.hstack((src, np.zeros((int(DURA * SR) - n_sample,))))
elif n_sample > n_sample_wanted: # if too long
src = src[(n_sample - n_sample_wanted) / 2:
(n_sample + n_sample_wanted) / 2]
logam = librosa.logamplitude
melgram = librosa.feature.melspectrogram
x = logam(melgram(y=src, sr=SR, hop_length=HOP_LEN,
n_fft=N_FFT, n_mels=N_MELS) ** 2,
ref_power=1.0)
if dim_ordering == 'th':
x = np.expand_dims(x, axis=0)
elif dim_ordering == 'tf':
x = np.expand_dims(x, axis=3)
return x