def getdata(self):
# Structure for the array of songs
song_data = []
genre_data = []
# Read files from the folders
for x,_ in self.genres.items():
for root, subdirs, files in os.walk(self.file_path + x):
for file in files:
# Read the audio file
file_name = self.file_path + x + "/" + file
print(file_name)
signal, sr = librosa.load(file_name)
# Calculate the melspectrogram of the audio and use log scale
melspec = librosa.feature.melspectrogram(signal[:self.song_samples],
sr = sr, n_fft = self.n_fft, hop_length = self.hop_length).T[:1280,]
# Append the result to the data structure
song_data.append(melspec)
genre_data.append(self.genres[x])
return np.array(song_data), keras.utils.to_categorical(genre_data, len(self.genres))
python类load()的实例源码
def __init__(self,
audio_file: Path,
id: Optional[str] = None,
sample_rate_to_convert_to: int = 16000,
label: Optional[str] = "nolabel",
fourier_window_length: int = 512,
hop_length: int = 128,
mel_frequency_count: int = 128,
label_with_tags: str = None,
positional_label: Optional[PositionalLabel] = None):
# The default values for hop_length and fourier_window_length are powers of 2 near the values specified in the wave2letter paper.
if id is None:
id = name_without_extension(audio_file)
self.audio_file = audio_file
super().__init__(
id=id, get_raw_audio=lambda: librosa.load(str(self.audio_file), sr=self.sample_rate)[0],
label=label, sample_rate=sample_rate_to_convert_to,
fourier_window_length=fourier_window_length, hop_length=hop_length, mel_frequency_count=mel_frequency_count,
label_with_tags=label_with_tags, positional_label=positional_label)
def __init__(self, images, labels, fake_data=False, one_hot=False, load=False):
"""Construct a DataSet. one_hot arg is used only if fake_data is true."""
if fake_data:
self._num_examples = 10000
self.one_hot = one_hot
else:
num = len(images)
assert num == len(labels), ('images.shape: %s labels.shape: %s' % (images.shape, labels.shape))
print("len(images) %d" % num)
self._num_examples = num
self.cache={}
self._image_names = numpy.array(images)
self._labels = labels
self._epochs_completed = 0
self._index_in_epoch = 0
self._images=[]
if load: # Otherwise loaded on demand
self._images=self.load(self._image_names)
def read_data_sets(train_dir,source_data=Source.NUMBER_IMAGES, fake_data=False, one_hot=True):
class DataSets(object):
pass
data_sets = DataSets()
if fake_data:
data_sets.train = DataSet([], [], fake_data=True, one_hot=one_hot)
data_sets.validation = DataSet([], [], fake_data=True, one_hot=one_hot)
data_sets.test = DataSet([], [], fake_data=True, one_hot=one_hot)
return data_sets
VALIDATION_SIZE = 2000
local_file = maybe_download(source_data, train_dir)
train_images = extract_images(TRAIN_INDEX,train=True)
train_labels = extract_labels(TRAIN_INDEX,train=True, one_hot=one_hot)
test_images = extract_images(TEST_INDEX,train=False)
test_labels = extract_labels(TEST_INDEX,train=False, one_hot=one_hot)
# train_images = train_images[:VALIDATION_SIZE]
# train_labels = train_labels[:VALIDATION_SIZE:]
# test_images = test_images[VALIDATION_SIZE:]
# test_labels = test_labels[VALIDATION_SIZE:]
data_sets.train = DataSet(train_images, train_labels , load=False)
data_sets.test = DataSet(test_images, test_labels, load=True)
# data_sets.validation = DataSet(validation_images, validation_labels, load=True)
return data_sets
def load_audio(audio_filename, sample_rate):
"""Loads an audio file.
Args:
audio_filename: File path to load.
sample_rate: The number of samples per second at which the audio will be
returned. Resampling will be performed if necessary.
Returns:
A numpy array of audio samples, single-channel (mono) and sampled at the
specified rate, in float32 format.
Raises:
AudioIOReadException: If librosa is unable to load the audio data.
"""
try:
y, unused_sr = librosa.load(audio_filename, sr=sample_rate, mono=True)
except Exception as e: # pylint: disable=broad-except
raise AudioIOReadException(e)
return y
def get_seeds(self, audio_filepath):
"""Get the seeds file to pass to the HLL tracker.
Parameters
----------
audio_filepath : str
Path to audio file.
Returns
-------
seeds_fpath : str
Path to the seeds output file.
"""
y, sr = librosa.load(audio_filepath, sr=44100)
y_harmonic = librosa.effects.harmonic(y)
cqt, samples, freqs = self._compute_cqt(y_harmonic, sr)
seeds = self._pick_seeds_cqt(cqt, freqs, samples)
seeds_fpath = tmp.mktemp('.csv')
with open(seeds_fpath, 'w') as fhandle:
writer = csv.writer(fhandle, delimiter=',')
writer.writerows(seeds)
return seeds_fpath
def compute_spectrograms(filename):
out_rate = 12000
N_FFT = 512
HOP_LEN = 256
frames, rate = librosa.load(filename, sr=out_rate, mono=True)
if len(frames) < out_rate*3:
# if less then 3 second - can't process
raise Exception("Audio duration is too short")
logam = librosa.logamplitude
melgram = librosa.feature.melspectrogram
x = logam(melgram(y=frames, sr=out_rate, hop_length=HOP_LEN,
n_fft=N_FFT, n_mels=N_MEL_BANDS) ** 2,
ref_power=1.0)
# now going through spectrogram with the stride of the segment duration
for start_idx in range(0, x.shape[1] - SEGMENT_DUR + 1, SEGMENT_DUR):
yield x[:, start_idx:start_idx + SEGMENT_DUR]
def predict_on_long_clips():
"""Load the saved model and perform inference/prediction on features obtained from inputs.
Splits the audio into 10second chunks and predicts on those chunks."""
with open(FILENAMES,"r") as fh:
filecontents=fh.read()
filenames=filecontents.splitlines()
random.shuffle(filenames)
filenames=filenames[:5] #[:5] is for quickly verifying if things work
filenames = [DATASET_LOCATION+f for f in filenames]
session = tf.Session()
saver = tf.train.import_meta_graph(IMPORT_META_GRAPH)
saver.restore(session, tf.train.latest_checkpoint(IMPORT_LATEST_CHECKPOINT))
tf.global_variables_initializer().run(session=session)
test_x = {}
for f in filenames:
s, sr = librosa.load(f)
total_chunks = s.shape[0]/max_audio_length
waveforms = [s[max_audio_length*i:max_audio_length*(i+1)] for i in range(total_chunks)]
test_x[f] = extract_features_from_waveforms(waveforms)
print "FILENAME: ", f
predictions = session.run(tf.argmax(pred, 1), feed_dict={X: test_x[f]})
print [possible_categories[p] for p in predictions]
predict_convnet_laughterornot_10sec_model.py 文件源码
项目:laughter
作者: ganesh-srinivas
项目源码
文件源码
阅读 20
收藏 0
点赞 0
评论 0
def predict_on_long_clips():
"""Load the saved model and perform inference/prediction on features obtained from inputs.
Splits the audio into 10second chunks and predicts on those chunks."""
with open(FILENAMES,"r") as fh:
filecontents=fh.read()
filenames=filecontents.splitlines()
random.shuffle(filenames)
filenames=filenames[:5] #[:5] is for quickly verifying if things work
filenames = [DATASET_LOCATION+f for f in filenames]
session = tf.Session()
saver = tf.train.import_meta_graph(IMPORT_META_GRAPH)
saver.restore(session, tf.train.latest_checkpoint(IMPORT_LATEST_CHECKPOINT))
tf.global_variables_initializer().run(session=session)
test_x = {}
for f in filenames:
s, sr = librosa.load(f)
total_chunks = s.shape[0]/max_audio_length
waveforms = [s[max_audio_length*i:max_audio_length*(i+1)] for i in range(total_chunks)]
test_x[f] = extract_features_from_waveforms(waveforms)
print "FILENAME: ", f
predictions = session.run(tf.argmax(pred, 1), feed_dict={X: test_x[f]})
print [possible_categories[p] for p in predictions]
def predict_on_long_clips():
"""Load the saved model and perform inference/prediction on features obtained from inputs.
Splits the audio into 10second chunks and predicts on those chunks."""
with open(FILENAMES,"r") as fh:
filecontents=fh.read()
filenames=filecontents.splitlines()
random.shuffle(filenames)
filenames=filenames[:5] #[:5] is for quickly verifying if things work
filenames = [DATASET_LOCATION+f for f in filenames]
session = tf.Session()
saver = tf.train.import_meta_graph(IMPORT_META_GRAPH)
saver.restore(session, tf.train.latest_checkpoint(IMPORT_LATEST_CHECKPOINT))
tf.global_variables_initializer().run(session=session)
test_x = {}
for f in filenames:
s, sr = librosa.load(f)
total_chunks = s.shape[0]/max_audio_length
waveforms = [s[max_audio_length*i:max_audio_length*(i+1)] for i in range(total_chunks)]
test_x[f] = extract_features_from_waveforms(waveforms)
print "FILENAME: ", f
predictions = session.run(tf.argmax(pred, 1), feed_dict={X: test_x[f]})
print [possible_categories[p] for p in predictions]
def loadFile(self, fname):
'''
fname: filename of the sound file we want to load
'''
if self.verbose: print('Loading %s' % fname)
if self.cached:
if not os.path.exists(fname + '-mfcc.npy'):
y, sr = librosa.load(fname)
data = mfcc(y=y, sr=sr).T
np.save(fname + '-mfcc.npy', data)
else:
data = np.load(fname + '-mfcc.npy')
else:
y, sr = librosa.load(fname)
# TODO: Add ability to filter by seconds/duration
# seconds = y.size/sr
data = mfcc(y=y, sr=sr).T
return data
def get_mfccs_and_deltas(wav_pathname, n_mfcc=13, n_fft=2048, freq_min=100, freq_max=16000):
sample_array, sample_rate = librosa.load(wav_pathname, sr=44100)
if len(sample_array) == 0:
return []
else:
mfcc = librosa.feature.mfcc(sample_array, sample_rate, n_fft=n_fft, hop_length=n_fft, n_mfcc=n_mfcc, fmin=freq_min, fmax=freq_max)
delta = librosa.feature.delta(mfcc)
delta2 = librosa.feature.delta(mfcc, order=2)
mfcc = mfcc.T ### Transposing tables
delta = delta.T ## (We can instead set the axis above to do this without the extra step)
delta2 = delta2.T
mfcc_sans_0th = [frame_values[1:] for frame_values in mfcc]
all_features = []
for i in range(len(mfcc)):
all_features.append(list(mfcc_sans_0th[i]) + list(delta[i]) + list(delta2[i]))
return all_features
def recognise_mfcc(filePath,outputDir,outputName,debug):
print("start decompose harmonic/percussive and extract mfcc {0}".format(filePath))
y,sr = librosa.load(filePath)
mfcc = librosa.feature.mfcc(y=y,sr=sr)
mfcc = np.transpose(mfcc)
basePath = outputDir+outputName;
np.savetxt(basePath+"_normal_mfcc.csv",mfcc,delimiter=",")
harmonic_sep = 3.0
percussive_sep = 3.0
h,p = librosa.effects.hpss(y,margin=(harmonic_sep,percussive_sep))
hmfcc = librosa.feature.mfcc(y=h,sr=sr)
hmfcc = np.transpose(hmfcc)
np.savetxt(basePath+"_harmonic_mfcc.csv",hmfcc,delimiter=",")
pmfcc = librosa.feature.mfcc(y=p,sr=sr)
pmfcc = np.transpose(pmfcc)
np.savetxt(basePath+"_percussive_mfcc.csv",pmfcc,delimiter=",")
# extract rhythm patter with rp_extract
def load_generic_audio(directory, sample_rate):
'''Generator that yields audio waveforms from the directory.'''
files = find_files(directory)
id_reg_exp = re.compile(FILE_PATTERN)
print("files length: {}".format(len(files)))
randomized_files = randomize_files(files)
for filename in randomized_files:
ids = id_reg_exp.findall(filename)
if not ids:
# The file name does not match the pattern containing ids, so
# there is no id.
category_id = None
else:
# The file name matches the pattern for containing ids.
category_id = int(ids[0][0])
audio, _ = librosa.load(filename, sr=sample_rate, mono=True)
audio = audio.reshape(-1, 1)
yield audio, filename, category_id
def get_audio_analysis(song_url):
if(song_url is None):
return None, None, None, None, None
urlretrieve(song_url, "current.mp3")
y, sr = librosa.load("./current.mp3")
# Tempo = beats/minute
tempo, beats = librosa.beat.beat_track(y=y, sr=sr)
# pitch = Frequency
pitches, magnitudes = librosa.piptrack(y=y, sr=sr,
fmax=1000, hop_length=1000)
pitches, magnitudes = extract_max(pitches, magnitudes, pitches.shape)
y[abs(y) < 10**-2] = 0
y = np.trim_zeros(y)
json = {
'sound_wave': np.array(y[:len(pitches)]).tolist(),
'pitch': pitches
}
y_harm, y_per = librosa.effects.hpss(y)
harm, perc = audio_fingerprint(y_harm), audio_fingerprint(y_per)
pitch_ave = np.average(pitches)
return float(tempo), float(pitch_ave), float(harm), float(perc), json
def main():
outdir = 'mix'
if not os.path.exists(outdir):
os.makedirs(outdir)
audio_total1, sr = librosa.load('./cao.wav', sr=sample_rate, mono=True)
audio_total2, sr = librosa.load('./huang.wav', sr=sample_rate, mono=True)
seglen = int(sav_n_secs * sr)
len1 = audio_total1.shape[0] - seglen
len2 = audio_total2.shape[0] - seglen
for i in range(train_data_num):
if i % 100 == 0:
print(i)
idx1=random.randint(0, len1)
idx2=random.randint(0, len2)
mix(audio_total1[idx1:idx1+seglen], audio_total2[idx2:idx2+seglen], sample_rate, sav_n_secs,outdir,i)
def save_cache(src_path, des_path, get_feature_func):
des_path = osp.splitext(des_path)[0] + '.npy'
try:
X, sr = librosa.load(src_path)
src = int(sr)
feature = get_feature_func(X, sr)
print('[INFO] Saving Cache in {} ...'.format(des_path))
des_par = osp.abspath(osp.join(des_path, osp.pardir))
if not osp.exists(des_par):
os.makedirs(des_par)
except Exception, e:
print("[ERROR] Unkown error happend when dealing with{}".format(src_path))
#print(e)
return -1
np.save(des_path, feature)
return 0
def adjust_volume(in_fp):
def adjust(volume):
audio_p = audio + volume
fn_p = fn + "_" + str(volume) +"db" + ".wav"
fd = audio_p.export(path.join(out_dir, str(volume) + 'db', path.split(in_dir)[-1], fn_p), format=format)
in_dir, fn = path.split(in_fp)
fn, file_ext = path.splitext(fn)
file_ext = file_ext.lower()
format = file_ext.replace('.', '')
# audio = None
y, sr = librosa.load(in_fp, sr=44100)
tmp_in_fp = "tmp/" + fn + "_tmp.wav"
librosa.output.write_wav(tmp_in_fp, y, sr, norm=False)
format = "wav"
audio = aseg.from_file(tmp_in_fp, format)
os.remove(tmp_in_fp)
if audio != None:
for v in volume_list:
adjust(v)
data_preprocess.py 文件源码
项目:crnn-music-genre-classification
作者: meetshah1995
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
def log_scale_melspectrogram(path, plot=False):
signal, sr = lb.load(path, sr=Fs)
n_sample = signal.shape[0]
n_sample_fit = int(DURA*Fs)
if n_sample < n_sample_fit:
signal = np.hstack((signal, np.zeros((int(DURA*Fs) - n_sample,))))
elif n_sample > n_sample_fit:
signal = signal[(n_sample-n_sample_fit)/2:(n_sample+n_sample_fit)/2]
melspect = lb.logamplitude(lb.feature.melspectrogram(y=signal, sr=Fs, hop_length=N_OVERLAP, n_fft=N_FFT, n_mels=N_MELS)**2, ref_power=1.0)
if plot:
melspect = melspect[np.newaxis, :]
misc.imshow(melspect.reshape((melspect.shape[1],melspect.shape[2])))
print(melspect.shape)
return melspect
def read_file_pair(filename_pair, mono=True):
"""
given a pair of file names, read in both waveforms and upsample (through
librosa's default interpolation) the downsampled waveform
assumes the file name pair is of the form ("original", "downsampled")
mono selects whether to read in mono or stereo formatted waveforms
returns a pair of numpy arrays representing the original and upsampled
waveform
"""
channel = 1 if mono else 2
true_waveform, true_br = librosa.load(filename_pair[0], sr=None,
mono=mono)
ds_waveform, _ = librosa.load(filename_pair[1], sr=true_br, mono=mono)
# truth, example
return true_waveform.reshape((-1, channel)), \
ds_waveform.reshape((-1, channel))
def extract(filename, fft_size=FFT_SIZE, dtype=np.float32):
''' Basic (WORLD) feature extraction '''
x, _ = librosa.load(filename, sr=args.fs, mono=True, dtype=np.float64)
features = wav2pw(x, args.fs, fft_size=fft_size)
ap = features['ap']
f0 = features['f0'].reshape([-1, 1])
sp = features['sp']
en = np.sum(sp + EPSILON, axis=1, keepdims=True)
sp = np.log10(sp / en)
return np.concatenate([sp, ap, f0, en], axis=1).astype(dtype)
trainModel.py 文件源码
项目:Sound-classification-on-Raspberry-Pi-with-Tensorflow
作者: GianlucaPaolocci
项目源码
文件源码
阅读 27
收藏 0
点赞 0
评论 0
def extract_features(file_name):
X, sample_rate = librosa.load(file_name)
stft = np.abs(librosa.stft(X))
mfccs = np.array(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=8).T)
chroma = np.array(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T)
mel = np.array(librosa.feature.melspectrogram(X, sr=sample_rate).T)
contrast = np.array(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T)
tonnetz = np.array(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T)
return mfccs,chroma,mel,contrast,tonnetz
def _load_from_cache(self):
try:
return numpy.load(str(self.spectrogram_cache_file))
except ValueError:
log("Recalculating cached file {} because loading failed.".format(self.spectrogram_cache_file))
return self._calculate_and_save_spectrogram()
def SIGNAL():
y, sr = librosa.load(librosa.util.example_audio_file(),
sr=None)
return y, sr
def SIGNAL():
y, sr = librosa.load(librosa.util.example_audio_file(),
sr=None)
return y, sr
def create_marked_audio_file(mark_locations: Union[List[float], np.ndarray], output_path: Opt[str] = None, *,
audio_file: Opt[str] = None, duration: float = None):
if audio_file:
y, sr = librosa.load(audio_file)
marked_audio = librosa.core.clicks(times=mark_locations, sr=sr, length=len(y))
marked_audio = y + marked_audio
elif duration:
sr = 22050
marked_audio = librosa.core.clicks(times=mark_locations, sr=sr, length=int(sr * duration))
else:
raise ParameterError("Must provide either audio file or duration.")
librosa.output.write_wav(path=output_path, y=marked_audio, sr=sr)
return output_path
def __init__(self, file: str, *, sample_rate: int = 44100):
"""
Parameters
----------
file
Audio file to load
"""
self.file = file
self.samples, self.sample_rate = librosa.load(file, sr=sample_rate)
self.duration = librosa.get_duration(y=self.samples, sr=self.sample_rate)
def clip_audio(specs, raw_audio, output):
# Load the spec data. In clipping audio, we hold the specs fixed.
spec_filenames = next(os.walk(specs))[2]
if len(spec_filenames) == 0:
print("No specs found.")
return
for spec_filename in spec_filenames:
with open(os.path.join(specs, spec_filename)) as f:
spec = json.load(f)
youtube_id = spec['audio_source']['youtube_id']
start_time = spec['audio_source']['start_time']
end_time = spec['audio_source']['end_time']
raw_audio_filenames = glob.glob(os.path.join(raw_audio, youtube_id + '.*'))
if len(raw_audio_filenames) == 0:
# No audio file found, skip.
continue
raw_audio_filename = raw_audio_filenames[0]
raw_audio_extension = os.path.splitext(raw_audio_filename)[1]
clip_filename = os.path.join(
output, CLIP_NAME_PATTERN.format(youtube_id, start_time, end_time) +
raw_audio_extension)
# Call ffmpeg to output the trimmed clip.
os.makedirs(os.path.dirname(clip_filename), exist_ok=True)
call1 = ['ffmpeg', '-loglevel', 'error', '-n',
'-ss', str(start_time), '-t', str(end_time - start_time),
'-i', raw_audio_filename]
if raw_audio_extension == 'ogg':
call2 = ['-codec:a', 'libvorbis', '-strict', 'experimental']
else:
call2 = []
call3 = [clip_filename]
process = subprocess.run(call1 + call2 + call3)
if process.returncode != 0:
print("Error: {} encountered by {}".format(
process.returncode, clip_filename))
else:
print(clip_filename)
def test_dtw_aligner():
x, fs = librosa.load(example_audio_file(), sr=None)
assert fs == 16000
x_fast = librosa.effects.time_stretch(x, 2.0)
X = _get_mcep(x, fs)
Y = _get_mcep(x_fast, fs)
D = X.shape[-1]
# Create padded pair
X, Y = adjast_frame_lengths(X, Y, divisible_by=2)
# Add utterance axis
X = X.reshape(1, -1, D)
Y = Y.reshape(1, -1, D)
X_aligned, Y_aligned = DTWAligner().transform((X, Y))
assert X_aligned.shape == Y_aligned.shape
assert np.linalg.norm(X_aligned - Y_aligned) < np.linalg.norm(X - Y)
X_aligned, Y_aligned = IterativeDTWAligner(
n_iter=2, max_iter_gmm=10, n_components_gmm=2).transform((X, Y))
assert X_aligned.shape == Y_aligned.shape
assert np.linalg.norm(X_aligned - Y_aligned) < np.linalg.norm(X - Y)
# Custom dist function
from nnmnkwii.metrics import melcd
X_aligned, Y_aligned = DTWAligner(dist=melcd).transform((X, Y))
assert np.linalg.norm(X_aligned - Y_aligned) < np.linalg.norm(X - Y)
def mfcc_batch_generator(batch_size=10, source=Source.DIGIT_WAVES, target=Target.digits):
maybe_download(source, DATA_DIR)
if target == Target.speaker: speakers = get_speakers()
batch_features = []
labels = []
files = os.listdir(path)
while True:
print("loaded batch of %d files" % len(files))
shuffle(files)
for file in files:
if not file.endswith(".wav"): continue
wave, sr = librosa.load(path+file, mono=True)
mfcc = librosa.feature.mfcc(wave, sr)
if target==Target.speaker: label=one_hot_from_item(speaker(file), speakers)
elif target==Target.digits: label=dense_to_one_hot(int(file[0]),10)
elif target==Target.first_letter: label=dense_to_one_hot((ord(file[0]) - 48) % 32,32)
elif target == Target.hotword: label = one_hot_word(file, pad_to=max_word_length) #
elif target == Target.word: label=string_to_int_word(file, pad_to=max_word_length)
# label = file # sparse_labels(file, pad_to=20) # max_output_length
else: raise Exception("todo : labels for Target!")
labels.append(label)
# print(np.array(mfcc).shape)
mfcc=np.pad(mfcc,((0,0),(0,80-len(mfcc[0]))), mode='constant', constant_values=0)
batch_features.append(np.array(mfcc))
if len(batch_features) >= batch_size:
# if target == Target.word: labels = sparse_labels(labels)
# labels=np.array(labels)
# print(np.array(batch_features).shape)
# yield np.array(batch_features), labels
# print(np.array(labels).shape) # why (64,) instead of (64, 15, 32)? OK IFF dim_1==const (20)
yield batch_features, labels # basic_rnn_seq2seq inputs must be a sequence
batch_features = [] # Reset for next batch
labels = []
# If you set dynamic_pad=True when calling tf.train.batch the returned batch will be automatically padded with 0s. Handy! A lower-level option is to use tf.PaddingFIFOQueue.
# only apply to a subset of all images at one time