def stft(wav, n_fft=1024, overlap=4, dt=tf.int32, absp=False):
assert (wav.shape[0] > n_fft)
X = tf.placeholder(dtype=dt,shape=wav.shape)
X = tf.cast(X,tf.float32)
hop = n_fft / overlap
## prepare constant variable
Pi = tf.constant(np.pi, dtype=tf.float32)
W = tf.constant(scipy.hanning(n_fft), dtype=tf.float32)
S = tf.pack([tf.fft(tf.cast(tf.multiply(W,X[i:i+n_fft]),\
tf.complex64)) for i in range(1, wav.shape[0] - n_fft, hop)])
abs_S = tf.complex_abs(S)
sess = tf.Session()
if absp:
return sess.run(abs_S, feed_dict={X:wav})
else:
return sess.run(S, feed_dict={X:wav})
python类stft()的实例源码
def griffin_lim(mag, phase_angle, n_fft, hop, num_iters):
"""Iterative algorithm for phase retrival from a magnitude spectrogram.
Args:
mag: Magnitude spectrogram.
phase_angle: Initial condition for phase.
n_fft: Size of the FFT.
hop: Stride of FFT. Defaults to n_fft/2.
num_iters: Griffin-Lim iterations to perform.
Returns:
audio: 1-D array of float32 sound samples.
"""
fft_config = dict(n_fft=n_fft, win_length=n_fft, hop_length=hop, center=True)
ifft_config = dict(win_length=n_fft, hop_length=hop, center=True)
complex_specgram = inv_magphase(mag, phase_angle)
for i in range(num_iters):
audio = librosa.istft(complex_specgram, **ifft_config)
if i != num_iters - 1:
complex_specgram = librosa.stft(audio, **fft_config)
_, phase = librosa.magphase(complex_specgram)
phase_angle = np.angle(phase)
complex_specgram = inv_magphase(mag, phase_angle)
return audio
def make_spectrum(self, filename, use_normalize):
sr, y = wav.read(filename)
if sr != 16000:
raise ValueError('Sampling rate is expected to be 16kHz!')
if y.dtype!='float32':
y = np.float32(y/32767.)
D=librosa.stft(y,n_fft=512,hop_length=256,win_length=512,window=scipy.signal.hamming)
Sxx=np.log10(abs(D)**2)
if use_normalize:
mean = np.mean(Sxx, axis=1).reshape((257,1))
std = np.std(Sxx, axis=1).reshape((257,1))+1e-12
Sxx = (Sxx-mean)/std
slices = []
for i in range(0, Sxx.shape[1]-self.FRAMELENGTH, self.OVERLAP):
slices.append(Sxx[:,i:i+self.FRAMELENGTH])
return np.array(slices)
def test_stft_istft(self):
try:
import librosa
ds = F.load_digit_wav()
name = ds.keys()[0]
path = ds[name]
y, _ = speech.read(path, pcm=True)
hop_length = int(0.01 * 8000)
stft = signal.stft(y, n_fft=256, hop_length=hop_length, window='hann')
stft_ = librosa.stft(y, n_fft=256, hop_length=hop_length, window='hann')
self.assertTrue(np.allclose(stft, stft_.T))
y1 = signal.istft(stft, hop_length=hop_length, window='hann')
y2 = librosa.istft(stft_, hop_length=hop_length, window='hann')
self.assertTrue(np.allclose(y1, y2))
except ImportError:
print("test_stft_istft require librosa.")
def griffinlim(spectrogram, n_iter=50, window='hann', n_fft=2048, win_length=2048, hop_length=-1, verbose=False):
if hop_length == -1:
hop_length = n_fft // 4
angles = np.exp(2j * np.pi * np.random.rand(*spectrogram.shape))
t = tqdm(range(n_iter), ncols=100, mininterval=2.0, disable=not verbose)
for i in t:
full = np.abs(spectrogram).astype(np.complex) * angles
inverse = librosa.istft(full, hop_length = hop_length, win_length = win_length, window = window)
rebuilt = librosa.stft(inverse, n_fft = n_fft, hop_length = hop_length, win_length = win_length, window = window)
angles = np.exp(1j * np.angle(rebuilt))
if verbose:
diff = np.abs(spectrogram) - np.abs(rebuilt)
t.set_postfix(loss=np.linalg.norm(diff, 'fro'))
full = np.abs(spectrogram).astype(np.complex) * angles
inverse = librosa.istft(full, hop_length = hop_length, win_length = win_length, window = window)
return inverse
def __call__(self, y):
"""Short-time Fourier transform (STFT).
Returns a real-valued matrix
Returns a complex-valued matrix D such that
`np.abs(D[f, t])` is the magnitude of frequency bin `f`
at frame `t`
`np.angle(D[f, t])` is the phase of frequency bin `f`
at frame `t`
Parameters
----------
y : np.ndarray [shape=(n,)], real-valued
the input signal (audio time series)
Returns
-------
D : np.ndarray [shape=(1 + n_fft/2, t), dtype=dtype]
STFT matrix
"""
return librosa.stft(y, **self.__dict__)
trainModel.py 文件源码
项目:Sound-classification-on-Raspberry-Pi-with-Tensorflow
作者: GianlucaPaolocci
项目源码
文件源码
阅读 27
收藏 0
点赞 0
评论 0
def extract_features(file_name):
X, sample_rate = librosa.load(file_name)
stft = np.abs(librosa.stft(X))
mfccs = np.array(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=8).T)
chroma = np.array(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T)
mel = np.array(librosa.feature.melspectrogram(X, sr=sample_rate).T)
contrast = np.array(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T)
tonnetz = np.array(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T)
return mfccs,chroma,mel,contrast,tonnetz
classiPi.py 文件源码
项目:Sound-classification-on-Raspberry-Pi-with-Tensorflow
作者: GianlucaPaolocci
项目源码
文件源码
阅读 20
收藏 0
点赞 0
评论 0
def extract_features():
X = sounddevice.rec(int(duration * sample_rate), samplerate=sample_rate, channels=1)
sounddevice.wait()
X= np.squeeze(X)
stft = np.abs(librosa.stft(X))
mfccs = np.array(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=8).T)
chroma = np.array(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T)
mel = np.array(librosa.feature.melspectrogram(X, sr=sample_rate).T)
contrast = np.array(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T)
tonnetz = np.array(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T)
ext_features = np.hstack([mfccs,chroma,mel,contrast,tonnetz])
features = np.vstack([features,ext_features])
return features
def _complex_spectrogram(self) -> ndarray:
return librosa.stft(y=self.get_raw_audio(), n_fft=self.fourier_window_length, hop_length=self.hop_length)
def _griffin_lim(S, n_fft, win_length, hop_length, num_iters):
angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
S_complex = np.abs(S).astype(np.complex)
for i in range(num_iters):
if i > 0:
angles = np.exp(1j * np.angle(librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length)))
y = librosa.istft(S_complex * angles, hop_length=hop_length, win_length=win_length)
return y
def create_spectrogram_from_audio(data):
global setting
spectrogram = librosa.stft(data, n_fft=Config.n_fft, hop_length=Config.hop_length).transpose()
# divide the real and imaginary components of each element
# concatenate the matrix with the real components and the matrix with imaginary components
# (DataCorruptionError when saving complex numbers in TFRecords)
# concatenated = np.concatenate([np.real(spectrogram), np.imag(spectrogram)], axis=1)
return spectrogram # [num_time_frames, num_freq_bins]
neural_network_audioset.py 文件源码
项目:TensorFlow_AudioSet_Example
作者: DantesLegacy
项目源码
文件源码
阅读 19
收藏 0
点赞 0
评论 0
def plot_log_power_specgram(sound_names,raw_sounds):
i = 1
fig = plt.figure(figsize=(25,60), dpi = 900)
for n,f in zip(sound_names,raw_sounds):
plt.subplot(10,1,i)
D = librosa.logamplitude(np.abs(librosa.stft(f))**2, ref_power=np.max)
librosa.display.specshow(D,x_axis='time' ,y_axis='log')
plt.title(n.title())
i += 1
plt.suptitle('Figure 3: Log power spectrogram',x=0.5, y=0.915,fontsize=18)
plt.show()
neural_network_audioset.py 文件源码
项目:TensorFlow_AudioSet_Example
作者: DantesLegacy
项目源码
文件源码
阅读 22
收藏 0
点赞 0
评论 0
def extract_feature(file_name):
X, sample_rate = librosa.load(file_name)
stft = np.abs(librosa.stft(X))
mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0)
chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)
tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0)
return mfccs,chroma,mel,contrast,tonnetz
def get_spectrograms(sound_file):
'''Extracts melspectrogram and log magnitude from given `sound_file`.
Args:
sound_file: A string. Full path of a sound file.
Returns:
Transposed S: A 2d array. A transposed melspectrogram with shape of (T, n_mels)
Transposed magnitude: A 2d array.Has shape of (T, 1+hp.n_fft//2)
'''
# Loading sound file
y, sr = librosa.load(sound_file, sr=None) # or set sr to hp.sr.
# stft. D: (1+n_fft//2, T)
D = librosa.stft(y=y,
n_fft=hp.n_fft,
hop_length=hp.hop_length,
win_length=hp.win_length)
# magnitude spectrogram
magnitude = np.abs(D) # (1+n_fft/2, T)
# power spectrogram
power = magnitude ** 2 # (1+n_fft/2, T)
# mel spectrogram
S = librosa.feature.melspectrogram(S=power, n_mels=hp.n_mels) # (n_mels, T)
return np.transpose(S.astype(np.float32)), np.transpose(magnitude.astype(np.float32)) # (T, n_mels), (T, 1+n_fft/2)
def spectrogram2wav(spectrogram):
'''
spectrogram: [t, f], i.e. [t, nfft // 2 + 1]
'''
spectrogram = spectrogram.T # [f, t]
X_best = copy.deepcopy(spectrogram) # [f, t]
for i in range(hp.n_iter):
X_t = invert_spectrogram(X_best)
est = librosa.stft(X_t, hp.n_fft, hp.hop_length, win_length=hp.win_length) # [f, t]
phase = est / np.maximum(1e-8, np.abs(est)) # [f, t]
X_best = spectrogram * phase # [f, t]
X_t = invert_spectrogram(X_best)
return np.real(X_t)
def make_spectrum_phase(y, FRAMESIZE, OVERLAP, FFTSIZE):
D=librosa.stft(y,n_fft=FRAMESIZE,hop_length=OVERLAP,win_length=FFTSIZE,window=scipy.signal.hamming)
Sxx = np.log10(abs(D)**2)
phase = np.exp(1j * np.angle(D))
mean = np.mean(Sxx, axis=1).reshape((257,1))
std = np.std(Sxx, axis=1).reshape((257,1))+1e-12
Sxx = (Sxx-mean)/std
return Sxx, phase, mean, std
def get_feature_aqibsaeed_1(X, sr, au_path=None):
"""
http://aqibsaeed.github.io/2016-09-03-urban-sound-classification-part-1/
"""
import librosa
if au_path is not None:
X, sr = librosa.load(au_path)
stft = np.abs(librosa.stft(X))
mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sr, n_mfcc=40).T,axis=0)
chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sr).T,axis=0)
mel = np.mean(librosa.feature.melspectrogram(X, sr=sr).T,axis=0)
contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sr).T,axis=0)
tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sr).T,axis=0)
feature = np.hstack([mfccs,chroma,mel,contrast,tonnetz])
return feature
def transform_audio(self, y):
'''Compute the STFT magnitude and phase.
Parameters
----------
y : np.ndarray
The audio buffer
Returns
-------
data : dict
data['mag'] : np.ndarray, shape=(n_frames, 1 + n_fft//2)
STFT magnitude
data['phase'] : np.ndarray, shape=(n_frames, 1 + n_fft//2)
STFT phase
'''
n_frames = self.n_frames(get_duration(y=y, sr=self.sr))
D = stft(y, hop_length=self.hop_length,
n_fft=self.n_fft)
D = fix_length(D, n_frames)
mag, phase = magphase(D)
if self.log:
mag = amplitude_to_db(mag, ref=np.max)
return {'mag': mag.T[self.idx].astype(np.float32),
'phase': np.angle(phase.T)[self.idx].astype(np.float32)}
def compute_spec(audio_file,spectro_file):
# Get actual audio
audio, sr = librosa.load(audio_file, sr=config['resample_sr'])
# Compute spectrogram
if config['spectrogram_type']=='cqt':
spec = librosa.cqt(audio, sr=sr, hop_length=config['hop'], n_bins=config['cqt_bins'], real=False)
elif config['spectrogram_type']=='mel':
spec = librosa.feature.melspectrogram(y=audio, sr=sr, hop_length=config['hop'],n_fft=config['n_fft'],n_mels=config['n_mels'])
elif config['spectrogram_type']=='stft':
spec = librosa.stft(y=audio,n_fft=config['n_fft'])
# Write results:
with open(spectro_file, "w") as f:
pickle.dump(spec, f, protocol=-1) # spec shape: MxN.
def expand(self, audio):
ori_len = audio.shape[0]
tmp = resample(audio, r=0.5, type='sinc_best')
down_len = tmp.shape[0]
tmp = resample(tmp, r=(ori_len+1) / float(down_len), type='sinc_best')
tmp = librosa.stft(audio, 1024)
phase = np.divide(tmp, np.abs(tmp))
spec_input = np.abs(librosa.stft(audio, 1024))[0:n_input, ::]
spec_input = spec_input[::, 0:spec_input.shape[1]//n_len*n_len]
spec_input = np.split(spec_input,
spec_input.shape[1]//n_len, axis=1)
spec_input = np.asarray(spec_input)
spec_input = np.expand_dims(spec_input, axis=-1)
feed_dict = {self.input_op: np.log1p(spec_input) / 12.0}
debug = self.sess.run(self.debug_op, feed_dict=feed_dict)
np.save('debug.npy', debug)
S = self.sess.run(self.eva_op, feed_dict=feed_dict)
S[S >= 5e3] = 5e3
S[S <= 0] = 0
print ('mean', np.mean(S))
print (np.sum(np.isinf(S)))
S = np.squeeze(np.concatenate(np.split(S, S.shape[0]), axis=2),
axis=(0, -1))
phase = phase[..., :S.shape[1]]
print (phase.shape)
print (S.shape)
print (np.sum(np.isinf(np.multiply(S, phase))))
X = librosa.istft(np.multiply(S, phase))
return X
def get_spectrograms(sound_file):
'''Extracts melspectrogram and log magnitude from given `sound_file`.
Args:
sound_file: A string. Full path of a sound file.
Returns:
Transposed S: A 2d array. A transposed melspectrogram with shape of (T, n_mels)
Transposed magnitude: A 2d array.Has shape of (T, 1+hp.n_fft//2)
'''
# Loading sound file
y, sr = librosa.load(sound_file, sr=hp.sr) # or set sr to hp.sr.
# stft. D: (1+n_fft//2, T)
D = librosa.stft(y=y,
n_fft=hp.n_fft,
hop_length=hp.hop_length,
win_length=hp.win_length)
# magnitude spectrogram
magnitude = np.abs(D) #(1+n_fft/2, T)
# power spectrogram
power = magnitude**2 #(1+n_fft/2, T)
# mel spectrogram
S = librosa.feature.melspectrogram(S=power, n_mels=hp.n_mels) #(n_mels, T)
return np.transpose(S.astype(np.float32)), np.transpose(magnitude.astype(np.float32)) # (T, n_mels), (T, 1+n_fft/2)
def spectrogram2wav(spectrogram):
'''
spectrogram: [t, f], i.e. [t, nfft // 2 + 1]
'''
spectrogram = spectrogram.T # [f, t]
X_best = copy.deepcopy(spectrogram) # [f, t]
for i in range(hp.n_iter):
X_t = invert_spectrogram(X_best)
est = librosa.stft(X_t, hp.n_fft, hp.hop_length, win_length=hp.win_length) # [f, t]
phase = est / np.maximum(1e-8, np.abs(est)) # [f, t]
X_best = spectrogram * phase # [f, t]
X_t = invert_spectrogram(X_best)
return np.real(X_t)
def source_separation(self, x):
if not Duration()(x) > 10:
stftx = librosa.stft(x)
real = stftx.real
imag = stftx.imag
ssp = find_sparse_source_points(real, imag) #find sparsity in the signal
cos_dist = cosine_distance(ssp) #cosine distance from sparse data
sources = find_number_of_sources(cos_dist) #find possible number of sources
if (sources == 0) or (sources == 1): #this means x is an instrumental track and doesn't have more than one source
print "There's only one visible source"
return x
else:
print "Separating sources"
xs = NMF(stftx, sources)
return xs[0] #take the bass part #TODO: correct NMF to return noiseless reconstruction
else:
stftx = librosa.stft(x[:441000]) #take 10 seconds of signal data to find sources
print "It can take some time to find any source in this signal"
real = stftx.real
imag = stftx.imag
ssp = find_sparse_source_points(real, imag) #find sparsity in the signal
cos_dist = cosine_distance(ssp) #cosine distance from sparse data
sources = find_number_of_sources(cos_dist) #find possible number of sources
if (sources == 0) or (sources == 1): #this means x is an instrumental track and doesn't have more than one source
print "There's only one visible source"
return x
else:
print "Separating sources"
xs = NMF(librosa.stft(x), sources)
return xs[0] #take the bass part #TODO: correct NMF to return noiseless reconstruction
def sad_music_remix(self, neg_arous_dir, files, decisions, harmonic = None):
for subdirs, dirs, sounds in os.walk(neg_arous_dir):
fx = random.choice(sounds[::-1])
fy = random.choice(sounds[:])
x = MonoLoader(filename = neg_arous_dir + '/' + fx)()
y = MonoLoader(filename = neg_arous_dir + '/' + fy)()
fx = fx.split('.')[0]
fy = fy.split('.')[0]
fx = np.where(files == fx)[0][0]
fy = np.where(files == fy)[0][0]
if harmonic is False or None:
dec_x = get_coordinate(fx, 1, decisions)
dec_y = get_coordinate(fy, 1, decisions)
else:
dec_x = get_coordinate(fx, 2, decisions)
dec_y = get_coordinate(fy, 2, decisions)
x = self.source_separation(x)
x = scratch_music(x, dec_x)
x = x[np.nonzero(x)]
y = scratch_music(y, dec_y)
y = y[np.nonzero(y)]
x, y = same_time(x,y)
negative_arousal_samples = [i/i.max() for i in (x,y)]
negative_arousal_x = np.array(negative_arousal_samples).sum(axis=0)
negative_arousal_x = 0.5*negative_arousal_x/negative_arousal_x.max()
if harmonic is True:
return librosa.decompose.hpss(librosa.stft(negative_arousal_x), margin = (1.0, 5.0))[0]
if harmonic is False or harmonic is None:
onsets = hfc_onsets(np.float32(negative_arousal_x))
interv = seconds_to_indices(onsets)
steps = overlapped_intervals(interv)
output = librosa.effects.remix(negative_arousal_x, steps[::-1], align_zeros = False)
output = librosa.effects.pitch_shift(output, sr = 44100, n_steps = 3)
remix_filename = 'data/emotions/remixes/sad/'+str(time.strftime("%Y%m%d-%H:%M:%S"))+'multitag_remix.ogg'
MonoWriter(filename=remix_filename, format = 'ogg', sampleRate = 44100)(np.float32(output))
subprocess.call(["ffplay", "-nodisp", "-autoexit", remix_filename])
def happy_music_remix(self, pos_arous_dir, files, decisions, harmonic = None):
for subdirs, dirs, sounds in os.walk(pos_arous_dir):
fx = random.choice(sounds[::-1])
fy = random.choice(sounds[:])
x = MonoLoader(filename = pos_arous_dir + '/' + fx)()
y = MonoLoader(filename = pos_arous_dir + '/' + fy)()
fx = fx.split('.')[0]
fy = fy.split('.')[0]
fx = np.where(files == fx)[0][0]
fy = np.where(files == fy)[0][0]
if harmonic is False or None:
dec_x = get_coordinate(fx, 3, decisions)
dec_y = get_coordinate(fy, 3, decisions)
else:
dec_x = get_coordinate(fx, 0, decisions)
dec_y = get_coordinate(fy, 0, decisions)
x = self.source_separation(x)
x = scratch_music(x, dec_x)
y = scratch_music(y, dec_y)
x = x[np.nonzero(x)]
y = y[np.nonzero(y)]
x, y = same_time(x,y)
positive_arousal_samples = [i/i.max() for i in (x,y)]
positive_arousal_x = np.float32(positive_arousal_samples).sum(axis=0)
positive_arousal_x = 0.5*positive_arousal_x/positive_arousal_x.max()
if harmonic is True:
return librosa.decompose.hpss(librosa.stft(positive_arousal_x), margin = (1.0, 5.0))[0]
if harmonic is False or harmonic is None:
interv = RhythmExtractor2013()(positive_arousal_x)[1] * 44100
steps = overlapped_intervals(interv)
output = librosa.effects.remix(positive_arousal_x, steps, align_zeros = False)
output = librosa.effects.pitch_shift(output, sr = 44100, n_steps = 4)
remix_filename = 'data/emotions/remixes/happy/'+str(time.strftime("%Y%m%d-%H:%M:%S"))+'multitag_remix.ogg'
MonoWriter(filename=remix_filename, format = 'ogg', sampleRate = 44100)(np.float32(output))
subprocess.call(["ffplay", "-nodisp", "-autoexit", remix_filename])
def not_angry_music_remix(self, neg_arous_dir, files, decisions):
sounds = []
for i in range(len(neg_arous_dir)):
for subdirs, dirs, s in os.walk(neg_arous_dir[i]):
sounds.append(subdirs + '/' + random.choice(s))
fx = random.choice(sounds[::-1])
fy = random.choice(sounds[:])
x = MonoLoader(filename = fx)()
y = MonoLoader(filename = fy)()
fx = fx.split('/')[1].split('.')[0]
fy = fy.split('/')[1].split('.')[0]
fx = np.where(files == fx)[0]
fy = np.where(files == fy)[0]
dec_x = get_coordinate(fx, choice(range(1,3)), decisions)
dec_y = get_coordinate(fy, choice(range(1,3)), decisions)
x = self.source_separation(x)
x = scratch_music(x, dec_x)
y = scratch_music(y, dec_y)
x = x[np.nonzero(x)]
y = y[np.nonzero(y)]
x, y = same_time(x,y)
morph = stft.morph(x1 = x,x2 = y,fs = 44100,w1=np.hanning(1025),N1=2048,w2=np.hanning(1025),N2=2048,H1=512,smoothf=0.1,balancef=0.7)
onsets = hfc_onsets(np.float32(morph))
interv = seconds_to_indices(onsets)
steps = overlapped_intervals(interv)
output = librosa.effects.remix(morph, steps[::-1], align_zeros = False)
output = librosa.effects.pitch_shift(output, sr = 44100, n_steps = 4)
remix_filename = 'data/emotions/remixes/not angry/'+str(time.strftime("%Y%m%d-%H:%M:%S"))+'multitag_remix.ogg'
MonoWriter(filename = remix_filename, sampleRate = 44100, format = 'ogg')(np.float32(output))
subprocess.call(["ffplay", "-nodisp", "-autoexit", remix_filename])
def not_relaxed_music_remix(self, pos_arous_dir, files, decisions):
sounds = []
for i in range(len(pos_arous_dir)):
for subdirs, dirs, s in os.walk(pos_arous_dir[i]):
sounds.append(subdirs + '/' + random.choice(s))
fx = random.choice(sounds[::-1])
fy = random.choice(sounds[:])
x = MonoLoader(filename = fx)()
y = MonoLoader(filename = fy)()
fx = fx.split('/')[1].split('.')[0]
fy = fy.split('/')[1].split('.')[0]
fx = np.where(files == fx)[0]
fy = np.where(files == fy)[0]
dec_x = get_coordinate(fx, choice([0,1,3]), decisions)
dec_y = get_coordinate(fy, choice([0,1,3]), decisions)
x = self.source_separation(x)
x = scratch_music(x, dec_x)
y = scratch_music(y, dec_y)
x = x[np.nonzero(x)]
y = y[np.nonzero(y)]
x, y = same_time(x,y)
morph = stft.morph(x1 = x,x2 = y,fs = 44100,w1=np.hanning(1025),N1=2048,w2=np.hanning(1025),N2=2048,H1=512,smoothf=0.01,balancef=0.7)
interv = RhythmExtractor2013()(np.float32(morph))[1] * 44100
steps = overlapped_intervals(interv)
output = librosa.effects.remix(morph, steps[::-1], align_zeros = False)
output = librosa.effects.pitch_shift(output, sr = 44100, n_steps = 3)
remix_filename = 'data/emotions/remixes/not relaxed/'+str(time.strftime("%Y%m%d-%H:%M:%S"))+'multitag_remix.ogg'
MonoWriter(filename = remix_filename, sampleRate = 44100, format = 'ogg')(np.float32(output))
subprocess.call(["ffplay", "-nodisp", "-autoexit", remix_filename])
def parse_audio(self, audio_path):
if self.augment:
y = load_randomly_augmented_audio(audio_path, self.sample_rate)
else:
y = load_audio(audio_path)
if self.noiseInjector:
add_noise = np.random.binomial(1, self.noise_prob)
if add_noise:
y = self.noiseInjector.inject_noise(y)
n_fft = int(self.sample_rate * self.window_size)
win_length = n_fft
hop_length = int(self.sample_rate * self.window_stride)
# STFT
D = librosa.stft(y, n_fft=n_fft, hop_length=hop_length,
win_length=win_length, window=self.window)
spect, phase = librosa.magphase(D)
# S = log(S+1)
spect = np.log1p(spect)
spect = torch.FloatTensor(spect)
if self.normalize:
mean = spect.mean()
std = spect.std()
spect.add_(-mean)
spect.div_(std)
return spect
def __init__(self, tex_wnd, fft_len=512, sr=22050):
self.tex_wnd = tex_wnd
self.an_wnd_len = fft_len
self.sr = sr
# calc signal spectrum
self.fft_tex_wnds = np.abs(
librosa.stft(
y=tex_wnd,
n_fft=fft_len,
hop_length=fft_len,
)
)
def get_feature_aqibsaeed_1(X, sr, au_path=None):
"""
http://aqibsaeed.github.io/2016-09-03-urban-sound-classification-part-1/
"""
import librosa
if au_path is not None:
X, sr = librosa.load(au_path)
stft = np.abs(librosa.stft(X))
mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sr, n_mfcc=40).T,axis=0)
chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sr).T,axis=0)
mel = np.mean(librosa.feature.melspectrogram(X, sr=sr).T,axis=0)
contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sr).T,axis=0)
tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sr).T,axis=0)
feature = np.hstack([mfccs,chroma,mel,contrast,tonnetz])
return feature