def transform_audio(self, y):
'''Compute the tempogram
Parameters
----------
y : np.ndarray
Audio buffer
Returns
-------
data : dict
data['tempogram'] : np.ndarray, shape=(n_frames, win_length)
The tempogram
'''
n_frames = self.n_frames(get_duration(y=y, sr=self.sr))
tgram = tempogram(y=y, sr=self.sr,
hop_length=self.hop_length,
win_length=self.win_length).astype(np.float32)
tgram = fix_length(tgram, n_frames)
return {'tempogram': tgram.T[self.idx]}
python类get_duration()的实例源码
def _crossFadeRegion(self): # Computes the cross fade region for the mixed song
Na = self.beats['in'].shape[0]-1
scores = [self._score(i, Na) for i in xrange(2, int(Na/4))]
noBeats = np.argmax(scores)+2
inDuration = librosa.get_duration(y=self.Yin, sr=self.sr)
fadeInStart = librosa.frames_to_time(self.beats['in'], sr=self.sr)[-int(noBeats/2)]
fadeIn = inDuration - fadeInStart
fadeOut = librosa.frames_to_time(self.beats['out'], sr=self.sr)[int(noBeats/2)]
print "Best Power Corelation Scores=", np.max(scores)
print "Number of beats in cross fade region=", noBeats
print "fadeInStart=", fadeInStart
print "fadeOutEnd=", fadeOut
print "Cross Fade Time=", fadeIn+fadeOut
self.crossFade = [fadeInStart*1000, fadeOut*1000] # In milliseconds
def duration_in_s(self) -> float:
try:
return librosa.get_duration(filename=str(self.audio_file))
except Exception as e:
log("Failed to get duration of {}: {}".format(self.audio_file, e))
return 0
def predict(self, filename=None, y=None, sr=None, outputs=None):
'''Predict annotations
Parameters
----------
filename : str (optional)
Path to audio file
y, sr : (optional)
Audio buffer and sample rate
outputs : (optional)
Pre-computed model outputs as produced by `CremaModel.outputs`.
If provided, then predictions are derived from these instead of
`filename` or `(y, sr)`.
.. note:: At least one of `filename`, `y, sr` must be provided.
Returns
-------
jams.Annotation
The predicted annotation
'''
# Pump the input features
output_key = self.model.output_names[0]
if outputs is None:
outputs = self.outputs(filename=filename, y=y, sr=sr)
# Invert the prediction. This is always the first output layer.
ann = self.pump[output_key].inverse(outputs[output_key])
# Populate the metadata
ann.annotation_metadata.version = self.version
ann.annotation_metadata.annotation_tools = 'CREMA {}'.format(version)
ann.annotation_metadata.data_source = 'program'
ann.duration = librosa.get_duration(y=y, sr=sr, filename=filename)
return ann
def __init__(self, file: str, *, sample_rate: int = 44100):
"""
Parameters
----------
file
Audio file to load
"""
self.file = file
self.samples, self.sample_rate = librosa.load(file, sr=sample_rate)
self.duration = librosa.get_duration(y=self.samples, sr=self.sample_rate)
def transform_audio(self, y):
'''Compute the STFT magnitude and phase.
Parameters
----------
y : np.ndarray
The audio buffer
Returns
-------
data : dict
data['mag'] : np.ndarray, shape=(n_frames, 1 + n_fft//2)
STFT magnitude
data['phase'] : np.ndarray, shape=(n_frames, 1 + n_fft//2)
STFT phase
'''
n_frames = self.n_frames(get_duration(y=y, sr=self.sr))
D = stft(y, hop_length=self.hop_length,
n_fft=self.n_fft)
D = fix_length(D, n_frames)
mag, phase = magphase(D)
if self.log:
mag = amplitude_to_db(mag, ref=np.max)
return {'mag': mag.T[self.idx].astype(np.float32),
'phase': np.angle(phase.T)[self.idx].astype(np.float32)}
def transform_audio(self, y):
'''Compute the time position encoding
Parameters
----------
y : np.ndarray
Audio buffer
Returns
-------
data : dict
data['relative'] = np.ndarray, shape=(n_frames, 2)
data['absolute'] = np.ndarray, shape=(n_frames, 2)
Relative and absolute time positional encodings.
'''
duration = get_duration(y=y, sr=self.sr)
n_frames = self.n_frames(duration)
relative = np.zeros((n_frames, 2), dtype=np.float32)
relative[:, 0] = np.cos(np.pi * np.linspace(0, 1, num=n_frames))
relative[:, 1] = np.sin(np.pi * np.linspace(0, 1, num=n_frames))
absolute = relative * np.sqrt(duration)
return {'relative': relative[self.idx],
'absolute': absolute[self.idx]}
def transform_audio(self, y):
'''Compute the Mel spectrogram
Parameters
----------
y : np.ndarray
The audio buffer
Returns
-------
data : dict
data['mag'] : np.ndarray, shape=(n_frames, n_mels)
The Mel spectrogram
'''
n_frames = self.n_frames(get_duration(y=y, sr=self.sr))
mel = np.sqrt(melspectrogram(y=y, sr=self.sr,
n_fft=self.n_fft,
hop_length=self.hop_length,
n_mels=self.n_mels,
fmax=self.fmax)).astype(np.float32)
mel = fix_length(mel, n_frames)
if self.log:
mel = amplitude_to_db(mel, ref=np.max)
return {'mag': mel.T[self.idx]}
def transform_audio(self, y):
'''Compute the CQT
Parameters
----------
y : np.ndarray
The audio buffer
Returns
-------
data : dict
data['mag'] : np.ndarray, shape = (n_frames, n_bins)
The CQT magnitude
data['phase']: np.ndarray, shape = mag.shape
The CQT phase
'''
n_frames = self.n_frames(get_duration(y=y, sr=self.sr))
C = cqt(y=y, sr=self.sr, hop_length=self.hop_length,
fmin=self.fmin,
n_bins=(self.n_octaves * self.over_sample * 12),
bins_per_octave=(self.over_sample * 12))
C = fix_length(C, n_frames)
cqtm, phase = magphase(C)
if self.log:
cqtm = amplitude_to_db(cqtm, ref=np.max)
return {'mag': cqtm.T.astype(np.float32)[self.idx],
'phase': np.angle(phase).T.astype(np.float32)[self.idx]}
def analyze(filename=None, y=None, sr=None):
'''Analyze a recording for all tasks.
Parameters
----------
filename : str, optional
Path to audio file
y : np.ndarray, optional
sr : number > 0, optional
Audio buffer and sampling rate
.. note:: At least one of `filename` or `y, sr` must be provided.
Returns
-------
jam : jams.JAMS
a JAMS object containing all estimated annotations
Examples
--------
>>> from crema.analyze import analyze
>>> import librosa
>>> jam = analyze(filename=librosa.util.example_audio_file())
>>> jam
<JAMS(file_metadata=<FileMetadata(...)>,
annotations=[1 annotation],
sandbox=<Sandbox(...)>)>
>>> # Get the chord estimates
>>> chords = jam.annotations['chord', 0]
>>> chords.to_dataframe().head(5)
time duration value confidence
0 0.000000 0.092880 E:maj 0.336977
1 0.092880 0.464399 E:7 0.324255
2 0.557279 1.021678 E:min 0.448759
3 1.578957 2.693515 E:maj 0.501462
4 4.272472 1.486077 E:min 0.287264
'''
_load_models()
jam = jams.JAMS()
# populate file metadata
jam.file_metadata.duration = librosa.get_duration(y=y, sr=sr,
filename=filename)
for model in __MODELS__:
jam.annotations.append(model.predict(filename=filename, y=y, sr=sr))
return jam
def generate_example(spec_filename, audio_filename):
with open(spec_filename) as f:
spec = json.load(f)
spec_duration = (spec['audio_source']['end_time'] -
spec['audio_source']['start_time'])
sample_duration = librosa.get_duration(filename=audio_filename)
if not math.isclose(spec_duration, sample_duration):
print("Warning: sample duration is {} but spec says {}".format(
sample_duration, spec_duration))
sample, sampling_rate = librosa.load(audio_filename, sr=44100)
if sampling_rate != 44100:
print("Warning: sampling rate is {}".format(sampling_rate))
return tf.train.SequenceExample(
context=tf.train.Features(feature={
'data_source': tf.train.Feature(
bytes_list=tf.train.BytesList(
value=[bytes(spec['data_source'], 'utf-8')])),
'tonic': tf.train.Feature(
int64_list=tf.train.Int64List(
value=[spec['key']['tonic']])),
'mode': tf.train.Feature(
int64_list=tf.train.Int64List(
value=[spec['key']['mode']])),
'beats': tf.train.Feature(
int64_list=tf.train.Int64List(
value=[spec['meter']['beats']])),
'beats_per_measure': tf.train.Feature(
int64_list=tf.train.Int64List(
value=[spec['meter']['beats_per_measure']])),
}),
feature_lists=tf.train.FeatureLists(feature_list={
'audio': tf.train.FeatureList(
feature=[tf.train.Feature(
float_list=tf.train.FloatList(value=sample.tolist()))]),
'melody': tf.train.FeatureList(
feature=[tf.train.Feature(
int64_list=tf.train.Int64List(value=[]))]),
'harmony': tf.train.FeatureList(
feature=[tf.train.Feature(
int64_list=tf.train.Int64List(value=[]))]),
}))