def vad_energy(log_energy, distrib_nb=3, nb_train_it=25):
""" Fitting Gaussian mixture model on the log-energy and the voice
activity is the component with highest energy.
Return
------
vad: array of 0, 1
threshold: scalar
"""
from sklearn.exceptions import ConvergenceWarning
from sklearn.mixture import GaussianMixture
# center and normalize the energy
log_energy = (log_energy - np.mean(log_energy)) / np.std(log_energy)
if log_energy.ndim == 1:
log_energy = log_energy[:, np.newaxis]
# create mixture model: diag, spherical
world = GaussianMixture(
n_components=distrib_nb, covariance_type='diag',
init_params='kmeans', max_iter=nb_train_it,
weights_init=np.ones(distrib_nb) / distrib_nb,
means_init=(-2 + 4.0 * np.arange(distrib_nb) / (distrib_nb - 1))[:, np.newaxis],
precisions_init=np.ones((distrib_nb, 1)),
)
try:
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=ConvergenceWarning)
world.fit(log_energy)
except (ValueError, IndexError): # index error because of float32 cumsum
if distrib_nb - 1 >= 2:
return vad_energy(log_energy, distrib_nb=distrib_nb - 1,
nb_train_it=nb_train_it)
return np.zeros(shape=(log_energy.shape[0],)), 0
# Compute threshold
threshold = world.means_.max() - \
__current_vad_mode * np.sqrt(1.0 / world.precisions_[world.means_.argmax(), 0])
# Apply frame selection with the current threshold
label = log_energy.ravel() > threshold
return label, threshold
评论列表
文章目录