signal.py 文件源码-python代码片段

def vad_energy(log_energy, distrib_nb=3, nb_train_it=25):
  """ Fitting Gaussian mixture model on the log-energy and the voice
  activity is the component with highest energy.

  Return
  ------
  vad: array of 0, 1
  threshold: scalar
  """
  from sklearn.exceptions import ConvergenceWarning
  from sklearn.mixture import GaussianMixture
  # center and normalize the energy
  log_energy = (log_energy - np.mean(log_energy)) / np.std(log_energy)
  if log_energy.ndim == 1:
    log_energy = log_energy[:, np.newaxis]
  # create mixture model: diag, spherical
  world = GaussianMixture(
      n_components=distrib_nb, covariance_type='diag',
      init_params='kmeans', max_iter=nb_train_it,
      weights_init=np.ones(distrib_nb) / distrib_nb,
      means_init=(-2 + 4.0 * np.arange(distrib_nb) / (distrib_nb - 1))[:, np.newaxis],
      precisions_init=np.ones((distrib_nb, 1)),
  )
  try:
    with warnings.catch_warnings():
      warnings.filterwarnings("ignore", category=ConvergenceWarning)
      world.fit(log_energy)
  except (ValueError, IndexError): # index error because of float32 cumsum
    if distrib_nb - 1 >= 2:
      return vad_energy(log_energy, distrib_nb=distrib_nb - 1,
                        nb_train_it=nb_train_it)
    return np.zeros(shape=(log_energy.shape[0],)), 0
  # Compute threshold
  threshold = world.means_.max() - \
      __current_vad_mode * np.sqrt(1.0 / world.precisions_[world.means_.argmax(), 0])
  # Apply frame selection with the current threshold
  label = log_energy.ravel() > threshold
  return label, threshold