optimization.py 文件源码-python代码片段

def average_gradients(tower_grads):
    """Calculate the mean gradient for each shared variable across all towers.

    Note
    ----
    This function provides a synchronization point across all towers.

    Parameters
    ----------
    tower_grads: List of lists of (gradient, variable) tuples.
        The outer list is over individual gradients. The inner list is
        over the gradient calculation for each tower.

    Return
    ------
    List of pairs of (gradient, variable) where the gradient has been
    averaged across all towers.
    """
    average_grads = []
    for grads_and_vars in zip(*tower_grads):
        # Note that each grads_and_vars looks like the following:
        #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
        # TODO no need for the loop here
        # grad.append(mean(grad_gpu[0..N]), var_gpu0)
        grads = []
        for g, _ in grads_and_vars:
            # Add 0 dimension to the gradients to represent the tower.
            expanded_g = tf.expand_dims(g, 0)

            # Append on a 'tower' dimension which we will average over below.
            grads.append(expanded_g)

        # Average over the 'tower' dimension.
        grad = tf.concat(axis=0, values=grads)
        grad = tf.reduce_mean(grad, 0)

        # Keep in mind that the Variables are redundant because they are shared
        # across towers. So .. we will just return the first tower's pointer to
        # the Variable.
        v = grads_and_vars[0][1]
        grads_and_vars = (grad, v)
        average_grads.append(grads_and_vars)

    return average_grads