def average_gradients(tower_grads):
"""Calculate the mean gradient for each shared variable across all towers.
Note
----
This function provides a synchronization point across all towers.
Parameters
----------
tower_grads: List of lists of (gradient, variable) tuples.
The outer list is over individual gradients. The inner list is
over the gradient calculation for each tower.
Return
------
List of pairs of (gradient, variable) where the gradient has been
averaged across all towers.
"""
average_grads = []
for grads_and_vars in zip(*tower_grads):
# Note that each grads_and_vars looks like the following:
# ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
# TODO no need for the loop here
# grad.append(mean(grad_gpu[0..N]), var_gpu0)
grads = []
for g, _ in grads_and_vars:
# Add 0 dimension to the gradients to represent the tower.
expanded_g = tf.expand_dims(g, 0)
# Append on a 'tower' dimension which we will average over below.
grads.append(expanded_g)
# Average over the 'tower' dimension.
grad = tf.concat(axis=0, values=grads)
grad = tf.reduce_mean(grad, 0)
# Keep in mind that the Variables are redundant because they are shared
# across towers. So .. we will just return the first tower's pointer to
# the Variable.
v = grads_and_vars[0][1]
grads_and_vars = (grad, v)
average_grads.append(grads_and_vars)
return average_grads
评论列表
文章目录