def aggregate_gradients_using_copy_with_variable_colocation(
tower_grads, use_mean, check_inf_nan):
"""Aggregate gradients, colocating computation with the gradient's variable.
Args:
tower_grads: List of lists of (gradient, variable) tuples. The outer list
is over towers. The inner list is over individual gradients. All variables
of the same gradient across towers must be the same (that is,
tower_grads[x][a][1] == tower_grads[y][a][1] for all indices x, y, and a)
use_mean: if True, mean is taken, else sum of gradients is taken.
check_inf_nan: If true, check grads for nans and infs.
Returns:
The tuple ([(average_gradient, variable),], has_nan_or_inf) where the
gradient has been averaged across all towers. The variable is chosen from
the first tower. The has_nan_or_inf indicates the grads has nan or inf.
"""
agg_grads = []
has_nan_or_inf_list = []
for single_grads in zip(*tower_grads):
# Note that each single_grads looks like the following:
# ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
var = single_grads[0][1]
for _, v in single_grads:
assert v == var
with tf.device(var.device):
grad_and_var, has_nan_or_inf = aggregate_single_gradient_using_copy(
single_grads, use_mean, check_inf_nan)
agg_grads.append(grad_and_var)
has_nan_or_inf_list.append(has_nan_or_inf)
if check_inf_nan:
return agg_grads, tf.reduce_any(has_nan_or_inf_list)
else:
return agg_grads, None
评论列表
文章目录