def combine(self, expert_out, multiply_by_gates=True):
"""Sum together the expert output, weighted by the gates.
The slice corresponding to a particular batch element `b` is computed
as the sum over all experts `i` of the expert output, weighted by the
corresponding gate values. If `multiply_by_gates` is set to False, the
gate values are ignored.
Args:
expert_out: a list of `num_experts` `Tensor`s, each with shape
`[expert_batch_size_i, <extra_output_dims>]`.
multiply_by_gates: a boolean
Returns:
a `Tensor` with shape `[batch_size, <extra_output_dims>]`.
"""
# see comments on convert_gradient_to_tensor
stitched = convert_gradient_to_tensor(tf.concat(expert_out, 0))
if multiply_by_gates:
stitched *= tf.expand_dims(self._nonzero_gates, 1)
combined = tf.unsorted_segment_sum(stitched, self._batch_index,
tf.shape(self._gates)[0])
return combined
评论列表
文章目录