def get_classification_loss(logits, targets, softmax_loss_function=None):
bucket_outputs = logits
if softmax_loss_function is None:
assert len(bucket_outputs) == len(targets) == 1
# We need to make target an int64-tensor and set its shape.
bucket_target = array_ops.reshape(math_ops.to_int64(targets[0]), [-1])
crossent = nn_ops.sparse_softmax_cross_entropy_with_logits(logits=bucket_outputs[0],
labels=bucket_target)
else:
assert len(bucket_outputs) == len(targets) == 1
crossent = softmax_loss_function(bucket_outputs[0], targets[0])
batch_size = array_ops.shape(targets[0])[0]
loss = tf.reduce_sum(crossent) / math_ops.cast(batch_size, dtypes.float32)
return loss
python类reduce_sum()的实例源码
def inference_graph(self, input_data, data_spec=None):
"""Constructs a TF graph for evaluating a random forest.
Args:
input_data: A tensor or SparseTensor or placeholder for input data.
data_spec: A list of tf.dtype values specifying the original types of
each column.
Returns:
The last op in the random forest inference graph.
"""
data_spec = [constants.DATA_FLOAT] if data_spec is None else data_spec
probabilities = []
for i in range(self.params.num_trees):
with ops.device(self.device_assigner.get_device(i)):
tree_data = input_data
if self.params.bagged_features:
tree_data = self._bag_features(i, input_data)
probabilities.append(self.trees[i].inference_graph(tree_data,
data_spec))
with ops.device(self.device_assigner.get_device(0)):
all_predict = array_ops.pack(probabilities)
return math_ops.div(
math_ops.reduce_sum(all_predict, 0), self.params.num_trees,
name='probabilities')
def _gini(self, class_counts):
"""Calculate the Gini impurity.
If c(i) denotes the i-th class count and c = sum_i c(i) then
score = 1 - sum_i ( c(i) / c )^2
Args:
class_counts: A 2-D tensor of per-class counts, usually a slice or
gather from variables.node_sums.
Returns:
A 1-D tensor of the Gini impurities for each row in the input.
"""
smoothed = 1.0 + array_ops.slice(class_counts, [0, 1], [-1, -1])
sums = math_ops.reduce_sum(smoothed, 1)
sum_squares = math_ops.reduce_sum(math_ops.square(smoothed), 1)
return 1.0 - sum_squares / (sums * sums)
def _weighted_gini(self, class_counts):
"""Our split score is the Gini impurity times the number of examples.
If c(i) denotes the i-th class count and c = sum_i c(i) then
score = c * (1 - sum_i ( c(i) / c )^2 )
= c - sum_i c(i)^2 / c
Args:
class_counts: A 2-D tensor of per-class counts, usually a slice or
gather from variables.node_sums.
Returns:
A 1-D tensor of the Gini impurities for each row in the input.
"""
smoothed = 1.0 + array_ops.slice(class_counts, [0, 1], [-1, -1])
sums = math_ops.reduce_sum(smoothed, 1)
sum_squares = math_ops.reduce_sum(math_ops.square(smoothed), 1)
return sums - sum_squares / sums
def _variance(self, sums, squares):
"""Calculate the variance for each row of the input tensors.
Variance is V = E[x^2] - (E[x])^2.
Args:
sums: A tensor containing output sums, usually a slice from
variables.node_sums. Should contain the number of examples seen
in index 0 so we can calculate expected value.
squares: Same as sums, but sums of squares.
Returns:
A 1-D tensor of the variances for each row in the input.
"""
total_count = array_ops.slice(sums, [0, 0], [-1, 1])
e_x = sums / total_count
e_x2 = squares / total_count
return math_ops.reduce_sum(e_x2 - math_ops.square(e_x), 1)
def sum(x, axis=None, keepdims=False):
"""Sum of the values in a tensor, alongside the specified axis.
Arguments:
x: A tensor or variable.
axis: An integer, the axis to sum over.
keepdims: A boolean, whether to keep the dimensions or not.
If `keepdims` is `False`, the rank of the tensor is reduced
by 1. If `keepdims` is `True`,
the reduced dimension is retained with length 1.
Returns:
A tensor with sum of `x`.
"""
axis = _normalize_axis(axis, ndim(x))
return math_ops.reduce_sum(x, reduction_indices=axis, keep_dims=keepdims)
def reduce_sum_n(tensors, name=None):
"""Reduce tensors to a scalar sum.
This reduces each tensor in `tensors` to a scalar via `tf.reduce_sum`, then
adds them via `tf.add_n`.
Args:
tensors: List of tensors, all of the same numeric type.
name: Tensor name, and scope for all other ops.
Returns:
Total loss tensor, or None if no losses have been configured.
Raises:
ValueError: if `losses` is missing or empty.
"""
if not tensors:
raise ValueError('No tensors provided.')
tensors = [math_ops.reduce_sum(t, name='%s/sum' % t.op.name) for t in tensors]
if len(tensors) == 1:
return tensors[0]
with ops.name_scope(name, 'reduce_sum_n', tensors) as scope:
return math_ops.add_n(tensors, name=scope)
def _scale_losses(losses, weight):
"""Computes the scaled loss.
Args:
losses: A `Tensor` of size [batch_size, d1, ... dN].
weight: A `Tensor` of size [1], [batch_size] or [batch_size, d1, ... dN].
The `losses` are reduced (tf.reduce_sum) until its dimension matches
that of `weight` at which point the reduced `losses` are element-wise
multiplied by `weight` and a final reduce_sum is computed on the result.
Conceptually, this operation is equivalent to broadcasting (tiling)
`weight` to be the same size as `losses`, performing an element-wise
multiplication, and summing the result.
Returns:
A scalar tf.float32 `Tensor` whose value represents the sum of the scaled
`losses`.
"""
# First, compute the sum of the losses over all elements:
start_index = max(0, weight.get_shape().ndims)
reduction_indices = list(range(start_index, losses.get_shape().ndims))
reduced_losses = math_ops.reduce_sum(losses,
reduction_indices=reduction_indices)
reduced_losses = math_ops.mul(reduced_losses, weight)
return math_ops.reduce_sum(reduced_losses)
def approximate_duality_gap(self):
"""Add operations to compute the approximate duality gap.
Returns:
An Operation that computes the approximate duality gap over all
examples.
"""
with name_scope('sdca/approximate_duality_gap'):
_, values_list = self._hashtable.export_sharded()
shard_sums = []
for values in values_list:
with ops.device(values.device):
shard_sums.append(
math_ops.reduce_sum(math_ops.cast(values, dtypes.float64), 0))
summed_values = math_ops.add_n(shard_sums)
primal_loss = summed_values[1]
dual_loss = summed_values[2]
example_weights = summed_values[3]
# Note: we return NaN if there are no weights or all weights are 0, e.g.
# if no examples have been processed
return (primal_loss + dual_loss + self._l1_loss() +
(2.0 * self._l2_loss(self._symmetric_l2_regularization()))
) / example_weights
def _gini(self, class_counts):
"""Calculate the Gini impurity.
If c(i) denotes the i-th class count and c = sum_i c(i) then
score = 1 - sum_i ( c(i) / c )^2
Args:
class_counts: A 2-D tensor of per-class counts, usually a slice or
gather from variables.node_sums.
Returns:
A 1-D tensor of the Gini impurities for each row in the input.
"""
smoothed = 1.0 + array_ops.slice(class_counts, [0, 1], [-1, -1])
sums = math_ops.reduce_sum(smoothed, 1)
sum_squares = math_ops.reduce_sum(math_ops.square(smoothed), 1)
return 1.0 - sum_squares / (sums * sums)
def _variance(self, sums, squares):
"""Calculate the variance for each row of the input tensors.
Variance is V = E[x^2] - (E[x])^2.
Args:
sums: A tensor containing output sums, usually a slice from
variables.node_sums. Should contain the number of examples seen
in index 0 so we can calculate expected value.
squares: Same as sums, but sums of squares.
Returns:
A 1-D tensor of the variances for each row in the input.
"""
total_count = array_ops.slice(sums, [0, 0], [-1, 1])
e_x = sums / total_count
e_x2 = squares / total_count
return math_ops.reduce_sum(e_x2 - math_ops.square(e_x), 1)
def _broadcast_weights(weights, values):
"""Broadcast `weights` to the same shape as `values`.
This returns a version of `weights` following the same broadcast rules as
`mul(weights, values)`. When computing a weighted average, use this function
to broadcast `weights` before summing them; e.g.,
`reduce_sum(w * v) / reduce_sum(_broadcast_weights(w, v))`.
Args:
weights: `Tensor` whose shape is broadcastable to `values`.
values: `Tensor` of any shape.
Returns:
`weights` broadcast to `values` shape.
"""
weights_shape = weights.get_shape()
values_shape = values.get_shape()
if (weights_shape.is_fully_defined() and
values_shape.is_fully_defined() and
weights_shape.is_compatible_with(values_shape)):
return weights
return math_ops.mul(
weights, array_ops.ones_like(values), name='broadcast_weights')
def _iqfov_via_sqrt_solve(self, x):
"""Get the inverse quadratic form on vectors via a sqrt_solve."""
# x^{-1} A^{-1} x = || S^{-1}x ||^2,
# where S is a square root of A (A = SS^T).
# Steps:
# 1. Convert x to a matrix, flipping all extra dimensions in `x` to the
# final dimension of x_matrix.
x_matrix = flip_vector_to_matrix(
x, self.batch_shape(), self.get_batch_shape())
# 2. Get soln_matrix = S^{-1} x_matrix
soln_matrix = self.sqrt_solve(x_matrix)
# 3. Reshape back to a vector.
soln = flip_matrix_to_vector(
soln_matrix, extract_batch_shape(x, 1), x.get_shape()[:-1])
# 4. L2 (batch) vector norm squared.
result = math_ops.reduce_sum(
math_ops.square(soln), reduction_indices=[-1])
result.set_shape(x.get_shape()[:-1])
return result
def _sqrt_log_det_core(self, diag_chol_c):
"""Finish computation of Sqrt[Log[Det]]."""
# Complete computation of ._log_det and ._batch_log_det, after the initial
# Cholesky factor has been taken with the appropriate batch/non-batch method
# det(M + VDV^T) = det(D^{-1} + V^T M^{-1} V) * det(D) * det(M)
# = det(C) * det(D) * det(M)
# Multiply by 2 here because this is the log-det of the Cholesky factor of C
log_det_c = 2 * math_ops.reduce_sum(
math_ops.log(diag_chol_c),
reduction_indices=[-1])
# Add together to get Log[det(M + VDV^T)], the Log-det of the updated square
# root.
log_det_updated_sqrt = (
log_det_c + self._diag_operator.log_det() + self._operator.log_det())
return log_det_updated_sqrt
def _attention(self, query, attn_states):
conv2d = nn_ops.conv2d
reduce_sum = math_ops.reduce_sum
softmax = nn_ops.softmax
tanh = math_ops.tanh
with vs.variable_scope("Attention"):
k = vs.get_variable("AttnW", [1, 1, self._attn_size, self._attn_vec_size])
v = vs.get_variable("AttnV", [self._attn_vec_size])
hidden = array_ops.reshape(attn_states,
[-1, self._attn_length, 1, self._attn_size])
hidden_features = conv2d(hidden, k, [1, 1, 1, 1], "SAME")
y = _linear(query, self._attn_vec_size, True)
y = array_ops.reshape(y, [-1, 1, 1, self._attn_vec_size])
s = reduce_sum(v * tanh(hidden_features + y), [2, 3])
a = softmax(s)
d = reduce_sum(
array_ops.reshape(a, [-1, self._attn_length, 1, 1]) * hidden, [1, 2])
new_attns = array_ops.reshape(d, [-1, self._attn_size])
new_attn_states = array_ops.slice(attn_states, [0, 1, 0], [-1, -1, -1])
return new_attns, new_attn_states
def reduce_sum_n(tensors, name=None):
"""Reduce tensors to a scalar sum.
This reduces each tensor in `tensors` to a scalar via `tf.reduce_sum`, then
adds them via `tf.add_n`.
Args:
tensors: List of tensors, all of the same numeric type.
name: Tensor name, and scope for all other ops.
Returns:
Total loss tensor, or None if no losses have been configured.
Raises:
ValueError: if `losses` is missing or empty.
"""
if not tensors:
raise ValueError('No tensors provided.')
tensors = [math_ops.reduce_sum(t, name='%s/sum' % t.op.name) for t in tensors]
if len(tensors) == 1:
return tensors[0]
with ops.name_scope(name, 'reduce_sum_n', tensors) as scope:
return math_ops.add_n(tensors, name=scope)
def _scale_losses(losses, weights):
"""Computes the scaled loss.
Args:
losses: A `Tensor` of size [batch_size, d1, ... dN].
weights: A `Tensor` of size [1], [batch_size] or [batch_size, d1, ... dN].
The `losses` are reduced (tf.reduce_sum) until its dimension matches
that of `weights` at which point the reduced `losses` are element-wise
multiplied by `weights` and a final reduce_sum is computed on the result.
Conceptually, this operation is equivalent to broadcasting (tiling)
`weights` to be the same size as `losses`, performing an element-wise
multiplication, and summing the result.
Returns:
A scalar tf.float32 `Tensor` whose value represents the sum of the scaled
`losses`.
"""
# First, compute the sum of the losses over all elements:
start_index = max(0, weights.get_shape().ndims)
reduction_indices = list(range(start_index, losses.get_shape().ndims))
reduced_losses = math_ops.reduce_sum(losses,
reduction_indices=reduction_indices)
reduced_losses = math_ops.mul(reduced_losses, weights)
return math_ops.reduce_sum(reduced_losses)
def regularized_loss(self, examples):
"""Add operations to compute the loss with regularization loss included.
Args:
examples: Examples to compute loss on.
Returns:
An Operation that computes mean (regularized) loss for given set of
examples.
Raises:
ValueError: if examples are not well defined.
"""
self._assertSpecified(['example_labels', 'example_weights',
'sparse_features', 'dense_features'], examples)
self._assertList(['sparse_features', 'dense_features'], examples)
with name_scope('sdca/regularized_loss'):
weights = convert_to_tensor(examples['example_weights'])
return ((
self._l1_loss() +
# Note that here we are using the raw regularization
# (as specified by the user) and *not*
# self._symmetric_l2_regularization().
self._l2_loss(self._options['symmetric_l2_regularization'])) /
math_ops.reduce_sum(math_ops.cast(weights, dtypes.float64)) +
self.unregularized_loss(examples))
def _gini(self, class_counts):
"""Calculate the Gini impurity.
If c(i) denotes the i-th class count and c = sum_i c(i) then
score = 1 - sum_i ( c(i) / c )^2
Args:
class_counts: A 2-D tensor of per-class counts, usually a slice or
gather from variables.node_sums.
Returns:
A 1-D tensor of the Gini impurities for each row in the input.
"""
smoothed = 1.0 + array_ops.slice(class_counts, [0, 1], [-1, -1])
sums = math_ops.reduce_sum(smoothed, 1)
sum_squares = math_ops.reduce_sum(math_ops.square(smoothed), 1)
return 1.0 - sum_squares / (sums * sums)
def _weighted_gini(self, class_counts):
"""Our split score is the Gini impurity times the number of examples.
If c(i) denotes the i-th class count and c = sum_i c(i) then
score = c * (1 - sum_i ( c(i) / c )^2 )
= c - sum_i c(i)^2 / c
Args:
class_counts: A 2-D tensor of per-class counts, usually a slice or
gather from variables.node_sums.
Returns:
A 1-D tensor of the Gini impurities for each row in the input.
"""
smoothed = 1.0 + array_ops.slice(class_counts, [0, 1], [-1, -1])
sums = math_ops.reduce_sum(smoothed, 1)
sum_squares = math_ops.reduce_sum(math_ops.square(smoothed), 1)
return sums - sum_squares / sums
def _variance(self, sums, squares):
"""Calculate the variance for each row of the input tensors.
Variance is V = E[x^2] - (E[x])^2.
Args:
sums: A tensor containing output sums, usually a slice from
variables.node_sums. Should contain the number of examples seen
in index 0 so we can calculate expected value.
squares: Same as sums, but sums of squares.
Returns:
A 1-D tensor of the variances for each row in the input.
"""
total_count = array_ops.slice(sums, [0, 0], [-1, 1])
e_x = sums / total_count
e_x2 = squares / total_count
return math_ops.reduce_sum(e_x2 - math_ops.square(e_x), 1)
def _iqfov_via_sqrt_solve(self, x):
"""Get the inverse quadratic form on vectors via a sqrt_solve."""
# x^{-1} A^{-1} x = || S^{-1}x ||^2,
# where S is a square root of A (A = SS^T).
# Steps:
# 1. Convert x to a matrix, flipping all extra dimensions in `x` to the
# final dimension of x_matrix.
x_matrix = flip_vector_to_matrix(
x, self.batch_shape(), self.get_batch_shape())
# 2. Get soln_matrix = S^{-1} x_matrix
soln_matrix = self.sqrt_solve(x_matrix)
# 3. Reshape back to a vector.
soln = flip_matrix_to_vector(
soln_matrix, extract_batch_shape(x, 1), x.get_shape()[:-1])
# 4. L2 (batch) vector norm squared.
result = math_ops.reduce_sum(
math_ops.square(soln), reduction_indices=[-1])
result.set_shape(x.get_shape()[:-1])
return result
def _iqfov_via_solve(self, x):
"""Get the inverse quadratic form on vectors via a solve."""
# x^{-1} A^{-1} x
# 1. Convert x to a matrix, flipping all extra dimensions in `x` to the
# final dimension of x_matrix.
x_matrix = flip_vector_to_matrix(
x, self.batch_shape(), self.get_batch_shape())
# 2. Get x_whitened_matrix = A^{-1} x_matrix
soln_matrix = self.solve(x_matrix)
# 3. Reshape back to a vector.
soln = flip_matrix_to_vector(
soln_matrix, extract_batch_shape(x, 1), x.get_shape()[:-1])
# 4. Compute the dot product: x^T soln
result = math_ops.reduce_sum(x * soln, reduction_indices=[-1])
result.set_shape(x.get_shape()[:-1])
return result
def _sqrt_log_det_core(self, diag_chol_c):
"""Finish computation of Sqrt[Log[Det]]."""
# Complete computation of ._log_det and ._batch_log_det, after the initial
# Cholesky factor has been taken with the appropriate batch/non-batch method
# det(M + VDV^T) = det(D^{-1} + V^T M^{-1} V) * det(D) * det(M)
# = det(C) * det(D) * det(M)
# Multiply by 2 here because this is the log-det of the Cholesky factor of C
log_det_c = 2 * math_ops.reduce_sum(
math_ops.log(diag_chol_c),
reduction_indices=[-1])
# Add together to get Log[det(M + VDV^T)], the Log-det of the updated square
# root.
log_det_updated_sqrt = (
log_det_c + self._diag_operator.log_det() + self._operator.log_det())
return log_det_updated_sqrt
def _kl_categorical_categorical(a, b, name=None):
"""Calculate the batched KL divergence KL(a || b) with a and b Categorical.
Args:
a: instance of a Categorical distribution object.
b: instance of a Categorical distribution object.
name: (optional) Name to use for created operations.
default is "kl_categorical_categorical".
Returns:
Batchwise KL(a || b)
"""
with ops.name_scope(
name, "kl_categorical_categorical", [a.logits, b.logits]):
# sum(p*ln(p/q))
return math_ops.reduce_sum(
nn_ops.softmax(a.logits)*(nn_ops.log_softmax(a.logits)
- nn_ops.log_softmax(b.logits)), reduction_indices=[-1])
def _inverse_log_det_jacobian(self, y):
# WLOG, consider the vector case:
# x = log(y[:-1]) - log(y[-1])
# where,
# y[-1] = 1 - sum(y[:-1]).
# We have:
# det{ dX/dY } = det{ diag(1 ./ y[:-1]) + 1 / y[-1] }
# = det{ inv{ diag(y[:-1]) - y[:-1]' y[:-1] } } (1)
# = 1 / det{ diag(y[:-1]) - y[:-1]' y[:-1] }
# = 1 / { (1 + y[:-1]' inv(diag(y[:-1])) y[:-1]) *
# det(diag(y[:-1])) } (2)
# = 1 / { y[-1] prod(y[:-1]) }
# = 1 / prod(y)
# (1) - https://en.wikipedia.org/wiki/Sherman%E2%80%93Morrison_formula
# or by noting that det{ dX/dY } = 1 / det{ dY/dX } from Bijector
# docstring "Tip".
# (2) - https://en.wikipedia.org/wiki/Matrix_determinant_lemma
return -math_ops.reduce_sum(math_ops.log(y), reduction_indices=-1)
def _attention(self, query, attn_states):
conv2d = nn_ops.conv2d
reduce_sum = math_ops.reduce_sum
softmax = nn_ops.softmax
tanh = math_ops.tanh
with vs.variable_scope("Attention"):
k = vs.get_variable("AttnW", [1, 1, self._attn_size, self._attn_vec_size])
v = vs.get_variable("AttnV", [self._attn_vec_size])
hidden = array_ops.reshape(attn_states,
[-1, self._attn_length, 1, self._attn_size])
hidden_features = conv2d(hidden, k, [1, 1, 1, 1], "SAME")
y = _linear(query, self._attn_vec_size, True)
y = array_ops.reshape(y, [-1, 1, 1, self._attn_vec_size])
s = reduce_sum(v * tanh(hidden_features + y), [2, 3])
a = softmax(s)
d = reduce_sum(
array_ops.reshape(a, [-1, self._attn_length, 1, 1]) * hidden, [1, 2])
new_attns = array_ops.reshape(d, [-1, self._attn_size])
new_attn_states = array_ops.slice(attn_states, [0, 1, 0], [-1, -1, -1])
return new_attns, new_attn_states
def _attention(self, query, attn_states):
conv2d = nn_ops.conv2d
reduce_sum = math_ops.reduce_sum
softmax = nn_ops.softmax
tanh = math_ops.tanh
with tf.variable_scope("attention"):
k = tf.get_variable(
"attn_w", [1, 1, self._attn_size, self._attn_vec_size])
v = tf.get_variable("attn_v", [self._attn_vec_size])
hidden = array_ops.reshape(attn_states,
[-1, self._attn_length, 1, self._attn_size])
hidden_features = conv2d(hidden, k, [1, 1, 1, 1], "SAME")
y = _linear(query, self._attn_vec_size, True)
y = array_ops.reshape(y, [-1, 1, 1, self._attn_vec_size])
s = reduce_sum(v * tanh(hidden_features + y), [2, 3])
a = softmax(s)
d = reduce_sum(
array_ops.reshape(a, [-1, self._attn_length, 1, 1]) * hidden, [1, 2])
new_attns = array_ops.reshape(d, [-1, self._attn_size])
new_attn_states = array_ops.slice(attn_states, [0, 1, 0], [-1, -1, -1])
return new_attns, new_attn_states
def sequence_loss_by_mle(logits, targets, vocab_size, sequence_length, batch_size, output_projection=None):
#print("logits: ", np.shape(logits[0]))
#logits: [seq_len, batch_size, emb_dim]
#targets: [seq_len, batch_size] =====transpose====> [batch_size, seq_len]
# labels = tf.to_int32(tf.transpose(targets))
#targets: [seq_len, batch_size] ====reshape[-1]====> [seq_len * batch_size]
labels = tf.to_int32(tf.reshape(targets, [-1]))
if output_projection is not None:
#logits = nn_ops.xw_plus_b(logits, output_projection[0], output_projection[1])
logits = [tf.matmul(logit, output_projection[0]) + output_projection[1] for logit in logits]
reshape_logits = tf.reshape(logits, [-1, vocab_size]) #[seq_len * batch_size, vocab_size]
prediction = tf.clip_by_value(reshape_logits, 1e-20, 1.0)
pretrain_loss = -tf.reduce_sum(
# [seq_len * batch_size , vocab_size]
tf.one_hot(labels, vocab_size, 1.0, 0.0) * tf.log(prediction)
) / (sequence_length * batch_size)
return pretrain_loss
def __call__(self, inputs, state, scope=None):
"""Gated recurrent unit (GRU) with nunits cells."""
with vs.variable_scope(scope or type(self).__name__): # "GRUCell"
with vs.variable_scope("Gates"): # Reset gate and update gate.
# We start with bias of 1.0 to not reset and not update.
r, u, g = array_ops.split(1, 3, _linear([inputs, state],
3 * self._num_units, True, 1.0))
r, u, g = sigmoid(r), sigmoid(u), sigmoid(g)
with vs.variable_scope("Candidate"):
c = self._activation(_linear([inputs, r * state],
self._num_units, True))
new_h = u * state + (1 - u) * c
eps = 1e-13
temp = math_ops.div(math_ops.reduce_sum(math_ops.mul(new_h, state),1), \
math_ops.reduce_sum(math_ops.mul(state,state),1) + eps)
m = array_ops.transpose(g)
t1 = math_ops.mul(m , temp)
t1 = array_ops.transpose(t1)
distract_h = new_h - state * t1
return distract_h, distract_h