def svgd_kernel(self, h = -1):
sq_dist = pdist(self.theta)
pairwise_dists = squareform(sq_dist)**2
if h < 0: # if h < 0, using median trick
h = np.median(pairwise_dists)
h = np.sqrt(0.5 * h / np.log(self.theta.shape[0]+1))
# compute the rbf kernel
Kxy = np.exp( -pairwise_dists / h**2 / 2)
dxkxy = -np.matmul(Kxy, self.theta)
sumkxy = np.sum(Kxy, axis=1)
for i in range(self.theta.shape[1]):
dxkxy[:, i] = dxkxy[:,i] + np.multiply(self.theta[:,i],sumkxy)
dxkxy = dxkxy / (h**2)
return (Kxy, dxkxy)
python类log()的实例源码
def rbf_kernel(X0):
XY = T.dot(X0, X0.transpose())
x2 = T.reshape(T.sum(T.square(X0), axis=1), (X0.shape[0], 1))
X2e = T.repeat(x2, X0.shape[0], axis=1)
H = T.sub(T.add(X2e, X2e.transpose()), 2 * XY)
V = H.flatten()
# median distance
h = T.switch(T.eq((V.shape[0] % 2), 0),
# if even vector
T.mean(T.sort(V)[ ((V.shape[0] // 2) - 1) : ((V.shape[0] // 2) + 1) ]),
# if odd vector
T.sort(V)[V.shape[0] // 2])
h = T.sqrt(0.5 * h / T.log(X0.shape[0].astype('float32') + 1.0)) / 2.
Kxy = T.exp(-H / h ** 2 / 2.0)
neighbors = T.argsort(H, axis=1)[:, 1]
return Kxy, neighbors, h
def rbf_kernel(X):
XY = T.dot(X, X.T)
x2 = T.sum(X**2, axis=1).dimshuffle(0, 'x')
X2e = T.repeat(x2, X.shape[0], axis=1)
H = X2e + X2e.T - 2. * XY
V = H.flatten()
# median distance
h = T.switch(T.eq((V.shape[0] % 2), 0),
# if even vector
T.mean(T.sort(V)[ ((V.shape[0] // 2) - 1) : ((V.shape[0] // 2) + 1) ]),
# if odd vector
T.sort(V)[V.shape[0] // 2])
h = T.sqrt(.5 * h / T.log(H.shape[0].astype('float32') + 1.))
# compute the rbf kernel
kxy = T.exp(-H / (h ** 2) / 2.0)
dxkxy = -T.dot(kxy, X)
sumkxy = T.sum(kxy, axis=1).dimshuffle(0, 'x')
dxkxy = T.add(dxkxy, T.mul(X, sumkxy)) / (h ** 2)
return kxy, dxkxy
def gaussian_nll(x, mus, sigmas):
"""
NLL for Multivariate Normal with diagonal covariance matrix
See:
wikipedia.org/wiki/Multivariate_normal_distribution#Likelihood_function
where \Sigma = diag(s_1^2,..., s_n^2).
x, mus, sigmas all should have the same shape.
sigmas (s_1,..., s_n) should be strictly positive.
Results in output shape of similar but without the last dimension.
"""
nll = lib.floatX(numpy.log(2. * numpy.pi))
nll += 2. * T.log(sigmas)
nll += ((x - mus) / sigmas) ** 2.
nll = nll.sum(axis=-1)
nll *= lib.floatX(0.5)
return nll
def step(self, mode):
if mode == "train" and self.mode == "test":
raise Exception("Cannot train during test mode")
if mode == "train":
theano_fn = self.train_fn
batch_gen = self.train_batch_gen
elif mode == "test":
theano_fn = self.test_fn
batch_gen = self.test_batch_gen
else:
raise Exception("Invalid mode")
data = next(batch_gen)
ret = theano_fn(*data)
return {"prediction": np.array(ret[0]),
"answers": data[-1],
"current_loss": ret[1],
"log": ""}
def step(self, mode):
if mode == "train" and self.mode == "test":
raise Exception("Cannot train during test mode")
if mode == "train":
theano_fn = self.train_fn
batch_gen = self.train_batch_gen
elif mode == "test":
theano_fn = self.test_fn
batch_gen = self.test_batch_gen
else:
raise Exception("Invalid mode")
data = next(batch_gen)
ret = theano_fn(*data)
return {"prediction": np.array(ret[0]),
"answers": data[-1],
"current_loss": ret[1],
"log": ""}
def evaluation(self, X_test, y_test):
# normalization
X_test = self.normalization(X_test)
# average over the output
pred_y_test = np.zeros([self.M, len(y_test)])
prob = np.zeros([self.M, len(y_test)])
'''
Since we have M particles, we use a Bayesian view to calculate rmse and log-likelihood
'''
for i in range(self.M):
w1, b1, w2, b2, loggamma, loglambda = self.unpack_weights(self.theta[i, :])
pred_y_test[i, :] = self.nn_predict(X_test, w1, b1, w2, b2) * self.std_y_train + self.mean_y_train
prob[i, :] = np.sqrt(np.exp(loggamma)) /np.sqrt(2*np.pi) * np.exp( -1 * (np.power(pred_y_test[i, :] - y_test, 2) / 2) * np.exp(loggamma) )
pred = np.mean(pred_y_test, axis=0)
# evaluation
svgd_rmse = np.sqrt(np.mean((pred - y_test)**2))
svgd_ll = np.mean(np.log(np.mean(prob, axis = 0)))
return (svgd_rmse, svgd_ll)
def parse_arguments(parser):
parser.add_argument('seq_file', type=str, metavar='<visit_file>', help='The path to the Pickled file containing visit information of patients')
parser.add_argument('label_file', type=str, metavar='<label_file>', help='The path to the Pickled file containing label information of patients')
parser.add_argument('tree_file', type=str, metavar='<tree_file>', help='The path to the Pickled files containing the ancestor information of the input medical codes. Only use the prefix and exclude ".level#.pk".')
parser.add_argument('out_file', metavar='<out_file>', help='The path to the output models. The models will be saved after every epoch')
parser.add_argument('--embed_file', type=str, default='', help='The path to the Pickled file containing the representation vectors of medical codes. If you are not using medical code representations, do not use this option')
parser.add_argument('--embed_size', type=int, default=128, help='The dimension size of the visit embedding. If you are providing your own medical code vectors, this value will be automatically decided. (default value: 128)')
parser.add_argument('--rnn_size', type=int, default=128, help='The dimension size of the hidden layer of the GRU (default value: 128)')
parser.add_argument('--attention_size', type=int, default=128, help='The dimension size of hidden layer of the MLP that generates the attention weights (default value: 128)')
parser.add_argument('--batch_size', type=int, default=100, help='The size of a single mini-batch (default value: 100)')
parser.add_argument('--n_epochs', type=int, default=100, help='The number of training epochs (default value: 100)')
parser.add_argument('--L2', type=float, default=0.001, help='L2 regularization coefficient for all weights except RNN (default value: 0.001)')
parser.add_argument('--dropout_rate', type=float, default=0.5, help='Dropout rate used for the hidden layer of RNN (default value: 0.5)')
parser.add_argument('--log_eps', type=float, default=1e-8, help='A small value to prevent log(0) (default value: 1e-8)')
parser.add_argument('--verbose', action='store_true', help='Print output after every 100 mini-batches (default false)')
args = parser.parse_args()
return args
def iou_loss(p, t):
# print "pass"
tp, tt = p.reshape((p.shape[0], 2, 2)), t.reshape((t.shape[0], 2, 2))
overlaps_t0 = T.maximum(tp[:, 0, :], tt[:, 0, :])
overlaps_t1 = T.minimum(tp[:, 1, :], tt[:, 1, :])
intersection = overlaps_t1 - overlaps_t0
bool_overlap = T.min(intersection, axis=1) > 0
intersection = intersection[:, 0] * intersection[:, 1]
intersection = T.maximum(intersection, np.float32(0.))
dims_p = tp[:, 1, :] - tp[:, 0, :]
areas_p = dims_p[:, 0] * dims_p[:, 1]
dims_t = tt[:, 1, :] - tt[:, 0, :]
areas_t = dims_t[:, 0] * dims_t[:, 1]
union = areas_p + areas_t - intersection
loss = 1. - T.minimum(
T.exp(T.log(T.abs_(intersection)) -
T.log(T.abs_(union) + np.float32(1e-5))),
np.float32(1.)
)
# return loss
return T.mean(loss)
def iou_loss_val(p, t):
tp, tt = p.reshape((p.shape[0], 2, 2)), t.reshape((t.shape[0], 2, 2))
overlaps = np.zeros_like(tp, dtype=np.float32)
overlaps[:, 0, :] = np.maximum(tp[:, 0, :], tt[:, 0, :])
overlaps[:, 1, :] = np.minimum(tp[:, 1, :], tt[:, 1, :])
intersection = overlaps[:, 1, :] - overlaps[:, 0, :]
bool_overlap = np.min(intersection, axis=1) > 0
intersection = intersection[:, 0] * intersection[:, 1]
intersection = np.maximum(intersection, 0.)
# print "bool", bool_overlap
# print "Int", intersection
dims_p = tp[:, 1, :] - tp[:, 0, :]
areas_p = dims_p[:, 0] * dims_p[:, 1]
dims_t = tt[:, 1, :] - tt[:, 0, :]
areas_t = dims_t[:, 0] * dims_t[:, 1]
union = areas_p + areas_t - intersection
# print "un", union
loss = 1. - np.minimum(
np.exp(np.log(np.abs(intersection)) - np.log(np.abs(union) + 1e-5)),
1.
)
# print loss
return np.mean(loss)
def negativeLogLikelihoodWeighted(self, y, weightPerClass):
#Weighting the cost of the different classes in the cost-function, in order to counter class imbalance.
e1 = np.finfo(np.float32).tiny
addTinyProbMatrix = T.lt(self.p_y_given_x_train, 4*e1) * e1
weights = weightPerClass.dimshuffle('x', 0, 'x', 'x', 'x')
log_p_y_given_x_train = T.log(self.p_y_given_x_train + addTinyProbMatrix)
weighted_log_probs = log_p_y_given_x_train * weights
wShape = weighted_log_probs.shape
# Re-arrange
idx0 = T.arange( wShape[0] ).dimshuffle( 0, 'x','x','x')
idx2 = T.arange( wShape[2] ).dimshuffle('x', 0, 'x','x')
idx3 = T.arange( wShape[3] ).dimshuffle('x','x', 0, 'x')
idx4 = T.arange( wShape[4] ).dimshuffle('x','x','x', 0)
return -T.mean( weighted_log_probs[ idx0, y, idx2, idx3, idx4] )
def log_marginal(self, y, h, py, q):
'''Computes the approximate log marginal.
Uses \log \sum p / q - \log N
Args:
y: T.tensor, target values.
h: T.tensor, latent samples.
py: T.tesnor, conditional density p(y | h)
q: approximate posterior q(h | y)
Returns:
approximate log marginal.
'''
log_py_h = -self.conditional.neg_log_prob(y, py)
log_ph = -self.prior.neg_log_prob(h)
log_qh = -self.posterior.neg_log_prob(h, q)
assert log_py_h.ndim == log_ph.ndim == log_qh.ndim
log_p = log_py_h + log_ph - log_qh
log_p_max = T.max(log_p, axis=0, keepdims=True)
w = T.exp(log_p - log_p_max)
return (T.log(w.mean(axis=0, keepdims=True)) + log_p_max).mean()
def step_free_energy(self, x, beta, *params):
'''Step free energy function.
Args:
x (T.tensor): data sample.
beta (float): beta value for annealing.
*params: theano shared variables.
Returns:
T.tensor: free energy.
'''
W, v_params, h_params = self.split_params(*params)
vis_term = beta * self.v_dist.get_energy_bias(x, *v_params)
x = self.v_dist.scale_for_energy_model(x, *v_params)
hid_act = beta * (T.dot(x, W) + self.h_dist.get_center(*h_params))
fe = -vis_term - T.log(1. + T.exp(hid_act)).sum(axis=1)
return fe
def step_free_energy_h(self, h, beta, *params):
'''Step free energy function for hidden states.
Args:
h (T.tensor): hidden sample.
beta (float): beta value for annealing.
*params: theano shared variables.
Returns:
T.tensor: free energy.
'''
W, v_params, h_params = self.split_params(*params)
hid_term = beta * self.h_dist.get_energy_bias(h, *h_params)
h = self.h_dist.scale_for_energy_model(h, *h_params)
vis_act = beta * (T.dot(h, W.T) + self.v_dist.get_center(*v_params))
fe = -hid_term - T.log(1. + T.exp(vis_act)).sum(axis=1)
return fe
def _generate_train_model_function(self, scores):
u = T.lvector('u')
i = T.lvector('i')
j = T.lvector('j')
self.W = theano.shared(numpy.zeros((self._dim)).astype('float32'), name='W');
self.S = theano.shared(scores, name='S');
x_ui = T.dot(self.W, self.S[u,i,:].T);
x_uj = T.dot(self.W, self.S[u,j,:].T);
x_uij = x_ui - x_uj;
obj = T.sum(
T.log(T.nnet.sigmoid(x_uij)).sum() - \
self._lambda_w * 0.5 * (self.W ** 2).sum()
)
cost = -obj
g_cost_W = T.grad(cost=cost, wrt=self.W)
updates = [
(self.W, self.W - self._learning_rate * g_cost_W)
]
self.train_model = theano.function(inputs=[u,i,j], outputs=cost, updates=updates);
def ctc_update_log_p(skip_idxs, zeros, active, log_p_curr, log_p_prev):
active_skip_idxs = skip_idxs[(skip_idxs < active).nonzero()]
active_next = T.cast(T.minimum(
T.maximum(
active + 1,
T.max(T.concatenate([active_skip_idxs, [-1]])) + 2 + 1
), log_p_curr.shape[0]), 'int32')
common_factor = T.max(log_p_prev[:active])
p_prev = T.exp(log_p_prev[:active] - common_factor)
_p_prev = zeros[:active_next]
# copy over
_p_prev = T.set_subtensor(_p_prev[:active], p_prev)
# previous transitions
_p_prev = T.inc_subtensor(_p_prev[1:], _p_prev[:-1])
# skip transitions
_p_prev = T.inc_subtensor(_p_prev[active_skip_idxs + 2], p_prev[active_skip_idxs])
updated_log_p_prev = T.log(_p_prev) + common_factor
log_p_next = T.set_subtensor(
zeros[:active_next],
log_p_curr[:active_next] + updated_log_p_prev
)
return active_next, log_p_next
def ctc_path_probs(predict, Y, alpha=1e-4):
smoothed_predict = (1 - alpha) * predict[:, Y] + alpha * np.float32(1.) / Y.shape[0]
L = T.log(smoothed_predict)
zeros = T.zeros_like(L[0])
log_first = zeros
f_skip_idxs = ctc_create_skip_idxs(Y)
b_skip_idxs = ctc_create_skip_idxs(Y[::-1]) # there should be a shortcut to calculating this
def step(log_f_curr, log_b_curr, f_active, log_f_prev, b_active, log_b_prev):
f_active_next, log_f_next = ctc_update_log_p(f_skip_idxs, zeros, f_active, log_f_curr, log_f_prev)
b_active_next, log_b_next = ctc_update_log_p(b_skip_idxs, zeros, b_active, log_b_curr, log_b_prev)
return f_active_next, log_f_next, b_active_next, log_b_next
[f_active, log_f_probs, b_active, log_b_probs], _ = theano.scan(
step, sequences=[L, L[::-1, ::-1]], outputs_info=[np.int32(1), log_first, np.int32(1), log_first])
idxs = T.arange(L.shape[1]).dimshuffle('x', 0)
mask = (idxs < f_active.dimshuffle(0, 'x')) & (idxs < b_active.dimshuffle(0, 'x'))[::-1, ::-1]
log_probs = log_f_probs + log_b_probs[::-1, ::-1] - L
return log_probs, mask
def rbf_kernel(X0):
XY = T.dot(X0, X0.transpose())
x2 = T.reshape(T.sum(T.square(X0), axis=1), (X0.shape[0], 1))
X2e = T.repeat(x2, X0.shape[0], axis=1)
H = T.sub(T.add(X2e, X2e.transpose()), 2 * XY)
V = H.flatten()
# median distance
h = T.switch(T.eq((V.shape[0] % 2), 0),
# if even vector
T.mean(T.sort(V)[ ((V.shape[0] // 2) - 1) : ((V.shape[0] // 2) + 1) ]),
# if odd vector
T.sort(V)[V.shape[0] // 2])
h = T.sqrt(0.5 * h / T.log(X0.shape[0].astype('float32') + 1.0)) / 2.
Kxy = T.exp(-H / h ** 2 / 2.0)
neighbors = T.argsort(H, axis=1)[:, 1]
return Kxy, neighbors, h
def theano_logsumexp(x, axis=None):
"""
Compute log(sum(exp(x), axis=axis) in a numerically stable
fashion.
Parameters
----------
x : tensor_like
A Theano tensor (any dimension will do).
axis : int or symbolic integer scalar, or None
Axis over which to perform the summation. `None`, the
default, performs over all axes.
Returns
-------
result : ndarray or scalar
The result of the log(sum(exp(...))) operation.
"""
xmax = x.max(axis=axis, keepdims=True)
xmax_ = x.max(axis=axis)
return xmax_ + T.log(T.exp(x - xmax).sum(axis=axis))
def padMatrixWithTime(seqs, labels, times, options):
lengths = np.array([len(seq) for seq in seqs]) - 1
n_samples = len(seqs)
maxlen = np.max(lengths)
inputDimSize = options['inputDimSize']
numClass = options['numClass']
x = np.zeros((maxlen, n_samples, inputDimSize)).astype(config.floatX)
y = np.zeros((maxlen, n_samples, numClass)).astype(config.floatX)
t = np.zeros((maxlen, n_samples)).astype(config.floatX)
mask = np.zeros((maxlen, n_samples)).astype(config.floatX)
for idx, (seq,time,label) in enumerate(zip(seqs,times,labels)):
for xvec, subseq in zip(x[:,idx,:], seq[:-1]):
xvec[subseq] = 1.
for yvec, subseq in zip(y[:,idx,:], label[1:]):
yvec[subseq] = 1.
mask[:lengths[idx], idx] = 1.
t[:lengths[idx], idx] = time[:-1]
lengths = np.array(lengths, dtype=config.floatX)
if options['useLogTime']:
t = np.log(t + options['logEps'])
return x, y, t, mask, lengths
def parse_arguments(parser):
parser.add_argument('seq_file', type=str, metavar='<visit_file>', help='The path to the Pickled file containing visit information of patients')
parser.add_argument('n_input_codes', type=int, metavar='<n_input_codes>', help='The number of unique input medical codes')
parser.add_argument('label_file', type=str, metavar='<label_file>', help='The path to the Pickled file containing label information of patients')
parser.add_argument('n_output_codes', type=int, metavar='<n_output_codes>', help='The number of unique label medical codes')
parser.add_argument('out_file', metavar='out_file', help='The path to the output models. The models will be saved after every epoch')
parser.add_argument('--time_file', type=str, default='', help='The path to the Pickled file containing durations between visits of patients. If you are not using duration information, do not use this option')
parser.add_argument('--predict_time', type=int, default=0, choices=[0,1], help='Use this option if you want the GRU to also predict the time duration until the next visit (0 for false, 1 for true) (default value: 0)')
parser.add_argument('--tradeoff', type=float, default=1.0, help='Tradeoff variable for balancing the two loss functions: code prediction function and duration prediction function (default value: 1.0)')
parser.add_argument('--use_log_time', type=int, default=1, choices=[0,1], help='Use logarithm of time duration to dampen the impact of the outliers (0 for false, 1 for true) (default value: 1)')
parser.add_argument('--embed_file', type=str, default='', help='The path to the Pickled file containing the representation vectors of medical codes. If you are not using medical code representations, do not use this option')
parser.add_argument('--embed_size', type=int, default=200, help='The size of the visit embedding before passing it to the GRU layers. If you are not providing your own medical code vectors, you must specify this value (default value: 200)')
parser.add_argument('--embed_finetune', type=int, default=1, choices=[0,1], help='If you are using randomly initialized code representations, always use this option. If you are using an external medical code representations, and you want to fine-tune them as you train the GRU, use this option as well. (0 for false, 1 for true) (default value: 1)')
parser.add_argument('--hidden_dim_size', type=str, default='[200,200]', help='The size of the hidden layers of the GRU. This is a string argument. For example, [500,400] means you are using a two-layer GRU where the lower layer uses a 500-dimensional hidden layer, and the upper layer uses a 400-dimensional hidden layer. (default value: [200,200])')
parser.add_argument('--batch_size', type=int, default=100, help='The size of a single mini-batch (default value: 100)')
parser.add_argument('--n_epochs', type=int, default=10, help='The number of training epochs (default value: 10)')
parser.add_argument('--L2_softmax', type=float, default=0.001, help='L2 regularization for the softmax function (default value: 0.001)')
parser.add_argument('--L2_time', type=float, default=0.001, help='L2 regularization for the linear regression (default value: 0.001)')
parser.add_argument('--dropout_rate', type=float, default=0.5, help='Dropout rate between GRU hidden layers, and between the final hidden layer and the softmax layer (default value: 0.5)')
parser.add_argument('--log_eps', type=float, default=1e-8, help='A small value to prevent log(0) (default value: 1e-8)')
parser.add_argument('--verbose', action='store_true', help='Print output after every 10 mini-batches (default false)')
args = parser.parse_args()
return args
def theano_logsumexp(x, axis=None):
"""
Compute log(sum(exp(x), axis=axis) in a numerically stable
fashion.
Parameters
----------
x : tensor_like
A Theano tensor (any dimension will do).
axis : int or symbolic integer scalar, or None
Axis over which to perform the summation. `None`, the
default, performs over all axes.
Returns
-------
result : ndarray or scalar
The result of the log(sum(exp(...))) operation.
"""
xmax = T.max(x, axis = axis, keepdims = True)
xmax_ = T.max(x, axis = axis)
return xmax_ + T.log(T.exp(x - xmax).sum(axis = axis))
def xent(self, inputs, inputs_mask, chars, chars_mask,
outputs, outputs_mask, attention):
pred_outputs, pred_attention = self(
inputs, inputs_mask, chars, chars_mask, outputs, outputs_mask)
outputs_xent = batch_sequence_crossentropy(
pred_outputs, outputs[1:], outputs_mask[1:])
# Note that pred_attention will contain zero elements for masked-out
# character positions, to avoid trouble with log() we add 1 for zero
# element of attention (which after multiplication will be removed
# anyway).
batch_size = attention.shape[1].astype(theano.config.floatX)
attention_mask = (inputs_mask.dimshuffle('x', 1, 0) *
outputs_mask[1:].dimshuffle(0, 1, 'x')
).astype(theano.config.floatX)
epsilon = 1e-6
attention_xent = (
-attention[1:]
* T.log(epsilon + pred_attention + (1-attention_mask))
* attention_mask).sum() / batch_size
return outputs_xent, attention_xent
def multi_label_ACE(outputs,y_labels):
data_shape=outputs.shape
loss_buff=0
# num=T.iscalar(data_shape[0]) #theano int to get value from tensor
# for i in range(int(num)):
# for j in range(12):
# y_exp=outputs[i,j]
# y_tru=y_labels[i,0,0,j]
# if y_tru==0:
# loss_ij=math.log(1-outputs[i,j])
# loss_buff-=loss_ij
# if y_tru>0:
# loss_ij=math.log(outputs[i,j])
# loss_buff-=loss_ij
# wts=[ 0.24331649, 0.18382575, 0.23082499, 0.44545567, 0.52901483, 0.58482504, \
# 0.57321465, 0.43411294, 0.15502839, 0.36377019, 0.19050646, 0.16083916]
# for i in [3,4,5,6,7,9]:
for i in range(12):
target=y_labels[:,i]
output=outputs[:,i]
loss_au=T.sum(-(target * T.log((output+0.05)/1.05) + (1.0 - target) * T.log((1.05 - output)/1.05)))
loss_buff+=loss_au
return loss_buff/(12*BATCH_SIZE)
def multi_label_ACE(outputs,y_labels):
data_shape=outputs.shape
loss_buff=0
# num=T.iscalar(data_shape[0]) #theano int to get value from tensor
# for i in range(int(num)):
# for j in range(12):
# y_exp=outputs[i,j]
# y_tru=y_labels[i,0,0,j]
# if y_tru==0:
# loss_ij=math.log(1-outputs[i,j])
# loss_buff-=loss_ij
# if y_tru>0:
# loss_ij=math.log(outputs[i,j])
# loss_buff-=loss_ij
# wts=[ 0.24331649, 0.18382575, 0.23082499, 0.44545567, 0.52901483, 0.58482504, \
# 0.57321465, 0.43411294, 0.15502839, 0.36377019, 0.19050646, 0.16083916]
# for i in [3,4,5,6,7,9]:
for i in range(12):
target=y_labels[:,i]
output=outputs[:,i]
loss_au=T.sum(-(target * T.log((output+0.05)/1.05) + (1.0 - target) * T.log((1.05 - output)/1.05)))
loss_buff+=loss_au
return loss_buff/(12*BATCH_SIZE)
def sequence_log_likelihood(y, y_hat, y_mask, y_hat_mask, blank_symbol):
"""
Based on code from Shawn Tan.
Credits to Kyle Kastner as well.
"""
y_hat_mask_len = tensor.sum(y_hat_mask, axis=0, dtype='int32')
y_mask_len = tensor.sum(y_mask, axis=0, dtype='int32')
log_probabs = _log_path_probabs(
y, T.log(y_hat), y_mask, y_hat_mask, blank_symbol)
batch_size = log_probabs.shape[1]
log_labels_probab = _log_add(
log_probabs[y_hat_mask_len - 1,
tensor.arange(batch_size),
y_mask_len - 1],
log_probabs[y_hat_mask_len - 1,
tensor.arange(batch_size),
y_mask_len - 2])
return log_labels_probab
def log_sum_exp(x, axis=1):
m = T.max(x, axis=axis)
return m+T.log(T.sum(T.exp(x-m.dimshuffle(0,'x')), axis=axis))
def softmax_loss(p_true, output_before_softmax):
output_before_softmax -= T.max(output_before_softmax, axis=1, keepdims=True)
if p_true.ndim==2:
return T.mean(T.log(T.sum(T.exp(output_before_softmax),axis=1)) - T.sum(p_true*output_before_softmax, axis=1))
else:
return T.mean(T.log(T.sum(T.exp(output_before_softmax),axis=1)) - output_before_softmax[T.arange(p_true.shape[0]),p_true])
def build_model(model_):
global fn_predict, fn_record
global g_ozer, g_mdl
g_ozer = dict(simple=VanillaSGD, adam=AdamSGD)[OZER]()
g_ozer.lr = LEARN_RATE
s_x = T.tensor4('x')
s_y = T.ivector('y')
s_pdpo = T.scalar()
s_out = model_(s_x, s_pdpo)
s_y_onehot = T.extra_ops.to_one_hot(s_y, len(g_dataset.label_map))
s_loss = T.mean(-s_y_onehot*T.log(s_out + 1e-3))
s_accr = T.mean( T.switch(
T.eq(T.argmax(s_out, axis=1), T.argmax(s_y_onehot, axis=1)), 1, 0))
no_dropout = [(s_pdpo, T.constant(0., dtype=th.config.floatX))]
fn_predict = th.function(
[s_x, s_y],
{'pred':s_out, 'accr':s_accr, 'loss':s_loss},
givens=no_dropout, profile=PROFILE)
rec_fetches = {
'x': s_x, 'y': s_y,
'pred': s_out}
rec_fetches.update(g_mdl.params_di)
fn_record = th.function(
[s_x, s_y], rec_fetches, givens=no_dropout, profile=PROFILE)
g_ozer.compile(
[s_x, s_y],
s_loss,
g_mdl.params_di.values(),
fetches_={'pred': s_out, 'loss': s_loss, 'accr': s_accr},
givens_=[(s_pdpo, T.constant(TRAIN_PDPO, dtype=th.config.floatX))],
profile_=PROFILE)
def compute_loss(output, num_samples, num_entries=6, gamma=500.0):
"""Compute the loss of a dataset, given the output of the DSSM.
Args:
output (:class:`lasagne.layers.Layer`): the output of the DSSM
num_samples (int): the number of samples in the dataset
num_entries (int): the number of compared papers in the DSSM structure
gamma (float): the coefficient applied in the softmax of the similarities
Returns:
theano.tensor.TensorType: the loss of the dataset
"""
assert (num_entries > 2)
assert (num_samples > 0)
# Post-NN operations to compute the loss
# First, we extract the first output of each bundle
mask = np.zeros(num_entries * num_samples)
mask[::num_entries] = 1
unmask = np.ones(num_entries * num_samples) - mask
cited = T.extra_ops.compress(mask, output, axis=0)
odocs = T.extra_ops.compress(unmask, output, axis=0)
# We duplicate each row 'x' num_entries-1 times
cited = T.extra_ops.repeat(cited, num_entries-1, axis=0)
# Then we compute element-wise product of x with each y, for each bundle
sims = T.sum(cited * odocs, axis=1)
# We reshape the similarities
sims = T.reshape(sims, (num_samples, num_entries-1))
sims = gamma * sims
# We take the softmax of each row
probs = T.nnet.softmax(sims)
# We compute the loss as the sum of element on the first column
loss_mask = np.zeros(num_entries-1)
loss_mask[0] = 1
loss = T.extra_ops.compress(loss_mask, probs, axis=1)
return -T.log(T.prod(loss))