def calculate_loss_mix2(self, predictions, predictions_class, predictions_encoder, labels, **unused_params):
with tf.name_scope("loss_mix2"):
float_labels = tf.cast(labels, tf.float32)
float_encoders = float_labels
for i in range(FLAGS.encoder_layers):
var_i = np.loadtxt(FLAGS.autoencoder_dir+'autoencoder_layer%d.model' % i)
weight_i = tf.constant(var_i[:-1,:],dtype=tf.float32)
bias_i = tf.reshape(tf.constant(var_i[-1,:],dtype=tf.float32),[-1])
float_encoders = tf.nn.xw_plus_b(float_encoders,weight_i,bias_i)
if i<FLAGS.encoder_layers-1:
float_encoders = tf.nn.relu(float_encoders)
else:
hidden_mean = tf.reduce_mean(float_encoders,axis=1,keep_dims=True)
hidden_std = tf.sqrt(tf.reduce_mean(tf.square(float_encoders-hidden_mean),axis=1,keep_dims=True))
float_encoders = (float_encoders-hidden_mean)/(hidden_std+1e-6)
#float_encoders = tf.nn.sigmoid(float_encoders)
cross_entropy_encoder = 0.1*self.calculate_mseloss(predictions_encoder,float_encoders)
cross_entropy_loss = self.calculate_loss(predictions,labels)
return cross_entropy_encoder+cross_entropy_loss, float_encoders
#return cross_entropy_encoder, float_encoders
python类sqrt()的实例源码
def ae(x):
if nonlinearity_name == 'relu':
f = tf.nn.relu
elif nonlinearity_name == 'elu':
f = tf.nn.elu
elif nonlinearity_name == 'gelu':
# def gelu(x):
# return tf.mul(x, tf.erfc(-x / tf.sqrt(2.)) / 2.)
# f = gelu
def gelu_fast(_x):
return 0.5 * _x * (1 + tf.tanh(tf.sqrt(2 / np.pi) * (_x + 0.044715 * tf.pow(_x, 3))))
f = gelu_fast
elif nonlinearity_name == 'silu':
def silu(_x):
return _x * tf.sigmoid(_x)
f = silu
# elif nonlinearity_name == 'soi':
# def soi_map(x):
# u = tf.random_uniform(tf.shape(x))
# mask = tf.to_float(tf.less(u, (1 + tf.erf(x / tf.sqrt(2.))) / 2.))
# return tf.cond(is_training, lambda: tf.mul(mask, x),
# lambda: tf.mul(x, tf.erfc(-x / tf.sqrt(2.)) / 2.))
# f = soi_map
else:
raise NameError("Need 'relu', 'elu', 'gelu', or 'silu' for nonlinearity_name")
h1 = f(tf.matmul(x, W['1']) + b['1'])
h2 = f(tf.matmul(h1, W['2']) + b['2'])
h3 = f(tf.matmul(h2, W['3']) + b['3'])
h4 = f(tf.matmul(h3, W['4']) + b['4'])
h5 = f(tf.matmul(h4, W['5']) + b['5'])
h6 = f(tf.matmul(h5, W['6']) + b['6'])
h7 = f(tf.matmul(h6, W['7']) + b['7'])
return tf.matmul(h7, W['8']) + b['8']
def build_encoder(self):
"""Inference Network. q(h|X)"""
with tf.variable_scope("encoder"):
q_cell = tf.nn.rnn_cell.LSTMCell(self.embed_dim, self.vocab_size)
a_cell = tf.nn.rnn_cell.LSTMCell(self.embed_dim, self.vocab_size)
l1 = tf.nn.relu(tf.nn.rnn_cell.linear(tf.expand_dims(self.x, 0), self.embed_dim, bias=True, scope="l1"))
l2 = tf.nn.relu(tf.nn.rnn_cell.linear(l1, self.embed_dim, bias=True, scope="l2"))
self.mu = tf.nn.rnn_cell.linear(l2, self.h_dim, bias=True, scope="mu")
self.log_sigma_sq = tf.nn.rnn_cell.linear(l2, self.h_dim, bias=True, scope="log_sigma_sq")
eps = tf.random_normal((1, self.h_dim), 0, 1, dtype=tf.float32)
sigma = tf.sqrt(tf.exp(self.log_sigma_sq))
_ = tf.histogram_summary("mu", self.mu)
_ = tf.histogram_summary("sigma", sigma)
self.h = self.mu + sigma * eps
def build_encoder(self):
"""Inference Network. q(h|X)"""
with tf.variable_scope("encoder"):
self.l1_lin = linear(tf.expand_dims(self.x, 0), self.embed_dim, bias=True, scope="l1")
self.l1 = tf.nn.relu(self.l1_lin)
self.l2_lin = linear(self.l1, self.embed_dim, bias=True, scope="l2")
self.l2 = tf.nn.relu(self.l2_lin)
self.mu = linear(self.l2, self.h_dim, bias=True, scope="mu")
self.log_sigma_sq = linear(self.l2, self.h_dim, bias=True, scope="log_sigma_sq")
self.eps = tf.random_normal((1, self.h_dim), 0, 1, dtype=tf.float32)
self.sigma = tf.sqrt(tf.exp(self.log_sigma_sq))
self.h = tf.add(self.mu, tf.mul(self.sigma, self.eps))
_ = tf.histogram_summary("mu", self.mu)
_ = tf.histogram_summary("sigma", self.sigma)
_ = tf.histogram_summary("h", self.h)
_ = tf.histogram_summary("mu + sigma", self.mu + self.sigma)
def log_variable(variable, gradient=None):
r'''
We introduce a function for logging a tensor variable's current state.
It logs scalar values for the mean, standard deviation, minimum and maximum.
Furthermore it logs a histogram of its state and (if given) of an optimization gradient.
'''
name = variable.name
mean = tf.reduce_mean(variable)
tf.summary.scalar(name='%s/mean' % name, tensor=mean)
tf.summary.scalar(name='%s/sttdev' % name, tensor=tf.sqrt(tf.reduce_mean(tf.square(variable - mean))))
tf.summary.scalar(name='%s/max' % name, tensor=tf.reduce_max(variable))
tf.summary.scalar(name='%s/min' % name, tensor=tf.reduce_min(variable))
tf.summary.histogram(name=name, values=variable)
if gradient is not None:
if isinstance(gradient, tf.IndexedSlices):
grad_values = gradient.values
else:
grad_values = gradient
if grad_values is not None:
tf.summary.histogram(name='%s/gradients' % name, values=grad_values)
def log_variable(variable, gradient=None):
r'''
We introduce a function for logging a tensor variable's current state.
It logs scalar values for the mean, standard deviation, minimum and maximum.
Furthermore it logs a histogram of its state and (if given) of an optimization gradient.
'''
name = variable.name
mean = tf.reduce_mean(variable)
tf.summary.scalar(name='%s/mean' % name, tensor=mean)
tf.summary.scalar(name='%s/sttdev' % name, tensor=tf.sqrt(tf.reduce_mean(tf.square(variable - mean))))
tf.summary.scalar(name='%s/max' % name, tensor=tf.reduce_max(variable))
tf.summary.scalar(name='%s/min' % name, tensor=tf.reduce_min(variable))
tf.summary.histogram(name=name, values=variable)
if gradient is not None:
if isinstance(gradient, tf.IndexedSlices):
grad_values = gradient.values
else:
grad_values = gradient
if grad_values is not None:
tf.summary.histogram(name='%s/gradients' % name, values=grad_values)
def log_variable(variable, gradient=None):
r'''
We introduce a function for logging a tensor variable's current state.
It logs scalar values for the mean, standard deviation, minimum and maximum.
Furthermore it logs a histogram of its state and (if given) of an optimization gradient.
'''
name = variable.name
mean = tf.reduce_mean(variable)
tf.summary.scalar(name='%s/mean' % name, tensor=mean)
tf.summary.scalar(name='%s/sttdev' % name, tensor=tf.sqrt(tf.reduce_mean(tf.square(variable - mean))))
tf.summary.scalar(name='%s/max' % name, tensor=tf.reduce_max(variable))
tf.summary.scalar(name='%s/min' % name, tensor=tf.reduce_min(variable))
tf.summary.histogram(name=name, values=variable)
if gradient is not None:
if isinstance(gradient, tf.IndexedSlices):
grad_values = gradient.values
else:
grad_values = gradient
if grad_values is not None:
tf.summary.histogram(name='%s/gradients' % name, values=grad_values)
def batchnorm(x, name, phase, updates, gamma=0.96):
k = x.get_shape()[1]
runningmean = tf.get_variable(name+"/mean", shape=[1, k], initializer=tf.constant_initializer(0.0), trainable=False)
runningvar = tf.get_variable(name+"/var", shape=[1, k], initializer=tf.constant_initializer(1e-4), trainable=False)
testy = (x - runningmean) / tf.sqrt(runningvar)
mean_ = mean(x, axis=0, keepdims=True)
var_ = mean(tf.square(x), axis=0, keepdims=True)
std = tf.sqrt(var_)
trainy = (x - mean_) / std
updates.extend([
tf.assign(runningmean, runningmean * gamma + mean_ * (1 - gamma)),
tf.assign(runningvar, runningvar * gamma + var_ * (1 - gamma))
])
y = switch(phase, trainy, testy)
out = y * tf.get_variable(name+"/scaling", shape=[1, k], initializer=tf.constant_initializer(1.0), trainable=True)\
+ tf.get_variable(name+"/translation", shape=[1,k], initializer=tf.constant_initializer(0.0), trainable=True)
return out
# ================================================================
# Mathematical utils
# ================================================================
def __init__(self, embedding):
self.sess = tf.Session()
self.inputs = tf.placeholder(tf.float32,
[None, embedding.shape[1]],
name='inputs')
self.test_vec = tf.placeholder(tf.float32, [1, embedding.shape[1]],
name='test_vec')
self.cos_distance = tf.matmul(self.inputs, tf.transpose(self.test_vec))
#-----------------------------------------------------------------------
# Compute normalized embedding matrix
#-----------------------------------------------------------------------
row_sum = tf.reduce_sum(tf.square(self.inputs), axis=1,
keep_dims=True)
norm = tf.sqrt(row_sum)
self.normalized = self.inputs / norm
self.embedding = self.sess.run(self.normalized,
feed_dict={self.inputs: embedding})
#---------------------------------------------------------------------------
def __call__(self, z):
z1 = tf.reshape(tf.slice(z, [0, 0], [-1, 1]), [-1])
z2 = tf.reshape(tf.slice(z, [0, 1], [-1, 1]), [-1])
v1 = tf.sqrt((z1 - 5) * (z1 - 5) + z2 * z2) * 2
v2 = tf.sqrt((z1 + 5) * (z1 + 5) + z2 * z2) * 2
v3 = tf.sqrt((z1 - 2.5) * (z1 - 2.5) + (z2 - 2.5 * np.sqrt(3)) * (z2 - 2.5 * np.sqrt(3))) * 2
v4 = tf.sqrt((z1 + 2.5) * (z1 + 2.5) + (z2 + 2.5 * np.sqrt(3)) * (z2 + 2.5 * np.sqrt(3))) * 2
v5 = tf.sqrt((z1 - 2.5) * (z1 - 2.5) + (z2 + 2.5 * np.sqrt(3)) * (z2 + 2.5 * np.sqrt(3))) * 2
v6 = tf.sqrt((z1 + 2.5) * (z1 + 2.5) + (z2 - 2.5 * np.sqrt(3)) * (z2 - 2.5 * np.sqrt(3))) * 2
pdf1 = tf.exp(-0.5 * v1 * v1) / tf.sqrt(2 * np.pi * 0.25)
pdf2 = tf.exp(-0.5 * v2 * v2) / tf.sqrt(2 * np.pi * 0.25)
pdf3 = tf.exp(-0.5 * v3 * v3) / tf.sqrt(2 * np.pi * 0.25)
pdf4 = tf.exp(-0.5 * v4 * v4) / tf.sqrt(2 * np.pi * 0.25)
pdf5 = tf.exp(-0.5 * v5 * v5) / tf.sqrt(2 * np.pi * 0.25)
pdf6 = tf.exp(-0.5 * v6 * v6) / tf.sqrt(2 * np.pi * 0.25)
return -tf.log((pdf1 + pdf2 + pdf3 + pdf4 + pdf5 + pdf6) / 6)
def batchnormalize(X, eps=1e-8, g=None, b=None):
if X.get_shape().ndims == 4:
mean = tf.reduce_mean(X, [0,1,2])
std = tf.reduce_mean( tf.square(X-mean), [0,1,2] )
X = (X-mean) / tf.sqrt(std+eps)
if g is not None and b is not None:
g = tf.reshape(g, [1,1,1,-1])
b = tf.reshape(b, [1,1,1,-1])
X = X*g + b
elif X.get_shape().ndims == 2:
mean = tf.reduce_mean(X, 0)
std = tf.reduce_mean(tf.square(X-mean), 0)
X = (X-mean) / tf.sqrt(std+eps)
if g is not None and b is not None:
g = tf.reshape(g, [1,-1])
b = tf.reshape(b, [1,-1])
X = X*g + b
else:
raise NotImplementedError
return X
def Grad_Penalty(real_data,fake_data,Discriminator,config):
'''
Implemention from "Improved training of Wasserstein"
Interpolation based estimation of the gradient of the discriminator.
Used to penalize the derivative rather than explicitly constrain lipschitz.
'''
batch_size=config.batch_size
LAMBDA=config.lambda_W
n_hidden=config.critic_hidden_size
alpha = tf.random_uniform([batch_size,1],0.,1.)
interpolates = alpha*real_data + ((1-alpha)*fake_data)#Could do more if not fixed batch_size
disc_interpolates = Discriminator(interpolates,batch_size,n_hidden=n_hidden,config=config, reuse=True)[1]#logits
gradients = tf.gradients(disc_interpolates,[interpolates])[0]#orig
slopes = tf.sqrt(tf.reduce_sum(tf.square(gradients),
reduction_indices=[1]))
gradient_penalty = tf.reduce_mean((slopes-1)**2)
grad_cost = LAMBDA*gradient_penalty
return grad_cost,slopes
def l2_loss(tensor, weight=1.0, scope=None, normalize=False):
"""Define a L2Loss, useful for regularize, i.e. weight decay.
Args:
tensor: tensor to regularize.
weight: an optional weight to modulate the loss.
scope: Optional scope for op_scope.
Returns:
the L2 loss op.
"""
with tf.op_scope([tensor], scope, 'L2Loss'):
weight = tf.convert_to_tensor(weight,
dtype=tensor.dtype.base_dtype,
name='loss_weight')
if normalize:
loss = tf.sqrt( (tf.sqrt( tf.nn.l2_loss(tensor)) / tf.to_float(tf.size(tensor))) , name='value')
else:
loss = tf.mul(weight, tf.nn.l2_loss(tensor), name='value')
tf.add_to_collection(LOSSES_COLLECTION, loss)
return loss
def recode_cost(self, inputs, variation, eps=1e-5, **kwargs):
"""
Cost for given input batch of samples, under current params.
"""
h = self.get_h_inputs(inputs)
z_mu = tf.matmul(h, self.params['Mhz']) + self.params['bMhz']
z_sig = tf.matmul(h, self.params['Shz']) + self.params['bShz']
# KL divergence between latent space induced by encoder and ...
lat_loss = -tf.reduce_sum(1 + z_sig - z_mu**2 - tf.exp(z_sig), 1)
z = z_mu + tf.sqrt(tf.exp(z_sig)) * variation
h = self.get_h_latents(z)
x_mu = self.decoding(tf.matmul(h, self.params['Mhx']) + self.params['bMhx'])
x_sig = self.decoding(tf.matmul(h, self.params['Shx']) + self.params['bShx'])
# x_sig = tf.clip_by_value(x_mu * (1 - x_mu), .05, 1)
# decoding likelihood term
like_loss = tf.reduce_sum(tf.log(x_sig + eps) +
(inputs - x_mu)**2 / x_sig, 1)
# # Mean cross entropy between input and encode-decoded input.
# like_loss = 2 * tf.reduce_sum(functions.cross_entropy(inputs, x_mu), 1)
return .5 * tf.reduce_mean(like_loss + lat_loss)
def Minibatch_Discriminator(input, num_kernels=100, dim_per_kernel=5, init=False, name='MD'):
num_inputs=df_dim*4
theta = tf.get_variable(name+"/theta",[num_inputs, num_kernels, dim_per_kernel], initializer=tf.random_normal_initializer(stddev=0.05))
log_weight_scale = tf.get_variable(name+"/lws",[num_kernels, dim_per_kernel], initializer=tf.constant_initializer(0.0))
W = tf.mul(theta, tf.expand_dims(tf.exp(log_weight_scale)/tf.sqrt(tf.reduce_sum(tf.square(theta),0)),0))
W = tf.reshape(W,[-1,num_kernels*dim_per_kernel])
x = input
x=tf.reshape(x, [batchsize,num_inputs])
activation = tf.matmul(x, W)
activation = tf.reshape(activation,[-1,num_kernels,dim_per_kernel])
abs_dif = tf.mul(tf.reduce_sum(tf.abs(tf.sub(tf.expand_dims(activation,3),tf.expand_dims(tf.transpose(activation,[1,2,0]),0))),2),
1-tf.expand_dims(tf.constant(np.eye(batchsize),dtype=np.float32),1))
f = tf.reduce_sum(tf.exp(-abs_dif),2)/tf.reduce_sum(tf.exp(-abs_dif))
print(f.get_shape())
print(input.get_shape())
return tf.concat(1,[x, f])
a2_transformer_classification.py 文件源码
项目:text_classification
作者: brightmart
项目源码
文件源码
阅读 32
收藏 0
点赞 0
评论 0
def inference(self):
""" building blocks:
encoder:6 layers.each layers has two sub-layers. the first is multi-head self-attention mechanism; the second is position-wise fully connected feed-forward network.
for each sublayer. use LayerNorm(x+Sublayer(x)). all dimension=512.
decoder:6 layers.each layers has three sub-layers. the second layer is performs multi-head attention over the ouput of the encoder stack.
for each sublayer. use LayerNorm(x+Sublayer(x)).
"""
# 1.embedding for encoder input & decoder input
# 1.1 position embedding for encoder input
input_x_embeded = tf.nn.embedding_lookup(self.Embedding,self.input_x) #[None,sequence_length, embed_size]
input_x_embeded=tf.multiply(input_x_embeded,tf.sqrt(tf.cast(self.d_model,dtype=tf.float32)))
input_mask=tf.get_variable("input_mask",[self.sequence_length,1],initializer=self.initializer)
input_x_embeded=tf.add(input_x_embeded,input_mask) #[None,sequence_length,embed_size].position embedding.
# 2. encoder
encoder_class=Encoder(self.d_model,self.d_k,self.d_v,self.sequence_length,self.h,self.batch_size,self.num_layer,input_x_embeded,input_x_embeded,dropout_keep_prob=self.dropout_keep_prob,use_residual_conn=self.use_residual_conn)
Q_encoded,K_encoded = encoder_class.encoder_fn() #K_v_encoder
Q_encoded=tf.reshape(Q_encoded,shape=(self.batch_size,-1)) #[batch_size,sequence_length*d_model]
with tf.variable_scope("output"):
logits = tf.matmul(Q_encoded, self.W_projection) + self.b_projection #logits shape:[batch_size*decoder_sent_length,self.num_classes]
print("logits:",logits)
return logits
def yolo_loss(labels, predictions, mask):
masked_labels = tf.boolean_mask(labels, mask)
masked_predictions = tf.boolean_mask(predictions, mask)
# ious = tensor_iou(masked_predictions[..., 1:5], masked_labels[..., 1:5])
# ious = tf.expand_dims(ious, axis=-1)
xy_loss = tf.reduce_sum((masked_labels[..., :2] - masked_predictions[..., 1:3]) ** 2)
wh_loss = tf.reduce_sum((tf.sqrt(masked_predictions[..., 3:5]) - tf.sqrt(masked_labels[..., 2:4])) ** 2)
# conf_loss = tf.reduce_sum((masked_predictions[..., 0] - ious) ** 2)
conf_loss = tf.reduce_sum((1 - masked_predictions[..., 0]) ** 2)
no_obj_loss = tf.reduce_sum((tf.boolean_mask(predictions, ~mask)[..., 0] ** 2))
class_loss = tf.reduce_sum((masked_predictions[..., 5:] - masked_labels[..., 4:]) ** 2)
loss = 5 * (xy_loss + wh_loss) + conf_loss + no_obj_loss + class_loss
return loss
def shrink_soft_threshold(r,rvar,theta):
"""
soft threshold function
y=sign(x)*max(0,abs(x)-theta[0]*sqrt(rvar) )*scaling
where scaling is theta[1] (default=1)
in other words, if theta is len(1), then the standard
"""
if len(theta.get_shape())>0 and theta.get_shape() != (1,):
lam = theta[0] * tf.sqrt(rvar)
scale=theta[1]
else:
lam = theta * tf.sqrt(rvar)
scale = None
lam = tf.maximum(lam,0)
arml = tf.abs(r) - lam
xhat = tf.sign(r) * tf.maximum(arml,0)
dxdr = tf.reduce_mean(tf.to_float(arml>0),0)
if scale is not None:
xhat = xhat*scale
dxdr = dxdr*scale
return (xhat,dxdr)
def shrink_bgest(r,rvar,theta):
"""Bernoulli-Gaussian MMSE estimator
Perform MMSE estimation E[x|r]
for x ~ BernoulliGaussian(lambda,xvar1)
r|x ~ Normal(x,rvar)
The parameters theta[0],theta[1] represent
The variance of non-zero x[i]
xvar1 = abs(theta[0])
The probability of nonzero x[i]
lamba = 1/(exp(theta[1])+1)
"""
xvar1 = abs(theta[...,0])
loglam = theta[...,1] # log(1/lambda - 1)
beta = 1/(1+rvar/xvar1)
r2scale = r*r*beta/rvar
rho = tf.exp(loglam - .5*r2scale ) * tf.sqrt(1 +xvar1/rvar)
rho1 = rho+1
xhat = beta*r/rho1
dxdr = beta*((1+rho*(1+r2scale) ) / tf.square( rho1 ))
dxdr = tf.reduce_mean(dxdr,0)
return (xhat,dxdr)
def pwlin_grid(r_,rvar_,theta_,dtheta = .75):
"""piecewise linear with noise-adaptive grid spacing.
returns xhat,dxdr
where
q = r/dtheta/sqrt(rvar)
xhat = r * interp(q,theta)
all but the last dimensions of theta must broadcast to r_
e.g. r.shape = (500,1000) is compatible with theta.shape=(500,1,7)
"""
ntheta = int(theta_.get_shape()[-1])
scale_ = dtheta / tf.sqrt(rvar_)
ars_ = tf.clip_by_value( tf.expand_dims( tf.abs(r_)*scale_,-1),0.0, ntheta-1.0 )
centers_ = tf.constant( np.arange(ntheta),dtype=tf.float32 )
outer_distance_ = tf.maximum(0., 1.0-tf.abs(ars_ - centers_) ) # new dimension for distance to closest bin centers (or center)
gain_ = tf.reduce_sum( theta_ * outer_distance_,axis=-1) # apply the gain (learnable)
xhat_ = gain_ * r_
dxdr_ = tf.gradients(xhat_,r_)[0]
return (xhat_,dxdr_)
def shrink_spline(r,rvar,theta):
""" Spline-based shrinkage function
"""
scale = theta[0]*tf.sqrt(rvar)
rs = tf.sign(r)
ar = tf.abs(r/scale)
ar2 = tf.square(ar)
ar3 = ar*ar2
reg1 = tf.to_float(ar<1)
reg2 = tf.to_float(ar<2)-reg1
ar_m2 = 2-ar
ar_m2_p2 = tf.square(ar_m2)
ar_m2_p3 = ar_m2*ar_m2_p2
beta3 = ( (2./3 - ar2 + .5*ar3)*reg1 + (1./6*(ar_m2_p3))*reg2 )
xhat = r*(theta[1] + theta[2]*beta3)
return (xhat,auto_gradients(xhat,r))
def show_shrinkage(shrink_func,theta,**kwargs):
tf.reset_default_graph()
tf.set_random_seed(kwargs.get('seed',1) )
N = kwargs.get('N',500)
L = kwargs.get('L',4)
nsigmas = kwargs.get('sigmas',10)
shape = (N,L)
rvar = 1e-4
r = np.reshape( np.linspace(0,nsigmas,N*L)*math.sqrt(rvar),shape)
r_ = tfcf(r)
rvar_ = tfcf(np.ones(L)*rvar)
xhat_,dxdr_ = shrink_func(r_,rvar_ ,tfcf(theta))
with tf.Session() as sess:
sess.run( tf.global_variables_initializer() )
xhat = sess.run(xhat_)
import matplotlib.pyplot as plt
plt.figure(1)
plt.plot(r.reshape(-1),r.reshape(-1),'y')
plt.plot(r.reshape(-1),xhat.reshape(-1),'b')
if kwargs.has_key('title'):
plt.suptitle(kwargs['title'])
plt.show()
def set_input_shape(self, input_shape):
batch_size, rows, cols, input_channels = input_shape
kernel_shape = tuple(self.kernel_shape) + (input_channels,
self.output_channels)
assert len(kernel_shape) == 4
assert all(isinstance(e, int) for e in kernel_shape), kernel_shape
init = tf.random_normal(kernel_shape, dtype=tf.float32)
init = init / tf.sqrt(1e-7 + tf.reduce_sum(tf.square(init),
axis=(0, 1, 2)))
self.kernels = tf.Variable(init)
self.b = tf.Variable(
np.zeros((self.output_channels,)).astype('float32'))
input_shape = list(input_shape)
input_shape[0] = 1
dummy_batch = tf.zeros(input_shape)
dummy_output = self.fprop(dummy_batch)
output_shape = [int(e) for e in dummy_output.get_shape()]
output_shape[0] = 1
self.output_shape = tuple(output_shape)
def sub_sampling(data, word_counter, word_dict, sampling_rate):
total_words = sum([len(sentence) for sentence in data])
prob_dict = dict()
for word, count in word_counter:
f = count / total_words
p = max(0, 1 - math.sqrt(sampling_rate / f))
prob_dict[word_dict[word]] = p
new_data = list()
for sentence in data:
s = list()
for word in sentence:
prob = prob_dict[word]
if random.random() > prob:
s.append(word)
new_data.append(s)
return new_data
def negative_l2_distance(x1, x2, axis=1):
"""
Negative L2 Distance.
.. math:: L = - \\sqrt{\\sum_i (x1_i - x2_i)^2}
Args:
x1: First term.
x2: Second term.
axis: Reduction Indices.
Returns:
Similarity Value.
"""
distance = tf.sqrt(tf.reduce_sum(tf.square(x1 - x2), axis=axis))
return - distance
def adam_updates(params, cost_or_grads, lr=0.001, mom1=0.9, mom2=0.999):
''' Adam optimizer '''
updates = []
if type(cost_or_grads) is not list:
grads = tf.gradients(cost_or_grads, params)
else:
grads = cost_or_grads
t = tf.Variable(1., 'adam_t')
for p, g in zip(params, grads):
mg = tf.Variable(tf.zeros(p.get_shape()), p.name + '_adam_mg')
if mom1 > 0:
v = tf.Variable(tf.zeros(p.get_shape()), p.name + '_adam_v')
v_t = mom1 * v + (1. - mom1) * g
v_hat = v_t / (1. - tf.pow(mom1, t))
updates.append(v.assign(v_t))
else:
v_hat = g
mg_t = mom2 * mg + (1. - mom2) * tf.square(g)
mg_hat = mg_t / (1. - tf.pow(mom2, t))
g_t = v_hat / tf.sqrt(mg_hat + 1e-8)
p_t = p - lr * g_t
updates.append(mg.assign(mg_t))
updates.append(p.assign(p_t))
updates.append(t.assign_add(1))
return tf.group(*updates)
def add_param(self, spec, shape, name, **kwargs):
param = self.add_param_plain(spec, shape, name, **kwargs)
if name is not None and name.startswith("W") and self.weight_normalization:
# Hacky: check if the parameter is a weight matrix. If so, apply weight normalization
if len(param.get_shape()) == 2:
v = param
g = self.add_param_plain(tf.ones_initializer, (shape[1],), name=name + "_wn/g")
param = v * (tf.reshape(g, (1, -1)) / tf.sqrt(tf.reduce_sum(tf.square(v), 0, keep_dims=True)))
elif len(param.get_shape()) == 4:
v = param
g = self.add_param_plain(tf.ones_initializer, (shape[3],), name=name + "_wn/g")
param = v * (tf.reshape(g, (1, 1, 1, -1)) / tf.sqrt(tf.reduce_sum(tf.square(v), [0, 1, 2],
keep_dims=True)))
else:
raise NotImplementedError
return param
def apply_ln(layer):
def _normalize(x, prefix):
EPS = 1e-5
dim = x.get_shape()[-1].value
bias_name = prefix + "_ln/bias"
scale_name = prefix + "_ln/scale"
if bias_name not in layer.norm_params:
layer.norm_params[bias_name] = layer.add_param(
tf.zeros_initializer, (dim,), name=bias_name, regularizable=False)
if scale_name not in layer.norm_params:
layer.norm_params[scale_name] = layer.add_param(
tf.ones_initializer, (dim,), name=scale_name)
bias = layer.norm_params[bias_name]
scale = layer.norm_params[scale_name]
mean, var = tf.nn.moments(x, axes=[1], keep_dims=True)
x_normed = (x - mean) / tf.sqrt(var + EPS)
return x_normed * scale + bias
return _normalize
def __init__(self, name, shape, initial_stdev = 2.0, initial_prec_a = 5.0, initial_prec_b = 1.0, a0 = 1.0, b0 = 1.0, fixed_prec = False, mean_init_std = None):
if mean_init_std is None:
mean_init_std = 1.0 / np.sqrt(shape[-1])
with tf.variable_scope(name) as scope:
#self.mean = tf.get_variable(name="mean", shape=shape, initializer=tf.contrib.layers.xavier_initializer(), dtype = tf.float32)
#self.var = tf.Variable(initial_var * np.ones(shape), name = name + ".var", dtype = tf.float32)
self.mean = tf.Variable(tf.random_uniform(shape, minval=-mean_init_std, maxval=mean_init_std))
self.logvar = tf.Variable(np.log(initial_stdev**2.0) * np.ones(shape), name = "logvar", dtype = tf.float32)
if fixed_prec:
self.prec_a = tf.constant(initial_prec_a * np.ones(shape[-1]), name = "prec_a", dtype = tf.float32)
self.prec_b = tf.constant(initial_prec_b * np.ones(shape[-1]), name = "prec_b", dtype = tf.float32)
else:
self.prec_a = tf.Variable(initial_prec_a * np.ones(shape[-1]), name = "prec_a", dtype = tf.float32)
self.prec_b = tf.Variable(initial_prec_b * np.ones(shape[-1]), name = "prec_b", dtype = tf.float32)
self.prec = tf.div(self.prec_a, self.prec_b, name = "prec")
self.var = tf.exp(self.logvar, name = "var")
self.a0 = a0
self.b0 = b0
self.shape = shape
def __init__(self, name, shape, initial_stdev = 2.0, initial_prec = 5.0, a0 = 1.0, b0 = 1.0):
mean_std = 1.0 / np.sqrt(shape[-1])
with tf.variable_scope(name) as scope:
self.mean = tf.Variable(tf.random_uniform(shape, minval=-mean_std, maxval=mean_std))
self.logvar = tf.Variable(np.log(initial_stdev**2.0) * np.ones(shape), name = "logvar", dtype = tf.float32)
self.prec = np.repeat(initial_prec, shape[-1])
self.prec_ph= tf.placeholder(shape=shape[-1], name="prec", dtype = tf.float32)
self.var = tf.exp(self.logvar, name = "var")
self.a0 = a0
self.b0 = b0
self.shape = shape
# def prec_div(self):
# return - tf.reduce_sum(gammaPrior(self.prec_a, self.prec_b, self.a0, self.b0))
## outputs E_q[ log N( x | 0, prec^-1) ] + Entropy(q(x))
## where x is the normally distributed variable