def _beta_update_raw_tfidf(self):
'''
Run only once - it does not depend on other parameters.
'''
for nodeid in xrange(self.D):
self.beta[nodeid] = self.W[self.node_vec == nodeid, :
].sum(axis=0)
for nodeid in xrange(self.D):
for wordid in xrange(self.beta.shape[1]):
docs_cnt = np.sum(self.W[self.node_vec == nodeid,
wordid] >= 1)
docs_cnt += 1 # smooth by adding one
self.beta[nodeid][wordid] *= 1 + np.log(self.W.shape[0]
* 1. / docs_cnt) # 1+ because we still want to keep words which always occurr, but probably it never happens
# Laplace smoothing to avoid zeros!
self.beta += 1
self._normalize_beta_rowwise()
return self.beta
python类log()的实例源码
def b_value(mags, mt, perc=[2.5, 97.5], n_reps=None):
"""Compute the b-value and optionally its confidence interval."""
# Extract magnitudes above completeness threshold
m = mags[mags >= mt]
# Compute b-value
b = (np.mean(m) - mt) * np.log(10)
# Draw bootstrap replicates
if n_reps is None:
return b
else:
m_bs_reps = dcst.draw_bs_reps(m, np.mean, size=n_reps)
# Compute b-value from replicates
b_bs_reps = (m_bs_reps - mt) * np.log(10)
# Compute confidence interval
conf_int = np.percentile(b_bs_reps, perc)
return b, conf_int
def f(w, lamb):
"""
Eq. (2) in problem 2
Non-vectorized, slow
"""
total = 0
nrows = X.shape[0]
for i in range(nrows):
current = 1 + np.exp(-y[i] * X[i, ].dot(w))
total += np.log(current)
total += (lamb / 2) * w.dot(w)
return total
def f2(w, lamb):
"""
Eq. (2) in problem 2
Vectorized (no explicit loops), fast
"""
yxTw = y * X.dot(w)
firstpart = np.log(1 + np.exp(-yxTw))
total = firstpart.sum()
total += (lamb / 2) * w.dot(w)
return total
def pac_metric (solution, prediction, task='binary.classification'):
''' Probabilistic Accuracy based on log_loss metric.
We assume the solution is in {0, 1} and prediction in [0, 1].
Otherwise, run normalize_array.'''
debug_flag=False
[sample_num, label_num] = solution.shape
if label_num==1: task='binary.classification'
eps = 1e-15
the_log_loss = log_loss(solution, prediction, task)
# Compute the base log loss (using the prior probabilities)
pos_num = 1.* sum(solution) # float conversion!
frac_pos = pos_num / sample_num # prior proba of positive class
the_base_log_loss = prior_log_loss(frac_pos, task)
# Alternative computation of the same thing (slower)
# Should always return the same thing except in the multi-label case
# For which the analytic solution makes more sense
if debug_flag:
base_prediction = np.empty(prediction.shape)
for k in range(sample_num): base_prediction[k,:] = frac_pos
base_log_loss = log_loss(solution, base_prediction, task)
diff = np.array(abs(the_base_log_loss-base_log_loss))
if len(diff.shape)>0: diff=max(diff)
if(diff)>1e-10:
print('Arrggh {} != {}'.format(the_base_log_loss,base_log_loss))
# Exponentiate to turn into an accuracy-like score.
# In the multi-label case, we need to average AFTER taking the exp
# because it is an NL operation
pac = mvmean(np.exp(-the_log_loss))
base_pac = mvmean(np.exp(-the_base_log_loss))
# Normalize: 0 for random, 1 for perfect
score = (pac - base_pac) / sp.maximum(eps, (1 - base_pac))
return score
def log_loss(solution, prediction, task = 'binary.classification'):
''' Log loss for binary and multiclass. '''
[sample_num, label_num] = solution.shape
eps = 1e-15
pred = np.copy(prediction) # beware: changes in prediction occur through this
sol = np.copy(solution)
if (task == 'multiclass.classification') and (label_num>1):
# Make sure the lines add up to one for multi-class classification
norma = np.sum(prediction, axis=1)
for k in range(sample_num):
pred[k,:] /= sp.maximum (norma[k], eps)
# Make sure there is a single label active per line for multi-class classification
sol = binarize_predictions(solution, task='multiclass.classification')
# For the base prediction, this solution is ridiculous in the multi-label case
# Bounding of predictions to avoid log(0),1/0,...
pred = sp.minimum (1-eps, sp.maximum (eps, pred))
# Compute the log loss
pos_class_log_loss = - mvmean(sol*np.log(pred), axis=0)
if (task != 'multiclass.classification') or (label_num==1):
# The multi-label case is a bunch of binary problems.
# The second class is the negative class for each column.
neg_class_log_loss = - mvmean((1-sol)*np.log(1-pred), axis=0)
log_loss = pos_class_log_loss + neg_class_log_loss
# Each column is an independent problem, so we average.
# The probabilities in one line do not add up to one.
# log_loss = mvmean(log_loss)
# print('binary {}'.format(log_loss))
# In the multilabel case, the right thing i to AVERAGE not sum
# We return all the scores so we can normalize correctly later on
else:
# For the multiclass case the probabilities in one line add up one.
log_loss = pos_class_log_loss
# We sum the contributions of the columns.
log_loss = np.sum(log_loss)
#print('multiclass {}'.format(log_loss))
return log_loss
def prior_log_loss(frac_pos, task = 'binary.classification'):
''' Baseline log loss. For multiplr classes ot labels return the volues for each column'''
eps = 1e-15
frac_pos_ = sp.maximum (eps, frac_pos)
if (task != 'multiclass.classification'): # binary case
frac_neg = 1-frac_pos
frac_neg_ = sp.maximum (eps, frac_neg)
pos_class_log_loss_ = - frac_pos * np.log(frac_pos_)
neg_class_log_loss_ = - frac_neg * np.log(frac_neg_)
base_log_loss = pos_class_log_loss_ + neg_class_log_loss_
# base_log_loss = mvmean(base_log_loss)
# print('binary {}'.format(base_log_loss))
# In the multilabel case, the right thing i to AVERAGE not sum
# We return all the scores so we can normalize correctly later on
else: # multiclass case
fp = frac_pos_ / sum(frac_pos_) # Need to renormalize the lines in multiclass case
# Only ONE label is 1 in the multiclass case active for each line
pos_class_log_loss_ = - frac_pos * np.log(fp)
base_log_loss = np.sum(pos_class_log_loss_)
return base_log_loss
# sklearn implementations for comparison
def sample(self, probs, temperature):
if temperature == 0:
return np.argmax(probs)
probs = probs.astype(np.float64) #convert to float64 for higher precision
probs = np.log(probs) / temperature
probs = np.exp(probs) / math.fsum(np.exp(probs))
return np.argmax(np.random.multinomial(1, probs, 1))
#generate a sentence given conv_hidden
def test(self, vocab_size, use_onto_lstm, S_ind_test=None, C_ind_test=None, hierarchical=False, base=2, oov_list=None):
X_test = C_ind_test[:,:-1] if use_onto_lstm else S_ind_test[:,:-1] # remove the last words' hyps in all sentences
Y_inds_test = S_ind_test[:,1:]
if hierarchical:
test_targets = self._factor_target_indices(Y_inds_test, vocab_size, base=base)
else:
test_targets = [self._make_one_hot(Y_inds_test, vocab_size)]
print >>sys.stderr, "Evaluating model on test data"
test_loss = self.model.evaluate(X_test, test_targets)
print >>sys.stderr, "Test loss: %.4f"%test_loss
if oov_list is not None:
oov_inds = [self.dp.word_index[w] for w in oov_list]
non_oov_Y_inds = numpy.copy(Y_inds_test)
for ind in oov_inds:
non_oov_Y_inds[non_oov_Y_inds == ind] = 0
non_oov_test_targets = self._factor_target_indices(non_oov_Y_inds, vocab_size, base=base)
non_oov_test_loss = self.model.evaluate(X_test, non_oov_test_targets)
print >>sys.stderr, "Non-oov test loss: %.4f"%non_oov_test_loss
factored_test_preds = [-((numpy.log(pred) * target).sum(axis=-1)) for pred, target in zip(self.model.predict(X_test), test_targets)]
test_preds = sum(factored_test_preds)
#non_null_probs = []
#for test_pred, inds in zip(test_preds, Y_inds_test):
# wanted_probs = []
# for tp, ind in zip(test_pred, inds):
# if ind != 0:
# wanted_probs.append(tp)
# non_null_probs.append(wanted_probs)
#return non_null_probs
return test_preds
def data_log_likelihood(self, dataSplit, coefficients, variances):
log_likelihood = 0.0
for k in range(self.num_components):
coef_ = coefficients[k]
Beta = coef_.ix[self.endoVar][self.endoVar]
Gamma = coef_.ix[self.endoVar][self.exoVar]
a_ = (np.dot(Beta, self.fscores[
self.endoVar].T) + np.dot(Gamma, self.fscores[self.exoVar].T))
invert_ = np.linalg.inv(np.array(variances[k]))
exponential = np.exp(-0.5 * np.dot(np.dot(a_.T, invert_), a_))
den = (((2 * np.pi)**(self.Q / 2)) *
np.sqrt(np.linalg.det(variances[k])))
probabilities = exponential[0] / den
log_likelihood += np.log(probabilities).sum()
print(log_likelihood)
return log_likelihood
def BTS(data):
n = data.shape[0]
p = data.shape[1]
chi2 = -(n - 1 - (2 * p + 5) / 6) * \
np.log(np.linalg.det(pd.DataFrame.corr(data)))
df = p * (p - 1) / 2
pvalue = scipy.stats.distributions.chi2.sf(chi2, df)
return [chi2, pvalue]
def build_model(self):
self.x = tf.placeholder(tf.float32, [self.reader.vocab_size], name="input")
self.x_idx = tf.placeholder(tf.int32, [None], name="x_idx")
self.build_encoder()
self.build_generator()
# Kullback Leibler divergence
self.e_loss = -0.5 * tf.reduce_sum(1 + self.log_sigma_sq - tf.square(self.mu) - tf.exp(self.log_sigma_sq))
# Log likelihood
self.g_loss = -tf.reduce_sum(tf.log(tf.gather(self.p_x_i, self.x_idx) + 1e-10))
self.loss = self.e_loss + self.g_loss
self.encoder_var_list, self.generator_var_list = [], []
for var in tf.trainable_variables():
if "encoder" in var.name:
self.encoder_var_list.append(var)
elif "generator" in var.name:
self.generator_var_list.append(var)
# optimizer for alternative update
self.optim_e = tf.train.AdamOptimizer(learning_rate=self.lr) \
.minimize(self.e_loss, global_step=self.step, var_list=self.encoder_var_list)
self.optim_g = tf.train.AdamOptimizer(learning_rate=self.lr) \
.minimize(self.g_loss, global_step=self.step, var_list=self.generator_var_list)
# optimizer for one shot update
self.optim = tf.train.AdamOptimizer(learning_rate=self.lr) \
.minimize(self.loss, global_step=self.step)
_ = tf.scalar_summary("encoder loss", self.e_loss)
_ = tf.scalar_summary("generator loss", self.g_loss)
_ = tf.scalar_summary("total loss", self.loss)
def edge_logits(self):
"""Get edge log probabilities on the complete graph."""
def logprob(self, data):
"""Compute non-normalized log probabilies of many rows of data."""
def logprob(self, data):
logprobs = np.stack(
[server.logprob(data) for server in self._ensemble])
logprobs = logsumexp(logprobs, axis=0)
logprobs -= np.log(len(self._ensemble))
assert logprobs.shape == (data.shape[0], )
return logprobs
def edge_logits(self):
"""A [K]-shaped array of log odds of edges in the complete graph."""
return self._server.edge_logits
def make_model_path(name):
log_path = os.path.join('log', name)
if os.path.isdir(log_path):
subprocess.call(('rm -rf %s' % log_path).split())
os.makedirs(log_path)
return log_path
def compute_log_sum(val):
min_val = np.min(val, axis=0, keepdims=True)
return np.mean(min_val - np.log(np.mean(np.exp(-val + min_val), axis=0)))
def getInitialHyps(self, X, C, y):
self.logdetXX = np.linalg.slogdet(C.T.dot(C))[1]
hyp0_sig2e = [0.5*np.log(0.5*y.var())]
Linreg = sklearn.linear_model.LinearRegression(fit_intercept=False, normalize=False, copy_X=False)
Linreg.fit(C, y)
hyp0_fixedEffects = Linreg.coef_
return hyp0_sig2e, hyp0_fixedEffects
def rankRegions(self, X, C, y, pos, regionLength, reml=True):
#get resiong list
regionsList = self.createRegionsList(pos, regionLength)
#precompute log determinant of covariates
XX = C.T.dot(C)
[Sxx,Uxx]= la.eigh(XX)
logdetXX = np.log(Sxx).sum()
#score each region
betas = np.zeros(len(regionsList))
for r_i, r in enumerate(regionsList):
regionSize = len(r)
if (self.verbose and r_i % 1000==0):
print 'Testing region ' + str(r_i+1)+'/'+str(len(regionsList)),
print 'with', regionSize, 'SNPs\t'
s,U = self.eigenDecompose(X[:, np.array(r)], None)
sig2g_kernel, sig2e_kernel, fixedEffects, ll = self.optSigma2(U, s, y, C, logdetXX, reml)
betas[r_i] = ll
return regionsList, betas
### this code is taken from the FastLMM package (see attached license)###