def KL_validate(data_true, data_predicted, n_bins, x_range, n_samples=10000):
'''"Pr(KL(simulated data||original) > KL(bootstrap original||bootstrap original))'''
n = data_true.shape[0]
hist_true, _ = np.histogram(data_true, bins=n_bins, range=x_range)
hist_predicted, bin_edges = np.histogram(data_predicted, bins=n_bins, range=x_range)
simulated_KL = sc.entropy(hist_true+1,hist_predicted+1)
subsampled_KL = []
for i in xrange(n_samples):
index1 = np.random.choice(n, n, replace=True)
index2 = np.random.choice(n, n, replace=True)
sample1 = data_true[index1]
sample2 = data_true[index2]
hist_sample1, _ = np.histogram(sample1, bins=n_bins, range=x_range)
hist_sample2, _ = np.histogram(sample2, bins=n_bins, range=x_range)
subsampled_KL.append(sc.entropy(hist_sample2+1,hist_sample1+1))
subsampled_KL = sorted(subsampled_KL)
pval = sum( simulated_KL < i for i in subsampled_KL) / float(n_samples)
conf_interval = (0,subsampled_KL[int(math.ceil(n_samples*0.95))-1])
return simulated_KL,conf_interval,pval,n
# CONTOUR PLOTS
python类entropy()的实例源码
def first_order(feature, aggregates, verbose=False):
if not type(aggregates) is list:
aggregates = [aggregates]
for aggregate in aggregates:
if verbose:
print(' first order computation: ' + aggregate)
if aggregate == 'log':
feature = np.log(feature)
elif aggregate == 'sqrt':
feature = np.sqrt(feature)
elif aggregate == 'minlog':
feature = np.log(1 - feature)
elif aggregate == 'minsqrt':
feature = np.sqrt(1 - feature)
elif aggregate == 'mean':
# feature = np.mean(feature, axis=0)
feature = np.nanmean(feature, axis=0)
elif aggregate == 'var':
feature = np.var(feature, axis=0)
elif aggregate == 'std':
# feature = np.std(feature, axis=0)
feature = np.nanstd(feature, axis=0)
elif aggregate == 'stdmean':
feature = np.hstack([np.mean(feature, axis=0), np.std(feature, axis=0)])
elif aggregate == 'cov':
feature = np.flatten(np.cov(feature, axis=0))
elif aggregate == 'totvar':
feature = np.array([np.mean(np.var(feature, axis=0))])
elif aggregate == 'totstd':
feature = np.array([np.mean(np.std(feature, axis=0))])
elif aggregate == 'entropy':
feature = feature.flatten()
feature = np.array([stats.entropy(feature)])
elif aggregate == 'normentropy':
feature = feature.flatten()
feature = np.array([stats.entropy(feature) / np.log(feature.size)])
elif aggregate == 'information':
feature = - np.log(feature)
return feature
def get_entropy(self, probs):
"""
Estimate the entropy of string in Shannons. That is, this method
assumes that the frequency of characters in the input string is
exactly equal to the probability mass function.
"""
# calculates entropy in Nats
ent_nat = entropy(probs)
# convert to Shannons
ent_shan = ent_nat * 1 / np.log(2)
return ent_shan
def diversity(dev, gen_test, beam_size, hypo_len, noise_size, per_premise, samples):
step = len(dev[0]) / samples
sind = [i * step for i in range(samples)]
p = Progbar(per_premise * samples)
for i in sind:
hypos = []
unique_words = []
hypo_list = []
premise = dev[0][i]
prem_list = set(cut_zeros(list(premise)))
while len(hypos) < per_premise:
label = np.argmax(dev[2][i])
words = single_generate(premise, label, gen_test, beam_size, hypo_len, noise_size)
hypos += [str(ex) for ex in words]
unique_words += [int(w) for ex in words for w in ex if w > 0]
hypo_list += [set(cut_zeros(list(ex))) for ex in words]
jacks = []
prem_jacks = []
for u in range(len(hypo_list)):
sim_prem = len(hypo_list[u] & prem_list)/float(len(hypo_list[u] | prem_list))
prem_jacks.append(sim_prem)
for v in range(u+1, len(hypo_list)):
sim = len(hypo_list[u] & hypo_list[v])/float(len(hypo_list[u] | hypo_list[v]))
jacks.append(sim)
avg_dist_hypo = 1 - np.mean(jacks)
avg_dist_prem = 1 - np.mean(prem_jacks)
d = entropy(Counter(hypos).values())
w = entropy(Counter(unique_words).values())
p.add(len(hypos), [('diversity', d),('word_entropy', w),('avg_dist_hypo', avg_dist_hypo), ('avg_dist_prem', avg_dist_prem)])
arrd = p.sum_values['diversity']
arrw = p.sum_values['word_entropy']
arrj = p.sum_values['avg_dist_hypo']
arrp = p.sum_values['avg_dist_prem']
return arrd[0] / arrd[1], arrw[0] / arrw[1], arrj[0] / arrj[1], arrp[0] / arrp[1]
def jensen_shannon(P, Q):
M = 0.5 * (P + Q)
return 0.5 * (entropy(P, M) + entropy(Q, M))
plot_distances.py 文件源码
项目:twitter_LDA_topic_modeling
作者: kenneth-orton
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def jensen_shannon_divergence(P, Q):
_P = np.array(P) / norm(np.array(P), ord=1)
_Q = np.array(Q) / norm(np.array(Q), ord=1)
_M = 0.5 * (_P + _Q)
return 0.5 * (entropy(_P, _M) + entropy(_Q, _M))
def jensen_shannon_divergence(p,q):
"""Jensen-Shannon distance between distributions p and q"""
m = (p+q)/2.0;
return stats.entropy(p,m) + stats.entropy(q,m);
demo_mi.py 文件源码
项目:Building-Machine-Learning-Systems-With-Python-Second-Edition
作者: PacktPublishing
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def mutual_info(x, y, bins=10):
counts_xy, bins_x, bins_y = np.histogram2d(x, y, bins=(bins, bins))
counts_x, bins = np.histogram(x, bins=bins)
counts_y, bins = np.histogram(y, bins=bins)
counts_xy += 1
counts_x += 1
counts_y += 1
P_xy = counts_xy / np.sum(counts_xy, dtype=float)
P_x = counts_x / np.sum(counts_x, dtype=float)
P_y = counts_y / np.sum(counts_y, dtype=float)
I_xy = np.sum(P_xy * np.log2(P_xy / (P_x.reshape(-1, 1) * P_y)))
return I_xy / (entropy(counts_x) + entropy(counts_y))
def create_window_based_features(data, window_size):
central_fn = np.mean
ma1 = calcuate_window_operation(data, window_size, central_fn)
ma2 = calcuate_window_operation(data, 2 * window_size, central_fn)
ma4 = calcuate_window_operation(data, 4 * window_size, central_fn)
ma8 = calcuate_window_operation(data, 8 * window_size, central_fn)
entropy = calcuate_window_operation(data, window_size, stats.entropy)
stddev = calcuate_window_operation(data, window_size, np.std)
medain_weeksbefore = value_before_period(data, 7)
return np.column_stack((ma1, ma2, ma4, ma8, entropy, stddev, medain_weeksbefore))
# do cross volidation http://stackoverflow.com/questions/533905/get-the-cartesian-product-of-a-series-of-lists-in-python
def entropy(X, bins=None):
"""
Use the Shannon Entropy H to describe the distribution of the given sample.
For calculating the Shannon Entropy, the bin edges are needed and can be passed as pk.
If pk is None, these edges will be calculated using the numpy.histogram function with bins='fq'.
This uses Freedman Diacons Estimator and is fairly resilient to outliers.
If the input data X is 2D (Entropy for more than one bin needed), it will derive the histogram once and
use the same edges in all bins.
CAUTION: this is actually an changed behaviour to scikit-gstat<=0.1.4
:param X: np.ndarray with the given sample to calculate the Shannon entropy from
:param bins: The bin edges for entropy calculation, or an amount of even spaced bins
:return:
"""
_X = np.array(X)
if any([isinstance(_, (list, np.ndarray)) for _ in _X]):
# if bins is not set, use the histogram over the full value range
if bins is None:
# could not fiugre out a better way here. I need the values before calculating the entropy
# in order to use the full value range in all bins
vals = [[np.abs(_[i] - _[i + 1]) for i in np.arange(0, len(_), 2)] for _ in _X]
bins = np.histogram(vals, bins=15)[1][1:]
return np.array([entropy(_, bins=bins) for _ in _X])
# check even
if len(_X) % 2 > 0:
raise ValueError('The sample does not have an even length: {}'.format(_X))
# calculate the values
vals = [np.abs(_X[i] - _X[i + 1]) for i in np.arange(0, len(_X), 2)]
# claculate the bins
if bins is None:
bins = 15
pk = np.histogram(vals, bins)[0]
return scipy_entropy(pk=pk)
def kullback_leibler(vec1, vec2, num_features=None):
"""
A distance metric between two probability distributions.
Returns a distance value in range <0,1> where values closer to 0 mean less distance (and a higher similarity)
Uses the scipy.stats.entropy method to identify kullback_leibler convergence value.
If the distribution draws from a certain number of docs, that value must be passed.
"""
if scipy.sparse.issparse(vec1):
vec1 = vec1.toarray()
if scipy.sparse.issparse(vec2):
vec2 = vec2.toarray() # converted both the vectors to dense in case they were in sparse matrix
if isbow(vec1) and isbow(vec2): # if they are in bag of words format we make it dense
if num_features != None: # if not None, make as large as the documents drawing from
dense1 = sparse2full(vec1, num_features)
dense2 = sparse2full(vec2, num_features)
return entropy(dense1, dense2)
else:
max_len = max(len(vec1), len(vec2))
dense1 = sparse2full(vec1, max_len)
dense2 = sparse2full(vec2, max_len)
return entropy(dense1, dense2)
else:
# this conversion is made because if it is not in bow format, it might be a list within a list after conversion
# the scipy implementation of Kullback fails in such a case so we pick up only the nested list.
if len(vec1) == 1:
vec1 = vec1[0]
if len(vec2) == 1:
vec2 = vec2[0]
return scipy.stats.entropy(vec1, vec2)
def kl(x1, x2):
assert x1.shape == x2.shape
# x1_2d, x2_2d = reshape_2d(x1), reshape_2d(x2)
# Transpose to [?, #num_examples]
x1_2d_t = x1.transpose()
x2_2d_t = x2.transpose()
# pdb.set_trace()
e = entropy(x1_2d_t, x2_2d_t)
e[np.where(e==np.inf)] = 2
return e
def f_entropy(p):
# Convert values to probability
p = np.bincount(p) / float(p.shape[0])
ep = stats.entropy(p)
if ep == -float('inf'):
return 0.0
return ep
def calc_class_entropy(y):
class_counts = stats.itemfreq(y)[:, 1]
return stats.entropy(class_counts, base=2)
def kullback_leibler(vec1, vec2, num_features=None):
"""
A distance metric between two probability distributions.
Returns a distance value in range <0,1> where values closer to 0 mean less distance (and a higher similarity)
Uses the scipy.stats.entropy method to identify kullback_leibler convergence value.
If the distribution draws from a certain number of docs, that value must be passed.
"""
if scipy.sparse.issparse(vec1):
vec1 = vec1.toarray()
if scipy.sparse.issparse(vec2):
vec2 = vec2.toarray() # converted both the vectors to dense in case they were in sparse matrix
if isbow(vec1) and isbow(vec2): # if they are in bag of words format we make it dense
if num_features != None: # if not None, make as large as the documents drawing from
dense1 = sparse2full(vec1, num_features)
dense2 = sparse2full(vec2, num_features)
return entropy(dense1, dense2)
else:
max_len = max(len(vec1), len(vec2))
dense1 = sparse2full(vec1, max_len)
dense2 = sparse2full(vec2, max_len)
return entropy(dense1, dense2)
else:
# this conversion is made because if it is not in bow format, it might be a list within a list after conversion
# the scipy implementation of Kullback fails in such a case so we pick up only the nested list.
if len(vec1) == 1:
vec1 = vec1[0]
if len(vec2) == 1:
vec2 = vec2[0]
return scipy.stats.entropy(vec1, vec2)
def barHhv(s):
'''
Conditional entropy H(h|v)
'''
return np.sum(s.Qv*s.Hhv)
def barHvh(s):
'''
Conditional entropy H(v|h)
'''
return np.sum(s.Qh*s.Hvh)
# ---------------------------------------------------------------------
# Energies of samples
def short_report(s):
Hhs = np.sum(rb.bitent(s.Ph))
Hvs = np.sum(rb.bitent(s.Pv))
# print a short report
print('\nRBM dataset Ns=%s Nh=%s Nv=%s'%(s.Ns,s.Nh,s.Nv))
print('Vis capacity, maximum',np.sum(rb.bitent(0.5*np.ones(s.Nv))))
print('Hid capacity, maximum',np.sum(rb.bitent(0.5*np.ones(s.Nh))))
print('Vis entropy , sampled',Hvs)
print('Hid entropy , sampled',Hhs)
print('Entropy difference ',(Hhs-Hvs))
print('Mean hidden rate ',np.mean(s.Ph))
print('Mean hidde complexity',rb.bitent(np.mean(s.Ph))*s.Nh)
def long_report(s):
lgE = np.log2(np.e)
# Long report
# print('\nFound dataset %s T=%s Nh=%s Nv=%s'%(DIR,T,Nh,Nv))
# print('DKL %0.2f'%DKL)
print('\nRBM dataset Ns=%s Nh=%s Nv=%s'%(s.Ns,s.Nh,s.Nv))
# Hidden layer entropy
print('==Hidden layer entropy==')
print('Hid capacity, maximum %0.2f'%(np.sum(rb.bitent(0.5*np.ones(s.Nh)))))
print('Hid entropy , sampled %0.2f'%(s.Hhs))
print('Entropy hid sample is %0.2f'%(entropy(s.Qh,base=2)))
print('<<Eh>h|v>v sampled is %0.2f'%(s.barEhhv*lgE))
print('<<Eh>h|v>v ufield is %0.2f'%(s.barEhhv_meanfield*lgE))
print('Mean hidde complexity %0.2f'%(rb.bitent(np.mean(s.Ph))*s.Nh))
print('Mean hidden rate %0.2f'%(np.mean(s.Ph)))
# Conditional entropy
print('==Conditional entropy==')
print('Entropy difference %0.2f'%(s.Hhs-s.Hvs))
print('<H_h|v>v is %0.2f'%(s.barHhv*lgE))
# Likelihoods
print('==Negative log-likelihood==')
print('<<Ev|h>h|v>v sampl is %0.2f'%(s.barEvhhv *lgE))
print('<<Ev|h>h|v>v ufild is %0.2f'%(s.barEvhhv_meanfield*lgE))
# KL divergences
print('==KL divergences==')
print('<Dkl(h|v||h)>v sam is %0.2f'%(s.barDKLhv*lgE))
print('<Dkl(h|v||h)>v uf1 is %0.2f'%(s.barDKLhv_meanfield*lgE))
# Visible entropy; These should be close in value
print('==Visible layer entropy==')
print('Vis capacity, maximum %0.2f'%(np.sum(rb.bitent(0.5*np.ones(s.Nv)))))
print('Vis entropy , sampled %0.2f'%(s.Hvs))
print('Entropy vis sample is %0.2f'%(entropy(s.Qv,base=2)))
print('<D(.)+<Ev|h>h|v>v sam %0.2f'%(s.barDKLhv*lgE+s.barEvhhv *lgE))
print('<D(.)+<Ev|h>h|v>v uf1 %0.2f'%(s.barDKLhv_meanfield*lgE+s.barEvhhv_meanfield*lgE))
def entropy(self, filename, delimeter, itemsetSize, minsup, fun): # fun defines use of build-in entropy or my own
db = DataBase()
db.readDB(filename, delimeter)
dbElem = db.getDBElements()
dbSize = db.size()
kItemsetFreq = [float(db.getItemsetSup(set(itemset))) / dbSize for itemset in combinations(dbElem, itemsetSize)]
sumFreq = sum(kItemsetFreq)
kItemsetProb = (itemsetFreq / sumFreq for itemsetFreq in kItemsetFreq)
kItemsetFreq.clear()
db.getDataBase().clear()
if fun == 1:
return entropy(kItemsetProb, base=2)
elif fun == 2:
return self.calculateEntropy(kItemsetProb)