python类entropy()的实例源码

shared_functions.py 文件源码 项目:CustomerSim 作者: sisl 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def KL_validate(data_true, data_predicted, n_bins, x_range, n_samples=10000):
    '''"Pr(KL(simulated data||original) > KL(bootstrap original||bootstrap original))'''

    n = data_true.shape[0]

    hist_true, _ = np.histogram(data_true, bins=n_bins, range=x_range)
    hist_predicted, bin_edges = np.histogram(data_predicted, bins=n_bins, range=x_range)

    simulated_KL = sc.entropy(hist_true+1,hist_predicted+1)
    subsampled_KL = []

    for i in xrange(n_samples):
        index1 = np.random.choice(n, n, replace=True)
        index2 = np.random.choice(n, n, replace=True)
        sample1 = data_true[index1]
        sample2 = data_true[index2]
        hist_sample1, _ = np.histogram(sample1, bins=n_bins, range=x_range)
        hist_sample2, _ = np.histogram(sample2, bins=n_bins, range=x_range)
        subsampled_KL.append(sc.entropy(hist_sample2+1,hist_sample1+1))

    subsampled_KL = sorted(subsampled_KL)
    pval = sum( simulated_KL < i for i in subsampled_KL) / float(n_samples)
    conf_interval = (0,subsampled_KL[int(math.ceil(n_samples*0.95))-1])
    return simulated_KL,conf_interval,pval,n

# CONTOUR PLOTS
feature_transforms.py 文件源码 项目:catchy 作者: jvbalen 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def first_order(feature, aggregates, verbose=False):
    if not type(aggregates) is list:
        aggregates = [aggregates]
    for aggregate in aggregates:
        if verbose:
            print('        first order computation: ' + aggregate)
        if aggregate == 'log':
            feature = np.log(feature)
        elif aggregate == 'sqrt':
            feature = np.sqrt(feature)
        elif aggregate == 'minlog':
            feature = np.log(1 - feature)
        elif aggregate == 'minsqrt':
            feature = np.sqrt(1 - feature)
        elif aggregate == 'mean':
            # feature = np.mean(feature, axis=0)
            feature = np.nanmean(feature, axis=0)
        elif aggregate == 'var':
            feature = np.var(feature, axis=0)
        elif aggregate == 'std':
            # feature = np.std(feature, axis=0)
            feature = np.nanstd(feature, axis=0)
        elif aggregate == 'stdmean':
            feature = np.hstack([np.mean(feature, axis=0), np.std(feature, axis=0)])
        elif aggregate == 'cov':
            feature = np.flatten(np.cov(feature, axis=0))
        elif aggregate == 'totvar':
            feature = np.array([np.mean(np.var(feature, axis=0))])
        elif aggregate == 'totstd':
            feature = np.array([np.mean(np.std(feature, axis=0))])
        elif aggregate == 'entropy':
            feature = feature.flatten()
            feature = np.array([stats.entropy(feature)])
        elif aggregate == 'normentropy':
            feature = feature.flatten()
            feature = np.array([stats.entropy(feature) / np.log(feature.size)])
        elif aggregate == 'information':
            feature = - np.log(feature)

    return feature
entropy.py 文件源码 项目:leven-squash 作者: dwcoates 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def get_entropy(self, probs):
        """
        Estimate the entropy of string in Shannons. That is, this method
        assumes that the frequency of characters in the input string is
        exactly equal to the probability mass function.
        """
        # calculates entropy in Nats
        ent_nat = entropy(probs)

        # convert to Shannons
        ent_shan = ent_nat * 1 / np.log(2)

        return ent_shan
generative_alg.py 文件源码 项目:nli_generation 作者: jstarc 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def diversity(dev, gen_test, beam_size, hypo_len, noise_size, per_premise, samples):
    step = len(dev[0]) / samples
    sind = [i * step for i in range(samples)]
    p = Progbar(per_premise * samples)
    for i in sind:
        hypos = []
        unique_words = []
        hypo_list = []
        premise = dev[0][i]
        prem_list = set(cut_zeros(list(premise)))        
        while len(hypos) < per_premise:
            label = np.argmax(dev[2][i])
            words = single_generate(premise, label, gen_test, beam_size, hypo_len, noise_size)
            hypos += [str(ex) for ex in words]
            unique_words += [int(w) for ex in words for w in ex if w > 0]
            hypo_list += [set(cut_zeros(list(ex))) for ex in words]

        jacks = []  
        prem_jacks = []
        for u in range(len(hypo_list)):
            sim_prem = len(hypo_list[u] & prem_list)/float(len(hypo_list[u] | prem_list))
            prem_jacks.append(sim_prem)
            for v in range(u+1, len(hypo_list)):
                sim = len(hypo_list[u] & hypo_list[v])/float(len(hypo_list[u] | hypo_list[v]))
                jacks.append(sim)
        avg_dist_hypo = 1 -  np.mean(jacks)
        avg_dist_prem = 1 -  np.mean(prem_jacks)
        d = entropy(Counter(hypos).values()) 
        w = entropy(Counter(unique_words).values())
        p.add(len(hypos), [('diversity', d),('word_entropy', w),('avg_dist_hypo', avg_dist_hypo), ('avg_dist_prem', avg_dist_prem)])
    arrd = p.sum_values['diversity']
    arrw = p.sum_values['word_entropy']
    arrj = p.sum_values['avg_dist_hypo']
    arrp = p.sum_values['avg_dist_prem']

    return arrd[0] / arrd[1], arrw[0] / arrw[1], arrj[0] / arrj[1],  arrp[0] / arrp[1]
lda_tuna.py 文件源码 项目:twitter_LDA_topic_modeling 作者: kenneth-orton 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def jensen_shannon(P, Q):
    M = 0.5 * (P + Q)
    return 0.5 * (entropy(P, M) + entropy(Q, M))
plot_distances.py 文件源码 项目:twitter_LDA_topic_modeling 作者: kenneth-orton 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def jensen_shannon_divergence(P, Q):
    _P = np.array(P) / norm(np.array(P), ord=1)
    _Q = np.array(Q) / norm(np.array(Q), ord=1)
    _M = 0.5 * (_P + _Q)
    return 0.5 * (entropy(_P, _M) + entropy(_Q, _M))
methods.py 文件源码 项目:CElegansBehaviour 作者: ChristophKirst 项目源码 文件源码 阅读 36 收藏 0 点赞 0 评论 0
def jensen_shannon_divergence(p,q):
  """Jensen-Shannon distance between distributions p and q"""
  m = (p+q)/2.0;
  return stats.entropy(p,m) + stats.entropy(q,m);
demo_mi.py 文件源码 项目:Building-Machine-Learning-Systems-With-Python-Second-Edition 作者: PacktPublishing 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def mutual_info(x, y, bins=10):
    counts_xy, bins_x, bins_y = np.histogram2d(x, y, bins=(bins, bins))
    counts_x, bins = np.histogram(x, bins=bins)
    counts_y, bins = np.histogram(y, bins=bins)

    counts_xy += 1
    counts_x += 1
    counts_y += 1
    P_xy = counts_xy / np.sum(counts_xy, dtype=float)
    P_x = counts_x / np.sum(counts_x, dtype=float)
    P_y = counts_y / np.sum(counts_y, dtype=float)

    I_xy = np.sum(P_xy * np.log2(P_xy / (P_x.reshape(-1, 1) * P_y)))

    return I_xy / (entropy(counts_x) + entropy(counts_y))
__init__.py 文件源码 项目:mlprojects-py 作者: srinathperera 项目源码 文件源码 阅读 35 收藏 0 点赞 0 评论 0
def create_window_based_features(data, window_size):
    central_fn = np.mean
    ma1 = calcuate_window_operation(data, window_size, central_fn)
    ma2 = calcuate_window_operation(data, 2 * window_size, central_fn)
    ma4 = calcuate_window_operation(data, 4 * window_size, central_fn)
    ma8 = calcuate_window_operation(data, 8 * window_size, central_fn)


    entropy = calcuate_window_operation(data, window_size, stats.entropy)
    stddev = calcuate_window_operation(data, window_size, np.std)
    medain_weeksbefore = value_before_period(data, 7)

    return np.column_stack((ma1, ma2, ma4, ma8, entropy, stddev, medain_weeksbefore))

# do cross volidation http://stackoverflow.com/questions/533905/get-the-cartesian-product-of-a-series-of-lists-in-python
estimator.py 文件源码 项目:scikit-gstat 作者: mmaelicke 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def entropy(X, bins=None):
    """
    Use the Shannon Entropy H to describe the distribution of the given sample.
    For calculating the Shannon Entropy, the bin edges are needed and can be passed as pk.
    If pk is None, these edges will be calculated using the numpy.histogram function with bins='fq'.
    This uses Freedman Diacons Estimator and is fairly resilient to outliers.
    If the input data X is 2D (Entropy for more than one bin needed), it will derive the histogram once and
    use the same edges in all bins.
    CAUTION: this is actually an changed behaviour to scikit-gstat<=0.1.4

    :param X:  np.ndarray with the given sample to calculate the Shannon entropy from
    :param bins: The bin edges for entropy calculation, or an amount of even spaced bins
    :return:
    """
    _X = np.array(X)

    if any([isinstance(_, (list, np.ndarray)) for _ in _X]):
        # if bins is not set, use the histogram over the full value range
        if bins is None:
            # could not fiugre out a better way here. I need the values before calculating the entropy
            # in order to use the full value range in all bins
            vals = [[np.abs(_[i] - _[i + 1]) for i in np.arange(0, len(_), 2)] for _ in _X]
            bins = np.histogram(vals, bins=15)[1][1:]
        return np.array([entropy(_, bins=bins) for _ in _X])

    # check even
    if len(_X) % 2 > 0:
        raise ValueError('The sample does not have an even length: {}'.format(_X))

    # calculate the values
    vals = [np.abs(_X[i] - _X[i + 1]) for i in np.arange(0, len(_X), 2)]

    # claculate the bins
    if bins is None:
        bins = 15
    pk = np.histogram(vals, bins)[0]

    return scipy_entropy(pk=pk)
matutils.py 文件源码 项目:nonce2vec 作者: minimalparts 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def kullback_leibler(vec1, vec2, num_features=None):
    """
    A distance metric between two probability distributions.
    Returns a distance value in range <0,1> where values closer to 0 mean less distance (and a higher similarity)
    Uses the scipy.stats.entropy method to identify kullback_leibler convergence value.
    If the distribution draws from a certain number of docs, that value must be passed.
    """
    if scipy.sparse.issparse(vec1):
        vec1 = vec1.toarray()
    if scipy.sparse.issparse(vec2):
        vec2 = vec2.toarray() # converted both the vectors to dense in case they were in sparse matrix 
    if isbow(vec1) and isbow(vec2): # if they are in bag of words format we make it dense
        if num_features != None: # if not None, make as large as the documents drawing from
            dense1 = sparse2full(vec1, num_features)
            dense2 = sparse2full(vec2, num_features)
            return entropy(dense1, dense2)
        else:
            max_len = max(len(vec1), len(vec2))
            dense1 = sparse2full(vec1, max_len)
            dense2 = sparse2full(vec2, max_len)
            return entropy(dense1, dense2)
    else:
        # this conversion is made because if it is not in bow format, it might be a list within a list after conversion
        # the scipy implementation of Kullback fails in such a case so we pick up only the nested list.
        if len(vec1) == 1:
            vec1 = vec1[0]
        if len(vec2) == 1:
            vec2 = vec2[0]
        return scipy.stats.entropy(vec1, vec2)
feature_squeezing.py 文件源码 项目:EvadeML-Zoo 作者: mzweilin 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def kl(x1, x2):
    assert x1.shape == x2.shape
    # x1_2d, x2_2d = reshape_2d(x1), reshape_2d(x2)

    # Transpose to [?, #num_examples]
    x1_2d_t = x1.transpose()
    x2_2d_t = x2.transpose()

    # pdb.set_trace()
    e = entropy(x1_2d_t, x2_2d_t)
    e[np.where(e==np.inf)] = 2
    return e
base.py 文件源码 项目:MLAlgorithms 作者: rushter 项目源码 文件源码 阅读 38 收藏 0 点赞 0 评论 0
def f_entropy(p):
    # Convert values to probability
    p = np.bincount(p) / float(p.shape[0])

    ep = stats.entropy(p)
    if ep == -float('inf'):
        return 0.0
    return ep
mdlp.py 文件源码 项目:xam 作者: MaxHalford 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def calc_class_entropy(y):
    class_counts = stats.itemfreq(y)[:, 1]
    return stats.entropy(class_counts, base=2)
matutils.py 文件源码 项目:ohmnet 作者: marinkaz 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def kullback_leibler(vec1, vec2, num_features=None):
    """
    A distance metric between two probability distributions.
    Returns a distance value in range <0,1> where values closer to 0 mean less distance (and a higher similarity)
    Uses the scipy.stats.entropy method to identify kullback_leibler convergence value.
    If the distribution draws from a certain number of docs, that value must be passed.
    """
    if scipy.sparse.issparse(vec1):
        vec1 = vec1.toarray()
    if scipy.sparse.issparse(vec2):
        vec2 = vec2.toarray() # converted both the vectors to dense in case they were in sparse matrix 
    if isbow(vec1) and isbow(vec2): # if they are in bag of words format we make it dense
        if num_features != None: # if not None, make as large as the documents drawing from
            dense1 = sparse2full(vec1, num_features)
            dense2 = sparse2full(vec2, num_features)
            return entropy(dense1, dense2)
        else:
            max_len = max(len(vec1), len(vec2))
            dense1 = sparse2full(vec1, max_len)
            dense2 = sparse2full(vec2, max_len)
            return entropy(dense1, dense2)
    else:
        # this conversion is made because if it is not in bow format, it might be a list within a list after conversion
        # the scipy implementation of Kullback fails in such a case so we pick up only the nested list.
        if len(vec1) == 1:
            vec1 = vec1[0]
        if len(vec2) == 1:
            vec2 = vec2[0]
        return scipy.stats.entropy(vec1, vec2)
rbm_sample.py 文件源码 项目:neurotools 作者: michaelerule 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def barHhv(s):
        '''
        Conditional entropy H(h|v)
        '''
        return np.sum(s.Qv*s.Hhv)
rbm_sample.py 文件源码 项目:neurotools 作者: michaelerule 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def barHvh(s):
        '''
        Conditional entropy H(v|h)
        '''
        return np.sum(s.Qh*s.Hvh)

    # ---------------------------------------------------------------------
    # Energies of samples
rbm_sample.py 文件源码 项目:neurotools 作者: michaelerule 项目源码 文件源码 阅读 34 收藏 0 点赞 0 评论 0
def short_report(s):
        Hhs = np.sum(rb.bitent(s.Ph))
        Hvs = np.sum(rb.bitent(s.Pv))
        # print a short report
        print('\nRBM dataset Ns=%s Nh=%s Nv=%s'%(s.Ns,s.Nh,s.Nv))
        print('Vis capacity, maximum',np.sum(rb.bitent(0.5*np.ones(s.Nv))))
        print('Hid capacity, maximum',np.sum(rb.bitent(0.5*np.ones(s.Nh))))
        print('Vis entropy , sampled',Hvs)
        print('Hid entropy , sampled',Hhs)
        print('Entropy difference   ',(Hhs-Hvs))
        print('Mean hidden rate     ',np.mean(s.Ph))
        print('Mean hidde complexity',rb.bitent(np.mean(s.Ph))*s.Nh)
rbm_sample.py 文件源码 项目:neurotools 作者: michaelerule 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def long_report(s):
        lgE = np.log2(np.e)
        # Long report
        # print('\nFound dataset %s T=%s Nh=%s Nv=%s'%(DIR,T,Nh,Nv))
        # print('DKL                   %0.2f'%DKL)
        print('\nRBM dataset Ns=%s Nh=%s Nv=%s'%(s.Ns,s.Nh,s.Nv))
        # Hidden layer entropy
        print('==Hidden layer entropy==')
        print('Hid capacity, maximum %0.2f'%(np.sum(rb.bitent(0.5*np.ones(s.Nh)))))
        print('Hid entropy , sampled %0.2f'%(s.Hhs))
        print('Entropy hid sample is %0.2f'%(entropy(s.Qh,base=2)))
        print('<<Eh>h|v>v sampled is %0.2f'%(s.barEhhv*lgE))
        print('<<Eh>h|v>v ufield  is %0.2f'%(s.barEhhv_meanfield*lgE))
        print('Mean hidde complexity %0.2f'%(rb.bitent(np.mean(s.Ph))*s.Nh))
        print('Mean hidden rate      %0.2f'%(np.mean(s.Ph)))
        # Conditional entropy
        print('==Conditional entropy==')
        print('Entropy difference    %0.2f'%(s.Hhs-s.Hvs))
        print('<H_h|v>v           is %0.2f'%(s.barHhv*lgE))
        # Likelihoods
        print('==Negative log-likelihood==')
        print('<<Ev|h>h|v>v sampl is %0.2f'%(s.barEvhhv *lgE))
        print('<<Ev|h>h|v>v ufild is %0.2f'%(s.barEvhhv_meanfield*lgE))
        # KL divergences
        print('==KL divergences==')
        print('<Dkl(h|v||h)>v sam is %0.2f'%(s.barDKLhv*lgE))
        print('<Dkl(h|v||h)>v uf1 is %0.2f'%(s.barDKLhv_meanfield*lgE))
        # Visible entropy; These should be close in value
        print('==Visible layer entropy==')
        print('Vis capacity, maximum %0.2f'%(np.sum(rb.bitent(0.5*np.ones(s.Nv)))))
        print('Vis entropy , sampled %0.2f'%(s.Hvs))
        print('Entropy vis sample is %0.2f'%(entropy(s.Qv,base=2)))
        print('<D(.)+<Ev|h>h|v>v sam %0.2f'%(s.barDKLhv*lgE+s.barEvhhv *lgE))
        print('<D(.)+<Ev|h>h|v>v uf1 %0.2f'%(s.barDKLhv_meanfield*lgE+s.barEvhhv_meanfield*lgE))
test1.py 文件源码 项目:Database-Generation-for-Itemset-Mining 作者: clezcano 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def entropy(self, filename, delimeter, itemsetSize, minsup, fun): # fun defines use of build-in entropy or my own
        db = DataBase()
        db.readDB(filename, delimeter)
        dbElem = db.getDBElements()
        dbSize = db.size()

        kItemsetFreq = [float(db.getItemsetSup(set(itemset))) / dbSize for itemset in combinations(dbElem, itemsetSize)]
        sumFreq = sum(kItemsetFreq)
        kItemsetProb = (itemsetFreq / sumFreq for itemsetFreq in kItemsetFreq)
        kItemsetFreq.clear()
        db.getDataBase().clear()
        if fun == 1:
            return entropy(kItemsetProb, base=2)
        elif fun == 2:
            return self.calculateEntropy(kItemsetProb)


问题


面经


文章

微信
公众号

扫码关注公众号