python类stats()的实例源码-面圈网

stats.py 文件源码项目：Projects 作者: it2school 项目源码文件源码阅读 35 收藏 0 点赞 0 评论 0

def ppf(x, n):
    try:
        from scipy import stats
    except ImportError:
        stats = None

    if stats:
        if n < 30:
            return stats.t.ppf(x, n)
        return stats.norm.ppf(x)
    else:
        if n < 30:
            # TODO: implement power series:
            # http://eprints.maths.ox.ac.uk/184/1/tdist.pdf
            raise ImportError(
                'You must have scipy installed to use t-student '
                'when sample_size is below 30')
        return norm_ppf(x)

# According to http://sphweb.bumc.bu.edu/otlt/MPH-Modules/BS/
# BS704_Confidence_Intervals/BS704_Confidence_Intervals_print.html

statistics_noise.py 文件源码项目：privcount 作者: privcount 项目源码文件源码阅读 41 收藏 0 点赞 0 评论 0

def get_sanity_check_counter():
    '''
    Provide a dictionary with the standard sanity check counter values.
    It is typically used like:
        counters['ZeroCount'] = get_sanity_check_counter()
    All of these values are unused, except for:
        bins:
            - [-inf, inf] (a long counter is appended by SecureCounters,
                           it should only ever have blinding values added)
        estimated_value: 0.0 (TODO: used for checking if stats have changed)
        sigma: 0.0 (used for adding noise, 0.0 means no noise is added)
    '''
    sanity_check = {}
    sanity_check['bins'] = []
    single_bin = [float('-inf'), float('inf')]
    sanity_check['bins'].append(single_bin)
    sanity_check['sensitivity'] = 0.0
    sanity_check['estimated_value'] = 0.0
    sanity_check['sigma'] = 0.0
    sanity_check['epsilon'] = 0.0
    sanity_check['expected_noise_ratio'] = 0.0
    return sanity_check

test_stats.py 文件源码项目：PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码文件源码阅读 35 收藏 0 点赞 0 评论 0

def test_rank_methods_frame(self):
        tm.skip_if_no_package('scipy', '0.13', 'scipy.stats.rankdata')
        import scipy
        from scipy.stats import rankdata

        xs = np.random.randint(0, 21, (100, 26))
        xs = (xs - 10.0) / 10.0
        cols = [chr(ord('z') - i) for i in range(xs.shape[1])]

        for vals in [xs, xs + 1e6, xs * 1e-6]:
            df = DataFrame(vals, columns=cols)

            for ax in [0, 1]:
                for m in ['average', 'min', 'max', 'first', 'dense']:
                    result = df.rank(axis=ax, method=m)
                    sprank = np.apply_along_axis(
                        rankdata, ax, vals,
                        m if m != 'first' else 'ordinal')
                    sprank = sprank.astype(np.float64)
                    expected = DataFrame(sprank, columns=cols)

                    if LooseVersion(scipy.__version__) >= '0.17.0':
                        expected = expected.astype('float64')
                    tm.assert_frame_equal(result, expected)

gleu.py 文件源码项目：smt-for-gec 作者: cnap 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def gleu(self, stats, smooth=False):
        """Compute GLEU from collected statistics obtained by call(s) to gleu_stats"""
        # smooth 0 counts for sentence-level scores
        if smooth:
            stats = [s if s != 0 else 1 for s in stats]
        if len(filter(lambda x: x == 0, stats)) > 0:
            return 0
        (c, r) = stats[:2]
        log_gleu_prec = sum([math.log(float(x) / y)
                             for x, y in zip(stats[2::2], stats[3::2])]) / 4
        for i, (x, y) in enumerate(zip(stats[2::2], stats[3::2])) :
            pass
            #print 'Precision', i+1, '=', x, '/', y, '=', 1.*x/y
#        log_gleu_prec = sum([math.log(float(x) / y)
#                             for x, y in zip(stats[2::2], stats[3::2])]) / 4

        return math.exp(min([0, 1 - float(r) / c]) + log_gleu_prec)

common.py 文件源码项目：singlecell-dash 作者: czbiohub 项目源码文件源码阅读 44 收藏 0 点赞 0 评论 0

def clean_mapping_stats(mapping_stats_original, convert_to_percentage=None):
    """Remove whitespace from all values and convert to numbers"""

    if convert_to_percentage is None:
        convert_to_percentage = set()

    mapping_stats_original = mapping_stats_original.applymap(
        lambda x: (x.replace(',', '').strip().strip('%')
                   if isinstance(x, str) else x))

    numeric = mapping_stats_original.apply(maybe_to_numeric)

    numeric.columns = numeric.columns.map(str.strip)

    # for 10X mapping stats
    numeric.columns = numeric.columns.map(
            lambda x: ('Percent {}'.format(x.replace('Fraction ', ''))
                       if x in convert_to_percentage else x)
    )

    return numeric

brsa.py 文件源码项目：brainiak 作者: brainiak 项目源码文件源码阅读 54 收藏 0 点赞 0 评论 0

def prior_GP_var_half_cauchy(y_invK_y, n_y, tau_range):
    """ Imposing a half-Cauchy prior onto the standard deviation (tau)
        of the Gaussian Process which is in turn a prior imposed over
        a function y = f(x).
        The scale parameter of the half-Cauchy prior is tau_range.
        The function returns the MAP estimate of tau^2 and
        log(p(tau|tau_range)) for the MAP value of tau^2,
        where tau_range describes the reasonable range of tau
        in the half-Cauchy prior.
        An alternative form of prior is inverse-Gamma prior on tau^2.
        Inverse-Gamma prior penalizes for both very small and very
        large values of tau, while half-Cauchy prior only penalizes
        for very large values of tau.
        For more information on usage, see description in BRSA class:
        `.BRSA`
    """
    tau2 = (y_invK_y - n_y * tau_range**2
            + np.sqrt(n_y**2 * tau_range**4 + (2 * n_y + 8)
                      * tau_range**2 * y_invK_y + y_invK_y**2))\
        / 2 / (n_y + 2)
    log_ptau = scipy.stats.halfcauchy.logpdf(
        tau2**0.5, scale=tau_range)
    return tau2, log_ptau

brsa.py 文件源码项目：brainiak 作者: brainiak 项目源码文件源码阅读 34 收藏 0 点赞 0 评论 0

def _zscore(a):
    """ Calculating z-score of data on the first axis.
        If the numbers in any column are all equal, scipy.stats.zscore
        will return NaN for this column. We shall correct them all to
        be zeros.

    Parameters
    ----------
    a: numpy array

    Returns
    -------
    zscore: numpy array
        The z-scores of input "a", with any columns including non-finite
        numbers replaced by all zeros.
    """
    assert a.ndim > 1, 'a must have more than one dimensions'
    zscore = scipy.stats.zscore(a, axis=0)
    zscore[:, np.logical_not(np.all(np.isfinite(zscore), axis=0))] = 0
    return zscore

segment.py 文件源码项目：kaggle-seizure-prediction 作者: sics-lm 项目源码文件源码阅读 51 收藏 0 点赞 0 评论 0

def mad(self, median=None):
        """
        Return the median absolute deviation for this segment only,
        scaled by phi-1(3/4) to approximate the standard deviation.
        :param median: If this is not None, it will be used as the median from which the deviation is calculated.
        :return: A NDarray of shape (n_channels, 1) with the MAD for the channels of this segment.
        """

        c = scipy.stats.norm.ppf(3 / 4)
        if median is None:
            subject_median = self.median()
        else:
            subject_median = median
        median_residuals = self.mat_struct.data - subject_median  # deviation between median and data
        mad = np.median(np.abs(median_residuals), axis=1)[:, np.newaxis]
        return mad / c

basic_segment_statistics.py 文件源码项目：kaggle-seizure-prediction 作者: sics-lm 项目源码文件源码阅读 36 收藏 0 点赞 0 评论 0

def median_absolute_deviation(data, subject_median=None, axis=0):
    """
    Returns the median absolute deviation (MAD) for the given data.
    :param data: A DataFrame holding the data to calculate the MAD from.
    :param subject_median: If given, these will be used as the medians which the deviation is calculated from. If None
                           The median of the given data is used.
    :param axis: The axis to calculate over.
    :return: A Series object with the median absolute deviations over the given *axis* for the *data*.
    """
    """A robust estimate of the standard deviation"""
    # The scaling factor for estimating the standard deviation from the MAD
    c = scipy.stats.norm.ppf(3 / 4)
    if isinstance(data, pd.DataFrame):
        if subject_median is None:
            subject_median = data.median(axis=axis)
        median_residuals = data - subject_median
        mad = median_residuals.abs().median(axis=axis)
    else:
        if subject_median is None:
            subject_median = np.median(data, axis=axis)
        median_residuals = data - subject_median[:, np.newaxis]  # deviation between median and data
        mad = np.median(np.abs(median_residuals), axis=axis)
    return mad / c

basic_segment_statistics.py 文件源码项目：kaggle-seizure-prediction 作者: sics-lm 项目源码文件源码阅读 33 收藏 0 点赞 0 评论 0

def read_folder(stats_folder, metrics=None):
    """
    Reads all segment statistics files from the given folder and returns the statistics as a single DataFrame.
    :param stats_folder: The folder to reat statistics files from.
    :param metrics: A subset of the metrics to load.
    :return: A DataFrame with all the statistics available in stats_folder.
    """
    if metrics is None:
        metrics = ['absolute mean', 'absolute median', 'kurtosis', 'skew', 'std']
    glob_pattern = os.path.join(stats_folder, '*segments_statistics.csv')
    files = glob.glob(glob_pattern)
    if files:
        stats = pd.concat([read_stats(stat_file, metrics) for stat_file in files])
        return stats
    else:
        raise IOError("No segment statistics file in folder {}".format(stats_folder))

basic_segment_statistics.py 文件源码项目：kaggle-seizure-prediction 作者: sics-lm 项目源码文件源码阅读 48 收藏 0 点赞 0 评论 0

def get_subject_metric(stats_df, metric_name, aggregator='{dataframe}.median()', channel_ordering=None, use_cache=True):
    """
    Returns the metric given by stats df as a NDArray-like of shape (n_channels, 1), obtained by aggregating the
    given metric from the dataframe.
    :param stats_df: The statistics dataframe acquired from read_stats.
    :param metric_name: The metric to collect.
    :param aggregator: A string with an expression used to aggregate the per segment statistic to a single statistic for
                       the whole subject.
    :param channel_ordering: An optional ordered sequence of channel names, which will ensure that the outputted
    statistics vector has the same order as the segment which the statistic should be applied on.
    :param use_cache: If True, the metrics will be cached by the function so that calling it multiple times for the
                      same subject is fast.
    :return: A NDArray of shape (n_channels, 1) where each element along axis 0 correspond to the aggregated statistic
             that channel.
    """
    cache = get_subject_metric.cache
    assert isinstance(stats_df, pd.DataFrame)
    if use_cache and id(stats_df) in cache and (metric_name, aggregator) in cache[id(stats_df)]:
        return cache[id(stats_df)][(metric_name, aggregator)]

    # The stats dataframes have a 2-level column index, where the first level are the channel names and the seconde
    # the metric name. To get the metric but keep the channels we slice the first level with all the entries using
    # slice(None), this is equivalent to [:] for regular slices.
    if channel_ordering is None:
        segment_metrics = stats_df.loc[:, (slice(None), metric_name)]
    else:
        segment_metrics = stats_df.loc[:, (channel_ordering, metric_name)]
    aggregated_metric = eval(aggregator.format(dataframe='segment_metrics'))
    added_axis = aggregated_metric[:, np.newaxis]
    cache[id(stats_df)][(metric_name, aggregator)] = added_axis
    return added_axis

base_sp.py 文件源码项目：standard_precip 作者: e-baumer 项目源码文件源码阅读 39 收藏 0 点赞 0 评论 0

def cdf_to_ppf(self, data, params):
        '''
        Take the specific distributions fitted parameters and calculate the
        cdf. Apply the inverse normal distribution to the cdf to get the SPI 
        SPEI. This process is best described in Lloyd-Hughes and Saunders, 2002
        which is included in the documentation.

        '''

        # Calculate the CDF of observed precipitation on a given time scale
        cdf = self.distr.cdf(
            data, *params[:-2], loc=params[-2], scale=params[-1]
        )

        # Apply inverse normal distribution
        norm_ppf = scipy.stats.norm.ppf(cdf)

        return norm_ppf

bayes.py 文件源码项目：nmmn 作者: rsnemmen 项目源码文件源码阅读 39 收藏 0 点赞 0 评论 0

def joint_density(X, Y, bounds=None):
    """
Plots joint distribution of variables.
Inherited from method in src/graphics.py module in project 
git://github.com/aflaxman/pymc-example-tfr-hdi.git
    """
    if bounds:
        X_min, X_max, Y_min, Y_max = bounds
    else:
        X_min = X.min()
        X_max = X.max()
        Y_min = Y.min()
        Y_max = Y.max()

    pylab.plot(X, Y, linestyle='none', marker='o', color='green', mec='green', alpha=.2, zorder=-99)

    gkde = scipy.stats.gaussian_kde([X, Y])
    x,y = pylab.mgrid[X_min:X_max:(X_max-X_min)/25.,Y_min:Y_max:(Y_max-Y_min)/25.]
    z = pylab.array(gkde.evaluate([x.flatten(), y.flatten()])).reshape(x.shape)
    pylab.contour(x, y, z, linewidths=2)

    pylab.axis([X_min, X_max, Y_min, Y_max])

stats.py 文件源码项目：nmmn 作者: rsnemmen 项目源码文件源码阅读 39 收藏 0 点赞 0 评论 0

def scatterfit(x,y,a=None,b=None):
    """
Compute the mean deviation of the data about the linear model given if A,B
(*y=ax+b*) provided as arguments. Otherwise, compute the mean deviation about 
the best-fit line.

:param x,y: assumed to be Numpy arrays. 
:param a,b: scalars.
:rtype: float sd with the mean deviation.
    """

    if a==None: 
        # Performs linear regression
        a, b, r, p, err = scipy.stats.linregress(x,y)

    # Std. deviation of an individual measurement (Bevington, eq. 6.15)
    N=numpy.size(x)
    sd=1./(N-2.)* numpy.sum((y-a*x-b)**2)
    sd=numpy.sqrt(sd)

    return sd

stats.py 文件源码项目：nmmn 作者: rsnemmen 项目源码文件源码阅读 44 收藏 0 点赞 0 评论 0

def r2p(r,n):
    """
Given a value of the Pearson r coefficient, this method gives the
corresponding p-value of the null hypothesis (i.e. no correlation).

Code copied from scipy.stats.pearsonr source.

Usage:

>>> p=r2p(r,n)

where 'r' is the Pearson coefficient and n is the number of data
points.

:returns: p-value of the null hypothesis.
    """
    df = n-2
    if abs(r) == 1.0:
        prob = 0.0
    else:
        t_squared = r*r * (df / ((1.0 - r) * (1.0 + r)))
        prob = scipy.stats.betai(0.5*df, 0.5, df / (df + t_squared))

    return prob

test_graynet.py 文件源码项目：graynet 作者: raamana 项目源码文件源码阅读 36 收藏 0 点赞 0 评论 0

def test_run_roi_stats_via_API():
    "Tests whether roi stats can be computed (not their accuracy) and the return values match in size."

    summary_methods = ['median', 'mean', 'std', 'variation', 'entropy', 'skew', 'kurtosis']
    # 'mode' returns more than one value; 'gmean' requires only positive values,
    # 'hmean' can not always be computed
    from scipy.stats import  trim_mean, kstat
    from functools import partial
    trimmed_mean = partial(trim_mean, proportiontocut=0.05)
    third_kstat = partial(kstat, n=3)

    summary_methods.extend([trimmed_mean, third_kstat])
    # checking support for nan-handling callables
    summary_methods.extend([np.nanmedian, np.nanmean])

    for summary_method in summary_methods:
        roi_medians = graynet.roiwise_stats_indiv(subject_id_list, fs_dir, base_feature=base_feature,
                                                  chosen_roi_stats=summary_method, atlas=atlas,
                                                  smoothing_param=fwhm, out_dir=out_dir, return_results=True)
        for sub in subject_id_list:
            if roi_medians[sub].size != num_roi_wholebrain:
                raise ValueError('invalid summary stats - #nodes do not match.')

models.py 文件源码项目：pyprocessmacro 作者: QuentinAndre 项目源码文件源码阅读 35 收藏 0 点赞 0 评论 0

def coeff_summary(self):
        """
        Get the estimates of the terms in the model.
        :return: A DataFrame of betas, se, t (or z), p, llci, ulci for all variables of the model.
        """
        results = self.estimation_results
        if results:
            if "t" in results.keys():  # Model has t-stats rather than z-stats
                coeffs = np.array(
                    [results["betas"], results["se"], results["t"], results["p"], results["llci"], results["ulci"]]).T
                df = pd.DataFrame(coeffs, index=results["names"],
                                  columns=["coeff", "se", "t", "p", "LLCI", "ULCI"])
            else:  # Model has z-stats.
                coeffs = np.array(
                    [results["betas"], results["se"], results["z"], results["p"], results["llci"], results["ulci"]]).T
                df = pd.DataFrame(coeffs, index=results["names"],
                                  columns=["coeff", "se", "Z", "p", "LLCI", "ULCI"])
        else:
            raise NotImplementedError(
                "The model has not been estimated yet. Please estimate the model first."
            )
        return df

models.py 文件源码项目：pyprocessmacro 作者: QuentinAndre 项目源码文件源码阅读 34 收藏 0 点赞 0 评论 0

def _estimate(self):
        """
        Estimates the direct effect of X on Y, and return the results into as a dictionary.
        :return: dict
            A dictionary of parameters and model estimates.
        """
        mod_values = [i for i in product(*self._moderators_values)]
        mod_symb = self._moderators_symb
        betas, se, llci, ulci = self._get_conditional_direct_effects(mod_symb, mod_values)
        t = betas / se
        if self._is_logit:
            p = stats.norm.sf(np.abs(t)) * 2
        else:
            df_e = self._model.estimation_results["df_e"]
            p = stats.t.sf(np.abs(t), df_e) * 2
        estimation_results = {"betas": betas,
                              "se": se,
                              "t": t,
                              "p": p,
                              "llci": llci,
                              "ulci": ulci}
        return estimation_results

pohmm.py 文件源码项目：pohmm 作者: vmonaco 项目源码文件源码阅读 36 收藏 0 点赞 0 评论 0

def _compute_log_likelihood(self, obs, pstates_idx):

        q = np.zeros(shape=(len(obs), self.n_hidden_states, self.n_features))

        for col, (feature_name, feature_distr) in enumerate(zip(self.emission_name, self.emission_distr)):
            if feature_distr == 'normal':
                mu = self.emission[feature_name]['mu'][pstates_idx]
                sigma = self.emission[feature_name]['sigma'][pstates_idx]
                for j in range(self.n_hidden_states):
                    q[:, j, col] = np.log(
                        np.maximum(MIN_PROBA, stats.norm.pdf(obs[:, col], loc=mu[:, j], scale=sigma[:, j])))
            if feature_distr == 'lognormal':
                logmu = self.emission[feature_name]['logmu'][pstates_idx]
                logsigma = self.emission[feature_name]['logsigma'][pstates_idx]
                for j in range(self.n_hidden_states):
                    q[:, j, col] = np.log(np.maximum(MIN_PROBA,
                                                     stats.lognorm.pdf(obs[:, col], logsigma[:, j], loc=0,
                                                                       scale=np.exp(logmu[:, j]))))

        q = q.sum(axis=2)
        return q

test_stats.py 文件源码项目：PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码文件源码阅读 43 收藏 0 点赞 0 评论 0

def test_rank_methods_series(self):
        tm.skip_if_no_package('scipy', '0.13', 'scipy.stats.rankdata')
        import scipy
        from scipy.stats import rankdata

        xs = np.random.randn(9)
        xs = np.concatenate([xs[i:] for i in range(0, 9, 2)])  # add duplicates
        np.random.shuffle(xs)

        index = [chr(ord('a') + i) for i in range(len(xs))]

        for vals in [xs, xs + 1e6, xs * 1e-6]:
            ts = Series(vals, index=index)

            for m in ['average', 'min', 'max', 'first', 'dense']:
                result = ts.rank(method=m)
                sprank = rankdata(vals, m if m != 'first' else 'ordinal')
                expected = Series(sprank, index=index)

                if LooseVersion(scipy.__version__) >= '0.17.0':
                    expected = expected.astype('float64')
                tm.assert_series_equal(result, expected)

test_analytics.py 文件源码项目：PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码文件源码阅读 44 收藏 0 点赞 0 评论 0

def test_skew(self):
        tm._skip_if_no_scipy()

        from scipy.stats import skew
        alt = lambda x: skew(x, bias=False)
        self._check_stat_op('skew', alt)

        # test corner cases, skew() returns NaN unless there's at least 3
        # values
        min_N = 3
        for i in range(1, min_N + 1):
            s = Series(np.ones(i))
            df = DataFrame(np.ones((i, i)))
            if i < min_N:
                self.assertTrue(np.isnan(s.skew()))
                self.assertTrue(np.isnan(df.skew()).all())
            else:
                self.assertEqual(0, s.skew())
                self.assertTrue((df.skew() == 0).all())

test_analytics.py 文件源码项目：PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码文件源码阅读 41 收藏 0 点赞 0 评论 0

def test_kurt(self):
        tm._skip_if_no_scipy()

        from scipy.stats import kurtosis
        alt = lambda x: kurtosis(x, bias=False)
        self._check_stat_op('kurt', alt)

        index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]],
                           labels=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2],
                                   [0, 1, 0, 1, 0, 1]])
        s = Series(np.random.randn(6), index=index)
        self.assertAlmostEqual(s.kurt(), s.kurt(level=0)['bar'])

        # test corner cases, kurt() returns NaN unless there's at least 4
        # values
        min_N = 4
        for i in range(1, min_N + 1):
            s = Series(np.ones(i))
            df = DataFrame(np.ones((i, i)))
            if i < min_N:
                self.assertTrue(np.isnan(s.kurt()))
                self.assertTrue(np.isnan(df.kurt()).all())
            else:
                self.assertEqual(0, s.kurt())
                self.assertTrue((df.kurt() == 0).all())

goftest.py 文件源码项目：kernel-gof 作者: wittawatj 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def perform_test(self, dat):
        """
        dat: a instance of Data
        """
        with util.ContextTimer() as t:
            alpha = self.alpha
            X = dat.data()
            n = X.shape[0]

            # H: length-n vector
            _, H = self.compute_stat(dat, return_pointwise_stats=True)
            test_stat = np.sqrt(n/2)*np.mean(H)
            stat_var = np.mean(H**2) 
            pvalue = stats.norm.sf(test_stat, loc=0, scale=np.sqrt(stat_var) )

        results = {'alpha': self.alpha, 'pvalue': pvalue, 'test_stat': test_stat,
                 'h0_rejected': pvalue < alpha, 'time_secs': t.secs, 
                 }
        return results

stats.py 文件源码项目：SynBioMTS 作者: reisalex 项目源码文件源码阅读 47 收藏 0 点赞 0 评论 0

def correlation(x,y,name="Pearson"):
    # Linear or rank correlation
    # Returns:
    #   r,rho, or tau (float) = the test statistic
    #   pvalue (float) = the p-value for the test

    assert len(x)==len(y), "Arrays x & y must be equal length."

    if name == "Pearson":
        (r,pvalue) = scipy.stats.pearsonr(x,y)
        return r,pvalue

    elif name == "Spearman":
        (rho,pvalue) = scipy.stats.spearmanr(x,y)
        return rho,pvalue

    elif name == "Kendall":
        (tau,pvalue) = scipy.stats.kendalltau(x,y)
        return tau,pvalue

    else:
        error = ("The {} correlation is not available."
                 "Please use 'Pearson', 'Spearman', or 'Kendall.'")
        error.format(name)
        raise ValueError(error)

cluster_level.py 文件源码项目：decoding_challenge_cortana_2016_3rd 作者: kingjr 项目源码文件源码阅读 34 收藏 0 点赞 0 评论 0

def _pval_from_histogram(T, H0, tail):
    """Get p-values from stats values given an H0 distribution

    For each stat compute a p-value as percentile of its statistics
    within all statistics in surrogate data
    """
    if tail not in [-1, 0, 1]:
        raise ValueError('invalid tail parameter')

    # from pct to fraction
    if tail == -1:  # up tail
        pval = np.array([np.sum(H0 <= t) for t in T])
    elif tail == 1:  # low tail
        pval = np.array([np.sum(H0 >= t) for t in T])
    else:  # both tails
        pval = np.array([np.sum(abs(H0) >= abs(t)) for t in T])

    pval = (pval + 1.0) / (H0.size + 1.0)  # the init data is one resampling
    return pval

iop.py 文件源码项目：blackbox 作者: wrwrwr 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def instantiate_dist(name, *opts):
    """
    Creates distribution for the given command-line arguments.
    """
    try:
        dist = getattr(stats, name)
    except AttributeError:
        raise ValueError("No such distribution {}".format(name))
    opts = list(opts)
    try:
        loc = float(opts.pop(0))
    except IndexError:
        loc = 0
    try:
        scale = float(opts.pop(0))
    except IndexError:
        scale = 1
    return dist(*map(float, opts), loc=loc, scale=scale)

distributions.py 文件源码项目：siHMM 作者: Ardavans 项目源码文件源码阅读 50 收藏 0 点赞 0 评论 0

def resample(self,data=[],temperature=1.,stats=None):
        stats = self._get_statistics(data) if stats is None else stats

        alphas_n, betas_n, mu_n, nus_n = self._natural_to_standard(
                self.natural_hypparam + stats / temperature)

        D = mu_n.shape[0]
        self.sigmas = 1/np.random.gamma(alphas_n,scale=1/betas_n)
        self.mu = np.sqrt(self.sigmas/nus_n)*np.random.randn(D) + mu_n

        assert not np.isnan(self.mu).any()
        assert not np.isnan(self.sigmas).any()

        # NOTE: next line is to use Gibbs sampling to initialize mean field
        self.mf_mu = self.mu

        assert self.sigmas.ndim == 1
        return self

distributions.py 文件源码项目：siHMM 作者: Ardavans 项目源码文件源码阅读 46 收藏 0 点赞 0 评论 0

def max_likelihood(self,data,weights=None,stats=None):
        if stats is not None:
            n, tot = stats
        elif weights is None:
            n, tot = super(NegativeBinomialIntegerR,self)._get_statistics(data)
        else:
            n, tot = super(NegativeBinomialIntegerR,self)._get_weighted_statistics(data,weights)

        if n > 1:
            rs = self.r_support
            ps = self._max_likelihood_ps(n,tot,rs)

            # TODO TODO this isn't right for weighted data: do weighted sums
            if isinstance(data,np.ndarray):
                likelihoods = np.array([self.log_likelihood(data,r=r,p=p).sum()
                                            for r,p in zip(rs,ps)])
            else:
                likelihoods = np.array([sum(self.log_likelihood(d,r=r,p=p).sum()
                                            for d in data) for r,p in zip(rs,ps)])

            argmax = likelihoods.argmax()
            self.r = self.r_support[argmax]
            self.p = ps[argmax]
        return self

analyzeAngle.py 文件源码项目：livespin 作者: biocompibens 项目源码文件源码阅读 42 收藏 0 点赞 0 评论 0

def bootstrap_extradata(self, nBoot, extradataA, nbins = 20):
        pops =[]
        meanpop = [[] for i in data.cat]
        pylab.figure(figsize = (14,14))
        for i in xrange(min(4, len(extradataA))):
            #pylab.subplot(2,2,i+1)
            if  i ==0:
                pylab.title("Bootstrap on means", fontsize = 20.)
            pop = extradataA[i]# & (self.GFP > 2000)]#
            for index in xrange(nBoot):
                newpop = np.random.choice(pop, size=len(pop), replace=True)

                #meanpop[i].append(np.mean(newpop))
            pops.append(newpop)
            pylab.legend()
        #pylab.title(cat[i])
            pylab.xlabel("Angle(degree)", fontsize = 15)
            pylab.xlim([0., 90.])
        for i in xrange(len(extradataA)):
            for j in xrange(i+1, len(extradataA)):
                statT, pvalue = scipy.stats.ttest_ind(pops[i], pops[j], equal_var=False)
                print "cat{0} & cat{1} get {2} ({3})".format(i,j, pvalue,statT)
        pylab.savefig("/users/biocomp/frose/frose/Graphics/FINALRESULTS-diff-f3/mean_nBootstrap{0}_bins{1}_GFPsup{2}_FLO_{3}.png".format(nBoot, nbins, 'all', randint(0,999)))

statistics_noise.py 文件源码项目：privcount 作者: privcount 项目源码文件源码阅读 36 收藏 0 点赞 0 评论 0

def satisfies_dp(sensitivity, epsilon, delta, std):
    '''
    Return True if (epsilon, delta)-differential privacy is satisfied.
    '''
    # find lowest value at which epsilon differential-privacy is satisfied
    lower_x = -(float(epsilon) * (std**2.0) / sensitivity) + sensitivity/2.0
    # determine lower tail probability of normal distribution w/ mean of zero
    lower_tail_prob = scipy.stats.norm.cdf(lower_x, 0, std)
    # explicitly return Boolean value to avoid returning numpy type
    if (lower_tail_prob <= delta):
        return True
    else:
        return False