def _p_test(self,v,grouped_data,is_continuous,is_categorical,
is_normal,min_observed,catlevels,
pval=np.nan,ptest='Not tested'):
"""
Compute p value
"""
# do not test if any sub-group has no observations
if min_observed == 0:
warnings.warn('No p-value was computed for {} due to the low number of observations.'.format(v))
return pval,ptest
# continuous
if is_continuous and is_normal:
# normally distributed
ptest = 'One-way ANOVA'
test_stat, pval = stats.f_oneway(*grouped_data)
elif is_continuous and not is_normal:
# non-normally distributed
ptest = 'Kruskal-Wallis'
test_stat, pval = stats.kruskal(*grouped_data)
# categorical
elif is_categorical:
# default to chi-squared
ptest = 'Chi-squared'
chi2, pval, dof, expected = stats.chi2_contingency(grouped_data)
# if any expected cell counts are < 5, chi2 may not be valid
# if this is a 2x2, switch to fisher exact
if expected.min() < 5:
if grouped_data.shape == (2,2):
ptest = 'Fisher''s exact'
oddsratio, pval = stats.fisher_exact(grouped_data)
else:
ptest = 'Chi-squared (warning: expected count < 5)'
warnings.warn('No p-value was computed for {} due to the low number of observations.'.format(v))
return pval,ptest
python类fisher_exact()的实例源码
def test_bin_fisher(intv_bin_ip, intv_bin_con, with_control=True, correction_method='fdr_bh'):
"""DOCSTRING
Args
Returns
"""
if intv_bin_ip.shape[0] != 1:
raise Exception('Fisher exact test does not deal with replicates.')
intv_counter = intv_bin_ip.shape[1]
assert intv_counter == intv_bin_con.shape[1]
binscore = np.empty(intv_counter)
binsignal = np.empty(intv_counter)
ip_sum = np.sum(intv_bin_ip[0,])
con_sum = np.sum(intv_bin_con[0,])
for i in range(intv_counter):
this_ip = intv_bin_ip[0, i]
others_ip = ip_sum - this_ip
this_con = intv_bin_con[0, i]
others_con = con_sum - this_con
if this_ip == 0:
binsignal[i], binscore[i] = np.nan, 1.0
continue
_, binscore[i] = fisher_exact([[this_ip, others_ip], [this_con, others_con]], alternative='greater')
if with_control:
binsignal[i] = this_ip/others_ip / this_con*others_con
else:
binsignal[i] = this_ip
adj = multipletests(binscore, alpha=0.05, method=correction_method)
binscore_adj = adj[1]
return binsignal, binscore_adj
def test_bin_fisher(intv_bin_ip, intv_bin_con, with_control=True, correction_method='fdr_bh'):
"""DOCSTRING
Args
Returns
"""
if intv_bin_ip.shape[0] != 1:
raise Exception('Fisher exact test does not deal with replicates.')
intv_counter = intv_bin_ip.shape[1]
assert intv_counter == intv_bin_con.shape[1]
binscore = np.empty(intv_counter)
binsignal = np.empty(intv_counter)
ip_sum = np.sum(intv_bin_ip[0,])
con_sum = np.sum(intv_bin_con[0,])
for i in range(intv_counter):
this_ip = intv_bin_ip[0, i]
others_ip = ip_sum - this_ip
this_con = intv_bin_con[0, i]
others_con = con_sum - this_con
if this_ip == 0:
binsignal[i], binscore[i] = np.nan, 1.0
continue
_, binscore[i] = fisher_exact([[this_ip, others_ip], [this_con, others_con]], alternative='greater')
if with_control:
binsignal[i] = this_ip/others_ip / this_con*others_con
else:
binsignal[i] = this_ip
adj = multipletests(binscore, alpha=0.05, method=correction_method)
binscore_adj = adj[1]
return binsignal, binscore_adj
def fisher_exact(*_args, **_kwargs):
raise NotImplementedError
### Indices to marginals arguments:
def fisher(cls, *marginals):
"""Scores bigrams using Fisher's Exact Test (Pedersen 1996). Less
sensitive to small counts than PMI or Chi Sq, but also more expensive
to compute. Requires scipy.
"""
n_ii, n_io, n_oi, n_oo = cls._contingency(*marginals)
(odds, pvalue) = fisher_exact([[n_ii, n_io], [n_oi, n_oo]], alternative='less')
return pvalue
def get_tf_associations(self, test):
# test = {not dissociated,associated}
tf_set = set()
# this is the set in which all Term - Function pairs will be contained
# that cannot be dissociated (i.e, for which we do not know for sure that
# they are not associated) - done with Fisher Exact tests
for onto in set(self.ontological):
if onto not in ['body','thing']: continue
d_onto = self.data[self.ontological == onto]
for li in range(30):
terms = set([w for dd in d_onto for w in dd[li]])
for term in terms:
for annot in set(self.annotation):
valid = False
if annot == 'UF': continue
d_onto_annot = self.data[(self.ontological == onto) * (self.annotation == annot)]
aa = len([t for t in d_onto_annot if term in t[li]]) # + term + function
ab = len(d_onto_annot) - aa # - term + function
ba = len([t for t in d_onto if term in t[li]]) - aa # + term - function
bb = len(d_onto) - (aa + ab + ba) # - term - function
if test == 'not dissociated' and fisher_exact([[aa,ab],[ba,bb]],'less')[1] > .05:
valid = True
tf_set.add((li,term,annot))
if test == 'associated' and fisher_exact([[aa,ab],[ba,bb]],'greater')[1] < .05:
valid = True
tf_set.add((li,term,annot))
# if aa > 0: print('%s,%d,%s,%s,%r,%d,%d,%d' % (onto,li,term,annot,valid,aa,ba,ab))
return tf_set
association.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 30
收藏 0
点赞 0
评论 0
def fisher_exact(*_args, **_kwargs):
raise NotImplementedError
### Indices to marginals arguments:
association.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
def fisher(cls, *marginals):
"""Scores bigrams using Fisher's Exact Test (Pedersen 1996). Less
sensitive to small counts than PMI or Chi Sq, but also more expensive
to compute. Requires scipy.
"""
n_ii, n_io, n_oi, n_oo = cls._contingency(*marginals)
(odds, pvalue) = fisher_exact([[n_ii, n_io], [n_oi, n_oo]], alternative='less')
return pvalue
def _get_fisher_scores_from_counts(self, cat_word_counts, not_cat_word_counts):
cat_not_word_counts = cat_word_counts.sum() - cat_word_counts
not_cat_not_word_counts = not_cat_word_counts.sum() - not_cat_word_counts
def do_fisher_exact(x):
return fisher_exact([[x[0], x[1]], [x[2], x[3]]], alternative='greater')
odds_ratio, p_values = np.apply_along_axis(
do_fisher_exact,
0,
np.array([cat_word_counts, cat_not_word_counts, not_cat_word_counts, not_cat_not_word_counts]))
return odds_ratio, p_values
def fisher_exact(*_args, **_kwargs):
raise NotImplementedError
### Indices to marginals arguments:
def fisher(cls, *marginals):
"""Scores bigrams using Fisher's Exact Test (Pedersen 1996). Less
sensitive to small counts than PMI or Chi Sq, but also more expensive
to compute. Requires scipy.
"""
n_ii, n_io, n_oi, n_oo = cls._contingency(*marginals)
(odds, pvalue) = fisher_exact([[n_ii, n_io], [n_oi, n_oo]], alternative='less')
return pvalue
def run_test(q1_pos, q2_pos, q1_neg,q2_neg):
'''
this method takes four parallel arrays representing a 2X2 contingency table.
the length of these parallel arrays denotes the number of tests that will be run,
either a chi-squared test or an fisher-exact test are run, epending whether the requriments for a
reliable chi-squared test are satisifed.
Bonferroni correction is then applied by adjusting the p-values for all of the tests
We return two parellel arrays, the first array is the p-values of for the tests, the second array is the test value
e.g. the chi-squared value or the fisher-exact oddsratio.
'''
input = [q1_pos, q2_pos, q1_neg,q2_neg]
n = len(input[0])
if not all(len(x) == n for x in input):
raise BaseException ("length of input lists must be of same length")
pvalues = []
test_values = []
for i in range(0,n):
obs = np.array([ [input[0][i],input[1][i]],[input[2][i],input[3][i]] ])
if useFisherExact(obs):
p = fisher_exact(obs)[1]
t = fisher_exact(obs)[0]
else:
p = chi2_contingency(obs)[1]
t = chi2_contingency(obs)[0]
pvalues.append(p)
test_values.append(t)
#applying Bonferroni correction
adjustedPValues = [ float(p)/float(n) for p in pvalues]
return [adjustedPValues, test_values]
def fisher_exact(*_args, **_kwargs):
raise NotImplementedError
### Indices to marginals arguments:
def fisher(cls, *marginals):
"""Scores bigrams using Fisher's Exact Test (Pedersen 1996). Less
sensitive to small counts than PMI or Chi Sq, but also more expensive
to compute. Requires scipy.
"""
n_ii, n_io, n_oi, n_oo = cls._contingency(*marginals)
(odds, pvalue) = fisher_exact([[n_ii, n_io], [n_oi, n_oo]], alternative='less')
return pvalue
def fisher_exact(*_args, **_kwargs):
raise NotImplementedError
### Indices to marginals arguments:
def fisher(cls, *marginals):
"""Scores bigrams using Fisher's Exact Test (Pedersen 1996). Less
sensitive to small counts than PMI or Chi Sq, but also more expensive
to compute. Requires scipy.
"""
n_ii, n_io, n_oi, n_oo = cls._contingency(*marginals)
(odds, pvalue) = fisher_exact([[n_ii, n_io], [n_oi, n_oo]], alternative='less')
return pvalue
def fisher_exact(*_args, **_kwargs):
raise NotImplementedError
### Indices to marginals arguments:
def fisher(cls, *marginals):
"""Scores bigrams using Fisher's Exact Test (Pedersen 1996). Less
sensitive to small counts than PMI or Chi Sq, but also more expensive
to compute. Requires scipy.
"""
n_ii, n_io, n_oi, n_oo = cls._contingency(*marginals)
(odds, pvalue) = fisher_exact([[n_ii, n_io], [n_oi, n_oo]], alternative='less')
return pvalue
def fisher_exact(*_args, **_kwargs):
raise NotImplementedError
### Indices to marginals arguments:
def fisher(cls, *marginals):
"""Scores bigrams using Fisher's Exact Test (Pedersen 1996). Less
sensitive to small counts than PMI or Chi Sq, but also more expensive
to compute. Requires scipy.
"""
n_ii, n_io, n_oi, n_oo = cls._contingency(*marginals)
(odds, pvalue) = fisher_exact([[n_ii, n_io], [n_oi, n_oo]], alternative='less')
return pvalue
def fisher_exact(*_args, **_kwargs):
raise NotImplementedError
### Indices to marginals arguments:
def fisher(cls, *marginals):
"""Scores bigrams using Fisher's Exact Test (Pedersen 1996). Less
sensitive to small counts than PMI or Chi Sq, but also more expensive
to compute. Requires scipy.
"""
n_ii, n_io, n_oi, n_oo = cls._contingency(*marginals)
(odds, pvalue) = fisher_exact([[n_ii, n_io], [n_oi, n_oo]], alternative='less')
return pvalue
def concordance(series1, series2, method, nreps=1000):
"""
Measures the concordance between two pandas Series and returns a pvalue
and measure of concordance.
Parameters
----------
series1, series2 : pandas Series
Series with matching indexes.
method : str
['fisher', 'spearman', 'kendalltau', 'empirical', 'cohen']
nreps : int
number of repititions to build the null. Only needed if method is
'empirical'
Returns
-------
measure : float
some sort of measure of concordance (e.g. r for the correlation
methods, n_observed - mean(n_expected) for empirical, etc)
p : float
p value of observed concordance between series1 and series2
"""
if method == 'fisher':
# Note: this automatically ignores any bugs which were not present
# in both series.
mat = pd.crosstab(series1, series2)
return fisher_exact(mat)
elif method == 'spearman':
return spearmanr(series1, series2)
elif method == 'kendalltau':
return kendalltau(series1, series2, nan_policy='omit')
elif method == 'empirical':
return empirical_pval(series1, series2, nreps)
elif method == 'cohen':
tmp = pd.concat((series1, series2), axis=1).dropna()
return cohen_kappa_score(tmp.iloc[:, 0], tmp.iloc[:, 1]), np.nan
else:
raise ValueError('Unknown concordance method.')
feature_pathway_overrepresentation.py 文件源码
项目:PathCORE-T
作者: greenelab
项目源码
文件源码
阅读 29
收藏 0
点赞 0
评论 0
def single_side_pathway_enrichment(pathway_definitions,
gene_signature,
n_genes):
"""Identify overrepresented pathways using the Fisher's exact test for
significance on a given pathway definition and gene signature.
(FDR correction for multiple testing is applied in
`_significant_pathways_dataframe`).
Parameters
-----------
pathway_definitions : dict(str -> set(str))
Pathway definitions, *post*-overlap-correction if this function
is called from `pathway_enrichment_with_overlap_correction`.
A pathway (key) is defined by a set of genes (value).
gene_signature : set(str)
The set of genes we consider to be enriched in a feature.
n_genes : int
The total number of genes for which we have assigned weights in the
features of an unsupervised model.
Returns
-----------
pandas.Series, for each pathway, the p-value from applying the Fisher's
exact test.
"""
if not gene_signature:
return pd.Series(name="p-value")
pvalues_list = []
for pathway, definition in pathway_definitions.items():
if isinstance(definition, tuple):
definition = set.union(*definition)
both_definition_and_signature = len(definition & gene_signature)
in_definition_not_signature = (len(definition) -
both_definition_and_signature)
in_signature_not_definition = (len(gene_signature) -
both_definition_and_signature)
neither_definition_nor_signature = (n_genes -
both_definition_and_signature -
in_definition_not_signature -
in_signature_not_definition)
contingency_table = np.array(
[[both_definition_and_signature, in_signature_not_definition],
[in_definition_not_signature, neither_definition_nor_signature]])
try:
_, pvalue = stats.fisher_exact(
contingency_table, alternative="greater")
pvalues_list.append(pvalue)
# FPE can occur when `neither_definition_nor_signature` is very
# large and `both_definition_and_signature` is very small (near zero)
except FloatingPointError:
pvalues_list.append(1.0)
pvalues_series = pd.Series(
pvalues_list, index=pathway_definitions.keys(), name="p-value")
return pvalues_series
def fishers_exact_plot(data, condition1, condition2, ax=None,
condition1_value=None,
alternative="two-sided", **kwargs):
"""
Perform a Fisher's exact test to compare to binary columns
Parameters
----------
data: Pandas dataframe
Dataframe to retrieve information from
condition1: str
First binary column to compare (and used for test sidedness)
condition2: str
Second binary column to compare
ax : Axes, default None
Axes to plot on
condition1_value:
If `condition1` is not a binary column, split on =/!= to condition1_value
alternative:
Specify the sidedness of the test: "two-sided", "less"
or "greater"
"""
plot = sb.barplot(
x=condition1,
y=condition2,
ax=ax,
data=data,
**kwargs
)
plot.set_ylabel("Percent %s" % condition2)
condition1_mask = get_condition_mask(data, condition1, condition1_value)
count_table = pd.crosstab(data[condition1], data[condition2])
print(count_table)
oddsratio, p_value = fisher_exact(count_table, alternative=alternative)
add_significance_indicator(plot=plot, significant=p_value <= 0.05)
only_percentage_ticks(plot)
if alternative != "two-sided":
raise ValueError("We need to better understand the one-sided Fisher's Exact test")
sided_str = "two-sided"
print("Fisher's Exact Test: OR: {}, p-value={} ({})".format(oddsratio, p_value, sided_str))
return FishersExactResults(oddsratio=oddsratio,
p_value=p_value,
sided_str=sided_str,
with_condition1_series=data[condition1_mask][condition2],
without_condition1_series=data[~condition1_mask][condition2],
plot=plot)