def anova(data):
"""
return True is at least one mean is different from the other
https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.f_oneway.html
"""
if len(data) == 2:
statistic, pvalue = stats.f_oneway(data[0], data[1])
elif len(data) == 3:
statistic, pvalue = stats.f_oneway(data[0], data[1], data[2])
elif len(data) == 4:
statistic, pvalue = stats.f_oneway(data[0], data[1], data[2], data[3])
else:
utils.print_error("TODO ANOVA manage more values")
print("ANOVA Statistic " + str(statistic) + " and p-value " + str(pvalue))
if pvalue < 0.05:
return True
else:
return False
python类f_oneway()的实例源码
def _p_test(self,v,grouped_data,is_continuous,is_categorical,
is_normal,min_observed,catlevels,
pval=np.nan,ptest='Not tested'):
"""
Compute p value
"""
# do not test if any sub-group has no observations
if min_observed == 0:
warnings.warn('No p-value was computed for {} due to the low number of observations.'.format(v))
return pval,ptest
# continuous
if is_continuous and is_normal:
# normally distributed
ptest = 'One-way ANOVA'
test_stat, pval = stats.f_oneway(*grouped_data)
elif is_continuous and not is_normal:
# non-normally distributed
ptest = 'Kruskal-Wallis'
test_stat, pval = stats.kruskal(*grouped_data)
# categorical
elif is_categorical:
# default to chi-squared
ptest = 'Chi-squared'
chi2, pval, dof, expected = stats.chi2_contingency(grouped_data)
# if any expected cell counts are < 5, chi2 may not be valid
# if this is a 2x2, switch to fisher exact
if expected.min() < 5:
if grouped_data.shape == (2,2):
ptest = 'Fisher''s exact'
oddsratio, pval = stats.fisher_exact(grouped_data)
else:
ptest = 'Chi-squared (warning: expected count < 5)'
warnings.warn('No p-value was computed for {} due to the low number of observations.'.format(v))
return pval,ptest
def anova(data):
if len(data.groupby(level=1)) <= 2:
raise Exception('ANOVA requires a secondary index with three or more values')
return pd.DataFrame(
[f_oneway(*[v for k, v in data[col].groupby(level=1)]) for col in data.columns],
columns=['f', 'p'],
index=data.columns)
def main():
"""
1st phase
top1 = [70.0, 71.1, 72.5, 70.8, 68.1, 71.9, 71.1, 71.3, 68.4, 70.2]
top3 = [75.8, 78.4, 77.8, 77.7, 80.0, 77.8, 78.7, 76.4, 79.1, 77.3]
2nd phase
"""
x = [53.6, 54.5, 53.7, 52.7, 53.1, 55.5, 55.5, 52.8, 53.7, 52.7]
y = [89.7, 89.1, 89.5, 88.7, 89.4, 88.6, 89.8, 89.5, 89.2, 89.7]
# Compute the Wilcoxon rank-sum statistic for two samples.
wilcoxon = stats.ranksums(x, y)
anova = stats.f_oneway(x, y)
print "Wilcoxon: " + str(wilcoxon[1]) + "; ANOVA: " + str(anova[1])
def anovaTest(nAlgorithms,hyperVolumeList):
anova = []
for i in range(nAlgorithms):
algorithm = np.array(hyperVolumeList[i])
j=i+1
while j < nAlgorithms:
algorithmCompare = np.array(hyperVolumeList[j])
anvaTest = stats.f_oneway(algorithm, algorithmCompare)
anova.append(anvaTest)
j +=1
print 'esto es anova'
print anova
return anova
def test_f_oneway_vs_scipy_stats():
# Test that our f_oneway gives the same result as scipy.stats
rng = np.random.RandomState(0)
X1 = rng.randn(10, 3)
X2 = 1 + rng.randn(10, 3)
f, pv = stats.f_oneway(X1, X2)
f2, pv2 = f_oneway(X1, X2)
assert_true(np.allclose(f, f2))
assert_true(np.allclose(pv, pv2))
def test_f_oneway_ints():
# Smoke test f_oneway on integers: that it does raise casting errors
# with recent numpys
rng = np.random.RandomState(0)
X = rng.randint(10, size=(10, 10))
y = np.arange(10)
fint, pint = f_oneway(X, y)
# test that is gives the same result as with float
f, p = f_oneway(X.astype(np.float), y)
assert_array_almost_equal(f, fint, decimal=4)
assert_array_almost_equal(p, pint, decimal=4)
def smart_hypothesis_testing(*samples, **options):
"""Do a smart hypothesis testing."""
fancy = options.get('fancy', True)
out = options.get('out', sys.stdout)
alpha = options.get('alpha', 0.05)
equal_var = options.get('equal_var', True)
latex = options.get('latex', True)
samples = [np.array(sample, dtype='float') for sample in samples]
len_samples = len(samples)
out_buffer = StringIO()
normality_results = samples_are_normal(*samples)
if all(map(itemgetter(0), normality_results)):
# all our samples are normal
if equal_var:
if fancy:
out_buffer.write(Template(
u"Hypothesis testing:\n\n"
"\t$H0: ${mu}1 = ${mu}2{ellipsis} = $mu{len_samples}. "
"The means for all groups are equal.\n"
"\t$H1: $exists a,b $elementof Samples: ${mu}a $neq ${mu}b. "
"At least two of the means are not equal.\n\n"
"The significance test one-way analysis of variance (ANOVA) "
"was used with a significance level of $alpha={alpha:.2f}.\n"
"This test requires that the following "
"assumptions are satisfied:\n\n"
"1. Samples are independent.\n"
"2. Samples are drawn from a normally distributed population.\n"
"3. All populations have equal standard deviation.\n\n"
"For the assumption of normal distribution two tests were "
"performed ($alpha={alpha}): Shapiro Wilk's test "
"and D'Agostino and Pearson's test.\n"
"None of these tests reject the null hypothesis with "
"significance level of $alpha={alpha}, thus it is assumed that data "
"follows a normal distribution.\n\n"
"").substitute(GREEK_ALPHABET).format(
ellipsis=" = ..." if len_samples > 3 else "",
**locals()
))
statistic, pvalue = f_oneway(*samples)
if fancy:
if pvalue < alpha:
out_buffer.write(
u"One can say that samples come from populations "
"with different means, since ANOVA rejects the "
"null hypothesis "
"(statistic={statistic:.2f}, {pvalue_str}).\n"
"".format(pvalue_str=_pvalue_to_str(pvalue), **locals())
)
else:
out_buffer.write(
u"Thus, it was not possible to find evidence that"
" the means of populations are different "
"(statistic={statistic:.2f},{rho}={pvalue:.2f}).\n"
"".format(**locals())
)
_flush_output(out, out_buffer, latex)
return statistic, pvalue, f_oneway
def feature_importance_regression(features, target, n_neighbors=3, random_state=None):
cont = features.select_dtypes(include=[np.floating])
disc = features.select_dtypes(include=[np.integer, np.bool])
cont_imp = pd.DataFrame(index=cont.columns)
disc_imp = pd.DataFrame(index=disc.columns)
# Continuous features
if cont_imp.index.size > 0:
# Pearson correlation
pearson = np.array([stats.pearsonr(feature, target) for _, feature in cont.iteritems()])
cont_imp['pearson_r'] = pearson[:, 0]
cont_imp['pearson_r_p_value'] = pearson[:, 1]
# Mutual information
mut_inf = feature_selection.mutual_info_regression(cont, target, discrete_features=False,
n_neighbors=n_neighbors,
random_state=random_state)
cont_imp['mutual_information'] = mut_inf
# Discrete features
if disc_imp.index.size > 0:
# F-test
f_tests = defaultdict(dict)
for feature in disc.columns:
groups = [target[idxs] for idxs in disc.groupby(feature).groups.values()]
statistic, p_value = stats.f_oneway(*groups)
f_tests[feature]['f_statistic'] = statistic
f_tests[feature]['f_p_value'] = p_value
f_tests_df = pd.DataFrame.from_dict(f_tests, orient='index')
disc_imp['f_statistic'] = f_tests_df['f_statistic']
disc_imp['f_p_value'] = f_tests_df['f_p_value']
# Mutual information
mut_inf = feature_selection.mutual_info_regression(disc, target, discrete_features=True,
n_neighbors=n_neighbors,
random_state=random_state)
disc_imp['mutual_information'] = mut_inf
return cont_imp, disc_imp