def _p_test(self,v,grouped_data,is_continuous,is_categorical,
is_normal,min_observed,catlevels,
pval=np.nan,ptest='Not tested'):
"""
Compute p value
"""
# do not test if any sub-group has no observations
if min_observed == 0:
warnings.warn('No p-value was computed for {} due to the low number of observations.'.format(v))
return pval,ptest
# continuous
if is_continuous and is_normal:
# normally distributed
ptest = 'One-way ANOVA'
test_stat, pval = stats.f_oneway(*grouped_data)
elif is_continuous and not is_normal:
# non-normally distributed
ptest = 'Kruskal-Wallis'
test_stat, pval = stats.kruskal(*grouped_data)
# categorical
elif is_categorical:
# default to chi-squared
ptest = 'Chi-squared'
chi2, pval, dof, expected = stats.chi2_contingency(grouped_data)
# if any expected cell counts are < 5, chi2 may not be valid
# if this is a 2x2, switch to fisher exact
if expected.min() < 5:
if grouped_data.shape == (2,2):
ptest = 'Fisher''s exact'
oddsratio, pval = stats.fisher_exact(grouped_data)
else:
ptest = 'Chi-squared (warning: expected count < 5)'
warnings.warn('No p-value was computed for {} due to the low number of observations.'.format(v))
return pval,ptest
python类chi2_contingency()的实例源码
def fsel(data=[]): # (feature selection, using chi2)
""" Returns a {feature: p-value} dict
for the given set of (vector, label)-tuples.
"""
from scipy.stats import chi2_contingency as chi2
f1 = collections.defaultdict(float) # {label: count}
f2 = collections.defaultdict(float) # {feature: count}
f3 = collections.defaultdict(float) # {feature, label: count}
p = {}
for v, label in data:
f1[label] += 1
for v, label in data:
for f in v:
f2[f] += 1
f3[f, label] += 1
for f in f2:
p[f] = chi2([[f1[label] - f3[f, label] or 0.1 for label in f1],
[ f3[f, label] or 0.1 for label in f1]])[1]
return p
def _func(self, x, size, ext_size, width):
obs = pd.np.array(
[[x.freq1, x.freq2],
[size - x.freq1 * width, ext_size - x.freq2 * width]])
try:
tmp = stats.chi2_contingency(obs,
lambda_="log-likelihood")
except ValueError as e:
print(e)
return pd.np.nan
return tmp[0]
def test_random_circuits(self):
local_simulator = qasm_simulator.QasmSimulator()
for circuit in self.rqg.get_circuits(format='QuantumCircuit'):
self.log.info(circuit.qasm())
compiled_circuit = openquantumcompiler.compile(circuit.qasm())
shots = 100
min_cnts = int(shots / 10)
job_pq = QuantumJob(compiled_circuit,
backend='local_projectq_simulator',
seed=1, shots=shots)
job_py = QuantumJob(compiled_circuit,
backend='local_qasm_simulator',
seed=1, shots=shots)
result_pq = pq_simulator.run(job_pq)
result_py = local_simulator.run(job_py)
counts_pq = result_pq.get_counts(result_pq.get_names()[0])
counts_py = result_py.get_counts(result_py.get_names()[0])
# filter states with few counts
counts_pq = {key:cnt for key,cnt in counts_pq.items() if cnt > min_cnts}
counts_py = {key:cnt for key,cnt in counts_py.items() if cnt > min_cnts}
self.log.info('local_projectq_simulator: ' + str(counts_pq))
self.log.info('local_qasm_simulator: ' + str(counts_py))
self.assertTrue(counts_pq.keys() == counts_py.keys())
states = counts_py.keys()
# contingency table
ctable = numpy.array([[counts_pq[key] for key in states],
[counts_py[key] for key in states]])
result = chi2_contingency(ctable)
self.log.info('chi2_contingency: ' + str(result))
with self.subTest():
self.assertGreater(result[1], 0.01)
def run_test(q1_pos, q2_pos, q1_neg,q2_neg):
'''
this method takes four parallel arrays representing a 2X2 contingency table.
the length of these parallel arrays denotes the number of tests that will be run,
either a chi-squared test or an fisher-exact test are run, epending whether the requriments for a
reliable chi-squared test are satisifed.
Bonferroni correction is then applied by adjusting the p-values for all of the tests
We return two parellel arrays, the first array is the p-values of for the tests, the second array is the test value
e.g. the chi-squared value or the fisher-exact oddsratio.
'''
input = [q1_pos, q2_pos, q1_neg,q2_neg]
n = len(input[0])
if not all(len(x) == n for x in input):
raise BaseException ("length of input lists must be of same length")
pvalues = []
test_values = []
for i in range(0,n):
obs = np.array([ [input[0][i],input[1][i]],[input[2][i],input[3][i]] ])
if useFisherExact(obs):
p = fisher_exact(obs)[1]
t = fisher_exact(obs)[0]
else:
p = chi2_contingency(obs)[1]
t = chi2_contingency(obs)[0]
pvalues.append(p)
test_values.append(t)
#applying Bonferroni correction
adjustedPValues = [ float(p)/float(n) for p in pvalues]
return [adjustedPValues, test_values]
def get_p_vals(role1,champ1,single_counts=True,span=3):
# Use a chi-squared test to calculate p-values to compare the recommendation
# distributions for the top 3 champs vs the next few recommendations.
champ1=str(champ2id.get(champ1,champ1))
p_vals = {}
for role2 in recs[tier][role1][champ1]:
p_vals[role2] = {}
if role2=='TOTAL' or role2=='DATA':
continue
for idx in range(1,4):
values = []
for pos_to_compare in range(idx+1,idx+1+span):
# Get ids from recs:
champ2_1 = str(champ2id[recs[tier][role1][champ1][role2][idx]['champ']])
champ2_2 = str(champ2id[recs[tier][role1][champ1][role2][pos_to_compare]['champ']])
# Get data:
N = recs[tier][role1][champ1][role2]['N']
if N > 10:
data = sliding_count_recs[tier][role1][champ1][role2]
champ2_1_data = np.array(data['DATA'][champ2_1] + [0]*(N-len(data['DATA'][champ2_1])))
champ2_2_data = np.array(data['DATA'][champ2_2] + [0]*(N-len(data['DATA'][champ2_2])))
if single_counts:
champ2_1_data[champ2_1_data>0]=1
champ2_2_data[champ2_2_data>0]=1
contingency_mat = np.array([[sum(champ2_1_data), N-sum(champ2_1_data)],[sum(champ2_2_data),N-sum(champ2_2_data)]])
values.append(chi2_contingency(contingency_mat)[1])
else:
values.append(1)
p_vals[role2][idx] = values
return p_vals
def cramers_v_stat(confusion_matrix):
"""Calculate Cramérs V statistic for categorial-categorial association."""
chi2 = stats.chi2_contingency(confusion_matrix)[0]
n = confusion_matrix.sum()
phi2 = chi2 / n
r, k = confusion_matrix.shape
return math.sqrt(phi2 / min((r-1), (k-1)))
def cramers_v_corrected_stat(confusion_matrix):
"""Calculate Cramérs V statistic for categorial-categorial association.
Uses correction from Bergsma and Wicher, Journal of the Korean Statistical
Society 42 (2013): 323-328.
"""
chi2 = stats.chi2_contingency(confusion_matrix)[0]
n = confusion_matrix.sum()
phi2 = chi2 / n
r, k = confusion_matrix.shape
phi2_corr = max(0, phi2 - ((k-1)*(r-1)) / (n-1))
r_corr = r - ((r-1)**2) / (n-1)
k_corr = k - ((k-1)**2) / (n-1)
return math.sqrt(phi2_corr / min((r_corr-1), (k_corr-1)))
def feature_importance_classification(features, target, n_neighbors=3, random_state=None):
cont = features.select_dtypes(include=[np.floating])
disc = features.select_dtypes(include=[np.integer, np.bool])
cont_imp = pd.DataFrame(index=cont.columns)
disc_imp = pd.DataFrame(index=disc.columns)
# Continuous features
if cont_imp.index.size > 0:
# F-test
f_test = feature_selection.f_classif(cont, target)
cont_imp['f_statistic'] = f_test[0]
cont_imp['f_p_value'] = f_test[1]
# Mutual information
mut_inf = feature_selection.mutual_info_classif(cont, target, discrete_features=False,
n_neighbors=n_neighbors,
random_state=random_state)
cont_imp['mutual_information'] = mut_inf
# Discrete features
if disc_imp.index.size > 0:
# Chi²-test
chi2_tests = defaultdict(dict)
for feature in disc.columns:
cont = pd.crosstab(disc[feature], target)
statistic, p_value, _, _ = stats.chi2_contingency(cont)
chi2_tests[feature]['chi2_statistic'] = statistic
chi2_tests[feature]['chi2_p_value'] = p_value
chi2_tests_df = pd.DataFrame.from_dict(chi2_tests, orient='index')
disc_imp['chi2_statistic'] = chi2_tests_df['chi2_statistic']
disc_imp['chi2_p_value'] = chi2_tests_df['chi2_p_value']
# Cramér's V (corrected)
disc_imp['cramers_v'] = [
cramers_v_corrected_stat(pd.crosstab(feature, target).values)
for _, feature in disc.iteritems()
]
# Mutual information
mut_inf = feature_selection.mutual_info_classif(disc, target, discrete_features=True,
n_neighbors=n_neighbors,
random_state=random_state)
disc_imp['mutual_information'] = mut_inf
return cont_imp, disc_imp