def test_cohen_kappa():
# These label vectors reproduce the contingency matrix from Artstein and
# Poesio (2008), Table 1: np.array([[20, 20], [10, 50]]).
y1 = np.array([0] * 40 + [1] * 60)
y2 = np.array([0] * 20 + [1] * 20 + [0] * 10 + [1] * 50)
kappa = cohen_kappa_score(y1, y2)
assert_almost_equal(kappa, .348, decimal=3)
assert_equal(kappa, cohen_kappa_score(y2, y1))
# Add spurious labels and ignore them.
y1 = np.append(y1, [2] * 4)
y2 = np.append(y2, [2] * 4)
assert_equal(cohen_kappa_score(y1, y2, labels=[0, 1]), kappa)
assert_almost_equal(cohen_kappa_score(y1, y1), 1.)
# Multiclass example: Artstein and Poesio, Table 4.
y1 = np.array([0] * 46 + [1] * 44 + [2] * 10)
y2 = np.array([0] * 52 + [1] * 32 + [2] * 16)
assert_almost_equal(cohen_kappa_score(y1, y2), .8013, decimal=4)
python类cohen_kappa_score()的实例源码
def print_metrics_regression(y_true, predictions, verbose=1):
predictions = np.array(predictions)
predictions = np.maximum(predictions, 0).flatten()
y_true = np.array(y_true)
y_true_bins = [get_bin_custom(x, CustomBins.nbins) for x in y_true]
prediction_bins = [get_bin_custom(x, CustomBins.nbins) for x in predictions]
cf = metrics.confusion_matrix(y_true_bins, prediction_bins)
if verbose:
print "Custom bins confusion matrix:"
print cf
kappa = metrics.cohen_kappa_score(y_true_bins, prediction_bins,
weights='linear')
mad = metrics.mean_absolute_error(y_true, predictions)
mse = metrics.mean_squared_error(y_true, predictions)
mape = mean_absolute_percentage_error(y_true, predictions)
if verbose:
print "Mean absolute deviation (MAD) =", mad
print "Mean squared error (MSE) =", mse
print "Mean absolute percentage error (MAPE) =", mape
print "Cohen kappa score =", kappa
return {"mad": mad,
"mse": mse,
"mape": mape,
"kappa": kappa}
def _kappa_helper(y_true, y_pred, weights=None):
# weights can be None, 'linear', or 'quadratic'
def flatten(y):
if len(y.shape) > 1 and y.shape[1] > 1:
y = np.argmax(y, axis=1)
y = y.reshape(-1)
return y
y_true = flatten(y_true)
y_pred = flatten(y_pred)
return cohen_kappa_score(y_true, y_pred, weights=weights)
def results2df(results, dataset, n_ctrl, n_case, n_features):
"""
Converts the results dictionary into tidy dataframe format.
Parameters
----------
results : dict
results from cv_and_roc function
dataset : str
dataset ID
n_ctrl, n_case, n_features : int
number of controls, cases, and features
Returns
-------
resultsdf : pandas DataFrame
dataframe with 'mean_fpr', 'mean_tpr', 'fisher_p', and 'roc_auc' columns
from the results dict, 'kappa' from
cohen_kappa_score(results['y_preds']), and 'dataset', 'H_smpls',
'dis_smpls', and 'num_features' from the input parameters
"""
# Directly calling pd.DataFrame.from_dict doesn't work because
# this dictionary contains arrays, matrices, etc..
resultsdf = pd.DataFrame(data=np.array((results['mean_fpr'],
results['mean_tpr'])).T,
columns=['mean_fpr', 'mean_tpr'])
resultsdf['roc_auc'] = results['roc_auc']
resultsdf['fisher_p'] = results['fisher_p']
resultsdf['dataset'] = dataset
resultsdf['H_smpls'] = n_ctrl
resultsdf['dis_smpls'] = n_case
resultsdf['num_features'] = n_features
resultsdf['kappa'] = cohen_kappa_score(
results['y_true'], results['y_preds'])
return resultsdf
def concordance(series1, series2, method, nreps=1000):
"""
Measures the concordance between two pandas Series and returns a pvalue
and measure of concordance.
Parameters
----------
series1, series2 : pandas Series
Series with matching indexes.
method : str
['fisher', 'spearman', 'kendalltau', 'empirical', 'cohen']
nreps : int
number of repititions to build the null. Only needed if method is
'empirical'
Returns
-------
measure : float
some sort of measure of concordance (e.g. r for the correlation
methods, n_observed - mean(n_expected) for empirical, etc)
p : float
p value of observed concordance between series1 and series2
"""
if method == 'fisher':
# Note: this automatically ignores any bugs which were not present
# in both series.
mat = pd.crosstab(series1, series2)
return fisher_exact(mat)
elif method == 'spearman':
return spearmanr(series1, series2)
elif method == 'kendalltau':
return kendalltau(series1, series2, nan_policy='omit')
elif method == 'empirical':
return empirical_pval(series1, series2, nreps)
elif method == 'cohen':
tmp = pd.concat((series1, series2), axis=1).dropna()
return cohen_kappa_score(tmp.iloc[:, 0], tmp.iloc[:, 1]), np.nan
else:
raise ValueError('Unknown concordance method.')
def evaluate_ensemble(models_dir, test_images, out_dir, rf=False):
# Get images and true classes
img_arr, y_true = imgs_by_class_to_th_array(test_images, CLASS_LABELS)
print img_arr.shape
y_pred_all = []
# Load each model
for i, model_dir in enumerate(get_subdirs(models_dir)):
# Load model
if rf:
print "Loading CNN+RF #{}".format(i)
model_config, rf_pkl = locate_config(model_dir)
model = RetinaRF(model_config, rf_pkl=rf_pkl)
else:
print "Loading CNN #{}".format(i)
config_file = glob(join(model_dir, '*.yaml'))[0]
model = RetiNet(config_file).model
# Predicted probabilities
print "Making predictions..."
ypred_out = join(out_dir, 'ypred_{}.npy'.format(i))
if not exists(ypred_out):
y_preda = model.predict(img_arr)
np.save(ypred_out, y_preda)
else:
y_preda = np.load(ypred_out)
y_pred_all.append(y_preda)
y_pred = np.argmax(y_preda, axis=1)
kappa = cohen_kappa_score(y_true, y_pred, weights='quadratic')
confusion(y_true, y_pred, CLASS_LABELS, join(out_dir, 'confusion_split{}_k={:.3f}.png'.format(i, kappa)))
# Evaluate ensemble
y_preda_ensemble = np.mean(np.dstack(y_pred_all), axis=2)
y_pred_ensemble = np.argmax(y_preda_ensemble, axis=1)
kappa = cohen_kappa_score(y_true, y_pred_ensemble)
confusion(y_true, y_pred_ensemble, CLASS_LABELS, join(out_dir, 'confusion_ensemble_k={:.3f}.png'.format(kappa)))
def cohens_kappa(results, workers):
"""
Compute Cohen's Kappa on all workers that answered at least 5 HITs
:param results:
:return:
"""
answers_per_worker = { worker_id : { key : results[key][worker_id] for key in results.keys()
if worker_id in results[key] }
for worker_id in workers }
answers_per_worker = { worker_id : answers for worker_id, answers in answers_per_worker.iteritems()
if len(answers) >= 5 }
curr_workers = answers_per_worker.keys()
worker_pairs = [(worker1, worker2) for worker1 in curr_workers for worker2 in curr_workers if worker1 != worker2]
label_index = { True : 1, False : 0 }
pairwise_kappa = { worker_id : { } for worker_id in answers_per_worker.keys() }
# Compute pairwise Kappa
for (worker1, worker2) in worker_pairs:
mutual_hits = set(answers_per_worker[worker1].keys()).intersection(set(answers_per_worker[worker2].keys()))
mutual_hits = set([hit for hit in mutual_hits if not pandas.isnull(hit)])
if len(mutual_hits) >= 5:
worker1_labels = np.array([label_index[answers_per_worker[worker1][key][0]] for key in mutual_hits])
worker2_labels = np.array([label_index[answers_per_worker[worker2][key][0]] for key in mutual_hits])
curr_kappa = cohen_kappa_score(worker1_labels, worker2_labels)
if not math.isnan(curr_kappa):
pairwise_kappa[worker1][worker2] = curr_kappa
pairwise_kappa[worker2][worker1] = curr_kappa
# Remove worker answers with low agreement to others
workers_to_remove = set()
for worker, kappas in pairwise_kappa.iteritems():
if np.mean(kappas.values()) < 0.1:
print 'Removing %s' % worker
workers_to_remove.add(worker)
kappa = np.mean([k for worker1 in pairwise_kappa.keys() for worker2, k in pairwise_kappa[worker1].iteritems()
if not worker1 in workers_to_remove and not worker2 in workers_to_remove])
# Return the average
return kappa, workers_to_remove
def cohens_kappa(results, workers):
"""
Compute Cohen's Kappa on all workers that answered at least 5 HITs
:param results:
:return:
"""
answers_per_worker = { worker_id : { key : results[key][worker_id] for key in results.keys()
if worker_id in results[key] }
for worker_id in workers }
answers_per_worker = { worker_id : answers for worker_id, answers in answers_per_worker.iteritems()
if len(answers) >= 5 }
curr_workers = answers_per_worker.keys()
worker_pairs = [(worker1, worker2) for worker1 in curr_workers for worker2 in curr_workers if worker1 != worker2]
label_index = { True : 1, False : 0 }
pairwise_kappa = { worker_id : { } for worker_id in answers_per_worker.keys() }
# Compute pairwise Kappa
for (worker1, worker2) in worker_pairs:
mutual_hits = set(answers_per_worker[worker1].keys()).intersection(set(answers_per_worker[worker2].keys()))
mutual_hits = set([hit for hit in mutual_hits if not pandas.isnull(hit)])
if len(mutual_hits) >= 5:
worker1_labels = np.array([label_index[answers_per_worker[worker1][key][0]] for key in mutual_hits])
worker2_labels = np.array([label_index[answers_per_worker[worker2][key][0]] for key in mutual_hits])
curr_kappa = cohen_kappa_score(worker1_labels, worker2_labels)
if not math.isnan(curr_kappa):
pairwise_kappa[worker1][worker2] = curr_kappa
pairwise_kappa[worker2][worker1] = curr_kappa
# Remove worker answers with low agreement to others
workers_to_remove = set()
for worker, kappas in pairwise_kappa.iteritems():
if np.mean(kappas.values()) < 0.1:
print 'Removing %s' % worker
workers_to_remove.add(worker)
kappa = np.mean([k for worker1 in pairwise_kappa.keys() for worker2, k in pairwise_kappa[worker1].iteritems()
if not worker1 in workers_to_remove and not worker2 in workers_to_remove])
# Return the average
return kappa, workers_to_remove