def optimise_f2_thresholds(y, p, verbose=True, resolution=100):
def mf(x):
p2 = np.zeros_like(p)
for i in range(17):
p2[:, i] = (p[:, i] > x[i]).astype(np.int)
score = fbeta_score(y, p2, beta=2, average='samples')
return score
x = [0.2] * 17
for i in range(17):
best_i2 = 0
best_score = 0
for i2 in range(resolution):
i2 /= resolution
x[i] = i2
score = mf(x)
if score > best_score:
best_i2 = i2
best_score = score
x[i] = best_i2
if verbose:
print(i, best_i2, best_score)
return x
python类fbeta_score()的实例源码
def validate(net, loader, criterion):
net.eval()
running_loss = 0
running_accuracy = 0
targets = torch.FloatTensor(0,17) # For fscore calculation
predictions = torch.FloatTensor(0,17)
for i, (X,y) in enumerate(loader):
if cuda:
X, y = X.cuda(), y.cuda()
X, y = Variable(X, volatile=True), Variable(y)
output = net(X)
loss = criterion(output, y)
acc = utils.get_multilabel_accuracy(output, y)
targets = torch.cat((targets, y.cpu().data), 0)
predictions = torch.cat((predictions,output.cpu().data), 0)
running_loss += loss.data[0]
running_accuracy += acc
fscore = fbeta_score(targets.numpy(), predictions.numpy() > 0.23,
beta=2, average='samples')
return running_loss/len(loader), running_accuracy/len(loader), fscore
def f2_score(y_true, y_preds):
return fbeta_score(y_true, y_preds, beta=2, average='samples')
def get_scores(clf, X_t_train, y_train, X_t_test, y_test):
clf.fit(X_t_train, y_train)
app = dict()
score = fbeta_score(y_test, clf.predict(X_t_test), beta=2, average=None)
avg_sample_score = fbeta_score(y_test, clf.predict(X_t_test), beta=2, average='samples')
prec_score = precision_score(y_test, clf.predict(X_t_test), average='micro')
rec_score = recall_score(y_test, clf.predict(X_t_test), average='micro')
avg_prec = average_precision_score(y_test, clf.predict(X_t_test))
metrics = [score, avg_sample_score, roc_auc_score(y_test, clf.predict_proba(X_t_test))]
#app['Classwise Scores'] = ([(mlb.classes_[l], score[l]) for l in score.argsort()[::-1]])
app['F2 Score'] = avg_sample_score
app['ROC_AUC'] = roc_auc_score(y_test, clf.predict_proba(X_t_test))
app['P_AUPR'] = avg_prec
app['Precision'] = prec_score
app['Recall'] = rec_score
return app
def find_f_measure_threshold2(probs, labels, num_iters=100, seed=0.21):
_, num_classes = labels.shape[0:2]
best_thresholds = [seed] * num_classes
best_scores = [0] * num_classes
for t in range(num_classes):
thresholds = list(best_thresholds) # [seed]*num_classes
for i in range(num_iters):
th = i / float(num_iters)
thresholds[t] = th
f2 = fbeta_score(labels, probs > thresholds, beta=2, average='samples')
if f2 > best_scores[t]:
best_scores[t] = f2
best_thresholds[t] = th
print('\t(t, best_thresholds[t], best_scores[t])=%2d, %0.3f, %f' % (t, best_thresholds[t], best_scores[t]))
print('')
return best_thresholds, best_scores
def test_fscore_warnings():
clean_warning_registry()
with warnings.catch_warnings(record=True) as record:
warnings.simplefilter('always')
for score in [f1_score, partial(fbeta_score, beta=2)]:
score(np.array([[1, 1], [1, 1]]),
np.array([[0, 0], [0, 0]]),
average='micro')
assert_equal(str(record.pop().message),
'F-score is ill-defined and '
'being set to 0.0 due to no predicted samples.')
score(np.array([[0, 0], [0, 0]]),
np.array([[1, 1], [1, 1]]),
average='micro')
assert_equal(str(record.pop().message),
'F-score is ill-defined and '
'being set to 0.0 due to no true samples.')
def fbeta(_, predictions_binary, labels, parameters):
return metrics.fbeta_score(labels, predictions_binary, **parameters)
def fbeta(true_label, prediction):
return fbeta_score(true_label, prediction, beta=2, average='samples')
def fscore(prediction):
""" Get the fscore of the validation set. Gives a good indication
of score on public leaderboard"""
target = torch.FloatTensor(0, 17)
for i, (_,y) in enumerate(val_loader):
target = torch.cat((target, y), 0)
fscore = fbeta_score(target.numpy(), prediction.numpy() > 0.23,
beta=2, average='samples')
return fscore
def fbeta_score(y_true, y_pred, beta=1):
"""Computes the F score.
The F score is the weighted harmonic mean of precision and recall.
Here it is only computed as a batch-wise average, not globally.
This is useful for multi-label classification, where input samples can be
classified as sets of labels. By only using accuracy (precision) a model
would achieve a perfect score by simply assigning every class to every
input. In order to avoid this, a metric should penalize incorrect class
assignments as well (recall). The F-beta score (ranged from 0.0 to 1.0)
computes this, as a weighted mean of the proportion of correct class
assignments vs. the proportion of incorrect class assignments.
With beta = 1, this is equivalent to a F-measure. With beta < 1, assigning
correct classes becomes more important, and with beta > 1 the metric is
instead weighted towards penalizing incorrect class assignments.
"""
if beta < 0:
raise ValueError('The lowest choosable beta is zero (only precision).')
# If there are no true positives, fix the F score at 0 like sklearn.
if K.sum(K.round(K.clip(y_true, 0, 1))) == 0:
return 0
p = precision(y_true, y_pred)
r = recall(y_true, y_pred)
bb = beta ** 2
fbeta_score = (1 + bb) * (p * r) / (bb * p + r + K.epsilon())
return fbeta_score
def fmeasure(y_true, y_pred):
"""Computes the f-measure, the harmonic mean of precision and recall.
Here it is only computed as a batch-wise average, not globally.
"""
return fbeta_score(y_true, y_pred, beta=1)
def fbeta(model, X_valid, y_valid):
p_valid = model.predict(X_valid)
return fbeta_score(y_valid, np.array(p_valid) > 0.2, beta=2, average='samples')
def f2_score(output, target, threshold):
output = (output > threshold)
return fbeta_score(target, output, beta=2, average='samples')
def optimise_f2_thresholds(y, p, verbose=True, resolution=100):
""" Find optimal threshold values for f2 score. Thanks Anokas
https://www.kaggle.com/c/planet-understanding-the-amazon-from-space/discussion/32475
"""
size = y.shape[1]
def mf(x):
p2 = np.zeros_like(p)
for i in range(size):
p2[:, i] = (p[:, i] > x[i]).astype(np.int)
score = fbeta_score(y, p2, beta=2, average='samples')
return score
x = [0.2] * size
for i in range(size):
best_i2 = 0
best_score = 0
for i2 in range(resolution):
i2 /= resolution
x[i] = i2
score = mf(x)
if score > best_score:
best_i2 = i2
best_score = score
x[i] = best_i2
if verbose:
print(i, best_i2, best_score)
return x, best_score
def f2_score(output, target, threshold):
output = (output > threshold)
return fbeta_score(target, output, beta=2, average='samples')
def optimise_f2_thresholds(y, p, verbose=True, resolution=100):
""" Find optimal threshold values for f2 score. Thanks Anokas
https://www.kaggle.com/c/planet-understanding-the-amazon-from-space/discussion/32475
"""
size = y.shape[1]
def mf(x):
p2 = np.zeros_like(p)
for i in range(size):
p2[:, i] = (p[:, i] > x[i]).astype(np.int)
score = fbeta_score(y, p2, beta=2, average='samples')
return score
x = [0.2] * size
for i in range(size):
best_i2 = 0
best_score = 0
for i2 in range(resolution):
i2 /= resolution
x[i] = i2
score = mf(x)
if score > best_score:
best_i2 = i2
best_score = score
x[i] = best_i2
if verbose:
print(i, best_i2, best_score)
return x, best_score
def get_scores(clf, X_t_train, y_train, X_t_test, y_test):
clf.fit(X_t_train, y_train)
y_score = clf.predict_proba(X_t_test)
app = dict()
score = fbeta_score(y_test, clf.predict(X_t_test), beta=2, average=None)
#auc_score = roc_auc_score(y_test, clf.predict(X_t_test), average='samples')
avg_sample_score = fbeta_score(y_test, clf.predict(X_t_test), beta=2, average='samples')
prec_score = precision_score(y_test, clf.predict(X_t_test), average='micro')
rec_score = recall_score(y_test, clf.predict(X_t_test), average='micro')
avg_prec = average_precision_score(y_test, clf.predict(X_t_test))
metrics = [score, avg_sample_score, roc_auc_score(y_test, clf.predict_proba(X_t_test))]
#app['Classwise Scores'] = ([(mlb.classes_[l], score[l]) for l in score.argsort()[::-1]])
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(len(list(enumerate(mlb.classes_)))):
fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
roc_auc[mlb.classes_[i]] = auc(fpr[i], tpr[i])
app['F2 Score'] = avg_sample_score
app['ROC_AUC'] = roc_auc_score(y_test, clf.predict_proba(X_t_test))
app['Classwise F2 Scores'] = ([(mlb.classes_[l], score[l]) for l in score.argsort()[::-1]])
app['P_AUPR'] = avg_prec
app['Precision'] = prec_score
app['Recall'] = rec_score
app['ROC_AUC_samples'] = roc_auc
return app
def get_scores(clf, X_t_train, y_train, X_t_test, y_test):
clf.fit(X_t_train, y_train)
y_score = clf.predict_proba(X_t_test)
app = dict()
score = fbeta_score(y_test, clf.predict(X_t_test), beta=2, average=None)
#auc_score = roc_auc_score(y_test, clf.predict(X_t_test), average='samples')
avg_sample_score = fbeta_score(y_test, clf.predict(X_t_test), beta=2, average='samples')
prec_score = precision_score(y_test, clf.predict(X_t_test), average='micro')
rec_score = recall_score(y_test, clf.predict(X_t_test), average='micro')
avg_prec = average_precision_score(y_test, clf.predict(X_t_test))
metrics = [score, avg_sample_score, roc_auc_score(y_test, clf.predict_proba(X_t_test))]
#app['Classwise Scores'] = ([(mlb.classes_[l], score[l]) for l in score.argsort()[::-1]])
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(len(list(enumerate(mlb.classes_)))):
fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
roc_auc[mlb.classes_[i]] = auc(fpr[i], tpr[i])
app['F2 Score'] = avg_sample_score
app['ROC_AUC'] = roc_auc_score(y_test, clf.predict_proba(X_t_test))
app['Classwise F2 Scores'] = ([(mlb.classes_[l], score[l]) for l in score.argsort()[::-1]])
app['P_AUPR'] = avg_prec
app['Precision'] = prec_score
app['Recall'] = rec_score
app['ROC_AUC_samples'] = roc_auc
return app
def get_scores(clf, X_t_train, y_train, X_t_test, y_test):
clf.fit(X_t_train, y_train)
app = dict()
score = fbeta_score(y_test, clf.predict(X_t_test), beta=2, average=None)
avg_sample_score = fbeta_score(y_test, clf.predict(X_t_test), beta=2, average='samples')
avg_prec = average_precision_score(y_test, clf.predict(X_t_test))
metrics = [score, avg_sample_score, roc_auc_score(y_test, clf.predict_proba(X_t_test))]
app['Classwise Scores'] = ([(mlb.classes_[l], score[l]) for l in score.argsort()[::-1]])
app['F2 Score'] = avg_sample_score
app['ROC_AUC'] = roc_auc_score(y_test, clf.predict_proba(X_t_test))
app['Precision Score Avg (PR Curve)'] = avg_prec
return app
def get_scores(clf, X_t_train, y_train, X_t_test, y_test):
clf.fit(X_t_train, y_train)
y_score = clf.predict_proba(X_t_test)
app = dict()
score = fbeta_score(y_test, clf.predict(X_t_test), beta=2, average=None)
#auc_score = roc_auc_score(y_test, clf.predict(X_t_test), average='samples')
avg_sample_score = fbeta_score(y_test, clf.predict(X_t_test), beta=2, average='samples')
prec_score = precision_score(y_test, clf.predict(X_t_test), average='micro')
rec_score = recall_score(y_test, clf.predict(X_t_test), average='micro')
avg_prec = average_precision_score(y_test, clf.predict(X_t_test))
metrics = [score, avg_sample_score, roc_auc_score(y_test, clf.predict_proba(X_t_test))]
#app['Classwise Scores'] = ([(mlb.classes_[l], score[l]) for l in score.argsort()[::-1]])
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(len(list(enumerate(mlb.classes_)))):
fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
roc_auc[mlb.classes_[i]] = auc(fpr[i], tpr[i])
app['F2 Score'] = avg_sample_score
app['ROC_AUC'] = roc_auc_score(y_test, clf.predict_proba(X_t_test))
app['Classwise F2 Scores'] = ([(mlb.classes_[l], score[l]) for l in score.argsort()[::-1]])
app['P_AUPR'] = avg_prec
app['Precision'] = prec_score
app['Recall'] = rec_score
app['ROC_AUC_samples'] = roc_auc
return app
def get_scores(clf, X_t_train, y_train, X_t_test, y_test):
clf.fit(X_t_train, y_train)
y_score = clf.predict_proba(X_t_test)
app = dict()
score = fbeta_score(y_test, clf.predict(X_t_test), beta=2, average=None)
#auc_score = roc_auc_score(y_test, clf.predict(X_t_test), average='samples')
avg_sample_score = fbeta_score(y_test, clf.predict(X_t_test), beta=2, average='samples')
prec_score = precision_score(y_test, clf.predict(X_t_test), average='micro')
rec_score = recall_score(y_test, clf.predict(X_t_test), average='micro')
avg_prec = average_precision_score(y_test, clf.predict(X_t_test))
metrics = [score, avg_sample_score, roc_auc_score(y_test, clf.predict_proba(X_t_test))]
#app['Classwise Scores'] = ([(mlb.classes_[l], score[l]) for l in score.argsort()[::-1]])
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(len(list(enumerate(mlb.classes_)))):
fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
roc_auc[mlb.classes_[i]] = auc(fpr[i], tpr[i])
app['F2 Score'] = avg_sample_score
app['ROC_AUC'] = roc_auc_score(y_test, clf.predict_proba(X_t_test))
app['Classwise F2 Scores'] = ([(mlb.classes_[l], score[l]) for l in score.argsort()[::-1]])
app['P_AUPR'] = avg_prec
app['Precision'] = prec_score
app['Recall'] = rec_score
app['ROC_AUC_samples'] = roc_auc
return app
def get_scores(clf, X_t_train, y_train, X_t_test, y_test):
clf.fit(X_t_train, y_train)
y_score = clf.predict_proba(X_t_test)
app = dict()
score = fbeta_score(y_test, clf.predict(X_t_test), beta=2, average=None)
#auc_score = roc_auc_score(y_test, clf.predict(X_t_test), average='samples')
avg_sample_score = fbeta_score(y_test, clf.predict(X_t_test), beta=2, average='samples')
prec_score = precision_score(y_test, clf.predict(X_t_test), average='micro')
rec_score = recall_score(y_test, clf.predict(X_t_test), average='micro')
avg_prec = average_precision_score(y_test, clf.predict(X_t_test))
metrics = [score, avg_sample_score, roc_auc_score(y_test, clf.predict_proba(X_t_test))]
#app['Classwise Scores'] = ([(mlb.classes_[l], score[l]) for l in score.argsort()[::-1]])
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(len(list(enumerate(mlb.classes_)))):
fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
roc_auc[mlb.classes_[i]] = auc(fpr[i], tpr[i])
app['F2 Score'] = avg_sample_score
app['ROC_AUC'] = roc_auc_score(y_test, clf.predict_proba(X_t_test))
app['Classwise F2 Scores'] = ([(mlb.classes_[l], score[l]) for l in score.argsort()[::-1]])
app['P_AUPR'] = avg_prec
app['Precision'] = prec_score
app['Recall'] = rec_score
app['ROC_AUC_samples'] = roc_auc
return app
def get_scores(clf, X_t_train, y_train, X_t_test, y_test):
clf.fit(X_t_train, y_train)
y_score = clf.predict_proba(X_t_test)
app = dict()
score = fbeta_score(y_test, clf.predict(X_t_test), beta=2, average=None)
#auc_score = roc_auc_score(y_test, clf.predict(X_t_test), average='samples')
avg_sample_score = fbeta_score(y_test, clf.predict(X_t_test), beta=2, average='samples')
prec_score = precision_score(y_test, clf.predict(X_t_test), average='micro')
rec_score = recall_score(y_test, clf.predict(X_t_test), average='micro')
avg_prec = average_precision_score(y_test, clf.predict(X_t_test))
metrics = [score, avg_sample_score, roc_auc_score(y_test, clf.predict_proba(X_t_test))]
#app['Classwise Scores'] = ([(mlb.classes_[l], score[l]) for l in score.argsort()[::-1]])
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(len(list(enumerate(mlb.classes_)))):
fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
roc_auc[mlb.classes_[i]] = auc(fpr[i], tpr[i])
app['F2 Score'] = avg_sample_score
app['ROC_AUC'] = roc_auc_score(y_test, clf.predict_proba(X_t_test))
app['Classwise F2 Scores'] = ([(mlb.classes_[l], score[l]) for l in score.argsort()[::-1]])
app['P_AUPR'] = avg_prec
app['Precision'] = prec_score
app['Recall'] = rec_score
app['ROC_AUC_samples'] = roc_auc
return app
def get_scores(clf, X_t_train, y_train, X_t_test, y_test):
clf.fit(X_t_train, y_train)
y_score = clf.predict_proba(X_t_test)
app = dict()
score = fbeta_score(y_test, clf.predict(X_t_test), beta=2, average=None)
#auc_score = roc_auc_score(y_test, clf.predict(X_t_test), average='samples')
avg_sample_score = fbeta_score(y_test, clf.predict(X_t_test), beta=2, average='samples')
prec_score = precision_score(y_test, clf.predict(X_t_test), average='micro')
rec_score = recall_score(y_test, clf.predict(X_t_test), average='micro')
avg_prec = average_precision_score(y_test, clf.predict(X_t_test))
metrics = [score, avg_sample_score, roc_auc_score(y_test, clf.predict_proba(X_t_test))]
#app['Classwise Scores'] = ([(mlb.classes_[l], score[l]) for l in score.argsort()[::-1]])
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(len(list(enumerate(mlb.classes_)))):
fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
roc_auc[mlb.classes_[i]] = auc(fpr[i], tpr[i])
app['F2 Score'] = avg_sample_score
app['ROC_AUC'] = roc_auc_score(y_test, clf.predict_proba(X_t_test))
app['Classwise F2 Scores'] = ([(mlb.classes_[l], score[l]) for l in score.argsort()[::-1]])
app['P_AUPR'] = avg_prec
app['Precision'] = prec_score
app['Recall'] = rec_score
app['ROC_AUC_samples'] = roc_auc
return app
def fbeta(_, predictions_binary, labels, parameters):
return metrics.fbeta_score(labels, predictions_binary, **parameters)
def test_precision_recall_f1_score_binary():
# Test Precision Recall and F1 Score for binary classification task
y_true, y_pred, _ = make_prediction(binary=True)
# detailed measures for each class
p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None)
assert_array_almost_equal(p, [0.73, 0.85], 2)
assert_array_almost_equal(r, [0.88, 0.68], 2)
assert_array_almost_equal(f, [0.80, 0.76], 2)
assert_array_equal(s, [25, 25])
# individual scoring function that can be used for grid search: in the
# binary class case the score is the value of the measure for the positive
# class (e.g. label == 1). This is deprecated for average != 'binary'.
assert_dep_warning = partial(assert_warns, DeprecationWarning)
for kwargs, my_assert in [({}, assert_no_warnings),
({'average': 'binary'}, assert_no_warnings),
({'average': 'micro'}, assert_dep_warning)]:
ps = my_assert(precision_score, y_true, y_pred, **kwargs)
assert_array_almost_equal(ps, 0.85, 2)
rs = my_assert(recall_score, y_true, y_pred, **kwargs)
assert_array_almost_equal(rs, 0.68, 2)
fs = my_assert(f1_score, y_true, y_pred, **kwargs)
assert_array_almost_equal(fs, 0.76, 2)
assert_almost_equal(my_assert(fbeta_score, y_true, y_pred, beta=2,
**kwargs),
(1 + 2 ** 2) * ps * rs / (2 ** 2 * ps + rs), 2)
def test_precision_recall_f1_no_labels():
y_true = np.zeros((20, 3))
y_pred = np.zeros_like(y_true)
# tp = [0, 0, 0]
# fn = [0, 0, 0]
# fp = [0, 0, 0]
# support = [0, 0, 0]
# |y_hat_i inter y_i | = [0, 0, 0]
# |y_i| = [0, 0, 0]
# |y_hat_i| = [0, 0, 0]
for beta in [1]:
p, r, f, s = assert_warns(UndefinedMetricWarning,
precision_recall_fscore_support,
y_true, y_pred, average=None, beta=beta)
assert_array_almost_equal(p, [0, 0, 0], 2)
assert_array_almost_equal(r, [0, 0, 0], 2)
assert_array_almost_equal(f, [0, 0, 0], 2)
assert_array_almost_equal(s, [0, 0, 0], 2)
fbeta = assert_warns(UndefinedMetricWarning, fbeta_score,
y_true, y_pred, beta=beta, average=None)
assert_array_almost_equal(fbeta, [0, 0, 0], 2)
for average in ["macro", "micro", "weighted", "samples"]:
p, r, f, s = assert_warns(UndefinedMetricWarning,
precision_recall_fscore_support,
y_true, y_pred, average=average,
beta=beta)
assert_almost_equal(p, 0)
assert_almost_equal(r, 0)
assert_almost_equal(f, 0)
assert_equal(s, None)
fbeta = assert_warns(UndefinedMetricWarning, fbeta_score,
y_true, y_pred,
beta=beta, average=average)
assert_almost_equal(fbeta, 0)
def test_precision_recall_f1_score_multiclass():
# Test Precision Recall and F1 Score for multiclass classification task
y_true, y_pred, _ = make_prediction(binary=False)
# compute scores with default labels introspection
p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None)
assert_array_almost_equal(p, [0.83, 0.33, 0.42], 2)
assert_array_almost_equal(r, [0.79, 0.09, 0.90], 2)
assert_array_almost_equal(f, [0.81, 0.15, 0.57], 2)
assert_array_equal(s, [24, 31, 20])
# averaging tests
ps = precision_score(y_true, y_pred, pos_label=1, average='micro')
assert_array_almost_equal(ps, 0.53, 2)
rs = recall_score(y_true, y_pred, average='micro')
assert_array_almost_equal(rs, 0.53, 2)
fs = f1_score(y_true, y_pred, average='micro')
assert_array_almost_equal(fs, 0.53, 2)
ps = precision_score(y_true, y_pred, average='macro')
assert_array_almost_equal(ps, 0.53, 2)
rs = recall_score(y_true, y_pred, average='macro')
assert_array_almost_equal(rs, 0.60, 2)
fs = f1_score(y_true, y_pred, average='macro')
assert_array_almost_equal(fs, 0.51, 2)
ps = precision_score(y_true, y_pred, average='weighted')
assert_array_almost_equal(ps, 0.51, 2)
rs = recall_score(y_true, y_pred, average='weighted')
assert_array_almost_equal(rs, 0.53, 2)
fs = f1_score(y_true, y_pred, average='weighted')
assert_array_almost_equal(fs, 0.47, 2)
assert_raises(ValueError, precision_score, y_true, y_pred,
average="samples")
assert_raises(ValueError, recall_score, y_true, y_pred, average="samples")
assert_raises(ValueError, f1_score, y_true, y_pred, average="samples")
assert_raises(ValueError, fbeta_score, y_true, y_pred, average="samples",
beta=0.5)
# same prediction but with and explicit label ordering
p, r, f, s = precision_recall_fscore_support(
y_true, y_pred, labels=[0, 2, 1], average=None)
assert_array_almost_equal(p, [0.83, 0.41, 0.33], 2)
assert_array_almost_equal(r, [0.79, 0.90, 0.10], 2)
assert_array_almost_equal(f, [0.81, 0.57, 0.15], 2)
assert_array_equal(s, [24, 20, 31])
def test_precision_recall_f1_score_with_an_empty_prediction():
y_true = np.array([[0, 1, 0, 0], [1, 0, 0, 0], [0, 1, 1, 0]])
y_pred = np.array([[0, 0, 0, 0], [0, 0, 0, 1], [0, 1, 1, 0]])
# true_pos = [ 0. 1. 1. 0.]
# false_pos = [ 0. 0. 0. 1.]
# false_neg = [ 1. 1. 0. 0.]
p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
average=None)
assert_array_almost_equal(p, [0.0, 1.0, 1.0, 0.0], 2)
assert_array_almost_equal(r, [0.0, 0.5, 1.0, 0.0], 2)
assert_array_almost_equal(f, [0.0, 1 / 1.5, 1, 0.0], 2)
assert_array_almost_equal(s, [1, 2, 1, 0], 2)
f2 = fbeta_score(y_true, y_pred, beta=2, average=None)
support = s
assert_array_almost_equal(f2, [0, 0.55, 1, 0], 2)
p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
average="macro")
assert_almost_equal(p, 0.5)
assert_almost_equal(r, 1.5 / 4)
assert_almost_equal(f, 2.5 / (4 * 1.5))
assert_equal(s, None)
assert_almost_equal(fbeta_score(y_true, y_pred, beta=2,
average="macro"),
np.mean(f2))
p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
average="micro")
assert_almost_equal(p, 2 / 3)
assert_almost_equal(r, 0.5)
assert_almost_equal(f, 2 / 3 / (2 / 3 + 0.5))
assert_equal(s, None)
assert_almost_equal(fbeta_score(y_true, y_pred, beta=2,
average="micro"),
(1 + 4) * p * r / (4 * p + r))
p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
average="weighted")
assert_almost_equal(p, 3 / 4)
assert_almost_equal(r, 0.5)
assert_almost_equal(f, (2 / 1.5 + 1) / 4)
assert_equal(s, None)
assert_almost_equal(fbeta_score(y_true, y_pred, beta=2,
average="weighted"),
np.average(f2, weights=support))
p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
average="samples")
# |h(x_i) inter y_i | = [0, 0, 2]
# |y_i| = [1, 1, 2]
# |h(x_i)| = [0, 1, 2]
assert_almost_equal(p, 1 / 3)
assert_almost_equal(r, 1 / 3)
assert_almost_equal(f, 1 / 3)
assert_equal(s, None)
assert_almost_equal(fbeta_score(y_true, y_pred, beta=2,
average="samples"),
0.333, 2)