def check_alternative_lrap_implementation(lrap_score, n_classes=5,
n_samples=20, random_state=0):
_, y_true = make_multilabel_classification(n_features=1,
allow_unlabeled=False,
random_state=random_state,
n_classes=n_classes,
n_samples=n_samples)
# Score with ties
y_score = sparse_random_matrix(n_components=y_true.shape[0],
n_features=y_true.shape[1],
random_state=random_state)
if hasattr(y_score, "toarray"):
y_score = y_score.toarray()
score_lrap = label_ranking_average_precision_score(y_true, y_score)
score_my_lrap = _my_lrap(y_true, y_score)
assert_almost_equal(score_lrap, score_my_lrap)
# Uniform score
random_state = check_random_state(random_state)
y_score = random_state.uniform(size=(n_samples, n_classes))
score_lrap = label_ranking_average_precision_score(y_true, y_score)
score_my_lrap = _my_lrap(y_true, y_score)
assert_almost_equal(score_lrap, score_my_lrap)
python类label_ranking_average_precision_score()的实例源码
def _batch_MAP_MRR(self,
s_label, # [batch_size, sent_num]
s_preds, # [batch_size, sent_num]
mask): # [batch_size, sent_num]
""" Calcualte the Mean Average Precision and Mean Reciprocal Rank
"""
average_precisions = []
reciprocal_ranks = []
for i in xrange(s_label.shape[0]): # For each question in the batch
# Only keep those not padded
label = np.take(s_label[i], np.where(mask[i] == 1)[0])
preds = np.take(s_preds[i], np.where(mask[i] == 1)[0])
assert(label.shape == preds.shape)
# MAP only makes sense for positive bags
try:
assert(np.max(label) > 0)
except AssertionError as e:
print(s_label)
raise e
# TODO: is this correct???
ap = label_ranking_average_precision_score([label], # true binary label
[preds]) # target scores
rr = label_ranking_reciprocal_rank(label, preds)
try: assert(not np.isnan(ap) and not np.isnan(rr))
except: pdb.set_trace()
average_precisions.append(ap)
reciprocal_ranks.append(rr)
return average_precisions, reciprocal_ranks
def label_ranking_average_precision_score(self, predictor, batch_size=50):
from sklearn.metrics import label_ranking_average_precision_score
# ??predict
p = []
for xq_batch, xa_batch, _ in super(QaPairsTest, self).sampling(batch_size):
delta = predictor(xq_batch, xa_batch)
p += delta[0].tolist()
p = np.array(p)
# ???????????
# 1. ??????????
# 2. ??????????
map_record = []
skip1 = 0
skip2 = 0
for question, entry in self.questions.items():
idx = np.array(entry['idx'])
if self.y_np[idx].max() == 0:
skip1 += 1
continue
if self.y_np[idx].min() != 0:
skip2 += 1
#continue
score = p[idx].reshape(idx.shape).tolist()
map = label_ranking_average_precision_score(np.array([entry['label']]), np.array([score]))
map_record.append(map)
logging.info('Skip1 %d Skip2 %d' % (skip1, skip2))
return np.array(map_record).mean()
def label_ranking_average_precision_score2(self, model, batch_size=50):
def label_ranking_average_precision_score(label, score):
assert len(label) == len(score)
data = zip(label, score)
data = sorted(data, key=lambda x:x[1],reverse=True)
count = 0.0
values = []
for i in range(len(data)):
if data[i][0]:
count += 1
values.append(count / (i + 1))
assert len(values)
return sum(values) / count, values[0]
p = model.predict(
{'q_input': self.xq_np, 'a_input':self.xa_np},
batch_size=batch_size
)
map_record = []
for question, entry in self.questions.items():
idx = np.array(entry['idx'])
if self.y_np[idx].max() == 0:
continue
score = p[idx].reshape(idx.shape).tolist()
map, _ = label_ranking_average_precision_score(entry['label'], score)
map_record.append(map)
self.saveResult(question, map, score)
map = np.array(map_record).mean()
self.saveResult('__TOTAL_MAP__', map)
return map
def _generate_classification_reports(y_true, y_pred, target_names=None):
# Calculate additional stats
total_accuracy = accuracy_score(y_true, y_pred)
cov_error = coverage_error(y_true, y_pred)
lrap = label_ranking_average_precision_score(y_true, y_pred)
report = metrics.multilabel_prediction_report(y_true, y_pred)
report += '\n\n'
report += metrics.multilabel_classification_report(y_true, y_pred, target_names=target_names)
report += '\n\n'
report += 'coverage error: %.3f' % cov_error
report += '\n'
report += 'LRAP: %.3f' % lrap
report += '\n'
report += 'total accuracy: %.3f' % total_accuracy
return report
# def run_train_test(path_train, path_test, args):
# print('Loading train data set "%s"...' % path_train)
# X_train, y_train, tags_train, _ = dataset.load_manifest(path_train)
#
# print('\nLoading test data set "%s" ...' % path_test)
# X_test, y_test, tags_test, _ = dataset.load_manifest(path_test)
#
# report_base_name = args.model + '_kfold_%d' % rnd
# validate(X_train, y_train, X_test, y_test, report_base_name, target_names=tags_train)
def test_label_ranking_avp():
for fn in [label_ranking_average_precision_score, _my_lrap]:
yield check_lrap_toy, fn
yield check_lrap_without_tie_and_increasing_score, fn
yield check_lrap_only_ties, fn
yield check_zero_or_all_relevant_labels, fn
yield check_lrap_error_raised, label_ranking_average_precision_score
for n_samples, n_classes, random_state in product((1, 2, 8, 20),
(2, 5, 10),
range(1)):
yield (check_alternative_lrap_implementation,
label_ranking_average_precision_score,
n_classes, n_samples, random_state)
eval_performance.py 文件源码
项目:Neural-Architecture-Search-with-RL
作者: dhruvramani
项目源码
文件源码
阅读 17
收藏 0
点赞 0
评论 0
def evaluate(predictions, labels, threshold=0.4, multi_label=True):
'''
True Positive : Label : 1, Prediction : 1
False Positive : Label : 0, Prediction : 1
False Negative : Label : 0, Prediction : 0
True Negative : Label : 1, Prediction : 0
Precision : TP/(TP + FP)
Recall : TP/(TP + FN)
F Score : 2.P.R/(P + R)
Ranking Loss : The average number of label pairs that are incorrectly ordered given predictions
Hammming Loss : The fraction of labels that are incorrectly predicted. (Hamming Distance between predictions and labels)
'''
assert predictions.shape == labels.shape, "Shapes: %s, %s" % (predictions.shape, labels.shape,)
metrics = dict()
if not multi_label:
metrics['bae'] = BAE(labels, predictions)
labels, predictions = np.argmax(labels, axis=1), np.argmax(predictions, axis=1)
metrics['accuracy'] = accuracy_score(labels, predictions)
metrics['micro_precision'], metrics['micro_recall'], metrics['micro_f1'], _ = \
precision_recall_fscore_support(labels, predictions, average='micro')
metrics['macro_precision'], metrics['macro_recall'], metrics['macro_f1'], metrics['coverage'], \
metrics['average_precision'], metrics['ranking_loss'], metrics['pak'], metrics['hamming_loss'] \
= 0, 0, 0, 0, 0, 0, 0, 0
else:
metrics['coverage'] = coverage_error(labels, predictions)
metrics['average_precision'] = label_ranking_average_precision_score(labels, predictions)
metrics['ranking_loss'] = label_ranking_loss(labels, predictions)
for i in range(predictions.shape[0]):
predictions[i, :][predictions[i, :] >= threshold] = 1
predictions[i, :][predictions[i, :] < threshold] = 0
metrics['bae'] = 0
metrics['patk'] = patk(predictions, labels)
metrics['micro_precision'], metrics['micro_recall'], metrics['micro_f1'], metrics['macro_precision'], \
metrics['macro_recall'], metrics['macro_f1'] = bipartition_scores(labels, predictions)
return metrics
def evaluate(experiment_path, meta_data=False, xml_dir="", train_dir="",
submission_file=""):
pickle_path = os.path.join(experiment_path, "predictions.pkl")
with open(pickle_path, 'rb') as input:
y_trues = pickle.load(input)
y_scores = pickle.load(input)
training_segments = pickle.load(input)
if meta_data:
elevation_scores = compute_elevation_scores(training_segments, xml_dir,
train_dir)
## Combine the scores using Bayes Thm.
normalize = np.array([np.sum(y_s * e_s) for y_s, e_s in zip(y_scores,
elevation_scores)])
y_scores = y_scores * elevation_scores / normalize[:, None]
if submission_file:
write_to_submission_file(submission_file, y_scores, training_segments,
train_dir)
return
map_score = mean_average_precision(y_trues, y_scores)
auroc_score = area_under_roc_curve(y_trues, y_scores)
# coverage error
coverage_error = metrics.coverage_error(y_trues, y_scores)
# label ranking average precision
lrap = metrics.label_ranking_average_precision_score(y_trues, y_scores)
# ranking loss
ranking_loss = metrics.label_ranking_loss(y_trues, y_scores)
print("")
print("- Top 1:", top_n(y_trues, y_scores, 1))
print("- Top 2:", top_n(y_trues, y_scores, 2))
print("- Top 3:", top_n(y_trues, y_scores, 3))
print("- Top 4:", top_n(y_trues, y_scores, 4))
print("- Top 5:", top_n(y_trues, y_scores, 5))
print("")
print("Mean Average Precision: ", map_score)
print("Area Under ROC Curve: ", auroc_score)
print("Coverage Error: ", coverage_error)
print("Label Ranking Average Precision: ", lrap)
print("Ranking Loss: ", ranking_loss)
print("Total predictions: ", len(y_scores))
return {
"map":map_score,
"auroc":auroc_score,
"coverage_error":coverage_error,
"lrap":lrap,
"ranking_loss": ranking_loss,
"top_1":top_n(y_trues, y_scores, 1),
"top_5":top_n(y_trues, y_scores, 5),
}