methods.py 文件源码

python
阅读 25 收藏 0 点赞 0 评论 0

项目:WhatsMissingInGeoparsing 作者: milangritta 项目源码 文件源码
def calculate_scores(predicted, gold, inspect=False, topocluster=False):
    """
    Given the predictions and the gold annotations, calculate precision, recall, F Score and accuracy.
    :param topocluster: Topocluster geoparser produces NON-STANDARD output so has to be treated differently
    :param inspect: If True, the differences between gold and predicted files will be printed
    :param predicted: path to the file with parser predictions
    :param gold: path to the file with gold annotations
    :return: a list of errors per toponym i.e how far away is each correctly identified toponym from
    the gold location. This is used to measure the accuracy of the geocoding part
    """
    tp, fp, fn = 0.0, 0.0, 0.0
    accuracy = {}
    wiki = True if "wiki" in predicted else False
    predictions_file = codecs.open(predicted)
    gold = codecs.open(gold)
    toponym_index = -1
    for predicted, gold in zip(predictions_file, gold):
        predicted_tops = predicted.split("||")[:-1]
        gold_tops = gold.split("||")[:-1]
        for gold_top in gold_tops[:]:
            toponym_index += 1
            gold_top_items = gold_top.split(",,")
            for predicted_top in predicted_tops[:]:
                predicted_top_items = predicted_top.split(",,")
                mean_g = (int(gold_top_items[4]) + int(gold_top_items[5])) / 2.0
                mean_p = (int(predicted_top_items[4]) + int(predicted_top_items[5])) / 2.0
                #  If the toponym position (its mean) is no more than 9 characters from gold AND the two
                #  strings are equal then it's a match. For reasons to do with UTF-8 encoding and decoding,
                #  the toponym indices may, in a few instances, be off by a few positions when using Web APIs.
                match = False  # A flag to establish whether this is a matching prediction
                if topocluster:  # Only match for the toponym name in this instance
                    if predicted_top_items[1].lower() == gold_top_items[1].lower():
                        match = True
                elif abs(mean_g - mean_p) < 10 and predicted_top_items[1].lower() == gold_top_items[1].lower():
                    match = True   # Change the number above to 0 for EXACT matches, 10 for INEXACT matches
                if match:
                    tp += 1
                    predicted_tops.remove(predicted_top)
                    gold_tops.remove(gold_top)
                    predicted_coord = (float(predicted_top_items[2]), float(predicted_top_items[3]))
                    gold_coord = (float(gold_top_items[2]), float(gold_top_items[3]))
                    accuracy[toponym_index] = numpy.log(1 + great_circle(predicted_coord, gold_coord).kilometers)
                    break
        if not wiki:
            fp += len(predicted_tops)
        fn += len(gold_tops)
        if inspect:
            if len(predicted_tops) > 0 or 0 < len(gold_tops):
                print "Predicted:", " - ".join(predicted_tops)
                print "Gold Tops:", " - ".join(gold_tops)
    f_score = (tp, fp, fn)
    output = {"f_score": f_score, "accuracy": accuracy}
    return output
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号