def _ndcg_at(k, label_col):
def ndcg_at_k(predicted, actual):
# TODO: Taking in rn and then re-sorting might not be necessary, but i can't
# find any real guarantee that they would come in order after a groupBy + collect_list,
# since they were only ordered within the window function.
predicted = [row[label_col] for row in sorted(predicted, key=lambda r: r.rn)]
actual = [row[label_col] for row in sorted(actual, key=lambda r: r.rn)]
dcg = 0.
for i, label in enumerate(predicted):
# This form is used to match EvalNDCG in xgboost
dcg += ((1 << label) - 1) / math.log(i + 2.0, 2)
idcg = 0.
for i, label in enumerate(actual):
idcg += ((1 << label) - 1) / math.log(i + 2.0, 2)
if idcg == 0:
return 0
else:
return dcg / idcg
return F.udf(ndcg_at_k, pyspark.sql.types.DoubleType())
评论列表
文章目录