sklearn GridSearchCV在得分函数中未使用sample_weight

发布于 2021-01-29 15:13:01

我有每个样本权重不同的数据。在我的应用程序中,重要的是在估计模型和比较替代模型时要考虑这些权重。

sklearn用来估计模型并比较替代的超参数选择。但是该单元测试表明,GridSearchCV这不适sample_weights用于估算分数。

有没有办法有sklearn使用sample_weight得分模式?

单元测试:

from __future__ import division

import numpy as np
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
from sklearn.model_selection import GridSearchCV, RepeatedKFold


def grid_cv(X_in, y_in, w_in, cv, max_features_grid, use_weighting):
  out_results = dict()

  for k in max_features_grid:
    clf = RandomForestClassifier(n_estimators=256,
                                 criterion="entropy",
                                 warm_start=False,
                                 n_jobs=-1,
                                 random_state=RANDOM_STATE,
                                 max_features=k)
    for train_ndx, test_ndx in cv.split(X=X_in, y=y_in):
      X_train = X_in[train_ndx, :]
      y_train = y_in[train_ndx]
      w_train = w_in[train_ndx]
      y_test = y[test_ndx]

      clf.fit(X=X_train, y=y_train, sample_weight=w_train)

      y_hat = clf.predict_proba(X=X_in[test_ndx, :])
      if use_weighting:
        w_test = w_in[test_ndx]
        w_i_sum = w_test.sum()
        score = w_i_sum / w_in.sum() * log_loss(y_true=y_test, y_pred=y_hat, sample_weight=w_test)
      else:
        score = log_loss(y_true=y_test, y_pred=y_hat)

      results = out_results.get(k, [])
      results.append(score)
      out_results.update({k: results})

  for k, v in out_results.items():
    if use_weighting:
      mean_score = sum(v)
    else:
      mean_score = np.mean(v)
    out_results.update({k: mean_score})

  best_score = min(out_results.values())
  best_param = min(out_results, key=out_results.get)
  return best_score, best_param


if __name__ == "__main__":
  RANDOM_STATE = 1337
  X, y = load_iris(return_X_y=True)
  sample_weight = np.array([1 + 100 * (i % 25) for i in range(len(X))])
  # sample_weight = np.array([1 for _ in range(len(X))])

  inner_cv = RepeatedKFold(n_splits=3, n_repeats=1, random_state=RANDOM_STATE)

  outer_cv = RepeatedKFold(n_splits=3, n_repeats=1, random_state=RANDOM_STATE)

  rfc = RandomForestClassifier(n_estimators=256,
                               criterion="entropy",
                               warm_start=False,
                               n_jobs=-1,
                               random_state=RANDOM_STATE)
  search_params = {"max_features": [1, 2, 3, 4]}


  fit_params = {"sample_weight": sample_weight}
  my_scorer = make_scorer(log_loss, 
               greater_is_better=False, 
               needs_proba=True, 
               needs_threshold=False)

  grid_clf = GridSearchCV(estimator=rfc,
                          scoring=my_scorer,
                          cv=inner_cv,
                          param_grid=search_params,
                          refit=True,
                          return_train_score=False,
                          iid=False)  # in this usage, the results are the same for `iid=True` and `iid=False`
  grid_clf.fit(X, y, **fit_params)
  print("This is the best out-of-sample score using GridSearchCV: %.6f." % -grid_clf.best_score_)

  msg = """This is the best out-of-sample score %s weighting using grid_cv: %.6f."""
  score_with_weights, param_with_weights = grid_cv(X_in=X,
                                                   y_in=y,
                                                   w_in=sample_weight,
                                                   cv=inner_cv,
                                                   max_features_grid=search_params.get(
                                                     "max_features"),
                                                   use_weighting=True)
  print(msg % ("WITH", score_with_weights))

  score_without_weights, param_without_weights = grid_cv(X_in=X,
                                                         y_in=y,
                                                         w_in=sample_weight,
                                                         cv=inner_cv,
                                                         max_features_grid=search_params.get(
                                                           "max_features"),
                                                         use_weighting=False)
  print(msg % ("WITHOUT", score_without_weights))

产生输出:

This is the best out-of-sample score using GridSearchCV: 0.135692.
This is the best out-of-sample score WITH weighting using grid_cv: 0.099367.
This is the best out-of-sample score WITHOUT weighting using grid_cv: 0.135692.

说明:由于手动计算损失而不进行加权会产生与相同的评分GridSearchCV,因此我们知道未使用样本权重。

关注者
0
被浏览
43
1 个回答
  • 面试哥
    面试哥 2021-01-29
    为面试而生,有面试问题,就找面试哥。

    GridSearchCV需要一个scoring作为输入,其可以是可调用。您可以在此处查看有关如何更改评分功能以及如何传递自己的评分功能的详细信息。为了完整起见,以下是该页面中的相关代码:

    在此处输入图片说明

    编辑fit_params
    仅传递给fit函数,而不传递给score函数。如果有应该传递给的参数scorer,则应将其传递给make_scorer。但这仍然不能解决问题,因为那将意味着将整个sample_weight参数传递给log_loss,而只y_test应传递与计算损失时对应的部分。

    sklearn不支持这样的事情,但是您可以使用来破解padas.DataFrame。好消息是,您可以sklearn理解DataFrame并保持这种方式。这意味着您可以利用的indexDataFrame如您在此处的代码中所见:

      # more code
    
      X, y = load_iris(return_X_y=True)
      index = ['r%d' % x for x in range(len(y))]
      y_frame = pd.DataFrame(y, index=index)
      sample_weight = np.array([1 + 100 * (i % 25) for i in range(len(X))])
      sample_weight_frame = pd.DataFrame(sample_weight, index=index)
    
      # more code
    
      def score_f(y_true, y_pred, sample_weight):
          return log_loss(y_true.values, y_pred,
                          sample_weight=sample_weight.loc[y_true.index.values].values.reshape(-1),
                          normalize=True)
    
      score_params = {"sample_weight": sample_weight_frame}
      my_scorer = make_scorer(score_f,
                              greater_is_better=False, 
                              needs_proba=True, 
                              needs_threshold=False,
                              **score_params)
    
      grid_clf = GridSearchCV(estimator=rfc,
                              scoring=my_scorer,
                              cv=inner_cv,
                              param_grid=search_params,
                              refit=True,
                              return_train_score=False,
                              iid=False)  # in this usage, the results are the same for `iid=True` and `iid=False`
      grid_clf.fit(X, y_frame)
    
      # more code
    

    如您所见,score_f使用的indexofy_true查找sample_weight要使用的部分。为了完整起见,下面是整个代码:

    from __future__ import division
    
    import numpy as np
    from sklearn.datasets import load_iris
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import log_loss
    from sklearn.model_selection import GridSearchCV, RepeatedKFold
    from sklearn.metrics import  make_scorer
    import pandas as pd
    
    def grid_cv(X_in, y_in, w_in, cv, max_features_grid, use_weighting):
      out_results = dict()
    
      for k in max_features_grid:
        clf = RandomForestClassifier(n_estimators=256,
                                     criterion="entropy",
                                     warm_start=False,
                                     n_jobs=1,
                                     random_state=RANDOM_STATE,
                                     max_features=k)
        for train_ndx, test_ndx in cv.split(X=X_in, y=y_in):
          X_train = X_in[train_ndx, :]
          y_train = y_in[train_ndx]
          w_train = w_in[train_ndx]
          y_test = y_in[test_ndx]
    
          clf.fit(X=X_train, y=y_train, sample_weight=w_train)
    
          y_hat = clf.predict_proba(X=X_in[test_ndx, :])
          if use_weighting:
            w_test = w_in[test_ndx]
            w_i_sum = w_test.sum()
            score = w_i_sum / w_in.sum() * log_loss(y_true=y_test, y_pred=y_hat, sample_weight=w_test)
          else:
            score = log_loss(y_true=y_test, y_pred=y_hat)
    
          results = out_results.get(k, [])
          results.append(score)
          out_results.update({k: results})
    
      for k, v in out_results.items():
        if use_weighting:
          mean_score = sum(v)
        else:
          mean_score = np.mean(v)
        out_results.update({k: mean_score})
    
      best_score = min(out_results.values())
      best_param = min(out_results, key=out_results.get)
      return best_score, best_param
    
    
    #if __name__ == "__main__":
    if True:
      RANDOM_STATE = 1337
      X, y = load_iris(return_X_y=True)
      index = ['r%d' % x for x in range(len(y))]
      y_frame = pd.DataFrame(y, index=index)
      sample_weight = np.array([1 + 100 * (i % 25) for i in range(len(X))])
      sample_weight_frame = pd.DataFrame(sample_weight, index=index)
      # sample_weight = np.array([1 for _ in range(len(X))])
    
      inner_cv = RepeatedKFold(n_splits=3, n_repeats=1, random_state=RANDOM_STATE)
    
      outer_cv = RepeatedKFold(n_splits=3, n_repeats=1, random_state=RANDOM_STATE)
    
      rfc = RandomForestClassifier(n_estimators=256,
                                   criterion="entropy",
                                   warm_start=False,
                                   n_jobs=1,
                                   random_state=RANDOM_STATE)
      search_params = {"max_features": [1, 2, 3, 4]}
    
    
      def score_f(y_true, y_pred, sample_weight):
          return log_loss(y_true.values, y_pred,
                          sample_weight=sample_weight.loc[y_true.index.values].values.reshape(-1),
                          normalize=True)
    
      score_params = {"sample_weight": sample_weight_frame}
      my_scorer = make_scorer(score_f,
                              greater_is_better=False, 
                              needs_proba=True, 
                              needs_threshold=False,
                              **score_params)
    
      grid_clf = GridSearchCV(estimator=rfc,
                              scoring=my_scorer,
                              cv=inner_cv,
                              param_grid=search_params,
                              refit=True,
                              return_train_score=False,
                              iid=False)  # in this usage, the results are the same for `iid=True` and `iid=False`
      grid_clf.fit(X, y_frame)
      print("This is the best out-of-sample score using GridSearchCV: %.6f." % -grid_clf.best_score_)
    
      msg = """This is the best out-of-sample score %s weighting using grid_cv: %.6f."""
      score_with_weights, param_with_weights = grid_cv(X_in=X,
                                                       y_in=y,
                                                       w_in=sample_weight,
                                                       cv=inner_cv,
                                                       max_features_grid=search_params.get(
                                                         "max_features"),
                                                       use_weighting=True)
      print(msg % ("WITH", score_with_weights))
    
      score_without_weights, param_without_weights = grid_cv(X_in=X,
                                                             y_in=y,
                                                             w_in=sample_weight,
                                                             cv=inner_cv,
                                                             max_features_grid=search_params.get(
                                                               "max_features"),
                                                             use_weighting=False)
      print(msg % ("WITHOUT", score_without_weights))
    

    代码的输出为:

    This is the best out-of-sample score using GridSearchCV: 0.095439.
    This is the best out-of-sample score WITH weighting using grid_cv: 0.099367.
    This is the best out-of-sample score WITHOUT weighting using grid_cv: 0.135692.
    

    编辑2 :正如下面的评论中所说:

    使用此解决方案时,我的分数和sklearn分数之差源自我计算分数的加权平均值的方式。如果省略代码的加权平均部分,则两个输出将与机器精度匹配。



知识点
面圈网VIP题库

面圈网VIP题库全新上线,海量真题题库资源。 90大类考试,超10万份考试真题开放下载啦

去下载看看