sklearn GridSearchCV在得分函数中未使用sample_weight
我有每个样本权重不同的数据。在我的应用程序中,重要的是在估计模型和比较替代模型时要考虑这些权重。
我sklearn
用来估计模型并比较替代的超参数选择。但是该单元测试表明,GridSearchCV
这不适sample_weights
用于估算分数。
有没有办法有sklearn
使用sample_weight
得分模式?
单元测试:
from __future__ import division
import numpy as np
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
from sklearn.model_selection import GridSearchCV, RepeatedKFold
def grid_cv(X_in, y_in, w_in, cv, max_features_grid, use_weighting):
out_results = dict()
for k in max_features_grid:
clf = RandomForestClassifier(n_estimators=256,
criterion="entropy",
warm_start=False,
n_jobs=-1,
random_state=RANDOM_STATE,
max_features=k)
for train_ndx, test_ndx in cv.split(X=X_in, y=y_in):
X_train = X_in[train_ndx, :]
y_train = y_in[train_ndx]
w_train = w_in[train_ndx]
y_test = y[test_ndx]
clf.fit(X=X_train, y=y_train, sample_weight=w_train)
y_hat = clf.predict_proba(X=X_in[test_ndx, :])
if use_weighting:
w_test = w_in[test_ndx]
w_i_sum = w_test.sum()
score = w_i_sum / w_in.sum() * log_loss(y_true=y_test, y_pred=y_hat, sample_weight=w_test)
else:
score = log_loss(y_true=y_test, y_pred=y_hat)
results = out_results.get(k, [])
results.append(score)
out_results.update({k: results})
for k, v in out_results.items():
if use_weighting:
mean_score = sum(v)
else:
mean_score = np.mean(v)
out_results.update({k: mean_score})
best_score = min(out_results.values())
best_param = min(out_results, key=out_results.get)
return best_score, best_param
if __name__ == "__main__":
RANDOM_STATE = 1337
X, y = load_iris(return_X_y=True)
sample_weight = np.array([1 + 100 * (i % 25) for i in range(len(X))])
# sample_weight = np.array([1 for _ in range(len(X))])
inner_cv = RepeatedKFold(n_splits=3, n_repeats=1, random_state=RANDOM_STATE)
outer_cv = RepeatedKFold(n_splits=3, n_repeats=1, random_state=RANDOM_STATE)
rfc = RandomForestClassifier(n_estimators=256,
criterion="entropy",
warm_start=False,
n_jobs=-1,
random_state=RANDOM_STATE)
search_params = {"max_features": [1, 2, 3, 4]}
fit_params = {"sample_weight": sample_weight}
my_scorer = make_scorer(log_loss,
greater_is_better=False,
needs_proba=True,
needs_threshold=False)
grid_clf = GridSearchCV(estimator=rfc,
scoring=my_scorer,
cv=inner_cv,
param_grid=search_params,
refit=True,
return_train_score=False,
iid=False) # in this usage, the results are the same for `iid=True` and `iid=False`
grid_clf.fit(X, y, **fit_params)
print("This is the best out-of-sample score using GridSearchCV: %.6f." % -grid_clf.best_score_)
msg = """This is the best out-of-sample score %s weighting using grid_cv: %.6f."""
score_with_weights, param_with_weights = grid_cv(X_in=X,
y_in=y,
w_in=sample_weight,
cv=inner_cv,
max_features_grid=search_params.get(
"max_features"),
use_weighting=True)
print(msg % ("WITH", score_with_weights))
score_without_weights, param_without_weights = grid_cv(X_in=X,
y_in=y,
w_in=sample_weight,
cv=inner_cv,
max_features_grid=search_params.get(
"max_features"),
use_weighting=False)
print(msg % ("WITHOUT", score_without_weights))
产生输出:
This is the best out-of-sample score using GridSearchCV: 0.135692.
This is the best out-of-sample score WITH weighting using grid_cv: 0.099367.
This is the best out-of-sample score WITHOUT weighting using grid_cv: 0.135692.
说明:由于手动计算损失而不进行加权会产生与相同的评分GridSearchCV
,因此我们知道未使用样本权重。
-
在
GridSearchCV
需要一个scoring
作为输入,其可以是可调用。您可以在此处查看有关如何更改评分功能以及如何传递自己的评分功能的详细信息。为了完整起见,以下是该页面中的相关代码:编辑 : fit_params
仅传递给fit函数,而不传递给score函数。如果有应该传递给的参数scorer
,则应将其传递给make_scorer
。但这仍然不能解决问题,因为那将意味着将整个sample_weight
参数传递给log_loss
,而只y_test
应传递与计算损失时对应的部分。sklearn
不支持这样的事情,但是您可以使用来破解padas.DataFrame
。好消息是,您可以sklearn
理解DataFrame
并保持这种方式。这意味着您可以利用的index
,DataFrame
如您在此处的代码中所见:# more code X, y = load_iris(return_X_y=True) index = ['r%d' % x for x in range(len(y))] y_frame = pd.DataFrame(y, index=index) sample_weight = np.array([1 + 100 * (i % 25) for i in range(len(X))]) sample_weight_frame = pd.DataFrame(sample_weight, index=index) # more code def score_f(y_true, y_pred, sample_weight): return log_loss(y_true.values, y_pred, sample_weight=sample_weight.loc[y_true.index.values].values.reshape(-1), normalize=True) score_params = {"sample_weight": sample_weight_frame} my_scorer = make_scorer(score_f, greater_is_better=False, needs_proba=True, needs_threshold=False, **score_params) grid_clf = GridSearchCV(estimator=rfc, scoring=my_scorer, cv=inner_cv, param_grid=search_params, refit=True, return_train_score=False, iid=False) # in this usage, the results are the same for `iid=True` and `iid=False` grid_clf.fit(X, y_frame) # more code
如您所见,
score_f
使用的index
ofy_true
查找sample_weight
要使用的部分。为了完整起见,下面是整个代码:from __future__ import division import numpy as np from sklearn.datasets import load_iris from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import log_loss from sklearn.model_selection import GridSearchCV, RepeatedKFold from sklearn.metrics import make_scorer import pandas as pd def grid_cv(X_in, y_in, w_in, cv, max_features_grid, use_weighting): out_results = dict() for k in max_features_grid: clf = RandomForestClassifier(n_estimators=256, criterion="entropy", warm_start=False, n_jobs=1, random_state=RANDOM_STATE, max_features=k) for train_ndx, test_ndx in cv.split(X=X_in, y=y_in): X_train = X_in[train_ndx, :] y_train = y_in[train_ndx] w_train = w_in[train_ndx] y_test = y_in[test_ndx] clf.fit(X=X_train, y=y_train, sample_weight=w_train) y_hat = clf.predict_proba(X=X_in[test_ndx, :]) if use_weighting: w_test = w_in[test_ndx] w_i_sum = w_test.sum() score = w_i_sum / w_in.sum() * log_loss(y_true=y_test, y_pred=y_hat, sample_weight=w_test) else: score = log_loss(y_true=y_test, y_pred=y_hat) results = out_results.get(k, []) results.append(score) out_results.update({k: results}) for k, v in out_results.items(): if use_weighting: mean_score = sum(v) else: mean_score = np.mean(v) out_results.update({k: mean_score}) best_score = min(out_results.values()) best_param = min(out_results, key=out_results.get) return best_score, best_param #if __name__ == "__main__": if True: RANDOM_STATE = 1337 X, y = load_iris(return_X_y=True) index = ['r%d' % x for x in range(len(y))] y_frame = pd.DataFrame(y, index=index) sample_weight = np.array([1 + 100 * (i % 25) for i in range(len(X))]) sample_weight_frame = pd.DataFrame(sample_weight, index=index) # sample_weight = np.array([1 for _ in range(len(X))]) inner_cv = RepeatedKFold(n_splits=3, n_repeats=1, random_state=RANDOM_STATE) outer_cv = RepeatedKFold(n_splits=3, n_repeats=1, random_state=RANDOM_STATE) rfc = RandomForestClassifier(n_estimators=256, criterion="entropy", warm_start=False, n_jobs=1, random_state=RANDOM_STATE) search_params = {"max_features": [1, 2, 3, 4]} def score_f(y_true, y_pred, sample_weight): return log_loss(y_true.values, y_pred, sample_weight=sample_weight.loc[y_true.index.values].values.reshape(-1), normalize=True) score_params = {"sample_weight": sample_weight_frame} my_scorer = make_scorer(score_f, greater_is_better=False, needs_proba=True, needs_threshold=False, **score_params) grid_clf = GridSearchCV(estimator=rfc, scoring=my_scorer, cv=inner_cv, param_grid=search_params, refit=True, return_train_score=False, iid=False) # in this usage, the results are the same for `iid=True` and `iid=False` grid_clf.fit(X, y_frame) print("This is the best out-of-sample score using GridSearchCV: %.6f." % -grid_clf.best_score_) msg = """This is the best out-of-sample score %s weighting using grid_cv: %.6f.""" score_with_weights, param_with_weights = grid_cv(X_in=X, y_in=y, w_in=sample_weight, cv=inner_cv, max_features_grid=search_params.get( "max_features"), use_weighting=True) print(msg % ("WITH", score_with_weights)) score_without_weights, param_without_weights = grid_cv(X_in=X, y_in=y, w_in=sample_weight, cv=inner_cv, max_features_grid=search_params.get( "max_features"), use_weighting=False) print(msg % ("WITHOUT", score_without_weights))
代码的输出为:
This is the best out-of-sample score using GridSearchCV: 0.095439. This is the best out-of-sample score WITH weighting using grid_cv: 0.099367. This is the best out-of-sample score WITHOUT weighting using grid_cv: 0.135692.
编辑2 :正如下面的评论中所说:
使用此解决方案时,我的分数和sklearn分数之差源自我计算分数的加权平均值的方式。如果省略代码的加权平均部分,则两个输出将与机器精度匹配。