def threshold_estimate_cv(x,y,k_fold):
print "%d %d %d" % (y.shape[0], sum(y==1), sum(y==0))
kf1 = StratifiedKFold(y, n_folds=k_fold, shuffle=True, random_state=0)
threshold = np.zeros((k_fold),dtype="float32")
cnt = 0
for train_index, test_index in kf1:
x_train, x_test = x[train_index], x[test_index]
y_train, y_test = y[train_index], y[test_index]
w1 = np.array([1]*y_train.shape[0])
weight = float(len(y_train[y_train == 0]))/float(len(y_train[y_train == 1]))
w1 = np.array([1]*y_train.shape[0])
w1[y_train==1]=weight
estimator = xgb.XGBClassifier(max_depth=10, learning_rate=0.1, n_estimators=1000, nthread=50)
estimator.fit(x_train, y_train, sample_weight=w1)
y_scores = estimator.predict_proba(x_test)[:,1]
precision, recall, thresholds = precision_recall_curve(y_test, y_scores)
f1 = 2*precision[2:]*recall[2:]/(precision[2:]+recall[2:])
m_idx = np.argmax(f1)
threshold[cnt] = thresholds[2+m_idx]
cnt += 1
print("%d %f %f" % (precision.shape[0], f1[m_idx], thresholds[2+m_idx]))
return np.mean(threshold), threshold
# Cross validation using gradient tree boosting
评论列表
文章目录