def parametered_cv(x,y,k_fold,k_fold1):
print("samples: %d %d %d %d" % (x.shape[0],x.shape[1],k_fold,k_fold1))
kf = StratifiedKFold(y, n_folds=k_fold, shuffle=True, random_state=0)
index = []
label = []
yfit = []
metrics = np.zeros((k_fold,5),dtype="float32")
thresholds = []
predicted = np.array([[0,0]])
features1 = np.array([[0,0]])
thresh = 0.5
cnt = 0
print "Positive: %d Negative: %d" % (sum(y==1), sum(y==0))
for train_index, test_index in kf:
x_train, x_test = x[train_index], x[test_index]
y_train, y_test = y[train_index], y[test_index]
print y_train.shape
print("%d %d %d %d" % (x_train.shape[0], x_train.shape[1], x_test.shape[0], x_test.shape[1]))
if k_fold1>1:
thresh, thresh_vec = threshold_estimate_cv(x_train,y_train,k_fold1)
elif k_fold1==1:
thresh = threshold_estimate(x_train,y_train)
else:
thresh = 0.5
print("%d %f" % (x_train.shape[0], thresh))
weight = float(len(y_train[y_train == 0]))/float(len(y_train[y_train == 1]))
w1 = np.array([1]*y_train.shape[0])
w1[y_train==1]=weight
weight1 = float(len(y_test[y_test == 0]))/float(len(y_test[y_test == 1]))
clf = xgb.XGBClassifier(max_depth=10, learning_rate=0.1, n_estimators=1000, nthread=50)
clf.fit(x_train, y_train, sample_weight=w1)
prob = clf.predict_proba(x_test)
yfit1 = (prob[:,1]>thresh)
index = np.concatenate((index,test_index),axis=0)
label = np.concatenate((label,y_test),axis=0)
yfit = np.concatenate((yfit,yfit1),axis=0)
precision, recall, f1, mcc = score_function(y_test,yfit1)
metrics[cnt,:] = np.array((thresh,precision,recall,f1,mcc))
print metrics[cnt,:]
cnt += 1
predicted = np.concatenate((predicted,prob),axis=0)
importances = clf.feature_importances_
indices1 = np.argsort(importances)[::-1]
feature_1 = np.transpose(np.array((indices1,importances[indices1])))
features1 = np.concatenate((features1,feature_1),axis=0)
pred = np.transpose(np.array((index,label,yfit)))
aver_metrics = np.mean(metrics,axis=0)
aver_metrics = np.reshape(aver_metrics,(1,metrics.shape[1]))
metrics_1 = np.concatenate((metrics,aver_metrics),axis=0)
print aver_metrics
return metrics_1, pred, predicted[1:,], features1[1:,]
# Single run using gradient tree boosting
评论列表
文章目录