mainPEP.py 文件源码-python代码片段

def parametered_cv(x,y,k_fold,k_fold1):
    print("samples: %d %d %d %d" % (x.shape[0],x.shape[1],k_fold,k_fold1))
    kf = StratifiedKFold(y, n_folds=k_fold, shuffle=True, random_state=0)
    index = []
    label = []
    yfit = []
    metrics = np.zeros((k_fold,5),dtype="float32")
    thresholds = []
    predicted = np.array([[0,0]])
    features1 = np.array([[0,0]])
    thresh = 0.5
    cnt = 0
    print "Positive: %d Negative: %d" % (sum(y==1), sum(y==0))
    for train_index, test_index in kf:
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        print y_train.shape
        print("%d %d %d %d" % (x_train.shape[0], x_train.shape[1], x_test.shape[0], x_test.shape[1]))
        if k_fold1>1:
            thresh, thresh_vec = threshold_estimate_cv(x_train,y_train,k_fold1)
        elif k_fold1==1:
            thresh = threshold_estimate(x_train,y_train)
        else:
            thresh = 0.5
        print("%d %f" % (x_train.shape[0], thresh))
        weight = float(len(y_train[y_train == 0]))/float(len(y_train[y_train == 1]))
        w1 = np.array([1]*y_train.shape[0])
        w1[y_train==1]=weight
        weight1 = float(len(y_test[y_test == 0]))/float(len(y_test[y_test == 1]))
        clf = xgb.XGBClassifier(max_depth=10, learning_rate=0.1, n_estimators=1000, nthread=50)
        clf.fit(x_train, y_train, sample_weight=w1)
        prob = clf.predict_proba(x_test)
        yfit1 = (prob[:,1]>thresh)
        index = np.concatenate((index,test_index),axis=0)
        label = np.concatenate((label,y_test),axis=0)
        yfit = np.concatenate((yfit,yfit1),axis=0)
        precision, recall, f1, mcc = score_function(y_test,yfit1)
        metrics[cnt,:] = np.array((thresh,precision,recall,f1,mcc))
        print metrics[cnt,:]
        cnt += 1
        predicted = np.concatenate((predicted,prob),axis=0) 
        importances = clf.feature_importances_
        indices1 = np.argsort(importances)[::-1]
        feature_1 = np.transpose(np.array((indices1,importances[indices1])))
        features1 = np.concatenate((features1,feature_1),axis=0)

    pred = np.transpose(np.array((index,label,yfit)))
    aver_metrics = np.mean(metrics,axis=0)
    aver_metrics = np.reshape(aver_metrics,(1,metrics.shape[1]))
    metrics_1 = np.concatenate((metrics,aver_metrics),axis=0)
    print aver_metrics
    return metrics_1, pred, predicted[1:,], features1[1:,]

# Single run using gradient tree boosting