main.py 文件源码-python代码片段

main.py 文件源码

python

阅读 24 收藏 0 点赞 0 评论 0

项目：Kaggle-DS-Bowl-17 作者: Zephyr-D 项目源码文件源码

def train_xgboost():
    df = pd.read_csv('data/stage1_labels.csv')
#    print df.head()

    x = []
    y = []
    did = df['id'].tolist()
    cancer = df['cancer'].tolist()
    for i in range(len(df)):
        if os.path.isfile('data/stage1/%s.npy' % did[i]):
            f = np.load('data/stage1/%s.npy' % did[i])
            f = f.reshape(f.shape[0], 2048)
            x.append(np.mean(f, axis=0))
            y.append(cancer[i])

    x = np.array(x)
    print x.shape
    y = np.array(y)

    trn_x, val_x, trn_y, val_y = cross_validation.train_test_split(x, y, random_state=822, stratify=y, test_size=0.1)

    clfs = []
    for s in range(5):
    # Some parameters were taken from discussion.
        clf = xgb.XGBRegressor(n_estimators=1000, max_depth=10, min_child_weight=10,
                               learning_rate=0.01, subsample=0.80, colsample_bytree=0.70,
                               seed=822 + s, reg_alpha=0.1)

        clf.fit(trn_x, trn_y, eval_set=[(val_x, val_y)], verbose=True, eval_metric='logloss', early_stopping_rounds=100)
        clfs.append(clf)
    return clfs