def train_xgboost():
df = pd.read_csv('data/stage1_labels.csv')
# print df.head()
x = []
y = []
did = df['id'].tolist()
cancer = df['cancer'].tolist()
for i in range(len(df)):
if os.path.isfile('data/stage1/%s.npy' % did[i]):
f = np.load('data/stage1/%s.npy' % did[i])
f = f.reshape(f.shape[0], 2048)
x.append(np.mean(f, axis=0))
y.append(cancer[i])
x = np.array(x)
print x.shape
y = np.array(y)
trn_x, val_x, trn_y, val_y = cross_validation.train_test_split(x, y, random_state=822, stratify=y, test_size=0.1)
clfs = []
for s in range(5):
# Some parameters were taken from discussion.
clf = xgb.XGBRegressor(n_estimators=1000, max_depth=10, min_child_weight=10,
learning_rate=0.01, subsample=0.80, colsample_bytree=0.70,
seed=822 + s, reg_alpha=0.1)
clf.fit(trn_x, trn_y, eval_set=[(val_x, val_y)], verbose=True, eval_metric='logloss', early_stopping_rounds=100)
clfs.append(clf)
return clfs
评论列表
文章目录