def fit(self,
train_fs, train_labels,
valid_fs, valid_labels):
rank_k = self.config.getint('RANK', 'rank_k')
train_DMatrix = xgb.DMatrix(train_fs, label=train_labels)
train_DMatrix.set_group([rank_k] * (len(train_labels) / rank_k))
valid_DMatrix = xgb.DMatrix(valid_fs, label=valid_labels)
valid_DMatrix.set_group([rank_k] * (len(valid_labels) / rank_k))
watchlist = [(train_DMatrix, 'train'), (valid_DMatrix, 'valid')]
# self.__lock()
self.model = xgb.train(self.params,
train_DMatrix,
self.params['num_round'],
watchlist,
early_stopping_rounds=self.params['early_stop'],
verbose_eval=self.params['verbose_eval'])
LogUtil.log('INFO', 'best_ntree_limit=%d' % self.model.best_ntree_limit)
# self.__unlock()
valid_preds = self.model.predict(valid_DMatrix, ntree_limit=self.model.best_ntree_limit)
return valid_preds
python类train()的实例源码
def xgb_train(train_config, X_train, y_train, X_test, y_test):
import xgboost as xgb
LOGGER.info("X_train.shape={}, y_train.shape={}, X_test.shape={}, y_test.shape={}".format(
X_train.shape, y_train.shape, X_test.shape, y_test.shape))
param = train_config["param"]
xg_train = xgb.DMatrix(X_train, label=y_train)
xg_test = xgb.DMatrix(X_test, label=y_test)
num_round = int(train_config["num_round"])
watchlist = [(xg_train, 'train'), (xg_test, 'test')]
try:
bst = xgb.train(param, xg_train, num_round, watchlist)
except KeyboardInterrupt:
LOGGER.info("Canceld by user's Ctrl-C action")
return
y_pred = np.argmax(bst.predict(xg_test), axis=1)
acc = 100. * np.sum(y_pred == y_test) / len(y_test)
LOGGER.info("accuracy={}%".format(acc))
def Options():
op = OptionParser()
op.add_option('-E', '--events')
op.add_option('-l', '--train')
op.add_option('-F', '--fstr')
op.add_option('-B', '--bags')
op.add_option('-R', '--seed')
op.add_option('-T', '--trees')
op.add_option('-O', '--out')
op.add_option('-d', '--depth')
op.add_option('-e', '--eta')
op.add_option('-S', '--subsample')
op.add_option('-v', '--toeval')
op.add_option("-V", action="store_true", dest="verbose")
op.add_option("-D", action="store_true", dest="dump")
op.add_option("-Q", action="store_true", dest="names")
op.add_option("-P", action="store_true", dest="production")
return op.parse_args()[0]
def predict():
saved = state.load('model')
#saved = None
if debug_mode:
saved = None
if saved == None:
train, y, test, _ = data.get()
z = pd.DataFrame()
z['id'] = test.id
z['y'] = 0
v = pd.DataFrame()
v['id'] = train.id
v['y'] = y
cv, _ = run(train, y, test, v, z)
state.save('model', (v, z, cv, None))
else:
v, z, cv, _ = saved
return v, z, cv, _
def predict():
saved = state.load('model')
#saved = None
if debug_mode:
saved = None
if saved == None:
train, y, test, _ = data.get()
z = pd.DataFrame()
z['id'] = test.id
z['y'] = 0
v = pd.DataFrame()
v['id'] = train.id
v['y'] = y
cv, _ = run(train, y, test, v, z)
state.save('model', (v, z, cv, None))
else:
v, z, cv, _ = saved
return v, z, cv, _
def predict():
saved = state.load('model')
#saved = None
if saved == None:
train, y, test, _ = data.get()
z = pd.DataFrame()
z['id'] = test.id
z['y'] = 0
v = pd.DataFrame()
v['id'] = train.id
v['y'] = y
cv, _ = run(train, y, test, v, z)
state.save('model', (v, z, cv, None))
else:
v, z, cv, _ = saved
return v, z, cv, _
def predict():
saved = state.load('model')
#saved = None
if debug_mode:
saved = None
if saved == None:
train, y, test, _ = data.get()
ftrain, ftest, _ = fea_1.get()
ftrain2, ftest2, _ = fea_2.get()
train = pd.concat([train, ftrain, ftrain2], axis=1)
test = pd.concat([test, ftest, ftest2], axis=1)
print(train.shape, test.shape)
z = pd.DataFrame()
z['id'] = test.id
z['y'] = 0
v = pd.DataFrame()
v['id'] = train.id
v['y'] = y
cv, _ = run(train, y, test, v, z)
state.save('model', (v, z, cv, None))
else:
v, z, cv, _ = saved
return v, z, cv, _
def predict():
saved = state.load('model')
#saved = None
if saved == None:
train, y, test, _ = data.get()
z = pd.DataFrame()
z['id'] = test.id
z['y'] = 0
v = pd.DataFrame()
v['id'] = train.id
v['y'] = y
cv, _ = run(train, y, test, v, z)
state.save('model', (v, z, cv, None))
else:
v, z, cv, _ = saved
return v, z, cv, _
def predict():
saved = state.load('model')
#saved = None
if debug_mode:
saved = None
if saved == None:
train, y, test, _ = data.get()
ftrain, ftest, _ = fea_1.get()
ftrain2, ftest2, _ = fea_2.get()
train = pd.concat([train, ftrain, ftrain2], axis=1)
test = pd.concat([test, ftest, ftest2], axis=1)
print(train.shape, test.shape)
z = pd.DataFrame()
z['id'] = test.id
z['y'] = 0
v = pd.DataFrame()
v['id'] = train.id
v['y'] = y
cv, _ = run(train, y, test, v, z)
state.save('model', (v, z, cv, None))
else:
v, z, cv, _ = saved
return v, z, cv, _
def xgb_base(train2, y, test2, v, z, xgb_params, N_splits, N_seeds, cname, base_seed=42):
v[cname], z[cname] = 0, 0
scores = []
skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
dtest = xgb.DMatrix(test2)
for s in range(N_seeds):
xgb_params['seed'] = s + base_seed
for n, (itrain, ival) in enumerate(skf.split(train2, y)):
dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
watch = [(dtrain, 'train'), (dvalid, 'valid')]
clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)
p = clf.predict(dvalid)
v.loc[ival, cname] += pconvert(p)
score = metrics.log_loss(y[ival], p)
z[cname] += pconvert(clf.predict(dtest))
print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
scores.append(score)
z[cname] /= N_splits * N_seeds
v[cname] /= N_seeds
print('validation loss: ', metrics.log_loss(y, prestore(v[cname])))
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
def xgb_base(train2, y, test2, v, z, xgb_params, N_splits, N_seeds, cname, base_seed=42):
v[cname], z[cname] = 0, 0
scores = []
dtest = xgb.DMatrix(test2)
for s in range(N_seeds):
xgb_params['seed'] = s + base_seed
skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True, random_state=s + base_seed)
for n, (itrain, ival) in enumerate(skf.split(train2, y)):
dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
watch = [(dtrain, 'train'), (dvalid, 'valid')]
clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)
p = clf.predict(dvalid)
v.loc[ival, cname] += pconvert(p)
score = metrics.log_loss(y[ival], p)
z[cname] += pconvert(clf.predict(dtest))
print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
scores.append(score)
z[cname] /= N_splits * N_seeds
v[cname] /= N_seeds
print('validation loss: ', metrics.log_loss(y, prestore(v[cname])))
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
def save_results(v, z):
pred_path = '../submissions/p' + base_data_name() + '.csv'
all_data_path = '../data/output/model' + base_data_name() + '.csv.gz'
z.y = np.clip(z.y.values, 1e-5, 1-1e-5)
z[['y']].to_csv(pred_path, header=None, index=False)
v['train'] = 1
z['train'] = 0
q = pd.concat([v, z], axis=0)
q.to_csv(all_data_path, index=False, compression='gzip')
for c in z.columns:
if c in {'id', 'train', 'y'}: continue
z[c] = prestore(z[c])
print(z.head(20))
print('saved', pred_path, all_data_path)
def save_results(v, z):
pred_path = '../submissions/p' + base_data_name() + '.csv'
all_data_path = '../data/output/model' + base_data_name() + '.csv.gz'
z.y = np.clip(z.y.values, 1e-5, 1-1e-5)
z[['y']].to_csv(pred_path, header=None, index=False)
v['train'] = 1
z['train'] = 0
q = pd.concat([v, z], axis=0)
q.to_csv(all_data_path, index=False, compression='gzip')
for c in z.columns:
if c in {'id', 'train', 'y'}: continue
z[c] = prestore(z[c])
print(z.head(20))
print('saved', pred_path, all_data_path)
def xgb_base(train2, y, test2, v, z, xgb_params, N_splits, N_seeds, cname, base_seed=42):
v[cname], z[cname] = 0, 0
scores = []
skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
dtest = xgb.DMatrix(test2)
for s in range(N_seeds):
xgb_params['seed'] = s + base_seed
for n, (itrain, ival) in enumerate(skf.split(train2, y)):
dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
watch = [(dtrain, 'train'), (dvalid, 'valid')]
clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)
p = clf.predict(dvalid)
v.loc[ival, cname] += pconvert(p)
score = metrics.log_loss(y[ival], p)
z[cname] += pconvert(clf.predict(dtest))
print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
scores.append(score)
z[cname] /= N_splits * N_seeds
v[cname] /= N_seeds
print('validation loss: ', metrics.log_loss(y, prestore(v[cname])))
cv=np.array(scores)
print(cv, cv.mean(), cv.std())
def save_results(v, z):
pred_path = '../submissions/p' + base_data_name() + '.csv'
all_data_path = '../data/output/model' + base_data_name() + '.csv.gz'
z.y = np.clip(z.y.values, 1e-5, 1-1e-5)
z[['y']].to_csv(pred_path, header=None, index=False)
v['train'] = 1
z['train'] = 0
q = pd.concat([v, z], axis=0)
q.to_csv(all_data_path, index=False, compression='gzip')
for c in z.columns:
if c in {'id', 'train', 'y'}: continue
z[c] = prestore(z[c])
print(z.head(20))
print('saved', pred_path, all_data_path)
def save_results(v, z):
pred_path = '../submissions/p' + base_data_name() + '.csv'
all_data_path = '../data/output/model' + base_data_name() + '.csv.gz'
z.y = np.clip(z.y.values, 1e-5, 1-1e-5)
z[['y']].to_csv(pred_path, header=None, index=False)
v['train'] = 1
z['train'] = 0
q = pd.concat([v, z], axis=0)
q.to_csv(all_data_path, index=False, compression='gzip')
for c in z.columns:
if c in {'id', 'train', 'y'}: continue
z[c] = prestore(z[c])
print(z.head(20))
print('saved', pred_path, all_data_path)
def fit(self,X,y,Xt=None,yt=None,
load_model=None,save_model=None,
obj=None,feval=None,print_fscore=True,evalx=None):
print(X.shape,y.shape)
num_round = self.params.get('num_round',100)
early_stopping_rounds = self.params.get('early_stopping_rounds',None)
maximize = self.params.get('maximize',False)
dtrain = xgb.DMatrix(X, y)
vb = self.params.get('verbose_eval',1)
if Xt is not None:
dvalid = xgb.DMatrix(Xt, yt)
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
bst = xgb.train(self.params, dtrain, num_round, evals = watchlist,
early_stopping_rounds=early_stopping_rounds,verbose_eval=vb,
xgb_model=load_model,obj=obj,feval=feval,maximize=maximize)
else:
watchlist = [(dtrain, 'train')]
bst = xgb.train(self.params, dtrain, num_round, evals = watchlist,
verbose_eval=vb,xgb_model=load_model,obj=obj,feval=feval)
self.bst = bst
if save_model is not None:
bst.save_model(save_model)
fscore = self.feature_importance()
if print_fscore:
print("Feature Importance:")
for i in fscore:
print(i)
if Xt is not None and evalx is not None:
yp = self.predict(Xt)
score = evalx(yt,yp)
print(score)
return score
return 0
two_sigma_financial_modelling.py 文件源码
项目:PortfolioTimeSeriesAnalysis
作者: MizioAnd
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def save_dataframe(self, df):
with pd.HDFStore(''.join([TwoSigmaFinModTools._save_path, 'train_debug', self.timestamp, '.h5']), "w") as train:
train.put("train_debug", df)
two_sigma_financial_modelling.py 文件源码
项目:PortfolioTimeSeriesAnalysis
作者: MizioAnd
项目源码
文件源码
阅读 25
收藏 0
点赞 0
评论 0
def load_dataframe():
dataframe_name = 'train_debug'
# one-hot encoded
# not one-hot
# date_time = '20170613_19h09m40s'
# date_time = '20170613_19h34m31s'
# date_time = '20170614_00h07m32s'
date_time = '20170619_11h47m22s'
with pd.HDFStore(''.join([TwoSigmaFinModTools._save_path, dataframe_name, date_time, '.h5']), 'r') as train:
return train.get(dataframe_name)