def fit(self,X,y,Xg,Xt=None,yt=None,Xgt=None,load_model=None,save_model=None):
print(X.shape,y.shape)
num_round = self.params['num_round']
early_stopping_rounds = self.params['early_stopping_rounds']
dtrain = xgb.DMatrix(X, y)
dtrain.set_group(Xg)
if Xt is not None:
dvalid = xgb.DMatrix(Xt, yt)
dvalid.set_group(Xgt)
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
bst = xgb.train(self.params, dtrain, num_round, evals = watchlist,
early_stopping_rounds=early_stopping_rounds,verbose_eval=1,xgb_model=load_model,
maximize=True)
else:
watchlist = [(dtrain, 'train')]
bst = xgb.train(self.params, dtrain, num_round, evals = watchlist,
verbose_eval=1,xgb_model=load_model)
self.bst = bst
if save_model is not None:
bst.save_model(save_model)
python类train()的实例源码
two_sigma_financial_modelling.py 文件源码
项目:PortfolioTimeSeriesAnalysis
作者: MizioAnd
项目源码
文件源码
阅读 22
收藏 0
点赞 0
评论 0
def predicted_vs_actual_y_xgb(self, xgb, best_nrounds, xgb_params, x_train_split, x_test_split, y_train_split,
y_test_split, title_name):
# Split the training data into an extra set of test
# x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train)
dtrain_split = xgb.DMatrix(x_train_split, label=y_train_split)
dtest_split = xgb.DMatrix(x_test_split)
print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split))
gbdt = xgb.train(xgb_params, dtrain_split, best_nrounds)
y_predicted = gbdt.predict(dtest_split)
plt.figure(figsize=(10, 5))
plt.scatter(y_test_split, y_predicted, s=20)
rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split)
plt.title(''.join([title_name, ', Predicted vs. Actual.', ' rmse = ', str(rmse_pred_vs_actual)]))
plt.xlabel('Actual y')
plt.ylabel('Predicted y')
plt.plot([min(y_test_split), max(y_test_split)], [min(y_test_split), max(y_test_split)])
plt.tight_layout()
def run_gbm(dtrain, dvalid, param):
# check training arguments in param
n_round = param.get('num_boost_round', 5000)
early_stop = param.get('early_stopping_rounds', 5)
verbose_eval = param.get('verbose_eval', 100)
# specify validations set to watch performance
watchlist = [(dvalid, 'eval')]
bst = xgb.train(params=param,
dtrain=dtrain,
num_boost_round=n_round,
evals=watchlist,
early_stopping_rounds=early_stop,
verbose_eval = verbose_eval
)
return bst
def crate_pre_train_model(x_,y_):
(x_train,x_test) = train_test_split(x_,test_size=0.1,random_state=1)
(y_train,y_test) = train_test_split(y_,test_size=0.1,random_state=1)
dtrain = xgb.DMatrix( x_train, label=y_train)
dtest = xgb.DMatrix( x_test, label=y_test)
evallist = [(dtrain,'train'),(dtest,'eval')]
param = {'objective':'reg:linear','max_depth':3 }
param['nthread'] = 64
#param['min_child_weight'] = 15
#param['subsample'] = 1
#param['num_class'] = 7
plst = param.items()
num_round = 5000
bst = xgb.train( plst, dtrain, num_round,
evallist,early_stopping_rounds=100,
#obj=logregobj,
feval=evalerror
)
return bst
# %% main
svm_classification.py 文件源码
项目:Video-Classification-Action-Recognition
作者: qijiezhao
项目源码
文件源码
阅读 31
收藏 0
点赞 0
评论 0
def get_data(item='train',id=1,is_shuffle=False,is_subtrain=1):
file_path=os.path.join(metadata_root,item+'_list0'+id+'.txt')
files=[]
labels=[]
with open(file_path,'r')as fp:
lines=fp.readlines()
if is_shuffle==True:
np.random.shuffle(lines)
if not is_subtrain==1:
lines=random.sample(lines,int(len(lines)*is_subtrain))
for line in lines:
tmp_prefix=line.strip().split('.')[0].split('/')[1]
label_tmp=line.strip().split(' ')[1]
files.append(os.path.join(feature_root,tmp_prefix+'.npy'))
labels.append(int(label_tmp)-1)
return files,np.array(labels,dtype=np.float64)
def tune_num_boost_round():
# global watchlist
global num_boost_round
global evals_result
global eval_metric_xgb_format
evals_result = {}
xgb.train(params=params_no_sklearn,dtrain=dtrain,num_boost_round=num_boost_round,evals=watchlist,evals_result=evals_result)
evals_result = evals_result['eval'][eval_metric_xgb_format]
# pprint.pprint(evals_result)
max = 0.0
max_loc = 0
for i,v in enumerate(evals_result):
# print '%d ...... %d : %d'%(i,max_loc,max)
if v>max:
max = v
max_loc = i
# print "max_loc : %s , max : %s"%(max_loc,max)
num_boost_round = max_loc+1
print('**** num_boost_round : ', num_boost_round)
def tune_num_boost_round():
# global watchlist
global num_boost_round
global evals_result
evals_result = {}
xgb.train(params=params_no_sklearn,dtrain=dtrain,num_boost_round=num_boost_round,evals=watchlist,evals_result=evals_result)
evals_result = evals_result['eval']['map']
pprint.pprint(evals_result)
max = 0.0
max_loc = 0
for i,v in enumerate(evals_result):
# print '%d ...... %d : %d'%(i,max_loc,max)
if v>max:
max = v
max_loc = i
print "max_loc : %d , max : %d"%(max_loc,max)
num_boost_round = max_loc+1
print '**** num_boost_round : ', num_boost_round
def load_data():
train_data = pd.read_csv(os.path.join(data_folder, 'train.csv'), delimiter=';', skip_blank_lines=True)
test_data = pd.read_csv(os.path.join(data_folder, 'test.csv'), delimiter=';', skip_blank_lines=True,
na_values='None')
ntrain = train_data.shape[0]
ntest = test_data.shape[0]
print('ntrain={}'.format(ntrain))
print('ntest={}'.format(ntest))
y_train = train_data['cardio'].values
# --------------------------------------------------------------
x_train = train_data.drop(["id", "cardio"], axis=1)
x_test = test_data.drop(["id"], axis=1)
x_test.replace('None', np.nan)
return (x_train,y_train,x_test)
# ---------------------------------------------------------------------
def main():
logging.basicConfig(level=logging.INFO)
parser = argparse.ArgumentParser()
parser.add_argument('tag')
parser.add_argument('--train-recordfile', default='train',
help='identifier for file with the users to train on (default: train). deprecated: specify in hps...')
parser.add_argument('-n', '--n-rounds', type=int, default=50,
help='Number of rounds of boosting. Deprecated: specify this in hp config file')
parser.add_argument('--weight', action='store_true',
help='Whether to do per-instance weighting. Deprecated: specify in hps')
args = parser.parse_args()
try:
hps = hypers.hps_for_tag(args.tag)
except hypers.NoHpsDefinedException:
logging.warn('No hps found for tag {}. Creating and saving some.'.format(args.tag))
hps = hypers.get_default_hparams()
hps.train_file = args.train_recordfile
hps.rounds = args.n_rounds
hps.weight = args.weight
hypers.save_hps(args.tag, hps)
validate_hps(hps)
dataset = Dataset(hps.train_file, hps)
with time_me(mode='stderr'):
train(dataset, args.tag, hps)
def xgb_train(train_config, X_train, y_train, X_test, y_test):
import xgboost as xgb
LOGGER.info("X_train.shape={}, y_train.shape={}, X_test.shape={}, y_test.shape={}".format(
X_train.shape, y_train.shape, X_test.shape, y_test.shape))
param = train_config["param"]
xg_train = xgb.DMatrix(X_train, label=y_train)
xg_test = xgb.DMatrix(X_test, label=y_test)
num_round = int(train_config["num_round"])
watchlist = [(xg_train, 'train'), (xg_test, 'test')]
try:
bst = xgb.train(param, xg_train, num_round, watchlist)
except KeyboardInterrupt:
LOGGER.info("Canceld by user's Ctrl-C action")
return
y_pred = np.argmax(bst.predict(xg_test), axis=1)
acc = 100. * np.sum(y_pred == y_test) / len(y_test)
LOGGER.info("accuracy={}%".format(acc))
def train_relatedness_classifier(trainX, trainY):
xg_train = xgb.DMatrix(trainX, label=trainY)
# setup parameters for xgboost
param = {}
# use softmax multi-class classification
param['objective'] = 'binary:logistic'
# scale weight of positive examples
param['eta'] = 0.1
param['max_depth'] = 6
param['silent'] = 1
param['nthread'] = 20
num_round = 1000
relatedness_classifier = xgb.train(param, xg_train, num_round);
return relatedness_classifier
def predicted_vs_actual_sale_price_xgb(self, xgb_params, x_train, y_train, seed, title_name):
# Split the training data into an extra set of test
x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train)
dtrain_split = xgb.DMatrix(x_train_split, label=y_train_split)
dtest_split = xgb.DMatrix(x_test_split)
res = xgb.cv(xgb_params, dtrain_split, num_boost_round=1000, nfold=4, seed=seed, stratified=False,
early_stopping_rounds=25, verbose_eval=10, show_stdv=True)
best_nrounds = res.shape[0] - 1
print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split))
gbdt = xgb.train(xgb_params, dtrain_split, best_nrounds)
y_predicted = gbdt.predict(dtest_split)
plt.figure(figsize=(10, 5))
plt.scatter(y_test_split, y_predicted, s=20)
rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split)
plt.title(''.join([title_name, ', Predicted vs. Actual.', ' rmse = ', str(rmse_pred_vs_actual)]))
plt.xlabel('Actual Sale Price')
plt.ylabel('Predicted Sale Price')
plt.plot([min(y_test_split), max(y_test_split)], [min(y_test_split), max(y_test_split)])
plt.tight_layout()
def fit(self, X, y):
if self.use_mspe:
lgb_train = lgb.Dataset(X, y,
weight=np.ones(X.shape[0]),
free_raw_data=False)
lgb_test = lgb.Dataset(X, y, reference=lgb_train,
weight=np.ones(X.shape[0]),
free_raw_data=False)
self.gbm = lgb.train(
self.kwargs,
lgb_train,
num_boost_round=10,
fobj=mspe,
feval=evalerror_lgbm,
valid_sets=lgb_test)
else:
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3)
#lgb_test = lgb.Dataset(X, y, reference=lgb_train,
# weight=np.ones(X.shape[0]),
# free_raw_data=False)
self.gbm.fit(X, y, early_stopping_rounds=10, eval_set=[(X, y)], verbose=False)
#print "gbm best_iteration=", self.gbm.best_iteration
def setUpClass(self):
"""
Set up the unit test by loading the dataset and training a model.
"""
if not HAS_XGBOOST:
return
if not HAS_SKLEARN:
return
scikit_data = load_boston()
dtrain = xgboost.DMatrix(scikit_data.data, label = scikit_data.target,
feature_names = scikit_data.feature_names)
xgb_model = xgboost.train({}, dtrain, 1)
# Save the data and the model
self.scikit_data = scikit_data
self.xgb_model = xgb_model
self.feature_names = self.scikit_data.feature_names
test_boosted_trees_regression_numeric.py 文件源码
项目:coremltools
作者: apple
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def _train_convert_evaluate(self, bt_params = {}, **params):
"""
Set up the unit test by loading the dataset and training a model.
"""
# Train a model
xgb_model = xgboost.train(bt_params, self.dtrain, **params)
# Convert the model
spec = xgb_converter.convert(xgb_model, self.feature_names, self.output_name, force_32bit_float = False)
# Get predictions
df = pd.DataFrame(self.X, columns=self.feature_names)
df['prediction'] = xgb_model.predict(self.dtrain)
# Evaluate it
metrics = evaluate_regressor(spec, df, target = 'target', verbose = False)
return metrics
def regression_with_xgboost_no_cv(x_train, y_train, X_test, Y_test, features=None, xgb_params=None, num_rounds = 10):
train_data = xgb.DMatrix(x_train, label=y_train, missing=float('nan'))
test_data = xgb.DMatrix(X_test, Y_test, missing=float('nan'))
evallist = [(train_data,'train'), (test_data,'eval')]
if xgb_params is None:
xgb_params = get_default_xgboost_params()
print "xgb_params not found"
print "XGBoost, using param", xgb_params
gbdt = xgb.train(xgb_params, train_data, num_rounds, evallist, verbose_eval = True, early_stopping_rounds=5)
isgbtree = xgb_params["booster"] == "gbtree"
if isgbtree :
ceate_feature_map_for_feature_importance(features)
show_feature_importance(gbdt)
y_pred = gbdt.predict(xgb.DMatrix(X_test, missing=float('nan')), ntree_limit=gbdt.best_ntree_limit)
else:
y_pred = gbdt.predict(xgb.DMatrix(X_test, missing=float('nan')))
return XGBoostModel(gbdt), y_pred
def predict(self, X):
train_file = os.path.join(self.tmp_dir, 'train.svm')
pred_file = os.path.join(self.tmp_dir, 'pred.svm')
out_file = os.path.join(self.tmp_dir, 'out.txt')
print "Exporting pred..."
with open(pred_file, 'w') as f:
dump_svmlight_file(X, np.zeros(X.shape[0]), f=f)
params = self.params.copy()
params['iter'] = 0
params['task'] = 'r'
params['train'] = train_file
params['test'] = pred_file
params['out'] = out_file
params['load_model'] = os.path.join(self.tmp_dir, 'model.libfm')
params = " ".join("-{} {}".format(k, params[k]) for k in params)
command = "{} {}".format(self.exec_path, params)
print command
os.system(command)
return pd.read_csv(out_file, header=None).values.flatten()
def ExtGBDT(train_x, train_y, test_x, test_y):
""" Ext-GBDT """
num_round = 100
param = {'objective': 'binary:logistic', 'booster': 'gbtree', 'eta': 0.03, 'max_depth': 3, 'eval_metric': 'auc',
'silent': 1, 'min_child_weight': 0.1, 'subsample': 0.7, 'colsample_bytree': 0.8, 'nthread': 4,
'max_delta_step': 0}
train_X = xgb.DMatrix(train_x, train_y)
test_X = xgb.DMatrix(test_x)
bst = xgb.train(param, train_X, num_round)
pred = bst.predict(test_X)
predict_y = []
for i in range(len(pred)):
if pred[i] < 0.5:
predict_y.append(0)
else:
predict_y.append(1)
auc = evaluate_auc(pred, test_y)
evaluate(predict_y, test_y)
return auc
def ka_bagging_2class_or_reg_lgbm(X_train, y_train, seed, bag_round, params
, X_test, using_notebook=True, num_boost_round=0):
'''
early version
'''
# create array object to hold predictions
baggedpred=np.zeros(shape=X_test.shape[0]).astype(np.float32)
#loop for as many times as we want bags
if using_notebook:
for n in tqdm_notebook(range(0, bag_round)):
#shuffle first, aids in increasing variance and forces different results
X_train, y_train=shuffle(X_train, y_train, random_state=seed+n)
params['seed'] = seed + n
model = lightgbm.train(params, lightgbm.Dataset(X_train, y_train), num_boost_round=num_boost_round)
pred = model.predict(X_test)
baggedpred += pred/bag_round
return baggedpred
Stock_Prediction_Model_XgBoost.py 文件源码
项目:StockRecommendSystem
作者: doncat99
项目源码
文件源码
阅读 19
收藏 0
点赞 0
评论 0
def do_run(self, train, predict, window):
LabelColumnName = 'label'
data_file = "data_file_xgboost_" + str(window) + ".pkl"
if os.path.exists(data_file):
input = open(data_file, 'rb')
data_feature = pickle.load(input)
input.close()
else:
data_feature = get_all_stocks_feature_data(self.paras, window, LabelColumnName)
output = open(data_file, 'wb')
pickle.dump(data_feature, output)
output.close()
model = None
train_feature = {}
if train: model = self.train_data(data_feature, window, LabelColumnName)
if predict: self.predict_data(model, data_feature, window, LabelColumnName)
def runXGB(train_X, train_y, seed_val=123):
param = {}
param['objective'] = 'multi:softprob'
param['eta'] = 0.05
param['max_depth'] = 6
param['silent'] = 1
param['num_class'] = 22
param['eval_metric'] = "mlogloss"
param['min_child_weight'] = 2
param['subsample'] = 0.9
param['colsample_bytree'] = 0.9
param['seed'] = seed_val
num_rounds = 115
plst = list(param.items())
xgtrain = xgb.DMatrix(train_X, label=train_y)
model = xgb.train(plst, xgtrain, num_rounds)
return model
def cross_validate(train):
#separate training and validation set
X_train,X_valid= split_train_validation(train)
scores = []; preds = []
for i in xrange(len(X_train)):
#convert X_train, Y_train etc... to xgboost matrix
dtrain = xgb.DMatrix(X_train[i][['phone_brand','device_model','timestamp']], label = X_train[i]['group'],missing=np.nan)
dvalid = xgb.DMatrix(X_valid[i][['phone_brand','device_model','timestamp']], label = X_valid[i]['group'],missing=np.nan)
#predict with xgboost
parameters = {'max_depth':4,'eta':0.1,'silent':1, 'subsample':0.8,'colsample_bytree':0.8,
'objective':'multi:softprob','booster':'gbtree','early_stopping_rounds':50,
'num_class':12,'num_boost_round':1000,'eval_metric':'mlogloss'}
plst = parameters.items()
bst = xgb.train(plst, dtrain)
pred = bst.predict(dvalid)
scores.append(log_loss(X_valid[i]['group'].tolist(),pred))
pred = pd.DataFrame(pred, index = X_valid[i].index, columns=target_encoder.classes_)
preds.append(pred)
return scores, preds
def test_basic(c, s, a, b):
dtrain = xgb.DMatrix(df, label=labels)
bst = xgb.train(param, dtrain)
ddf = dd.from_pandas(df, npartitions=4)
dlabels = dd.from_pandas(labels, npartitions=4)
dbst = yield dxgb._train(c, param, ddf, dlabels)
dbst = yield dxgb._train(c, param, ddf, dlabels) # we can do this twice
result = bst.predict(dtrain)
dresult = dbst.predict(dtrain)
correct = (result > 0.5) == labels
dcorrect = (dresult > 0.5) == labels
assert dcorrect.sum() >= correct.sum()
predictions = dxgb.predict(c, dbst, ddf)
assert isinstance(predictions, dd.Series)
predictions = yield c.compute(predictions)._result()
assert isinstance(predictions, pd.Series)
assert ((predictions > 0.5) != labels).sum() < 2
def test_dmatrix_kwargs(c, s, a, b):
xgb.rabit.init() # workaround for "Doing rabit call after Finalize"
dX = da.from_array(X, chunks=(2, 2))
dy = da.from_array(y, chunks=(2,))
dbst = yield dxgb._train(c, param, dX, dy, {"missing": 0.0})
# Distributed model matches local model with dmatrix kwargs
dtrain = xgb.DMatrix(X, label=y, missing=0.0)
bst = xgb.train(param, dtrain)
result = bst.predict(dtrain)
dresult = dbst.predict(dtrain)
assert np.abs(result - dresult).sum() < 0.02
# Distributed model gives bad predictions without dmatrix kwargs
dtrain_incompat = xgb.DMatrix(X, label=y)
dresult_incompat = dbst.predict(dtrain_incompat)
assert np.abs(result - dresult_incompat).sum() > 0.02
def test_numpy(c, s, a, b):
xgb.rabit.init() # workaround for "Doing rabit call after Finalize"
dX = da.from_array(X, chunks=(2, 2))
dy = da.from_array(y, chunks=(2,))
dbst = yield dxgb._train(c, param, dX, dy)
dbst = yield dxgb._train(c, param, dX, dy) # we can do this twice
dtrain = xgb.DMatrix(X, label=y)
bst = xgb.train(param, dtrain)
result = bst.predict(dtrain)
dresult = dbst.predict(dtrain)
correct = (result > 0.5) == y
dcorrect = (dresult > 0.5) == y
assert dcorrect.sum() >= correct.sum()
predictions = dxgb.predict(c, dbst, dX)
assert isinstance(predictions, da.Array)
predictions = yield c.compute(predictions)._result()
assert isinstance(predictions, np.ndarray)
assert ((predictions > 0.5) != labels).sum() < 2
def test_synchronous_api(loop): # noqa
dtrain = xgb.DMatrix(df, label=labels)
bst = xgb.train(param, dtrain)
ddf = dd.from_pandas(df, npartitions=4)
dlabels = dd.from_pandas(labels, npartitions=4)
with cluster() as (s, [a, b]):
with Client(s['address'], loop=loop) as c:
dbst = dxgb.train(c, param, ddf, dlabels)
result = bst.predict(dtrain)
dresult = dbst.predict(dtrain)
correct = (result > 0.5) == labels
dcorrect = (dresult > 0.5) == labels
assert dcorrect.sum() >= correct.sum()
def fit(self, X, y=None):
"""Fit the gradient boosting model
Parameters
----------
X : array-like [n_samples, n_features]
y : array-like
Returns
-------
self : the fitted Regressor
Notes
-----
This differs from the XGBoost version not supporting the ``eval_set``,
``eval_metric``, ``early_stopping_rounds`` and ``verbose`` fit
kwargs.
"""
client = default_client()
xgb_options = self.get_xgb_params()
self._Booster = train(client, xgb_options, X, y,
num_boost_round=self.n_estimators)
return self
def train(params, dmatrix_train, dmatrix_validate):
params['silent'] = 1
params['objective'] = 'binary:logistic' # output probabilities
params['eval_metric'] = 'auc'
num_rounds = params["num_rounds"]
early_stopping_rounds = params["early_stop_rounds"]
# early stop will check on the last dataset
watchlist = [(dmatrix_train, 'train'), (dmatrix_validate, 'validate')]
bst = xgb.train(param, dmatrix_train, num_rounds, watchlist, early_stopping_rounds=early_stopping_rounds)
print "parameters: {}".format(param)
print "best {}: {:.2f}".format(param["eval_metric"], bst.best_score)
print "best_iteration: %d" % (bst.best_iteration)
return params,bst
def fit(self, X, y, x_val=None, y_val=None):
dtrain = xgb.DMatrix(X, label=y)
if x_val is not None:
dtest = xgb.DMatrix(x_val, label=y_val)
watchlist = [(dtrain, 'train'), (dtest, 'validation')]
self.clf = xgb.train(params=self.params,
dtrain=dtrain,
num_boost_round=self.num_round,
early_stopping_rounds=self.early_stopping_rounds,
evals=watchlist,
verbose_eval=self.verbose)
else:
self.clf = xgb.train(params=self.params,
dtrain=dtrain,
num_boost_round=self.num_round,
early_stopping_rounds=self.early_stopping_rounds)
return
def fit(self, X, y, x_val=None, y_val=None):
dtrain = xgb.DMatrix(X, label=y)
if x_val is not None:
dtest = xgb.DMatrix(x_val, label=y_val)
watchlist = [(dtrain, 'train'), (dtest, 'validation')]
self.xgb = xgb.train(params=self.params,
dtrain=dtrain,
num_boost_round=self.num_round,
early_stopping_rounds=self.early_stopping_rounds,
evals=watchlist,
verbose_eval=self.verbose)
else:
self.xgb = xgb.train(params=self.params,
dtrain=dtrain,
num_boost_round=self.num_round,
early_stopping_rounds=self.early_stopping_rounds,
verbose_eval=self.verbose)
return