def main():
logging.basicConfig(level=logging.INFO)
parser = argparse.ArgumentParser()
parser.add_argument('tags', metavar='tag', nargs='+')
parser.add_argument('--fold', default='test',
help='identifier for file with the users to test on (default: test)')
args = parser.parse_args()
for model_tag in args.tags:
hps = hypers.hps_for_tag(model_tag)
dataset = Dataset(args.fold, hps, mode=Mode.inference)
path = common.resolve_xgboostmodel_path(model_tag)
logging.info('Loading model with tag {}'.format(model_tag))
model = xgb.Booster(model_file=path)
logging.info('Computing probs for tag {}'.format(model_tag))
with time_me('Computed probs for {}'.format(model_tag), mode='stderr'):
pdict = get_pdict(model, dataset)
logging.info('Got probs for {} users'.format(len(pdict)))
# TODO: might want to enforce some namespace separation between
# rnn-generated pdicts and ones coming from xgboost models?
common.save_pdict_for_tag(model_tag, pdict, args.fold)
python类Booster()的实例源码
precompute_probs.py 文件源码
项目:instacart-basket-prediction
作者: colinmorris
项目源码
文件源码
阅读 22
收藏 0
点赞 0
评论 0
s12_run_xgboost_only_train_create.py 文件源码
项目:KAGGLE_AVITO_2016
作者: ZFTurbo
项目源码
文件源码
阅读 19
收藏 0
点赞 0
评论 0
def run_train_with_model(train, features, model_path):
start_time = time.time()
gbm = xgb.Booster()
gbm.load_model(model_path)
print("Validating...")
check = gbm.predict(xgb.DMatrix(train[features]))
score = roc_auc_score(train['isDuplicate'].values, check)
validation_df = pd.DataFrame({'itemID_1': train['itemID_1'].values, 'itemID_2': train['itemID_2'].values,
'isDuplicate': train['isDuplicate'].values, 'probability': check})
print('AUC score value: {:.6f}'.format(score))
imp = get_importance(gbm, features)
print('Importance array: ', imp)
print('Prediction time: {} minutes'.format(round((time.time() - start_time)/60, 2)))
return validation_df, score
def run_test_with_model(train, test, features, model_path):
start_time = time.time()
gbm = xgb.Booster()
gbm.load_model(model_path)
print("Validating...")
check = gbm.predict(xgb.DMatrix(train[features]))
score = roc_auc_score(train['isDuplicate'].values, check)
validation_df = pd.DataFrame({'isDuplicate': train['isDuplicate'].values, 'probability': check})
# print(validation_df)
print('AUC score value: {:.6f}'.format(score))
# score1 = roc_auc_score(validation_df['isDuplicate'].values, validation_df['probability'])
# print('AUC score check value: {:.6f}'.format(score1))
imp = get_importance(gbm, features)
print('Importance array: ', imp)
print("Predict test set...")
test_prediction = gbm.predict(xgb.DMatrix(test[features]))
print('Training time: {} minutes'.format(round((time.time() - start_time)/60, 2)))
return test_prediction.tolist(), validation_df, score
def xgboost_make_submission(retrain = False):
sub_start_date = '2016-03-15'
sub_end_date = '2016-04-16'
if os.path.exists('./cache/bstmodel.bin') and not retrain:
bst = xgb.Booster({'ntheard':4})
bst.load_model('./cache/bstmodel.bin')
else:
bst = xgboost_train()
sub_user_index, sub_trainning_data = make_test_set(sub_start_date, sub_end_date, )
sub_trainning_data = xgb.DMatrix(sub_trainning_data.values)
y = bst.predict(sub_trainning_data)
sub_user_index['label'] = y
pred = sub_user_index[sub_user_index['label'] >= 0.03]
pred = pred[['user_id', 'sku_id']]
pred = pred.groupby('user_id').first().reset_index()
pred['user_id'] = pred['user_id'].astype(int)
dt = datetime.datetime.now()
sdt = str(dt.date())+str(dt.hour)+str(dt.minute)+str(dt.second)
pred.to_csv('./sub/submission_%s.csv' % sdt, index=False, index_label=False)
# P = get_sku_ids_in_P()
def predict(self,Xt,Xg,load_model=None):
print("load_model",load_model)
dtest = xgb.DMatrix(Xt)
dtest.set_group(Xg)
if load_model and self.bst is None:
self.bst = xgb.Booster(self.params,model_file=load_model)
return self.bst.predict(dtest)
def load_model(xgb_regressor,day,folder_path):
booster = xgb.Booster()
booster.load_model(folder_path+'%d.xgbmodel'%day)
xgb_regressor._Booster = booster
def predict_eval_model(dtest,model_path):
labels = dtest.get_label()
bst = xgb.Booster(model_file=model_path)
preds_prob = bst.predict(data=dtest)
preds_original = bst.predict(data=dtest,output_margin=True)
preds_label = []
for pred_prob in preds_prob:
if pred_prob>=0.5:
preds_label.append(1)
else:
preds_label.append(0)
print 'true label:\n',labels
from sklearn.metrics import accuracy_score
accuracy_score = accuracy_score(labels,preds_label)
# accuracy_score_num = accuracy_score(labels,preds_label,normalize=True)
# print 'accuracy_score : %f\tnum of the predicted truly : %d/%d'%accuracy_score,accuracy_score_num,len(labels)
print 'accuracy_score : %f'%accuracy_score
print 'average_precision_score : %f'%average_precision_score(labels,preds_prob)
print classification_report(labels,preds_label,target_names=['class 0','class 1'])
print confusion_matrix(labels,preds_label,labels=[0,1])
f1_score_s = f1_score(labels,preds_label,pos_label=1)
print 'f1 score : %f'%f1_score_s
print 'precision_score : %f'%precision_score(labels,preds_label,pos_label=1)
print 'recall_score : %f'%recall_score(labels,preds_label,pos_label=1)
print 'roc_auc_score : %f'%roc_auc_score(labels,preds_prob)
fpr, tpr, thresholds = roc_curve(labels, preds_prob)
print fpr,tpr,thresholds
roc_auc = auc(fpr, tpr)
precision,recall,thresholds_pr = precision_recall_curve(labels,preds_prob,pos_label=1)
plt.plot(precision,recall,label='P-R f1 score %f'%(f1_score_s))
# plt.plot(fpr, tpr, lw=1, label='ROC (area = %0.2f)' % (roc_auc))
# average_precision_score = average_precision_score(labels,preds_prob)
# print 'average_precision_score : %f'%average_precision_score
def predict_user():
print('????.')
xgb_model = xgb.Booster({'nthread':-1})
xgb_model.load_model('./model/xgb_user.model')
Online = pd.read_csv('./feat/online_user_model_feat.csv')
donline = xgb.DMatrix(Online.drop(['user_id'], axis=1))
xgb_proba = xgb_model.predict(donline)
online_proba = Online[['user_id']]
online_proba.loc[:,'proba'] = xgb_proba
online_proba.to_csv("./online_user_proba.csv", index=False)
def predict_sku():
print('????.')
xgb_model = xgb.Booster({'nthread':-1})
xgb_model.load_model('./model/xgb_sku.model')
Online = pd.read_csv("./feat/online_sku_feat.csv")
Online_drop_cols = ['user_id', 'sku_id', 'cate', 'brand']
donline = xgb.DMatrix(Online.drop(Online_drop_cols, axis=1))
##??
xgb_proba = xgb_model.predict(donline)
sku_proba = Online[['user_id', 'sku_id']]
sku_proba.loc[:,'sku_proba'] = xgb_proba
##???????????????
sku_proba = sku_proba.groupby(['user_id'], as_index=False).apply(lambda t: t[t.sku_proba == t.sku_proba.max()]).reset_index()[['user_id', 'sku_id', 'sku_proba']]
##??????????
user_proba = pd.read_csv("./online_user_proba.csv")
##???????????
sku_proba.sort_values(by="sku_proba", ascending=False, inplace=True)
user_proba.sort_values(by="proba", ascending=False, inplace=True)
##???? ? ???? ???500??
Top_user = user_proba.iloc[:500]
Top_sku = sku_proba.iloc[:500][['user_id', 'sku_id']]
Top_user = sku_proba[sku_proba.user_id.isin(Top_user.user_id)]
Top_user = Top_user.groupby(['user_id'], as_index=False).apply(lambda t: t[t.sku_proba == t.sku_proba.max()]).reset_index()[['user_id', 'sku_id']]
pred = pd.concat([Top_sku, Top_user])
pred = pred.drop_duplicates()
pred = pred[pred.user_id.duplicated()==False]
pred.astype(int).to_csv("online_submit.csv", index=False)
print('????.')
def run_test_with_model(train, test, features, target, random_state=0):
start_time = time.time()
test_size = 0.02
# X_train, X_valid = train_test_split(train, test_size=test_size, random_state=random_state)
split = round((1-test_size)*len(train.index))
X_train = train[0:split]
X_valid = train[split:]
print('Length train:', len(X_train.index))
print('Length valid:', len(X_valid.index))
# watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
# gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds, feval=auc_xgboost, verbose_eval=True)
# gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=True)
gbm = xgb.Booster()
gbm.load_model("models/model_0.968276662916_eta_0.2_md_5_test_size_0.02.bin")
print("Validating...")
check = gbm.predict(xgb.DMatrix(X_valid[features]))
score = roc_auc_score(X_valid[target].values, check)
score_kaggle = auc(X_valid[target].values, check)
print('Check error value: {:.6f} (Kaggle: {:.6f})'.format(score, score_kaggle))
imp = get_importance(gbm, features)
print('Importance array: ', imp)
print("Predict test set...")
test_prediction = gbm.predict(xgb.DMatrix(test[features]))
print('Training time: {} minutes'.format(round((time.time() - start_time)/60, 2)))
return test_prediction.tolist(), score
single.py 文件源码
项目:kaggle-yelp-restaurant-photo-classification
作者: u1234x1234
项目源码
文件源码
阅读 22
收藏 0
点赞 0
评论 0
def __init__(self):
self.clf = xgb.Booster()
single.py 文件源码
项目:kaggle-yelp-restaurant-photo-classification
作者: u1234x1234
项目源码
文件源码
阅读 20
收藏 0
点赞 0
评论 0
def fit(self, X, y):
d = xgb.DMatrix(X, y)
self.clf = xgb.Booster(param, [d])
for i in range(50):
self.clf.update(d, i)
predict_test.py 文件源码
项目:kaggle-yelp-restaurant-photo-classification
作者: u1234x1234
项目源码
文件源码
阅读 19
收藏 0
点赞 0
评论 0
def __init__(self):
self.clf = xgb.Booster()
predict_test.py 文件源码
项目:kaggle-yelp-restaurant-photo-classification
作者: u1234x1234
项目源码
文件源码
阅读 19
收藏 0
点赞 0
评论 0
def fit(self, X, y):
d = xgb.DMatrix(X, y)
self.clf = xgb.Booster(param, [d])
for i in range(10):
self.clf.update(d, i)
ensemble.py 文件源码
项目:kaggle-yelp-restaurant-photo-classification
作者: u1234x1234
项目源码
文件源码
阅读 28
收藏 0
点赞 0
评论 0
def __init__(self):
self.clf = xgb.Booster()
save_folds.py 文件源码
项目:kaggle-yelp-restaurant-photo-classification
作者: u1234x1234
项目源码
文件源码
阅读 17
收藏 0
点赞 0
评论 0
def __init__(self):
self.clf = xgb.Booster()
save_folds.py 文件源码
项目:kaggle-yelp-restaurant-photo-classification
作者: u1234x1234
项目源码
文件源码
阅读 18
收藏 0
点赞 0
评论 0
def fit(self, X, y):
d = xgb.DMatrix(X, y)
self.clf = xgb.Booster(param, [d])
for i in range(10):
self.clf.update(d, i)
def load(self, model_fp):
self.model = xgb.Booster(self.params)
self.model.load_model(model_fp)
def predict(self,Xt,load_model=None):
dtest = xgb.DMatrix(Xt)
if load_model and self.bst is None:
self.bst = xgb.Booster(self.params,model_file=load_model)
return self.bst.predict(dtest)
def output_critical_tests(train, features, target, model_path, test_size):
out_path = "cache/fails.html"
out = open(out_path, "w", encoding='utf-8')
gbm = xgb.Booster()
gbm.load_model(model_path)
types2 = {
'itemID': np.dtype(int),
'categoryID': np.dtype(int),
'title': np.dtype(str),
'description': np.dtype(str),
'images_array': np.dtype(str),
'attrsJSON': np.dtype(str),
'price': np.dtype(float),
'locationID': np.dtype(int),
'metroID': np.dtype(float),
'lat': np.dtype(float),
'lon': np.dtype(float),
}
print("Load ItemInfo_train.csv")
items = pd.read_csv("../input/ItemInfo_train.csv", dtype=types2)
items.fillna(-1, inplace=True)
split = round((1-test_size)*len(train.index))
X_train = train[0:split]
X_valid = train[split:]
print('Length train:', len(X_train.index))
print('Length valid:', len(X_valid.index))
print("Validating...")
check = gbm.predict(xgb.DMatrix(X_valid[features]))
# print(X_valid[features][:100])
# print(check[:100])
score = roc_auc_score(X_valid[target].values, check)
print('Score: {}'.format(score))
X_valid = append_items_info(X_valid, items)
count = 0
for i in range(len(X_valid[target].values)):
if abs(X_valid[target].values[i] - check[i]) > 0.9:
print(X_valid[target].values[i], check[i])
if count > 100:
break
print_debug_data(out, X_valid, features, i, check[i], X_valid[target].values[i])
count += 1
print('Count critical: {} from {}'.format(count, len(check)))
out.close()
Step11_test_ensemble_TopN.py 文件源码
项目:resume_job_matching
作者: lyoshiwo
项目源码
文件源码
阅读 17
收藏 0
点赞 0
评论 0
def get_ensemble_score(name):
if os.path.exists(util.features_prefix + name + "_XXXYYY.pkl") is False:
print 'file does not exist'
exit()
[X_train, X_validate, X_test, y_train, y_validate, y_test] = pd.read_pickle(
util.features_prefix + name + '_XXXYYY.pkl')
import xgboost as xgb
rf_clf_2 = pd.read_pickle(util.models_prefix + name + '_rf.pkl')
list_all = []
rf_2_list = rf_clf_2.predict_proba(X_test)
from sklearn.feature_selection import SelectFromModel
list_all.append(rf_2_list)
xgb_2 = xgb.Booster({'nthread': 4}) # init model
xgb_2.load_model(util.models_prefix + name + '_xgb_prob.pkl') # load data
dtest = xgb.DMatrix(X_test)
xgb_2_test = xgb_2.predict(dtest)
list_all.append(xgb_2_test)
# list_all.append(xgb_1_test)
import copy
[train_X, train_Y] = pd.read_pickle(util.features_prefix + name + '_XY.pkl')
X_semantic = np.array(copy.deepcopy(X_test[:, range(95, 475)]))
X_manual = np.array(copy.deepcopy(X_test[:, range(0, 95)]))
X_cluster = np.array(copy.deepcopy(X_test[:, range(475, 545)]))
X_document = np.array(copy.deepcopy(X_test[:, range(545, 547)]))
X_document[:, [0]] = X_document[:, [0]] + train_X[:, [-1]].max()
X_semantic = X_semantic.reshape(X_semantic.shape[0], 10, -1)
X_semantic_1 = np.zeros((X_semantic.shape[0], X_semantic.shape[2], X_semantic.shape[1]))
for i in range(int(X_semantic.shape[0])):
X_semantic_1[i] = np.transpose(X_semantic[i])
json_string = pd.read_pickle(util.models_prefix + name + '_json_string_cnn.pkl')
model_cnn = model_from_json(json_string)
model_cnn.load_weights(util.models_prefix + name + '_nn_weight_cnn.h5')
cnn_list = model_cnn.predict_proba([X_document, X_cluster, X_manual, X_semantic_1])
# cnn_list_prob = model_cnn.predict_proba([X_document, X_cluster, X_manual, X_semantic_1])
kk = list(cnn_list)
list_all.append(kk)
json_string = pd.read_pickle(util.models_prefix + name + '_json_string_lstm.pkl')
model_lstm = model_from_json(json_string)
model_lstm.load_weights(util.models_prefix + name + '_nn_weight_lstm.h5')
lstm_list = model_lstm.predict_proba([X_document, X_cluster, X_manual, X_semantic_1])
# cnn_list_prob = model_cnn.predict_proba([X_document, X_cluster, X_manual, X_semantic_1])
kk = list(lstm_list)
list_all.append(kk)
temp_list = []
for i in range(len(y_test)):
temp = np.zeros(len(list_all[0][0]))
for z in list_all:
temp += np.array(z[i])
temp_list.append(temp)
evaluate_k_recall(1, y_test, temp_list)
print '**************************'