python类cv()的实例源码

predict_2017_07_06_1.py 文件源码 项目:mlbootcamp_5 作者: ivan-filonov 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def xgb_common(train2, y, test2, v, z, N_seeds, N_splits, cname, xgb_params):
    scores = []
    skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
    dtest = xgb.DMatrix(test2)
    for s in range(N_seeds):
        cname2 = cname + str(s)
        v[cname2], z[cname2] = 0, 0
        xgb_params['seed'] = s + 4242
        for n, (itrain, ival) in enumerate(skf.split(train2, y)):
            dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
            dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
            watch = [(dtrain, 'train'), (dvalid, 'valid')]
            clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)

            p = clf.predict(dvalid)
            v.loc[ival, cname2] += pconvert(p)
            score = metrics.log_loss(y[ival], p)
            z[cname2]  += pconvert(clf.predict(dtest))
            print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
            scores.append(score)
        z[cname2] /= N_splits

    vloss = [metrics.log_loss(y, prestore(v[cname + str(i)])) for i in range(N_seeds)]
    print('validation loss: ', vloss, np.mean(vloss), np.std(vloss))
    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())
predict_2017_07_06_1.py 文件源码 项目:mlbootcamp_5 作者: ivan-filonov 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def keras_common(train3, y, test3, v, z, num_splits, cname, build_model, seed = 1234, batch_size = 128):
    v[cname], z[cname] = 0, 0
    np.random.seed(seed)
    build_model().summary(line_length=120)
    model_path = '../data/working/' + cname + '_keras_model.h5'
    ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=11, test_size=1/num_splits)
    scores = list()
    for n, (itrain, ival) in enumerate(ss.split(train3, y)):
        xtrain, xval = train3[itrain], train3[ival]
        ytrain, yval = y[itrain], y[ival]
        model = build_model()
        model.fit(
                xtrain, ytrain,
                batch_size = batch_size,
                epochs = 10000,
                validation_data = (xval, yval),
                verbose = 0,
                callbacks = build_keras_fit_callbacks(model_path),
                shuffle = True
            )
        model.load_weights(model_path)
        p = model.predict(xval)
        v.loc[ival, cname] += pconvert(p).ravel()
        score = metrics.log_loss(y[ival], p)
        print(cname, 'fold %d: '%(n+1), score, now())
        scores.append(score)
        z[cname] += pconvert(model.predict(test3)).ravel()
        del model
        for i in range(3): gc.collect(i)
    os.remove(model_path)

    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())
    z[cname] /= num_splits
predict_2017_07_04_4.py 文件源码 项目:mlbootcamp_5 作者: ivan-filonov 项目源码 文件源码 阅读 16 收藏 0 点赞 0 评论 0
def restore_missing(df, N_splits = 10):
    xgb_params = dict(
            max_depth = 5,
            learning_rate = 0.005,
            gamma = 1,
            alpha = 0.01,
            objective = 'binary:logistic',
            eval_metric = 'logloss',
            seed = 1,
            silent = 1
        )
    #{'gamma': 0.0, 'seed': 1, 'eval_metric': 'logloss', 'objective': 'binary:logistic', 'subsample': 0.6, 'min_child_weight': 1, 'colsample_bytree': 0.9, 'silent': 1, 'n_estimators': 10000, 'reg_alpha': 0.05, 'learning_rate': 0.005, 'max_depth': 2}
    df.ix[df.active == -1, 'active'] = 1
    df.ix[df.alco == -1, 'alco'] = 0

    label = 'smoke'
    print('before', label, '{{{', df[label].value_counts(), '}}}')
    xtrain = df[df[label] > -1].copy()
    ytrain = xtrain[label].astype('int32').values
    xtrain = xtrain.drop(label, axis=1)
    #print(label, ytrain.value_counts())

    xpred = df[df[label] == -1].copy()
    ypred = xpred[label] * 0
    xpred = xpred.drop(label, axis=1)

    dpred = xgb.DMatrix(xpred)
    dtrain = xgb.DMatrix(xtrain, label=ytrain)

    cv = xgb.cv(params=xgb_params,
                dtrain=dtrain,
                num_boost_round=10000,
                early_stopping_rounds=100,
                nfold=10,
                metrics='error',
                stratified=True)
    print(label, 'num_boost_rounds =', len(cv))
    bst = xgb.train(params=xgb_params, dtrain=dtrain, num_boost_round=len(cv))
    ypred += bst.predict(dpred)
    df.ix[df[label] == -1, label] = (ypred > 0.5) * 1
    print('restored', label, '{{{', df[label].value_counts(), '}}}')
predict_2017_07_04_4.py 文件源码 项目:mlbootcamp_5 作者: ivan-filonov 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def xgb_common(train2, y, test2, v, z, N_seeds, N_splits, cname, xgb_params):
    v[cname], z[cname] = 0, 0
    scores = []
    skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
    dtest = xgb.DMatrix(test2)
    for s in range(N_seeds):
        cname2 = cname + str(s)
        v[cname2], z[cname2] = 0, 0
        xgb_params['seed'] = s + 4242
        for n, (itrain, ival) in enumerate(skf.split(train2, y)):
            dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
            dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
            watch = [(dtrain, 'train'), (dvalid, 'valid')]
            clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)

            p = clf.predict(dvalid)
            v.loc[ival, cname2] += pconvert(p)
            score = metrics.log_loss(y[ival], p)
            z[cname2]  += pconvert(clf.predict(dtest))
            print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
            scores.append(score)
        z[cname2] /= N_splits
        z[cname] += z[cname2] / N_seeds
        v[cname] += v[cname2] / N_seeds

    vloss = [metrics.log_loss(y, prestore(v[cname + str(i)])) for i in range(N_seeds)]
    print('validation loss: ', vloss, np.mean(vloss), np.std(vloss))
    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())
predict_2017_07_05_2.py 文件源码 项目:mlbootcamp_5 作者: ivan-filonov 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def xgb_common(train2, y, test2, v, z, N_seeds, N_splits, cname, xgb_params):
    scores = []
    skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
    dtest = xgb.DMatrix(test2)
    for s in range(N_seeds):
        cname2 = cname + str(s)
        v[cname2], z[cname2] = 0, 0
        xgb_params['seed'] = s + 4242
        for n, (itrain, ival) in enumerate(skf.split(train2, y)):
            dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
            dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
            watch = [(dtrain, 'train'), (dvalid, 'valid')]
            clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)

            p = clf.predict(dvalid)
            v.loc[ival, cname2] += pconvert(p)
            score = metrics.log_loss(y[ival], p)
            z[cname2]  += pconvert(clf.predict(dtest))
            print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
            scores.append(score)
        z[cname2] /= N_splits

    vloss = [metrics.log_loss(y, prestore(v[cname + str(i)])) for i in range(N_seeds)]
    print('validation loss: ', vloss, np.mean(vloss), np.std(vloss))
    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())
predict_2017_07_05_2.py 文件源码 项目:mlbootcamp_5 作者: ivan-filonov 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def keras_common(train3, y, test3, v, z, num_splits, cname, build_model, seed = 1234, batch_size = 128):
    v[cname], z[cname] = 0, 0
    np.random.seed(seed)
    build_model().summary(line_length=120)
    model_path = '../data/working/' + cname + '_keras_model.h5'
    ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=11, test_size=1/num_splits)
    scores = list()
    for n, (itrain, ival) in enumerate(ss.split(train3, y)):
        xtrain, xval = train3[itrain], train3[ival]
        ytrain, yval = y[itrain], y[ival]
        model = build_model()
        model.fit(
                xtrain, ytrain,
                batch_size = batch_size,
                epochs = 10000,
                validation_data = (xval, yval),
                verbose = 0,
                callbacks = build_keras_fit_callbacks(model_path),
                shuffle = True
            )
        model.load_weights(model_path)
        p = model.predict(xval)
        v.loc[ival, cname] += pconvert(p).ravel()
        score = metrics.log_loss(y[ival], p)
        print(cname, 'fold %d: '%(n+1), score, now())
        scores.append(score)
        z[cname] += pconvert(model.predict(test3)).ravel()
        del model
        for i in range(3): gc.collect(i)
    os.remove(model_path)

    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())
    z[cname] /= num_splits
predict_2017_07_06_2.py 文件源码 项目:mlbootcamp_5 作者: ivan-filonov 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def restore_missing(df, N_splits = 10):
    xgb_params = dict(
            max_depth = 5,
            learning_rate = 0.005,
            gamma = 1,
            alpha = 0.01,
            objective = 'binary:logistic',
            eval_metric = 'logloss',
            seed = 1,
            silent = 1
        )
    #{'gamma': 0.0, 'seed': 1, 'eval_metric': 'logloss', 'objective': 'binary:logistic', 'subsample': 0.6, 'min_child_weight': 1, 'colsample_bytree': 0.9, 'silent': 1, 'n_estimators': 10000, 'reg_alpha': 0.05, 'learning_rate': 0.005, 'max_depth': 2}
    df.ix[df.active == -1, 'active'] = 1
    df.ix[df.alco == -1, 'alco'] = 0

    label = 'smoke'
    print('before', label, '{{{', df[label].value_counts(), '}}}')
    xtrain = df[df[label] > -1].copy()
    ytrain = xtrain[label].astype('int32').values
    xtrain = xtrain.drop(label, axis=1)
    #print(label, ytrain.value_counts())

    xpred = df[df[label] == -1].copy()
    ypred = xpred[label] * 0
    xpred = xpred.drop(label, axis=1)

    dpred = xgb.DMatrix(xpred)
    dtrain = xgb.DMatrix(xtrain, label=ytrain)

    cv = xgb.cv(params=xgb_params,
                dtrain=dtrain,
                num_boost_round=10000,
                early_stopping_rounds=100,
                nfold=10,
                metrics='error',
                stratified=True)
    print(label, 'num_boost_rounds =', len(cv))
    bst = xgb.train(params=xgb_params, dtrain=dtrain, num_boost_round=len(cv))
    ypred += bst.predict(dpred)
    df.ix[df[label] == -1, label] = (ypred > 0.5) * 1
    print('restored', label, '{{{', df[label].value_counts(), '}}}')
predict_2017_07_06_2.py 文件源码 项目:mlbootcamp_5 作者: ivan-filonov 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def xgb_common(train2, y, test2, v, z, N_seeds, N_splits, cname, xgb_params):
    scores = []
    skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
    dtest = xgb.DMatrix(test2)
    for s in range(N_seeds):
        cname2 = cname + str(s)
        v[cname2], z[cname2] = 0, 0
        xgb_params['seed'] = s + 4242
        for n, (itrain, ival) in enumerate(skf.split(train2, y)):
            dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
            dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
            watch = [(dtrain, 'train'), (dvalid, 'valid')]
            clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)

            p = clf.predict(dvalid)
            v.loc[ival, cname2] += pconvert(p)
            score = metrics.log_loss(y[ival], p)
            z[cname2]  += pconvert(clf.predict(dtest))
            print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
            scores.append(score)
        z[cname2] /= N_splits

    vloss = [metrics.log_loss(y, prestore(v[cname + str(i)])) for i in range(N_seeds)]
    print('validation loss: ', vloss, np.mean(vloss), np.std(vloss))
    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())
predict_2017_07_06_2.py 文件源码 项目:mlbootcamp_5 作者: ivan-filonov 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def keras_common(train3, y, test3, v, z, num_splits, cname, build_model, seed = 1234, batch_size = 128):
    v[cname], z[cname] = 0, 0
    np.random.seed(seed)
    build_model().summary(line_length=120)
    model_path = '../data/working/' + cname + '_keras_model.h5'
    ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=11, test_size=1/num_splits)
    scores = list()
    for n, (itrain, ival) in enumerate(ss.split(train3, y)):
        xtrain, xval = train3[itrain], train3[ival]
        ytrain, yval = y[itrain], y[ival]
        model = build_model()
        model.fit(
                xtrain, ytrain,
                batch_size = batch_size,
                epochs = 10000,
                validation_data = (xval, yval),
                verbose = 0,
                callbacks = build_keras_fit_callbacks(model_path),
                shuffle = True
            )
        model.load_weights(model_path)
        p = model.predict(xval)
        v.loc[ival, cname] += pconvert(p).ravel()
        score = metrics.log_loss(y[ival], p)
        print(cname, 'fold %d: '%(n+1), score, now())
        scores.append(score)
        z[cname] += pconvert(model.predict(test3)).ravel()
        del model
        for i in range(3): gc.collect(i)
    os.remove(model_path)

    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())
    z[cname] /= num_splits
predict_2017_07_02_5.py 文件源码 项目:mlbootcamp_5 作者: ivan-filonov 项目源码 文件源码 阅读 13 收藏 0 点赞 0 评论 0
def xgb1(train2, y, test2, v, z):
    cname = sys._getframe().f_code.co_name
    v[cname], z[cname] = 0, 0
    N_splits = 9
    N_seeds = 4
    scores = []
    skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
    xgb_params = dict(
            max_depth = 5,
            learning_rate = 0.02,
            alpha = 0.01,
            objective = 'binary:logistic',
            eval_metric = 'logloss',
            seed = 1,
            silent = 1
        )
    for s in range(N_seeds):
        xgb_params['seed'] = s + 4242
        for n, (itrain, ival) in enumerate(skf.split(train2, y)):
            dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
            dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
            dtest = xgb.DMatrix(test2)
            watch = [(dtrain, 'train'), (dvalid, 'valid')]
            clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)

            p = clf.predict(dvalid)
            v.loc[ival, cname] += pconvert(p)
            score = metrics.log_loss(y[ival], p)
            z[cname]  += pconvert(clf.predict(dtest))
            print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
            scores.append(score)

    z[cname] /= N_splits * N_seeds
    v[cname] /= N_seeds
    print('validation loss: ', metrics.log_loss(y, prestore(v[cname])))
    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())
predict_2017_07_02_5.py 文件源码 项目:mlbootcamp_5 作者: ivan-filonov 项目源码 文件源码 阅读 13 收藏 0 点赞 0 评论 0
def xgb2(train2, y, test2, v, z):
    cname = sys._getframe().f_code.co_name
    v[cname], z[cname] = 0, 0
    N_splits = 9
    N_seeds = 4
    scores = []
    skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
    xgb_params = dict(
            max_depth = 4,
            learning_rate = 0.02,
            subsample = 0.7,
            alpha = 0.015,
            #colsample_bytree = 0.8,
            objective = 'binary:logistic',
            eval_metric = 'logloss',
            seed = 1,
            silent = 1
        )
    dtest = xgb.DMatrix(test2)
    for s in range(N_seeds):
        xgb_params['seed'] = s + 4242
        for n, (itrain, ival) in enumerate(skf.split(train2, y)):
            dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
            dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
            watch = [(dtrain, 'train'), (dvalid, 'valid')]
            clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)

            p = clf.predict(dvalid)
            v.loc[ival, cname] += pconvert(p)
            score = metrics.log_loss(y[ival], p)
            z[cname]  += pconvert(clf.predict(dtest))
            print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
            scores.append(score)

    z[cname] /= N_splits * N_seeds
    v[cname] /= N_seeds
    print('validation loss: ', metrics.log_loss(y, prestore(v[cname])))
    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())
predict_2017_07_02_5.py 文件源码 项目:mlbootcamp_5 作者: ivan-filonov 项目源码 文件源码 阅读 12 收藏 0 点赞 0 评论 0
def xgb3(train2, y, test2, v, z):
    cname = sys._getframe().f_code.co_name
    v[cname], z[cname] = 0, 0
    N_splits = 9
    N_seeds = 4
    scores = []
    skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
    xgb_params = dict(
            max_depth = 4,
            learning_rate = 0.02,
            subsample = 0.8,
            colsample_bytree = 0.8,
            objective = 'binary:logistic',
            eval_metric = 'logloss',
            seed = 1,
            silent = 1
        )
    dtest = xgb.DMatrix(test2)
    for s in range(N_seeds):
        xgb_params['seed'] = s + 4242
        for n, (itrain, ival) in enumerate(skf.split(train2, y)):
            dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
            dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
            watch = [(dtrain, 'train'), (dvalid, 'valid')]
            clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)

            p = clf.predict(dvalid)
            v.loc[ival, cname] += pconvert(p)
            score = metrics.log_loss(y[ival], p)
            z[cname]  += pconvert(clf.predict(dtest))
            print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
            scores.append(score)

    z[cname] /= N_splits * N_seeds
    v[cname] /= N_seeds
    print('validation loss: ', metrics.log_loss(y, prestore(v[cname])))
    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())
predict_2017_07_05_5.py 文件源码 项目:mlbootcamp_5 作者: ivan-filonov 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def restore_missing(df, N_splits = 10):
    xgb_params = dict(
            max_depth = 5,
            learning_rate = 0.005,
            gamma = 1,
            alpha = 0.01,
            objective = 'binary:logistic',
            eval_metric = 'logloss',
            seed = 1,
            silent = 1
        )
    #{'gamma': 0.0, 'seed': 1, 'eval_metric': 'logloss', 'objective': 'binary:logistic', 'subsample': 0.6, 'min_child_weight': 1, 'colsample_bytree': 0.9, 'silent': 1, 'n_estimators': 10000, 'reg_alpha': 0.05, 'learning_rate': 0.005, 'max_depth': 2}
    df.ix[df.active == -1, 'active'] = 1
    df.ix[df.alco == -1, 'alco'] = 0

    label = 'smoke'
    print('before', label, '{{{', df[label].value_counts(), '}}}')
    xtrain = df[df[label] > -1].copy()
    ytrain = xtrain[label].astype('int32').values
    xtrain = xtrain.drop(label, axis=1)
    #print(label, ytrain.value_counts())

    xpred = df[df[label] == -1].copy()
    ypred = xpred[label] * 0
    xpred = xpred.drop(label, axis=1)

    dpred = xgb.DMatrix(xpred)
    dtrain = xgb.DMatrix(xtrain, label=ytrain)

    cv = xgb.cv(params=xgb_params,
                dtrain=dtrain,
                num_boost_round=10000,
                early_stopping_rounds=100,
                nfold=10,
                metrics='error',
                stratified=True)
    print(label, 'num_boost_rounds =', len(cv))
    bst = xgb.train(params=xgb_params, dtrain=dtrain, num_boost_round=len(cv))
    ypred += bst.predict(dpred)
    df.ix[df[label] == -1, label] = (ypred > 0.5) * 1
    print('restored', label, '{{{', df[label].value_counts(), '}}}')
predict_2017_07_05_5.py 文件源码 项目:mlbootcamp_5 作者: ivan-filonov 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def xgb_common(train2, y, test2, v, z, N_seeds, N_splits, cname, xgb_params):
    scores = []
    skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
    dtest = xgb.DMatrix(test2)
    for s in range(N_seeds):
        cname2 = cname + str(s)
        v[cname2], z[cname2] = 0, 0
        xgb_params['seed'] = s + 4242
        for n, (itrain, ival) in enumerate(skf.split(train2, y)):
            dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
            dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
            watch = [(dtrain, 'train'), (dvalid, 'valid')]
            clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)

            p = clf.predict(dvalid)
            v.loc[ival, cname2] += pconvert(p)
            score = metrics.log_loss(y[ival], p)
            z[cname2]  += pconvert(clf.predict(dtest))
            print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
            scores.append(score)
        z[cname2] /= N_splits

    vloss = [metrics.log_loss(y, prestore(v[cname + str(i)])) for i in range(N_seeds)]
    print('validation loss: ', vloss, np.mean(vloss), np.std(vloss))
    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())
predict_2017_07_06_3.py 文件源码 项目:mlbootcamp_5 作者: ivan-filonov 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def restore_missing(df, N_splits = 10):
    xgb_params = dict(
            max_depth = 5,
            learning_rate = 0.005,
            gamma = 1,
            alpha = 0.01,
            objective = 'binary:logistic',
            eval_metric = 'logloss',
            seed = 1,
            silent = 1
        )
    #{'gamma': 0.0, 'seed': 1, 'eval_metric': 'logloss', 'objective': 'binary:logistic', 'subsample': 0.6, 'min_child_weight': 1, 'colsample_bytree': 0.9, 'silent': 1, 'n_estimators': 10000, 'reg_alpha': 0.05, 'learning_rate': 0.005, 'max_depth': 2}
    df.ix[df.active == -1, 'active'] = 1
    df.ix[df.alco == -1, 'alco'] = 0

    label = 'smoke'
    print('before', label, '{{{', df[label].value_counts(), '}}}')
    xtrain = df[df[label] > -1].copy()
    ytrain = xtrain[label].astype('int32').values
    xtrain = xtrain.drop(label, axis=1)
    #print(label, ytrain.value_counts())

    xpred = df[df[label] == -1].copy()
    ypred = xpred[label] * 0
    xpred = xpred.drop(label, axis=1)

    dpred = xgb.DMatrix(xpred)
    dtrain = xgb.DMatrix(xtrain, label=ytrain)

    cv = xgb.cv(params=xgb_params,
                dtrain=dtrain,
                num_boost_round=10000,
                early_stopping_rounds=100,
                nfold=10,
                metrics='error',
                stratified=True)
    print(label, 'num_boost_rounds =', len(cv))
    bst = xgb.train(params=xgb_params, dtrain=dtrain, num_boost_round=len(cv))
    ypred += bst.predict(dpred)
    df.ix[df[label] == -1, label] = (ypred > 0.5) * 1
    print('restored', label, '{{{', df[label].value_counts(), '}}}')
predict_2017_07_06_3.py 文件源码 项目:mlbootcamp_5 作者: ivan-filonov 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def xgb_common(train2, y, test2, v, z, N_seeds, N_splits, cname, xgb_params):
    scores = []
    skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
    dtest = xgb.DMatrix(test2)
    for s in range(N_seeds):
        cname2 = cname + str(s)
        v[cname2], z[cname2] = 0, 0
        xgb_params['seed'] = s + 4242
        for n, (itrain, ival) in enumerate(skf.split(train2, y)):
            dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
            dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
            watch = [(dtrain, 'train'), (dvalid, 'valid')]
            clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)

            p = clf.predict(dvalid)
            v.loc[ival, cname2] += pconvert(p)
            score = metrics.log_loss(y[ival], p)
            z[cname2]  += pconvert(clf.predict(dtest))
            print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
            scores.append(score)
        z[cname2] /= N_splits

    vloss = [metrics.log_loss(y, prestore(v[cname + str(i)])) for i in range(N_seeds)]
    print('validation loss: ', vloss, np.mean(vloss), np.std(vloss))
    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())
predict_2017_07_06_3.py 文件源码 项目:mlbootcamp_5 作者: ivan-filonov 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def xgb2(train2, y, test2, v, z):
    cname = sys._getframe().f_code.co_name
    N_splits = 9
    N_seeds = 4
    from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval
    dtrain = xgb.DMatrix(train2, y)
    def step_xgb(params):
        cv = xgb.cv(params=params,
                    dtrain=dtrain,
                    num_boost_round=10000,
                    early_stopping_rounds=100,
                    nfold=10,
                    seed=params['seed'])
        score = cv.ix[len(cv)-1, 0]
        print(cname, score, len(cv), params)
        return dict(loss=score, status=STATUS_OK)
    space_xgb = dict(
            max_depth = hp.choice('max_depth', range(2, 8)),
            subsample = hp.quniform('subsample', 0.6, 1, 0.05),
            colsample_bytree = hp.quniform('colsample_bytree', 0.6, 1, 0.05),
            learning_rate = hp.quniform('learning_rate', 0.005, 0.03, 0.005),
            min_child_weight = hp.quniform('min_child_weight', 1, 6, 1),
            gamma = hp.quniform('gamma', 0.5, 10, 0.05),

            objective = 'binary:logistic',
            eval_metric = 'logloss',
            seed = 1,
            silent = 1
        )
    trs = load_state(cname + '_trials')
    if trs == None:
        tr = Trials()
    else:
        tr, _ = trs
    if len(tr.trials) > 0: print('reusing %d trials, best was:'%(len(tr.trials)), space_eval(space_xgb, tr.argmin))
    for n in range(5):
        best = fmin(step_xgb, space_xgb, algo=tpe.suggest, max_evals=len(tr.trials) + 1, trials = tr)
        save_state(cname + '_trials', (tr, space_xgb))
    xgb_params = space_eval(space_xgb, best)
    print(xgb_params)
    xgb_common(train2, y, test2, v, z, N_seeds, N_splits, cname, xgb_params)
predict_2017_07_06_3.py 文件源码 项目:mlbootcamp_5 作者: ivan-filonov 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def keras_common(train3, y, test3, v, z, num_splits, cname, build_model, seed = 1234, batch_size = 128):
    v[cname], z[cname] = 0, 0
    np.random.seed(seed)
    build_model().summary(line_length=120)
    model_path = '../data/working/' + cname + '_keras_model.h5'
    ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=11, test_size=1/num_splits)
    scores = list()
    for n, (itrain, ival) in enumerate(ss.split(train3, y)):
        xtrain, xval = train3[itrain], train3[ival]
        ytrain, yval = y[itrain], y[ival]
        model = build_model()
        model.fit(
                xtrain, ytrain,
                batch_size = batch_size,
                epochs = 10000,
                validation_data = (xval, yval),
                verbose = 0,
                callbacks = build_keras_fit_callbacks(model_path),
                shuffle = True
            )
        model.load_weights(model_path)
        p = model.predict(xval)
        v.loc[ival, cname] += pconvert(p).ravel()
        score = metrics.log_loss(y[ival], p)
        print(cname, 'fold %d: '%(n+1), score, now())
        scores.append(score)
        z[cname] += pconvert(model.predict(test3)).ravel()
        del model
        for i in range(3): gc.collect(i)
    os.remove(model_path)

    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())
    z[cname] /= num_splits
predict_2017_07_06_4.py 文件源码 项目:mlbootcamp_5 作者: ivan-filonov 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def restore_missing(df, N_splits = 10):
    xgb_params = dict(
            max_depth = 5,
            learning_rate = 0.005,
            gamma = 1,
            alpha = 0.01,
            objective = 'binary:logistic',
            eval_metric = 'logloss',
            seed = 1,
            silent = 1
        )
    #{'gamma': 0.0, 'seed': 1, 'eval_metric': 'logloss', 'objective': 'binary:logistic', 'subsample': 0.6, 'min_child_weight': 1, 'colsample_bytree': 0.9, 'silent': 1, 'n_estimators': 10000, 'reg_alpha': 0.05, 'learning_rate': 0.005, 'max_depth': 2}
    df.ix[df.active == -1, 'active'] = 1
    df.ix[df.alco == -1, 'alco'] = 0

    label = 'smoke'
    print('before', label, '{{{', df[label].value_counts(), '}}}')
    xtrain = df[df[label] > -1].copy()
    ytrain = xtrain[label].astype('int32').values
    xtrain = xtrain.drop(label, axis=1)
    #print(label, ytrain.value_counts())

    xpred = df[df[label] == -1].copy()
    ypred = xpred[label] * 0
    xpred = xpred.drop(label, axis=1)

    dpred = xgb.DMatrix(xpred)
    dtrain = xgb.DMatrix(xtrain, label=ytrain)

    cv = xgb.cv(params=xgb_params,
                dtrain=dtrain,
                num_boost_round=10000,
                early_stopping_rounds=100,
                nfold=10,
                metrics='error',
                stratified=True)
    print(label, 'num_boost_rounds =', len(cv))
    bst = xgb.train(params=xgb_params, dtrain=dtrain, num_boost_round=len(cv))
    ypred += bst.predict(dpred)
    df.ix[df[label] == -1, label] = (ypred > 0.5) * 1
    print('restored', label, '{{{', df[label].value_counts(), '}}}')
predict_2017_07_06_4.py 文件源码 项目:mlbootcamp_5 作者: ivan-filonov 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def xgb_common(train2, y, test2, v, z, N_seeds, N_splits, cname, xgb_params):
    scores = []
    skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
    dtest = xgb.DMatrix(test2)
    for s in range(N_seeds):
        cname2 = cname + str(s)
        v[cname2], z[cname2] = 0, 0
        xgb_params['seed'] = s + 4242
        for n, (itrain, ival) in enumerate(skf.split(train2, y)):
            dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
            dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
            watch = [(dtrain, 'train'), (dvalid, 'valid')]
            clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)

            p = clf.predict(dvalid)
            v.loc[ival, cname2] += pconvert(p)
            score = metrics.log_loss(y[ival], p)
            z[cname2]  += pconvert(clf.predict(dtest))
            print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
            scores.append(score)
        z[cname2] /= N_splits

    vloss = [metrics.log_loss(y, prestore(v[cname + str(i)])) for i in range(N_seeds)]
    print('validation loss: ', vloss, np.mean(vloss), np.std(vloss))
    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())


问题


面经


文章

微信
公众号

扫码关注公众号