def get_feature_importance(self,clf, model_name ):
clfs = {'RandomForestClassifier':'feature_importances',
'ExtraTreesClassifier': 'feature_importances',
'AdaBoostClassifier': 'feature_importances',
'LogisticRegression': 'coef',
'svm.SVC': 'coef',
'GradientBoostingClassifier': 'feature_importances',
'GaussianNB': None,
'DecisionTreeClassifier': 'feature_importances',
'SGDClassifier': 'coef',
'KNeighborsClassifier': None,
'linear.SVC': 'coef'}
if clfs[model_name] == 'feature_importances':
return list(clf.feature_importances_)
elif clfs[model_name] == 'coef':
return list(clf.coef_.tolist())
else:
return None
python类GradientBoostingClassifier()的实例源码
def define_model(self, model, parameters, n_cores = 0):
clfs = {'RandomForestClassifier': RandomForestClassifier(n_estimators=50, n_jobs=7),
'ExtraTreesClassifier': ExtraTreesClassifier(n_estimators=10, n_jobs=7, criterion='entropy'),
'AdaBoostClassifier': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200),
'LogisticRegression': LogisticRegression(penalty='l1', C=1e5),
'svm.SVC': svm.SVC(kernel='linear', probability=True, random_state=0),
'GradientBoostingClassifier': GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=10),
'GaussianNB': GaussianNB(),
'DecisionTreeClassifier': DecisionTreeClassifier(),
'SGDClassifier': SGDClassifier(loss="hinge", penalty="l2", n_jobs=7),
'KNeighborsClassifier': KNeighborsClassifier(n_neighbors=3),
'linear.SVC': svm.LinearSVC() }
if model not in clfs:
raise ConfigError("Unsupported model {}".format(model))
clf = clfs[model]
clf.set_params(**parameters)
return clf
gradient_boosting_blending.py 文件源码
项目:DataMiningCompetitionFirstPrize
作者: lzddzh
项目源码
文件源码
阅读 17
收藏 0
点赞 0
评论 0
def learn(x, y, test_x):
# set sample weight
weight_list = []
for j in range(len(y)):
if y[j] == "0":
weight_list.append(variables.weight_0_gdbt_b)
if y[j] == "1000":
weight_list.append(variables.weight_1000_gdbt_b)
if y[j] == "1500":
weight_list.append(variables.weight_1500_gdbt_b)
if y[j] == "2000":
weight_list.append(variables.weight_2000_gdbt_b)
clf = GradientBoostingClassifier(loss='deviance', n_estimators=variables.n_estimators_gdbt_b,
learning_rate=variables.learning_rate_gdbt_b,
max_depth=variables.max_depth_gdbt_b, random_state=0,
min_samples_split=variables.min_samples_split_gdbt_b,
min_samples_leaf=variables.min_samples_leaf_gdbt_b,
subsample=variables.subsample_gdbt_b,
).fit(x, y, weight_list)
prediction_list = clf.predict(test_x)
return prediction_list
def __init__(
self, data_block, predictors=[],cv_folds=10,
scoring_metric='accuracy',additional_display_metrics=[]):
base_classification.__init__(
self, alg=GradientBoostingClassifier(), data_block=data_block,
predictors=predictors,cv_folds=cv_folds,
scoring_metric=scoring_metric,
additional_display_metrics=additional_display_metrics
)
self.model_output = pd.Series(self.default_parameters)
self.model_output['Feature_Importance'] = "-"
#Set parameters to default values:
self.set_parameters(set_default=True)
def createPipeline(self):
self.pipeline = Pipeline([
('model', GradientBoostingClassifier(
loss = self.conf.loss,
learning_rate = self.conf.learning_rate,
n_estimators = self.conf.n_estimators,
criterion = self.conf.criterion,
max_depth = self.conf.max_depth,
min_samples_split = self.conf.min_samples_split,
min_samples_leaf = self.conf.min_samples_leaf,
min_weight_fraction_leaf = self.conf.min_weight_fraction_leaf,
subsample = self.conf.subsample,
max_features = self.conf.max_features,
max_leaf_nodes = self.conf.max_leaf_nodes,
min_impurity_split = self.conf.min_impurity_decrease,
presort = self.conf.presort))])
def get_classifier(self):
algo=self.algo
if algo=="GBT":
return GradientBoostingClassifier()
elif algo=="RF":
return RandomForestClassifier()
elif algo=="ADB":
return AdaBoostClassifier()
elif algo =="DT":
return DecisionTreeClassifier()
elif algo=="NB":
return BernoulliNB()
elif algo=="SGD":
return SGDClassifier()
elif algo=="SVC":
return LinearSVC()
elif algo=="MLPC":
return MLPClassifier(activation='logistic', batch_size='auto',
early_stopping=True, hidden_layer_sizes=(100,), learning_rate='adaptive',
learning_rate_init=0.1, max_iter=5000, random_state=1,
solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False,
warm_start=False)
return 0
test_boosted_trees_classifier_numeric.py 文件源码
项目:coremltools
作者: apple
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def _train_convert_evaluate(self, **scikit_params):
"""
Train a scikit-learn model, convert it and then evaluate it with CoreML
"""
scikit_model = GradientBoostingClassifier(random_state = 1, **scikit_params)
scikit_model.fit(self.X, self.target)
# Convert the model
spec = skl_converter.convert(scikit_model, self.feature_names, self.output_name)
# Get predictions
df = pd.DataFrame(self.X, columns=self.feature_names)
df['prediction'] = scikit_model.predict(self.X)
# Evaluate it
metrics = evaluate_classifier(spec, df)
return metrics
def setUpClass(self):
"""
Set up the unit test by loading the dataset and training a model.
"""
from sklearn.datasets import load_boston
import numpy as np
scikit_data = load_boston()
scikit_model = GradientBoostingClassifier(random_state = 1)
t = scikit_data.target
target = np.digitize(t, np.histogram(t)[1]) - 1
scikit_model.fit(scikit_data.data, target)
self.target = target
# Save the data and the model
self.scikit_data = scikit_data
self.scikit_model = scikit_model
User_Interface.py 文件源码
项目:yttresearch-machine-learning-algorithms-analysis
作者: gdemos01
项目源码
文件源码
阅读 20
收藏 0
点赞 0
评论 0
def GradientBoostingDecisionTree_Export(action):
# Setting our classifier to Gradient Boosting
clf = GradientBoostingClassifier()
dir = input('Give Data Directory: ')
if int(action) == 1:
print('Loading Data')
PopularityClassifier.loadData(dir)
PopularityClassifier.youtubePopular(dir,clf,2)
PopularityClassifier.twitterPopular(dir,clf,2)
PopularityClassifier.bothPopular(dir,clf,2)
elif int(action) == 2:
print('Loading Data')
ViralityClassifier.loadData(dir)
ViralityClassifier.youtubeViral(dir,clf,2)
ViralityClassifier.twitterViral(dir,clf,2)
ViralityClassifier.bothViral(dir,clf,2)
else:
print('Loading Data')
ViralityAndPopularityClassifier.loadData(dir)
ViralityAndPopularityClassifier.youtubeViralAndPopular(dir,clf,2)
ViralityAndPopularityClassifier.twitterViralAndPopular(dir,clf,2)
ViralityAndPopularityClassifier.bothViralAndPopular(dir,clf,2)
GradientBoostingClassifier.py 文件源码
项目:yttresearch-machine-learning-algorithms-analysis
作者: gdemos01
项目源码
文件源码
阅读 29
收藏 0
点赞 0
评论 0
def classify():
#Predict Popularity
gbdt = GradientBoostingClassifier()
gbdt.fit(X,YP)
gbdt.predict(videos)
print(valVir.shape)
prediction = gbdt.predict(videos)
print(prediction)
same=0
for i in range(0,valPop.size):
if valPop[i]==prediction[i]:
same = same+1
accurancy = same/valPop.size *100
print(accurancy)
classify_user_item.py 文件源码
项目:dut_tianchi_mobile_recommend_train
作者: ningshixian
项目源码
文件源码
阅读 22
收藏 0
点赞 0
评论 0
def classify_user_item(train_data_new, test_data_new, result9):
data = np.loadtxt(train_data_new)
X = data[:, :-1] # select columns 0 through end-1
y = data[:, -1] # select column end
print X
print y
print 'start train'
clf2 = RandomForestClassifier(n_estimators=100)
# clf2=GradientBoostingClassifier()
clf2.fit(X, y)
# clf2 = LogisticRegression().fit(X, y)
print clf2.classes_
data1 = np.loadtxt(test_data_new)
X_test = data1[:, :]
print 'testing data is ok'
result = clf2.predict_proba(X_test)
print 'output result'
print result
f_result = open(result9, 'w')
for i in range(0, len(result)):
f_result.write(str(result[i]) + '\n')
def GradientBoostingClassifier(X_train, y_train, X_test):
from sklearn.ensemble import GradientBoostingClassifier
now = datetime.datetime.now()
print ("GradientBoostingClassifier start in " + now.strftime('%Y-%m-%d %H:%M:%S'))
GBC = GradientBoostingClassifier(max_features = 'sqrt',
n_estimators = 300,
learning_rate = 0.02,
max_depth = 8,
subsample = 0.8,
n_jobs =4)
GBC.fit(X_train, y_train)
now = datetime.datetime.now()
print ("GradientBoostingClassifier train done in " + now.strftime('%Y-%m-%d %H:%M:%S'))
y_pred_GBC = GBC.predict_proba(X_test)
y_pred_GBC = pd.DataFrame(y_pred_GBC[:,1:2],columns=['GBC_predictions'])
y_pred_GBC.to_csv('GBC_result_all.csv', index=False)
now = datetime.datetime.now()
print ("GradientBoostingClassifier predict done in " + now.strftime('%Y-%m-%d %H:%M:%S'))
def GradientBoostingClassifier(X_train, y_train, X_test):
from sklearn.ensemble import GradientBoostingClassifier
now = datetime.datetime.now()
print ("GradientBoostingClassifier start in " + now.strftime('%Y-%m-%d %H:%M:%S'))
GBC = GradientBoostingClassifier(max_features = 'sqrt',
n_estimators = 300,
learning_rate = 0.02,
max_depth = 8,
subsample = 0.8)
GBC.fit(X_train, y_train)
now = datetime.datetime.now()
print ("GradientBoostingClassifier train done in " + now.strftime('%Y-%m-%d %H:%M:%S'))
y_pred_GBC = GBC.predict_proba(X_test)
y_pred_GBC = pd.DataFrame(y_pred_GBC[:,1:2],columns=['GBC_predictions'])
y_pred_GBC.to_csv('GBC_result_1.csv', index=False)
now = datetime.datetime.now()
print ("GradientBoostingClassifier predict done in " + now.strftime('%Y-%m-%d %H:%M:%S'))
def on_startup(app):
connector = aiohttp.TCPConnector(limit=5, use_dns_cache=True, loop=app.loop)
session = aiohttp.ClientSession(connector=connector, raise_for_status=True)
bot = TelegramBot(app['config'].token, session)
image_model = fit_model(app['config'].sample_df)
def config_injections(binder):
# injection bindings
binder.bind(Config, app['config'])
binder.bind(TelegramBot, bot)
binder.bind(GradientBoostingClassifier, image_model)
binder.bind_to_constructor(AsyncIOMotorDatabase, init_database)
try:
inject.configure(config_injections)
except inject.InjectorException:
log.error("Injector already configured", exc_info=True)
setup_logging(log)
app.loop.create_task(bot.set_hook())
def test_GradientBoostingClassifier_num(*data):
'''
test the performance with different n_estimators
:param data: train_data, test_data, train_value, test_value
:return: None
'''
X_train,X_test,y_train,y_test=data
nums=np.arange(1,100,step=2)
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
testing_scores=[]
training_scores=[]
for num in nums:
clf=ensemble.GradientBoostingClassifier(n_estimators=num)
clf.fit(X_train,y_train)
training_scores.append(clf.score(X_train,y_train))
testing_scores.append(clf.score(X_test,y_test))
ax.plot(nums,training_scores,label="Training Score")
ax.plot(nums,testing_scores,label="Testing Score")
ax.set_xlabel("estimator num")
ax.set_ylabel("score")
ax.legend(loc="lower right")
ax.set_ylim(0,1.05)
plt.suptitle("GradientBoostingClassifier")
plt.show()
def test_GradientBoostingClassifier_maxdepth(*data):
'''
test the performance with different max_depth
:param data: train_data, test_data, train_value, test_value
:return: None
'''
X_train,X_test,y_train,y_test=data
maxdepths=np.arange(1,20)
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
testing_scores=[]
training_scores=[]
for maxdepth in maxdepths:
clf=ensemble.GradientBoostingClassifier(max_depth=maxdepth,max_leaf_nodes=None)
clf.fit(X_train,y_train)
training_scores.append(clf.score(X_train,y_train))
testing_scores.append(clf.score(X_test,y_test))
ax.plot(maxdepths,training_scores,label="Training Score")
ax.plot(maxdepths,testing_scores,label="Testing Score")
ax.set_xlabel("max_depth")
ax.set_ylabel("score")
ax.legend(loc="lower right")
ax.set_ylim(0,1.05)
plt.suptitle("GradientBoostingClassifier")
plt.show()
def test_GradientBoostingClassifier_learning(*data):
'''
test the performance with different learning rate
:param data: train_data, test_data, train_value, test_value
:return: None
'''
X_train,X_test,y_train,y_test=data
learnings=np.linspace(0.01,1.0)
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
testing_scores=[]
training_scores=[]
for learning in learnings:
clf=ensemble.GradientBoostingClassifier(learning_rate=learning)
clf.fit(X_train,y_train)
training_scores.append(clf.score(X_train,y_train))
testing_scores.append(clf.score(X_test,y_test))
ax.plot(learnings,training_scores,label="Training Score")
ax.plot(learnings,testing_scores,label="Testing Score")
ax.set_xlabel("learning_rate")
ax.set_ylabel("score")
ax.legend(loc="lower right")
ax.set_ylim(0,1.05)
plt.suptitle("GradientBoostingClassifier")
plt.show()
def test_GradientBoostingClassifier_subsample(*data):
'''
test the performance with different subsample
:param data: train_data, test_data, train_value, test_value
:return: None
'''
X_train,X_test,y_train,y_test=data
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
subsamples=np.linspace(0.01,1.0)
testing_scores=[]
training_scores=[]
for subsample in subsamples:
clf=ensemble.GradientBoostingClassifier(subsample=subsample)
clf.fit(X_train,y_train)
training_scores.append(clf.score(X_train,y_train))
testing_scores.append(clf.score(X_test,y_test))
ax.plot(subsamples,training_scores,label="Training Score")
ax.plot(subsamples,testing_scores,label="Training Score")
ax.set_xlabel("subsample")
ax.set_ylabel("score")
ax.legend(loc="lower right")
ax.set_ylim(0,1.05)
plt.suptitle("GradientBoostingClassifier")
plt.show()
def test_friedman_mse_in_graphviz():
clf = DecisionTreeRegressor(criterion="friedman_mse", random_state=0)
clf.fit(X, y)
dot_data = StringIO()
export_graphviz(clf, out_file=dot_data)
clf = GradientBoostingClassifier(n_estimators=2, random_state=0)
clf.fit(X, y)
for estimator in clf.estimators_:
export_graphviz(estimator[0], out_file=dot_data)
for finding in finditer("\[.*?samples.*?\]", dot_data.getvalue()):
assert_in("friedman_mse", finding.group())
def check_classification_toy(presort, loss):
# Check classification on a toy dataset.
clf = GradientBoostingClassifier(loss=loss, n_estimators=10,
random_state=1, presort=presort)
assert_raises(ValueError, clf.predict, T)
clf.fit(X, y)
assert_array_equal(clf.predict(T), true_result)
assert_equal(10, len(clf.estimators_))
deviance_decrease = (clf.train_score_[:-1] - clf.train_score_[1:])
assert_true(np.any(deviance_decrease >= 0.0))
leaves = clf.apply(X)
assert_equal(leaves.shape, (6, 10, 1))
def test_probability_log():
# Predict probabilities.
clf = GradientBoostingClassifier(n_estimators=100, random_state=1)
assert_raises(ValueError, clf.predict_proba, T)
clf.fit(X, y)
assert_array_equal(clf.predict(T), true_result)
# check if probabilities are in [0, 1].
y_proba = clf.predict_proba(T)
assert_true(np.all(y_proba >= 0.0))
assert_true(np.all(y_proba <= 1.0))
# derive predictions from probabilities
y_pred = clf.classes_.take(y_proba.argmax(axis=1), axis=0)
assert_array_equal(y_pred, true_result)
def test_check_inputs_predict():
# X has wrong shape
clf = GradientBoostingClassifier(n_estimators=100, random_state=1)
clf.fit(X, y)
x = np.array([1.0, 2.0])[:, np.newaxis]
assert_raises(ValueError, clf.predict, x)
x = np.array([[]])
assert_raises(ValueError, clf.predict, x)
x = np.array([1.0, 2.0, 3.0])[:, np.newaxis]
assert_raises(ValueError, clf.predict, x)
clf = GradientBoostingRegressor(n_estimators=100, random_state=1)
clf.fit(X, rng.rand(len(X)))
x = np.array([1.0, 2.0])[:, np.newaxis]
assert_raises(ValueError, clf.predict, x)
x = np.array([[]])
assert_raises(ValueError, clf.predict, x)
x = np.array([1.0, 2.0, 3.0])[:, np.newaxis]
assert_raises(ValueError, clf.predict, x)
def test_staged_functions_defensive():
# test that staged_functions make defensive copies
rng = np.random.RandomState(0)
X = rng.uniform(size=(10, 3))
y = (4 * X[:, 0]).astype(np.int) + 1 # don't predict zeros
for estimator in [GradientBoostingRegressor(),
GradientBoostingClassifier()]:
estimator.fit(X, y)
for func in ['predict', 'decision_function', 'predict_proba']:
staged_func = getattr(estimator, "staged_" + func, None)
if staged_func is None:
# regressor has no staged_predict_proba
continue
with warnings.catch_warnings(record=True):
staged_result = list(staged_func(X))
staged_result[1][:] = 0
assert_true(np.all(staged_result[0] != 0))
def test_serialization():
# Check model serialization.
clf = GradientBoostingClassifier(n_estimators=100, random_state=1)
clf.fit(X, y)
assert_array_equal(clf.predict(T), true_result)
assert_equal(100, len(clf.estimators_))
try:
import cPickle as pickle
except ImportError:
import pickle
serialized_clf = pickle.dumps(clf, protocol=pickle.HIGHEST_PROTOCOL)
clf = None
clf = pickle.loads(serialized_clf)
assert_array_equal(clf.predict(T), true_result)
assert_equal(100, len(clf.estimators_))
def test_more_verbose_output():
# Check verbose=2 does not cause error.
from sklearn.externals.six.moves import cStringIO as StringIO
import sys
old_stdout = sys.stdout
sys.stdout = StringIO()
clf = GradientBoostingClassifier(n_estimators=100, random_state=1,
verbose=2)
clf.fit(X, y)
verbose_output = sys.stdout
sys.stdout = old_stdout
# check output
verbose_output.seek(0)
header = verbose_output.readline().rstrip()
# no OOB
true_header = ' '.join(['%10s'] + ['%16s'] * 2) % (
'Iter', 'Train Loss', 'Remaining Time')
assert_equal(true_header, header)
n_lines = sum(1 for l in verbose_output.readlines())
# 100 lines for n_estimators==100
assert_equal(100, n_lines)
def test_warm_start_oob():
# Test if warm start OOB equals fit.
X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
for Cls in [GradientBoostingRegressor, GradientBoostingClassifier]:
est = Cls(n_estimators=200, max_depth=1, subsample=0.5,
random_state=1)
est.fit(X, y)
est_ws = Cls(n_estimators=100, max_depth=1, subsample=0.5,
random_state=1, warm_start=True)
est_ws.fit(X, y)
est_ws.set_params(n_estimators=200)
est_ws.fit(X, y)
assert_array_almost_equal(est_ws.oob_improvement_[:100],
est.oob_improvement_[:100])
def test_probability_exponential():
# Predict probabilities.
clf = GradientBoostingClassifier(loss='exponential',
n_estimators=100, random_state=1)
assert_raises(ValueError, clf.predict_proba, T)
clf.fit(X, y)
assert_array_equal(clf.predict(T), true_result)
# check if probabilities are in [0, 1].
y_proba = clf.predict_proba(T)
assert_true(np.all(y_proba >= 0.0))
assert_true(np.all(y_proba <= 1.0))
score = clf.decision_function(T).ravel()
assert_array_almost_equal(y_proba[:, 1],
1.0 / (1.0 + np.exp(-2 * score)))
# derive predictions from probabilities
y_pred = clf.classes_.take(y_proba.argmax(axis=1), axis=0)
assert_array_equal(y_pred, true_result)
def test_partial_dependence_classifier():
# Test partial dependence for classifier
clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
clf.fit(X, y)
pdp, axes = partial_dependence(clf, [0], X=X, grid_resolution=5)
# only 4 grid points instead of 5 because only 4 unique X[:,0] vals
assert pdp.shape == (1, 4)
assert axes[0].shape[0] == 4
# now with our own grid
X_ = np.asarray(X)
grid = np.unique(X_[:, 0])
pdp_2, axes = partial_dependence(clf, [0], grid=grid)
assert axes is None
assert_array_equal(pdp, pdp_2)
def __init__(self, info, verbose=True, debug_mode=False):
self.label_num=info['label_num']
self.target_num=info['target_num']
self.task = info['task']
self.metric = info['metric']
self.postprocessor = None
#self.postprocessor = MultiLabelEnsemble(LogisticRegression(), balance=True) # To calibrate proba
self.postprocessor = MultiLabelEnsemble(LogisticRegression(), balance=False) # To calibrate proba
if debug_mode>=2:
self.name = "RandomPredictor"
self.model = RandomPredictor(self.target_num)
self.predict_method = self.model.predict_proba
return
if info['task']=='regression':
if info['is_sparse']==True:
self.name = "BaggingRidgeRegressor"
self.model = BaggingRegressor(base_estimator=Ridge(), n_estimators=1, verbose=verbose) # unfortunately, no warm start...
else:
self.name = "GradientBoostingRegressor"
self.model = GradientBoostingRegressor(n_estimators=1, max_depth=4, min_samples_split=14, verbose=verbose, warm_start = True)
self.predict_method = self.model.predict # Always predict probabilities
else:
if info['has_categorical']: # Out of lazziness, we do not convert categorical variables...
self.name = "RandomForestClassifier"
self.model = RandomForestClassifier(n_estimators=1, verbose=verbose) # unfortunately, no warm start...
elif info['is_sparse']:
self.name = "BaggingNBClassifier"
self.model = BaggingClassifier(base_estimator=BernoulliNB(), n_estimators=1, verbose=verbose) # unfortunately, no warm start...
else:
self.name = "GradientBoostingClassifier"
self.model = eval(self.name + "(n_estimators=1, verbose=" + str(verbose) + ", random_state=1, warm_start = True)")
if info['task']=='multilabel.classification':
self.model = MultiLabelEnsemble(self.model)
self.predict_method = self.model.predict_proba
def try_params( n_iterations, params ):
n_estimators = int( round( n_iterations * trees_per_iteration ))
print "n_estimators:", n_estimators
pprint( params )
clf = GB( n_estimators = n_estimators, verbose = 0, **params )
return train_and_eval_sklearn_classifier( clf, data )