def classify(train=None, test=None, data=None, res_dir="res/", disp=True, outfilename=None):
"""Description of compare
compare multiple classifier and display the best one
"""
utils.print_success("Comparison of differents classifiers")
if data is not None:
train_features = data["train_features"]
train_groundtruths = data["train_groundtruths"]
test_features = data["test_features"]
test_groundtruths = data["test_groundtruths"]
else:
train = utils.abs_path_file(train)
test = utils.abs_path_file(test)
train_features, train_groundtruths = read_file(train)
test_features, test_groundtruths = read_file(test)
if not utils.create_dir(res_dir):
res_dir = utils.abs_path_dir(res_dir)
classifiers = {
"RandomForest": RandomForestClassifier(n_jobs=-1)
# "RandomForest": RandomForestClassifier(n_estimators=5),
# "KNeighbors":KNeighborsClassifier(3),
# "GaussianProcess":GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True),
# "DecisionTree":DecisionTreeClassifier(max_depth=5),
# "MLP":MLPClassifier(),
# "AdaBoost":AdaBoostClassifier(),
# "GaussianNB":GaussianNB(),
# "QDA":QuadraticDiscriminantAnalysis(),
# "SVM":SVC(kernel="linear", C=0.025),
# "GradientBoosting":GradientBoostingClassifier(),
# "ExtraTrees":ExtraTreesClassifier(),
# "LogisticRegression":LogisticRegression(),
# "LinearDiscriminantAnalysis":LinearDiscriminantAnalysis()
}
for key in classifiers:
utils.print_success(key)
clf = classifiers[key]
utils.print_info("\tFit")
clf.fit(train_features, train_groundtruths)
utils.print_info("\tPredict")
predictions = clf.predict(test_features)
return predictions
python类GradientBoostingClassifier()的实例源码
def constructModel(corpus, classList, features, modelOutput):
"""
Trains a Decision Tree model on the test corpus.
Args:
corpus: A list of lists, containing the GC content, coverage, and class number.
classList: A list of class names.
features: List of variables used by each contig.
modelOutput: Location to save model as GraphViz DOT, or False to save no model.
Returns:
classifier: A DecisionTreeClassifier object that has been trained on the test corpus.
"""
corpus.sort() # just in case
X = []
Y = []
for item in corpus:
X.append(item[:-1]) # all but the last item
Y.append(item[-1]) # only the last item
X_train, X_test, Y_train, Y_test = mscv.train_test_split(X, Y, test_size=0.3, random_state=0)
# TODO: implement classifier testing and comparison, now only baggingClassifier is used as per paper
#treeClassifier = tree.DecisionTreeClassifier()
#treeClassifier = treeClassifier.fit(X_train, Y_train)
#click.echo("Decision tree classifier built, score is %s out of 1.00" % treeClassifier.score(X_test, Y_test))
baggingClassifier = ensemble.BaggingClassifier()
baggingClassifier = baggingClassifier.fit(X_train, Y_train)
click.echo("Bagging classifier built, score is %s out of 1.00" % baggingClassifier.score(X_test, Y_test))
#forestClassifier = ensemble.RandomForestClassifier(n_estimators=10)
#forestClassifier = forestClassifier.fit(X_train, Y_train)
#click.echo("Random forest classifier built, score is %s out of 1.00" % forestClassifier.score(X_test, Y_test))
#adaClassifier = ensemble.AdaBoostClassifier(n_estimators=100)
#adaClassifier = adaClassifier.fit(X_train, Y_train)
#click.echo("AdaBoost classifier built, score is %s out of 1.00" % adaClassifier.score(X_test, Y_test))
#gradientClassifier = ensemble.GradientBoostingClassifier(n_estimators=100)
#gradientClassifier = gradientClassifier.fit(X_train, Y_train)
#click.echo("Gradient tree boosting classifier built, score is %s out of 1.00" % gradientClassifier.score(X_test, Y_test))
if modelOutput:
with open(modelOutput, 'w') as dotfile:
tree.export_graphviz(baggingClassifier, out_file=dotfile, feature_names=features,
class_names=classList, filled=True, rounded=True, special_characters=True)
return baggingClassifier
def score(self, estimator, X, y, advanced_scoring=False):
X, y = utils.drop_missing_y_vals(X, y, output_column=None)
if isinstance(estimator, GradientBoostingClassifier):
X = X.toarray()
predictions = estimator.predict_proba(X)
if self.scoring_method == 'brier_score_loss':
# At the moment, Microsoft's LightGBM returns probabilities > 1 and < 0, which can break some scoring functions. So we have to take the max of 1 and the pred, and the min of 0 and the pred.
probas = [max(min(row[1], 1), 0) for row in predictions]
predictions = probas
try:
score = self.scoring_func(y, predictions)
except ValueError as e:
bad_val_indices = []
for idx, val in enumerate(y):
if str(val) in bad_vals_as_strings:
bad_val_indices.append(idx)
predictions = [val for idx, val in enumerate(predictions) if idx not in bad_val_indices]
y = [val for idx, val in enumerate(y) if idx not in bad_val_indices]
print('Found ' + str(len(bad_val_indices)) + ' null or infinity values in the y values. We will ignore these, and report the score on the rest of the dataset')
try:
score = self.scoring_func(y, predictions)
except ValueError:
# Sometimes, particularly for a badly fit model using either too little data, or a really bad set of hyperparameters during a grid search, we can predict probas that are > 1 or < 0. We'll cap those here, while warning the user about them, because they're unlikely to occur in a model that's properly trained with enough data and reasonable params
predictions = self.clean_probas(predictions)
score = self.scoring_func(y, predictions)
if advanced_scoring:
return (-1 * score, predictions)
else:
return -1 * score
def __init__(self, info, verbose=True, debug_mode=False):
self.label_num=info['label_num']
self.target_num=info['target_num']
self.task = info['task']
self.metric = info['metric']
self.postprocessor = None
#self.postprocessor = MultiLabelEnsemble(LogisticRegression(), balance=True) # To calibrate proba
self.postprocessor = MultiLabelEnsemble(LogisticRegression(), balance=False) # To calibrate proba
if debug_mode>=2:
self.name = "RandomPredictor"
self.model = RandomPredictor(self.target_num)
self.predict_method = self.model.predict_proba
return
if info['task']=='regression':
if info['is_sparse']==True:
self.name = "BaggingRidgeRegressor"
self.model = BaggingRegressor(base_estimator=Ridge(), n_estimators=1, verbose=verbose) # unfortunately, no warm start...
else:
self.name = "GradientBoostingRegressor"
self.model = GradientBoostingRegressor(n_estimators=1, max_depth=4, min_samples_split=14, verbose=verbose, warm_start = True)
self.predict_method = self.model.predict # Always predict probabilities
else:
if info['has_categorical']: # Out of lazziness, we do not convert categorical variables...
self.name = "RandomForestClassifier"
self.model = RandomForestClassifier(n_estimators=1, verbose=verbose) # unfortunately, no warm start...
elif info['is_sparse']:
self.name = "BaggingNBClassifier"
self.model = BaggingClassifier(base_estimator=BernoulliNB(), n_estimators=1, verbose=verbose) # unfortunately, no warm start...
else:
self.name = "GradientBoostingClassifier"
self.model = eval(self.name + "(n_estimators=1, verbose=" + str(verbose) + ", random_state=1, warm_start = True)")
if info['task']=='multilabel.classification':
self.model = MultiLabelEnsemble(self.model)
self.predict_method = self.model.predict_proba
def get_classification():
clf = svm.SVC()
clf = ensemble.GradientBoostingClassifier()
return clf
def __init__(self, nr_events, case_id_col, label_col, encoder_kwargs, cls_kwargs, cls_method="rf"):
self.case_id_col = case_id_col
self.label_col = label_col
self.encoder = SequenceEncoder(nr_events=nr_events, case_id_col=case_id_col, label_col=label_col,
**encoder_kwargs)
if cls_method == "gbm":
self.cls = GradientBoostingClassifier(**cls_kwargs)
elif cls_method == "rf":
self.cls = RandomForestClassifier(**cls_kwargs)
else:
print("Classifier method not known")
def GBDT_classify(train_dataSet_path, test_dataSet_path, train_one_and_two_result_as_proba_path):
train_data = pd.read_csv(train_dataSet_path)
train_data = train_data.as_matrix()
X_train = train_data[:, 2:-1] # select columns 0 through end-1
y_train = train_data[:, -1] # select column end
test_data = pd.read_csv(test_dataSet_path)
test_data = test_data.as_matrix()
X_test = test_data[:, 2:-1] # select columns 0 through end-1
y_test = test_data[:, -1] # select column end
clf = GradientBoostingClassifier(n_estimators=200)
clf.fit(X_train, y_train)
pre_y_test = clf.predict_proba(X_test)
print pre_y_test
print("GBDT Metrics : {0}".format(precision_recall_fscore_support(y_test, pre_y_test)))
print u'????.....'
f_result = open(test_dataSet_prob_path, 'w')
for i in range(0, len(pre_y_test)):
if i==0:
print str(pre_y_test[i][0])
if i==len(pre_y_test)-1:
print str(pre_y_test[i][0])
f_result.write(str(pre_y_test[i][0]) + '\n')
return clf
def performGTBClass(X_train, y_train, X_test, y_test, fout, savemodel):
"""
Gradient Tree Boosting binary Classification
"""
clf = GradientBoostingClassifier(n_estimators=100)
clf.fit(X_train, y_train)
# if savemodel == True:
# fname_out = '{}-{}.pickle'.format(fout, datetime.now())
# with open(fname_out, 'wb') as f:
# cPickle.dump(clf, f, -1)
accuracy = clf.score(X_test, y_test)
print "GTBClass: ", accuracy
def performGTBClass(X_train, y_train, X_test, y_test, fout, savemodel):
"""
Gradient Tree Boosting binary Classification
"""
clf = GradientBoostingClassifier(n_estimators=100)
clf.fit(X_train, y_train)
# if savemodel == True:
# fname_out = '{}-{}.pickle'.format(fout, datetime.now())
# with open(fname_out, 'wb') as f:
# cPickle.dump(clf, f, -1)
accuracy = clf.score(X_test, y_test)
print "GTBClass: ", accuracy
def get_data_preprocessor_balancing(params, y):
d_balancing = params['layer_dict_list'][1]
if params['balancing'] == str(d_balancing['None']) or params['balancing'] == 'None':
# for fp: ['ExtraTreesClassifier', 'LinearSVC'] + clf: ['DecisionTreeClassifier', 'ExtraTreesClassifier', 'LinearSVC', 'SVC', 'RandomForestClassifier', 'SGDClassifier']
params['class_weight'] = None
# for clf: ['Adasample_weightBoostClassifier', 'GradientBoostingClassifier']
params['sample_weight'] = None
elif params['balancing'] == str(d_balancing['weighting']) or params['balancing'] == 'weighting':
# for fp: ['ExtraTreesClassifier', 'LinearSVC'] + clf: ['DecisionTreeClassifier', 'ExtraTreesClassifier', 'LinearSVC', 'SVC', 'RandomForestClassifier', 'SGDClassifier']
params['class_weight'] = 'auto'
# for clf: ['AdaBoostClassifier', 'GradientBoostingClassifier']
if len(y.shape) > 1:
offsets = [2 ** i for i in range(y.shape[1])]
y_ = np.sum(y * offsets, axis=1)
else:
y_ = y
unique, counts = np.unique(y_, return_counts=True)
cw = 1. / counts
cw = cw / np.mean(cw)
sample_weight = np.ones(y_.shape)
for i, ue in enumerate(unique):
mask = y_ == ue
sample_weight[mask] *= cw[i]
params['sample_weight'] = sample_weight
return params
def model_fitting(train_set, train_labels, classifier_name, n_jobs=cpu_count()):
"""
The fitting process with sklearn algorithms.
:param train_set: numpy array, required
:param train_labels: list, required
:param classifier_name: string, required
:param n_jobs: integer, required
:return: object
- Fit classifier model according to the given training data
"""
classifier_list = {"svm_linear": SVC(probability=True, kernel='linear', C=1.0),
"svm_poly": SVC(probability=True, kernel='poly', C=1.0),
"svm_rbf": SVC(probability=True, kernel='rbf', C=1.0, gamma=0.01),
"linear_svc": LinearSVC(penalty='l2', loss='squared_hinge', dual=True, tol=0.1, C=1.0, multi_class='ovr', fit_intercept=True,
intercept_scaling=1, random_state=None, max_iter=3000),
"knn": KNeighborsClassifier(n_neighbors=100, weights='distance', leaf_size=30, n_jobs=n_jobs),
"random_forests": RandomForestClassifier(n_estimators=350, criterion='entropy', min_samples_split=2,
min_samples_leaf=1, max_leaf_nodes=600, n_jobs=n_jobs),
"logistic_regression": LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=2.4, fit_intercept=True, intercept_scaling=1,
random_state=None, solver='liblinear', max_iter=1000, multi_class='ovr',
warm_start=False, n_jobs=n_jobs),
"decision_trees": DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, min_samples_split=2,
min_samples_leaf=100, min_weight_fraction_leaf=0.0, max_features=None,
random_state=None, max_leaf_nodes=None, presort=False),
"sgd": SGDClassifier(alpha=.0001, n_iter=500, penalty="elasticnet", n_jobs=n_jobs),
"neural_network": Classifier(layers=[Layer("Sigmoid", units=14), Layer("Sigmoid", units=13), Layer("Sigmoid", units=12),
Layer("Sigmoid", units=10), Layer("Softmax")], learning_rate=0.01, n_iter=200,
batch_size=10, regularize='L1', n_stable=50, dropout_rate=0, verbose=True),
"GBC": GradientBoostingClassifier(max_depth=10, max_leaf_nodes=850, min_samples_leaf=15, learning_rate=0.1),
"XGB": XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
max_depth=10, min_child_weight=2, missing=None, n_estimators=100, nthread=n_jobs, reg_alpha=0,
objective='binary:logistic', reg_lambda=1, scale_pos_weight=1, seed=0, silent=True, subsample=1)}
return classifier_list[classifier_name].fit(train_set, train_labels)
def setUpClass(self):
"""
Set up the unit test by loading the dataset and training a model.
"""
from sklearn.datasets import load_boston
scikit_data = load_boston()
scikit_model = GradientBoostingClassifier(random_state = 1)
target = scikit_data['target'] > scikit_data['target'].mean()
scikit_model.fit(scikit_data['data'], target)
# Save the data and the model
self.scikit_data = scikit_data
self.scikit_model = scikit_model
def test_conversion_bad_inputs(self):
# Error on converting an untrained model
with self.assertRaises(Exception):
model = GradientBoostingClassifier()
spec = skl_converter.convert(model, 'data', 'out')
# Check the expected class during covnersion.
from sklearn.preprocessing import OneHotEncoder
with self.assertRaises(Exception):
model = OneHotEncoder()
spec = skl_converter.convert(model, 'data', 'out')
def score(self, estimator, X, y, advanced_scoring=False):
X, y = utils.drop_missing_y_vals(X, y, output_column=None)
if isinstance(estimator, GradientBoostingClassifier):
X = X.toarray()
predictions = estimator.predict_proba(X)
if self.scoring_method == 'brier_score_loss':
# At the moment, Microsoft's LightGBM returns probabilities > 1 and < 0, which can break some scoring functions. So we have to take the max of 1 and the pred, and the min of 0 and the pred.
probas = [max(min(row[1], 1), 0) for row in predictions]
predictions = probas
try:
score = self.scoring_func(y, predictions)
except ValueError as e:
bad_val_indices = []
for idx, val in enumerate(y):
if str(val) in bad_vals_as_strings:
bad_val_indices.append(idx)
predictions = [val for idx, val in enumerate(predictions) if idx not in bad_val_indices]
y = [val for idx, val in enumerate(y) if idx not in bad_val_indices]
print('Found ' + str(len(bad_val_indices)) + ' null or infinity values in the y values. We will ignore these, and report the score on the rest of the dataset')
try:
score = self.scoring_func(y, predictions)
except ValueError:
# Sometimes, particularly for a badly fit model using either too little data, or a really bad set of hyperparameters during a grid search, we can predict probas that are > 1 or < 0. We'll cap those here, while warning the user about them, because they're unlikely to occur in a model that's properly trained with enough data and reasonable params
predictions = self.clean_probas(predictions)
score = self.scoring_func(y, predictions)
if advanced_scoring:
return (-1 * score, predictions)
else:
return -1 * score
def fitAndPredict(self):
# classifier = LogisticRegression()
# classifier.fit(self.trainingSet, self.trainingLabel)
# pred_labels = classifier.predict(self.testSet)
# print 'Logistic:'
# print classification_report(self.testLabel, pred_labels)
classifier = SVC()
classifier.fit(self.trainingSet, self.trainingLabel)
pred_labels = {}
for user in self.testDict:
pred_labels[user] = classifier.predict([self.model.docvecs[user]])
# print 'SVM:'
# print classification_report(self.testLabel, pred_labels)
return pred_labels
# classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
# max_depth=1, random_state=0)
# classifier.fit(self.trainingSet, self.trainingLabel)
# pred_labels = classifier.predict(self.testSet)
# print 'GBDT:'
# print classification_report(self.testLabel, pred_labels)
#
# clf = AdaBoostClassifier(n_estimators=100)
# classifier.fit(self.trainingSet, self.trainingLabel)
# pred_labels = classifier.predict(self.testSet)
# print 'AdaBoost:'
# print classification_report(self.testLabel, pred_labels)
#
# clf = RandomForestClassifier(n_estimators=10)
# classifier.fit(self.trainingSet, self.trainingLabel)
# pred_labels = classifier.predict(self.testSet)
# print 'Random Forest:'
# print classification_report(self.testLabel, pred_labels)
def makEnsemble( X, xlist, Y ):
#naive bayes
clf = MultinomialNB()
clf.fit( xlist, Y )
featureSelectModel.append (clf)
#K nearest neighbours
clf = KNeighborsClassifier()
clf.fit( xlist, Y )
featureSelectModel.append (clf)
#Logistic regression
clf = LogisticRegression(C=1)
clf.fit( xlist, Y )
featureSelectModel.append (clf)
#random forest
clf = RandomForestClassifier(n_estimators = 400)
clf.fit( X, Y )
wholeFeatureModel.append (clf)
#extra forest
clf = ExtraTreesClassifier(n_estimators = 400)
clf.fit( X, Y )
wholeFeatureModel.append (clf)
#decision forest
clf = DecisionTreeClassifier(max_depth=None, min_samples_split=1, random_state=0)
clf.fit( X, Y )
wholeFeatureModel.append (clf)
#gradient boosting
params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 1,
'learning_rate': 0.01}
clf = GradientBoostingClassifier(**params)
clf.fit( X, Y )
wholeFeatureModel.append (clf)
def gradient_boosting_classifier(train_x, train_y):
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier(n_estimators=200)
model.fit(train_x, train_y)
return model
# SVM Classifier
def test_create(self):
SigOptSearchCV(
estimator=GradientBoostingClassifier,
param_domains=GradientBoostingClassifier_PARAM_DOMAIN,
client_token='client_token'
)
def test_no_token(self):
with pytest.raises(ValueError):
SigOptSearchCV(estimator=GradientBoostingClassifier, param_domains=GradientBoostingClassifier_PARAM_DOMAIN)
def test_search(self):
conn = sigopt.Connection()
n_iter = 5
folds = 3
cv = SigOptSearchCV(
estimator=GradientBoostingClassifier(),
param_domains=GradientBoostingClassifier_PARAM_DOMAIN,
client_token='client_token',
n_iter=n_iter,
cv=folds
)
assert len(conn.experiments().create.mock_calls) == 0
assert len(conn.experiments().fetch.mock_calls) == 0
assert len(conn.experiments().suggestions.create.mock_calls) == 0
assert len(conn.experiments().observations.create.mock_calls) == 0
data = sklearn.datasets.load_iris()
cv.fit(data['data'], data['target'])
assert len(conn.experiments().create.mock_calls) == 1
create_definition = conn.experiments().create.call_args[1]
assert create_definition['name'] == GradientBoostingClassifier_EXPERIMENT_DEF['name']
assert len(create_definition['parameters']) == len(GradientBoostingClassifier_EXPERIMENT_DEF['parameters'])
for p in GradientBoostingClassifier_EXPERIMENT_DEF['parameters']:
assert p in create_definition['parameters']
assert len(conn.experiments().best_assignments().fetch.mock_calls) == 1
assert len(conn.experiments().suggestions().create.mock_calls) == n_iter * folds
assert len(conn.experiments().observations().create.mock_calls) == n_iter * folds
assert cv.best_params_ == zero_corner(GradientBoostingClassifier_EXPERIMENT_DEF)