def ada_boost_classifier_err(self, data, target, learning_rate=1, n_estimators=400, show_score=False):
ada_boost = AdaBoostClassifier(
base_estimator=self.clf,
learning_rate=learning_rate,
n_estimators=n_estimators,
algorithm="SAMME.R")
ada_boost.fit(data, target)
score = ada_boost.score(data, target)
if not show_score:
print "Fitness score: " + str(score)
return 1.0 - score
python类AdaBoostClassifier()的实例源码
def varius_classifiers():
# List of tuples of a classifier and its parameters.
clf_list = []
clf_linearsvm = LinearSVC()
params_linearsvm = {"C": [0.5, 1, 5, 10, 100, 10**10],"tol":[0.1, 0.0000000001],"class_weight":['balanced']}
clf_list.append( (clf_linearsvm, params_linearsvm) )
clf_tree = DecisionTreeClassifier()
params_tree = { "min_samples_split":[2, 5, 10, 20],"criterion": ('gini', 'entropy')}
clf_list.append( (clf_tree, params_tree) )
clf_random_tree = RandomForestClassifier()
params_random_tree = { "n_estimators":[2, 3, 5],"criterion": ('gini', 'entropy')}
clf_list.append( (clf_random_tree, params_random_tree) )
clf_adaboost = AdaBoostClassifier()
params_adaboost = { "n_estimators":[20, 30, 50, 100]}
clf_list.append( (clf_adaboost, params_adaboost) )
clf_knn = KNeighborsClassifier()
params_knn = {"n_neighbors":[2, 5], "p":[2,3]}
clf_list.append( (clf_knn, params_knn) )
clf_log = LogisticRegression()
params_log = {"C":[0.5, 1, 10, 10**2,10**10, 10**20],"tol":[0.1, 0.00001, 0.0000000001],"class_weight":['balanced']}
clf_list.append( (clf_log, params_log) )
clf_lda = LinearDiscriminantAnalysis()
params_lda = {"n_components":[0, 1, 2, 5, 10]}
clf_list.append( (clf_lda, params_lda) )
logistic = LogisticRegression()
rbm = BernoulliRBM()
clf_rbm = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])
params_rbm = {"logistic__tol":[0.0000000001, 10**-20],"logistic__C":[0.05, 1, 10, 10**2,10**10, 10**20],"logistic__class_weight":['balanced'],"rbm__n_components":[2,3,4]}
clf_list.append( (clf_rbm, params_rbm) )
return clf_list
User_Interface.py 文件源码
项目:yttresearch-machine-learning-algorithms-analysis
作者: gdemos01
项目源码
文件源码
阅读 26
收藏 0
点赞 0
评论 0
def exportPresentationData(classifier,action):
dir = input('Give Data Directory: ')
if int(classifier)==1:
clf = GradientBoostingClassifier()
classify(dir,clf,action)
elif int(classifier) == 2:
clf = LogisticRegression()
classify(dir,clf,action)
elif int(classifier) == 3:
clf = KNeighborsClassifier(n_neighbors=5)
classify(dir,clf,action)
elif int(classifier) == 4:
clf = DecisionTreeClassifier()
classify(dir,clf,action)
elif int(classifier) == 5:
clf = svm.LinearSVC()
classify_type2(dir,clf,action)
elif int(classifier) == 6:
clf = RandomForestClassifier()
classify(dir,clf,action)
elif int(classifier) == 7:
clf = ExtraTreesClassifier()
classify(dir,clf,action)
elif int(classifier) == 8:
clf = IsolationForest()
classify_type2(dir,clf,action)
elif int(classifier) == 9:
clf = AdaBoostClassifier(n_estimators=100)
classify(dir,clf,action)
elif int(classifier) == 10:
clf = BaggingClassifier(DecisionTreeClassifier())
classify(dir,clf,action)
elif int(classifier) == 11:
clf1 = GradientBoostingClassifier()
clf2 = AdaBoostClassifier()
clf = VotingClassifier(estimators=[('abdt', clf1), ('gbdt', clf2)], voting='soft')
classify(dir,clf,action)
Exporter.py 文件源码
项目:yttresearch-machine-learning-algorithms-analysis
作者: gdemos01
项目源码
文件源码
阅读 49
收藏 0
点赞 0
评论 0
def exportPresentationData(classifier,action,dir):
if int(classifier)==1:
clf = GradientBoostingClassifier()
classify(dir,clf,action)
elif int(classifier) == 2:
clf = LogisticRegression()
classify(dir,clf,action)
elif int(classifier) == 3:
clf = KNeighborsClassifier(n_neighbors=5)
classify(dir,clf,action)
elif int(classifier) == 4:
clf = DecisionTreeClassifier()
classify(dir,clf,action)
elif int(classifier) == 5:
clf = svm.LinearSVC()
classify_type2(dir,clf,action)
elif int(classifier) == 6:
clf = RandomForestClassifier()
classify(dir,clf,action)
elif int(classifier) == 7:
clf = ExtraTreesClassifier()
classify(dir,clf,action)
elif int(classifier) == 8:
clf = IsolationForest()
classify_type2(dir,clf,action)
elif int(classifier) == 9:
clf = AdaBoostClassifier(n_estimators=100)
classify(dir,clf,action)
elif int(classifier) == 10:
clf = BaggingClassifier(DecisionTreeClassifier())
classify(dir,clf,action)
elif int(classifier) == 11:
clf1 = GradientBoostingClassifier()
clf2 = AdaBoostClassifier()
clf = VotingClassifier(estimators=[('abdt', clf1), ('gbdt', clf2)], voting='soft')
classify(dir,clf,action)
def define_clfs_params(self):
'''
Defines all relevant parameters and classes for classfier objects.
Edit these if you wish to change parameters.
'''
# These are the classifiers
self.clfs = {
'RF': RandomForestClassifier(n_estimators = 50, n_jobs = -1),
'ET': ExtraTreesClassifier(n_estimators = 10, n_jobs = -1, criterion = 'entropy'),
'AB': AdaBoostClassifier(DecisionTreeClassifier(max_depth = [1, 5, 10, 15]), algorithm = "SAMME", n_estimators = 200),
'LR': LogisticRegression(penalty = 'l1', C = 1e5),
'SVM': svm.SVC(kernel = 'linear', probability = True, random_state = 0),
'GB': GradientBoostingClassifier(learning_rate = 0.05, subsample = 0.5, max_depth = 6, n_estimators = 10),
'NB': GaussianNB(),
'DT': DecisionTreeClassifier(),
'SGD': SGDClassifier(loss = 'log', penalty = 'l2'),
'KNN': KNeighborsClassifier(n_neighbors = 3)
}
# These are the parameters which will be run through
self.params = {
'RF':{'n_estimators': [1,10,100,1000], 'max_depth': [10, 15,20,30,40,50,60,70,100], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10], 'random_state': [1]},
'LR': {'penalty': ['l1','l2'], 'C': [0.00001,0.0001,0.001,0.01,0.1,1,10], 'random_state': [1]},
'SGD': {'loss': ['log'], 'penalty': ['l2','l1','elasticnet'], 'random_state': [1]},
'ET': {'n_estimators': [1,10,100,1000], 'criterion' : ['gini', 'entropy'], 'max_depth': [1,3,5,10,15], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10], 'random_state': [1]},
'AB': {'algorithm': ['SAMME', 'SAMME.R'], 'n_estimators': [1,10,100,1000], 'random_state': [1]},
'GB': {'n_estimators': [1,10,100,1000], 'learning_rate' : [0.001,0.01,0.05,0.1,0.5],'subsample' : [0.1,0.5,1.0], 'max_depth': [1,3,5,10,20,50,100], 'random_state': [1]},
'NB': {},
'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [1,2,15,20,30,40,50], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10], 'random_state': [1]},
'SVM' :{'C' :[0.00001,0.0001,0.001,0.01,0.1,1,10],'kernel':['linear'], 'random_state': [1]},
'KNN' :{'n_neighbors': [1,5,10,25,50,100],'weights': ['uniform','distance'],'algorithm': ['auto','ball_tree','kd_tree']}
}
def runner(i):
sem.acquire()
print("learn begin %s" % i)
clf = ensemble.AdaBoostClassifier(naive_bayes.GaussianNB())
clf = clf.fit(traindata, trainlabel[i])
svms.append((i, clf))
result[i] = clf.predict_proba(testdata)
dbresult[i] = clf.predict_proba(dbdata)
#print("label %s done\n%s"
# % (i, metrics.classification_report(testlabel[i], result[i])))
#print metrics.confusion_matrix(testlabel[i], result)
sem.release()
def runner(i):
sem.acquire()
print("learn begin %s" % i)
clf = ensemble.AdaBoostClassifier(svm.LinearSVC())
clf = clf.fit(traindata, trainlabel[i])
svms.append((i, clf))
result[i] = clf.predict_proba(testdata)
dbresult[i] = clf.predict_proba(dbdata)
#print("label %s done\n%s"
# % (i, metrics.classification_report(testlabel[i], result[i])))
#print metrics.confusion_matrix(testlabel[i], result)
sem.release()
def __init__(self, genres, data, type='knn', name='', clf_kwargs=None):
self.logger = get_logger('classifier')
self.display_name = name
self.genres = genres
self.m_genres = { genre:i for i, genre in enumerate(genres) }
self.randstate = np.random.RandomState()
self.scaler = StandardScaler()
clf_kwargs = { } if not clf_kwargs else clf_kwargs
if type in ['svm', 'mlp']:
clf_kwargs['random_state'] = self.randstate
if type == 'knn':
self.proto_clf = KNeighborsClassifier(**clf_kwargs)
elif type == 'svm':
self.proto_clf = SVC(**clf_kwargs)
elif type == 'dtree':
self.proto_clf = DecisionTreeClassifier(**clf_kwargs)
elif type == 'gnb':
self.proto_clf = GaussianNB(**clf_kwargs)
elif type == 'perc':
self.proto_clf = Perceptron(**clf_kwargs)
elif type == 'mlp':
self.proto_clf = MLPClassifier(**clf_kwargs)
elif type == 'ada':
self.proto_clf = AdaBoostClassifier(**clf_kwargs)
else:
raise LookupError('Classifier type "{}" is invalid'.format(type))
self._convert_data(data)
self.logger.info('Classifier: {} (params={})'.format(
self.proto_clf.__class__.__name__,
clf_kwargs
))
def train_model(text_matrix, categories):
# model = AdaBoostClassifier(
# DecisionTreeClassifier(max_depth=3),
# n_estimators=500,
# algorithm="SAMME")
model = RandomForestClassifier(n_estimators=100, max_depth=8)
model.fit(text_matrix, categories)
return model
def set_adaboost_classifier(self):
return SkLearner(ensemble.AdaBoostClassifier())
def getModels():
result = []
result.append("LinearRegression")
result.append("BayesianRidge")
result.append("ARDRegression")
result.append("ElasticNet")
result.append("HuberRegressor")
result.append("Lasso")
result.append("LassoLars")
result.append("Rigid")
result.append("SGDRegressor")
result.append("SVR")
result.append("MLPClassifier")
result.append("KNeighborsClassifier")
result.append("SVC")
result.append("GaussianProcessClassifier")
result.append("DecisionTreeClassifier")
result.append("RandomForestClassifier")
result.append("AdaBoostClassifier")
result.append("GaussianNB")
result.append("LogisticRegression")
result.append("QuadraticDiscriminantAnalysis")
return result
def test_AdaBoostClassifier_base_classifier(*data):
'''
test Adaboost classifier with different number of classifier, and category of classifier
:param data: train_data, test_data, train_value, test_value
:return: None
'''
from sklearn.naive_bayes import GaussianNB
X_train,X_test,y_train,y_test=data
fig=plt.figure()
ax=fig.add_subplot(2,1,1)
clf=ensemble.AdaBoostClassifier(learning_rate=0.1)
clf.fit(X_train,y_train)
## graph
estimators_num=len(clf.estimators_)
X=range(1,estimators_num+1)
ax.plot(list(X),list(clf.staged_score(X_train,y_train)),label="Traing score")
ax.plot(list(X),list(clf.staged_score(X_test,y_test)),label="Testing score")
ax.set_xlabel("estimator num")
ax.set_ylabel("score")
ax.legend(loc="lower right")
ax.set_ylim(0,1)
ax.set_title("AdaBoostClassifier with Decision Tree")
ax=fig.add_subplot(2,1,2)
clf=ensemble.AdaBoostClassifier(learning_rate=0.1,base_estimator=GaussianNB())
clf.fit(X_train,y_train)
## graph
estimators_num=len(clf.estimators_)
X=range(1,estimators_num+1)
ax.plot(list(X),list(clf.staged_score(X_train,y_train)),label="Traing score")
ax.plot(list(X),list(clf.staged_score(X_test,y_test)),label="Testing score")
ax.set_xlabel("estimator num")
ax.set_ylabel("score")
ax.legend(loc="lower right")
ax.set_ylim(0,1)
ax.set_title("AdaBoostClassifier with Gaussian Naive Bayes")
plt.show()
def test_AdaBoostClassifier_algorithm(*data):
'''
test performance with different algorithm
:param data: train_data, test_data, train_value, test_value
:return: None
'''
X_train,X_test,y_train,y_test=data
algorithms=['SAMME.R','SAMME']
fig=plt.figure()
learning_rates=[0.05,0.1,0.5,0.9]
for i,learning_rate in enumerate(learning_rates):
ax=fig.add_subplot(2,2,i+1)
for i ,algorithm in enumerate(algorithms):
clf=ensemble.AdaBoostClassifier(learning_rate=learning_rate,
algorithm=algorithm)
clf.fit(X_train,y_train)
## ??
estimators_num=len(clf.estimators_)
X=range(1,estimators_num+1)
ax.plot(list(X),list(clf.staged_score(X_train,y_train)),
label="%s:Traing score"%algorithms[i])
ax.plot(list(X),list(clf.staged_score(X_test,y_test)),
label="%s:Testing score"%algorithms[i])
ax.set_xlabel("estimator num")
ax.set_ylabel("score")
ax.legend(loc="lower right")
ax.set_title("learing rate:%f"%learning_rate)
fig.suptitle("AdaBoostClassifier")
plt.show()
def ensemble_classify():
label_list = get_labels()
tweet_list = get_labelled_tweets()
# vectorise using tf-idf
vectoriser = TfidfVectorizer(min_df=3,
max_features=None,
strip_accents='unicode',
analyzer='word',
token_pattern=r'\w{1,}',
ngram_range=(1, 2),
use_idf=1,
smooth_idf=1,
sublinear_tf=1,)
## do transformation into vector
vectoriser.fit(tweet_list)
vectorised_tweet_list = vectoriser.transform(tweet_list)
train_vector, test_vector, train_labels, test_labels = train_test_split(vectorised_tweet_list,
label_list,
test_size=0.8,
random_state=42)
n_estimators = 10 # number of weak learners
model = AdaBoostClassifier(n_estimators=n_estimators)
ada_classifier = model.fit(train_vector, train_labels)
result = ada_classifier.predict(test_vector)
# output result to csv
create_directory('data')
result.tofile("data/tfidf_ada.csv", sep=',')
save_model(ada_classifier, 'tfidf_ada')
# evaluation
binarise_result = label_binarize(result, classes=class_list)
binarise_labels = label_binarize(test_labels, classes=class_list)
generate_eval_metrics(binarise_result, 'tfidf_ada', binarise_labels)
def test_classification_toy():
# Check classification on a toy dataset.
for alg in ['SAMME', 'SAMME.R']:
clf = AdaBoostClassifier(algorithm=alg, random_state=0)
clf.fit(X, y_class)
assert_array_equal(clf.predict(T), y_t_class)
assert_array_equal(np.unique(np.asarray(y_t_class)), clf.classes_)
assert_equal(clf.predict_proba(T).shape, (len(T), 2))
assert_equal(clf.decision_function(T).shape, (len(T),))
def test_iris():
# Check consistency on dataset iris.
classes = np.unique(iris.target)
clf_samme = prob_samme = None
for alg in ['SAMME', 'SAMME.R']:
clf = AdaBoostClassifier(algorithm=alg)
clf.fit(iris.data, iris.target)
assert_array_equal(classes, clf.classes_)
proba = clf.predict_proba(iris.data)
if alg == "SAMME":
clf_samme = clf
prob_samme = proba
assert_equal(proba.shape[1], len(classes))
assert_equal(clf.decision_function(iris.data).shape[1], len(classes))
score = clf.score(iris.data, iris.target)
assert score > 0.9, "Failed with algorithm %s and score = %f" % \
(alg, score)
# Somewhat hacky regression test: prior to
# ae7adc880d624615a34bafdb1d75ef67051b8200,
# predict_proba returned SAMME.R values for SAMME.
clf_samme.algorithm = "SAMME.R"
assert_array_less(0,
np.abs(clf_samme.predict_proba(iris.data) - prob_samme))
def test_pickle():
# Check pickability.
import pickle
# Adaboost classifier
for alg in ['SAMME', 'SAMME.R']:
obj = AdaBoostClassifier(algorithm=alg)
obj.fit(iris.data, iris.target)
score = obj.score(iris.data, iris.target)
s = pickle.dumps(obj)
obj2 = pickle.loads(s)
assert_equal(type(obj2), obj.__class__)
score2 = obj2.score(iris.data, iris.target)
assert_equal(score, score2)
# Adaboost regressor
obj = AdaBoostRegressor(random_state=0)
obj.fit(boston.data, boston.target)
score = obj.score(boston.data, boston.target)
s = pickle.dumps(obj)
obj2 = pickle.loads(s)
assert_equal(type(obj2), obj.__class__)
score2 = obj2.score(boston.data, boston.target)
assert_equal(score, score2)
def test_error():
# Test that it gives proper exception on deficient input.
assert_raises(ValueError,
AdaBoostClassifier(learning_rate=-1).fit,
X, y_class)
assert_raises(ValueError,
AdaBoostClassifier(algorithm="foo").fit,
X, y_class)
assert_raises(ValueError,
AdaBoostClassifier().fit,
X, y_class, sample_weight=np.asarray([-1]))
def test_base_estimator():
# Test different base estimators.
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
# XXX doesn't work with y_class because RF doesn't support classes_
# Shouldn't AdaBoost run a LabelBinarizer?
clf = AdaBoostClassifier(RandomForestClassifier())
clf.fit(X, y_regr)
clf = AdaBoostClassifier(SVC(), algorithm="SAMME")
clf.fit(X, y_class)
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
clf = AdaBoostRegressor(RandomForestRegressor(), random_state=0)
clf.fit(X, y_regr)
clf = AdaBoostRegressor(SVR(), random_state=0)
clf.fit(X, y_regr)
# Check that an empty discrete ensemble fails in fit, not predict.
X_fail = [[1, 1], [1, 1], [1, 1], [1, 1]]
y_fail = ["foo", "bar", 1, 2]
clf = AdaBoostClassifier(SVC(), algorithm="SAMME")
assert_raises_regexp(ValueError, "worse than random",
clf.fit, X_fail, y_fail)
def test_sample_weight_missing():
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
clf = AdaBoostClassifier(KMeans(), algorithm="SAMME")
assert_raises(ValueError, clf.fit, X, y_regr)
clf = AdaBoostRegressor(KMeans())
assert_raises(ValueError, clf.fit, X, y_regr)