def trained_models():
dataset = datasets.load_breast_cancer()
X = dataset.data
y = dataset.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=12345)
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
lr = LogisticRegression()
lr.fit(X_train, y_train)
svc_w_linear_kernel = SVC(kernel='linear')
svc_w_linear_kernel.fit(X_train, y_train)
svc_wo_linear_kernel = SVC()
svc_wo_linear_kernel.fit(X_train, y_train)
dummy = DummyClassifier()
dummy.fit(X_train, y_train)
return {'RF':rf, 'LR':lr, 'SVC_w_linear_kernel':svc_w_linear_kernel,
'Dummy':dummy, 'SVC_wo_linear_kernel':svc_wo_linear_kernel}
python类RandomForestClassifier()的实例源码
def get_feature_selection_model_from_name(type_of_estimator, model_name):
model_map = {
'classifier': {
'SelectFromModel': SelectFromModel(RandomForestClassifier(n_jobs=-1, max_depth=10, n_estimators=15), threshold='20*mean'),
'RFECV': RFECV(estimator=RandomForestClassifier(n_jobs=-1), step=0.1),
'GenericUnivariateSelect': GenericUnivariateSelect(),
'KeepAll': 'KeepAll'
},
'regressor': {
'SelectFromModel': SelectFromModel(RandomForestRegressor(n_jobs=-1, max_depth=10, n_estimators=15), threshold='0.7*mean'),
'RFECV': RFECV(estimator=RandomForestRegressor(n_jobs=-1), step=0.1),
'GenericUnivariateSelect': GenericUnivariateSelect(),
'KeepAll': 'KeepAll'
}
}
return model_map[type_of_estimator][model_name]
def get_feature_importance(self,clf, model_name ):
clfs = {'RandomForestClassifier':'feature_importances',
'ExtraTreesClassifier': 'feature_importances',
'AdaBoostClassifier': 'feature_importances',
'LogisticRegression': 'coef',
'svm.SVC': 'coef',
'GradientBoostingClassifier': 'feature_importances',
'GaussianNB': None,
'DecisionTreeClassifier': 'feature_importances',
'SGDClassifier': 'coef',
'KNeighborsClassifier': None,
'linear.SVC': 'coef'}
if clfs[model_name] == 'feature_importances':
return list(clf.feature_importances_)
elif clfs[model_name] == 'coef':
return list(clf.coef_.tolist())
else:
return None
def test_improvement(self):
np.random.seed(4)
data, target = make_classification(n_samples=100,
n_features=45,
n_informative=15,
n_redundant=5,
class_sep=1,
n_clusters_per_class=4,
flip_y=0.4)
model = RandomForestClassifier(max_depth=5)
model.fit(data, target)
start_score = clf_score(target, model.predict(data))
p1 = Parameter('max_depth', 'integer', lower=1, upper=10)
hyperopt = HyperoptOptimizer(model, [p1], clf_score)
best_params, best_model = hyperopt.fit(X_train=data, y_train=target, n_iters=10)
best_model.fit(data, target)
final_score = clf_score(target, best_model.predict(data))
self.assertTrue(final_score>start_score)
for status in hyperopt.trials.statuses():
self.assertEqual(status, 'ok')
def get_feature_selection_model_from_name(type_of_estimator, model_name):
model_map = {
'classifier': {
'SelectFromModel': SelectFromModel(RandomForestClassifier(n_jobs=-1, max_depth=10, n_estimators=15), threshold='20*mean'),
'RFECV': RFECV(estimator=RandomForestClassifier(n_jobs=-1), step=0.1),
'GenericUnivariateSelect': GenericUnivariateSelect(),
'RandomizedSparse': RandomizedLogisticRegression(),
'KeepAll': 'KeepAll'
},
'regressor': {
'SelectFromModel': SelectFromModel(RandomForestRegressor(n_jobs=-1, max_depth=10, n_estimators=15), threshold='0.7*mean'),
'RFECV': RFECV(estimator=RandomForestRegressor(n_jobs=-1), step=0.1),
'GenericUnivariateSelect': GenericUnivariateSelect(),
'RandomizedSparse': RandomizedLasso(),
'KeepAll': 'KeepAll'
}
}
return model_map[type_of_estimator][model_name]
def get_classifier_class(class_name):
name_table = {
'svm': SVC,
'k_neighbors': KNeighborsClassifier,
'gaussian_process': GaussianProcessClassifier,
'decision_tree': DecisionTreeClassifier,
'random_forest': RandomForestClassifier,
'ada_boost': AdaBoostClassifier,
'mlp': MLPClassifier,
'gaussian_naive_bayes': GaussianNB,
'quadratic_discriminant_analysis': QuadraticDiscriminantAnalysis
}
if class_name not in name_table:
raise ValueError('No such classifier')
return name_table[class_name]
classify.py 文件源码
项目:oss-github-analysis-project
作者: itu-oss-project-team
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def __create_classifiers(self):
classifiers = list()
classifiers.append({"func": linear_model.SGDClassifier(loss="log"),
"name": "sgd"})
classifiers.append({"func": neighbors.KNeighborsClassifier(1, weights='distance'),
"name": "knn1"})
classifiers.append({"func": neighbors.KNeighborsClassifier(3, weights='distance'),
"name": "knn3"})
classifiers.append({"func": neighbors.KNeighborsClassifier(5, weights='distance'),
"name": "knn5"})
classifiers.append({"func": GaussianNB(),
"name": "naive_bayes"})
# classifiers.append({"func": tree.DecisionTreeClassifier(), "name": "decision_tree"})
# classifiers.append({"func": MLPClassifier(max_iter=10000), "name": "mlp"})
# classifiers.append({"func": RandomForestClassifier(), "name": "random_forest"})
return classifiers
def define_model(self, model, parameters, n_cores = 0):
clfs = {'RandomForestClassifier': RandomForestClassifier(n_estimators=50, n_jobs=7),
'ExtraTreesClassifier': ExtraTreesClassifier(n_estimators=10, n_jobs=7, criterion='entropy'),
'AdaBoostClassifier': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200),
'LogisticRegression': LogisticRegression(penalty='l1', C=1e5),
'svm.SVC': svm.SVC(kernel='linear', probability=True, random_state=0),
'GradientBoostingClassifier': GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=10),
'GaussianNB': GaussianNB(),
'DecisionTreeClassifier': DecisionTreeClassifier(),
'SGDClassifier': SGDClassifier(loss="hinge", penalty="l2", n_jobs=7),
'KNeighborsClassifier': KNeighborsClassifier(n_neighbors=3),
'linear.SVC': svm.LinearSVC() }
if model not in clfs:
raise ConfigError("Unsupported model {}".format(model))
clf = clfs[model]
clf.set_params(**parameters)
return clf
def do_ml(ticker):
X, y, df = extract_featuresets(ticker)
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,
y,
test_size=0.25)
#clf = neighbors.KNeighborsClassifier()
clf = VotingClassifier([('lsvc',svm.LinearSVC()),
('knn',neighbors.KNeighborsClassifier()),
('rfor',RandomForestClassifier())])
clf.fit(X_train, y_train)
confidence = clf.score(X_test, y_test)
print('accuracy:',confidence)
predictions = clf.predict(X_test)
print('predicted class counts:',Counter(predictions))
print()
print()
return confidence
# examples of running:
def run_forests():
print('random forest: \n')
params = []
scores = []
for _ in range(5):
max_features = np.random.randint(400,800)
max_depth = np.random.choice([None, None, None, None, 30, 40, 60])
forest = RandomForestClassifier(n_estimators=50,
max_features=max_features,
max_depth=max_depth)
forest_fit = forest.fit(X_train, Y_train)
pred = forest_fit.predict(X_test)
print('\n params:', dict(max_features=max_features, max_depth=max_depth))
print('forest train: ',zero_one_score(Y_train, forest_fit.predict(X_train)), ' test: ',
zero_one_score(Y_test, pred))
params.append( (max_features, max_depth) )
scores.append( zero_one_score(Y_test, pred))
print('best:', params[np.argmin(scores)])
def run_forests():
print('random forest: \n')
params = []
scores = []
for _ in range(5):
max_features = np.random.randint(400,800)
max_depth = np.random.choice([None, None, None, None, 30, 40, 60])
forest = RandomForestClassifier(n_estimators=50,
max_features=max_features,
max_depth=max_depth)
forest_fit = forest.fit(X_train, Y_train)
pred = forest_fit.predict(X_test)
print('\n params:', dict(max_features=max_features, max_depth=max_depth))
print('forest train: ',zero_one_score(Y_train, forest_fit.predict(X_train)), ' test: ',
zero_one_score(Y_test, pred))
params.append( (max_features, max_depth) )
scores.append( zero_one_score(Y_test, pred))
print('best:', params[np.argmin(scores)])
def run_forests():
print('random forest: \n')
params = []
scores = []
for _ in range(5):
max_features = np.random.randint(400,800)
max_depth = np.random.choice([None, None, None, None, 30, 40, 60])
forest = RandomForestClassifier(n_estimators=50,
max_features=max_features,
max_depth=max_depth)
forest_fit = forest.fit(X_train, Y_train)
pred = forest_fit.predict(X_test)
print('\n params:', dict(max_features=max_features, max_depth=max_depth))
print('forest train: ',zero_one_score(Y_train, forest_fit.predict(X_train)), ' test: ',
zero_one_score(Y_test, pred))
params.append( (max_features, max_depth) )
scores.append( zero_one_score(Y_test, pred))
print('best:', params[np.argmin(scores)])
def run_forests():
print('random forest: \n')
params = []
scores = []
for _ in range(5):
max_features = np.random.randint(400,800)
max_depth = np.random.choice([None, None, None, None, 30, 40, 60])
forest = RandomForestClassifier(n_estimators=50,
max_features=max_features,
max_depth=max_depth)
forest_fit = forest.fit(X_train, Y_train)
pred = forest_fit.predict(X_test)
print('\n params:', dict(max_features=max_features, max_depth=max_depth))
print('forest train: ',zero_one_score(Y_train, forest_fit.predict(X_train)), ' test: ',
zero_one_score(Y_test, pred))
params.append( (max_features, max_depth) )
scores.append( zero_one_score(Y_test, pred))
print('best:', params[np.argmin(scores)])
def run_forests():
print('random forest: \n')
params = []
scores = []
for _ in range(5):
max_features = np.random.randint(400,800)
max_depth = np.random.choice([None, None, None, None, 30, 40, 60])
forest = RandomForestClassifier(n_estimators=50,
max_features=max_features,
max_depth=max_depth)
forest_fit = forest.fit(X_train, Y_train)
pred = forest_fit.predict(X_test)
print('\n params:', dict(max_features=max_features, max_depth=max_depth))
print('forest train: ',zero_one_score(Y_train, forest_fit.predict(X_train)), ' test: ',
zero_one_score(Y_test, pred))
params.append( (max_features, max_depth) )
scores.append( zero_one_score(Y_test, pred))
print('best:', params[np.argmin(scores)])
def train_clf(x_train, y_train, best_depth):
""" Train classifier.
Parameters
----------
x_train : np.array [n_samples, n_features]
Training features.
y_train : np.array [n_samples]
Training labels
best_depth : int
Optimal max_depth parameter
Returns
-------
clf : classifier
Trained scikit-learn classifier
"""
clf = RFC(n_estimators=100, max_depth=best_depth, n_jobs=-1,
class_weight='auto', max_features=None)
clf = clf.fit(x_train, y_train)
return clf
def __init__(
self,data_block, predictors=[],cv_folds=10,
scoring_metric='accuracy',additional_display_metrics=[]):
base_classification.__init__(
self, alg=RandomForestClassifier(), data_block=data_block,
predictors=predictors,cv_folds=cv_folds,
scoring_metric=scoring_metric,
additional_display_metrics=additional_display_metrics
)
self.model_output = pd.Series(self.default_parameters)
self.model_output['Feature_Importance'] = "-"
self.model_output['OOB_Score'] = "-"
#Set parameters to default values:
self.set_parameters(set_default=True)
def test_improvement(self):
np.random.seed(4)
data, target = make_classification(n_samples=100,
n_features=45,
n_informative=15,
n_redundant=5,
class_sep=1,
n_clusters_per_class=4,
flip_y=0.4)
model = RandomForestClassifier(max_depth=5)
model.fit(data, target)
start_score = clf_score(target, model.predict(data))
p1 = Parameter('max_depth', 'integer', lower=1, upper=10)
grid_sizes = {'max_depth': 5}
grid_search = GridSearchOptimizer(model, [p1], clf_score, grid_sizes)
best_params, best_model = grid_search.fit(X_train=data, y_train=target)
best_model.fit(data, target)
final_score = clf_score(target, best_model.predict(data))
self.assertTrue(final_score>start_score)
def test_objective_function(self):
np.random.seed(4)
data, target = make_classification(n_samples=100,
n_features=10,
n_informative=10,
n_redundant=0,
class_sep=100,
n_clusters_per_class=1,
flip_y=0.0)
model = RandomForestClassifier(max_depth=5)
model.fit(data, target)
fun = partial(objective, model,
'sklearn',
clf_score,
data, target, data, target)
# model should fit the data perfectly
final_score = fun(model.get_params())[0]
self.assertEqual(final_score,1)
def test_expected_improvement_tractable(self):
np.random.seed(5)
data, target = make_classification(n_samples=100,
n_features=45,
n_informative=15,
n_redundant=5,
class_sep=1,
n_clusters_per_class=4,
flip_y=0.4)
model = RandomForestClassifier(max_depth=5)
model.fit(data, target)
start_score = clf_score(target, model.predict(data))
p1 = Parameter('max_depth', 'integer', lower=1, upper=10)
bayesOpt = BayesianOptimizer(model, [p1], clf_score, method='expected_improvement')
best_params, best_model = bayesOpt.fit(X_train=data, y_train=target, n_iters=10)
self.assertTrue(bayesOpt.success)
best_model.fit(data, target)
final_score = clf_score(target, best_model.predict(data))
self.assertTrue(final_score>start_score)
def test_probability_of_improvement_tractable(self):
np.random.seed(5)
data, target = make_classification(n_samples=100,
n_features=45,
n_informative=15,
n_redundant=5,
class_sep=1,
n_clusters_per_class=4,
flip_y=0.4)
model = RandomForestClassifier(max_depth=5)
model.fit(data, target)
start_score = clf_score(target, model.predict(data))
p1 = Parameter('max_depth', 'integer', lower=1, upper=10)
bayesOpt = BayesianOptimizer(model, [p1], clf_score, method='probability_of_improvement')
best_params, best_model = bayesOpt.fit(X_train=data, y_train=target, n_iters=10)
self.assertTrue(bayesOpt.success)
best_model.fit(data, target)
final_score = clf_score(target, best_model.predict(data))
self.assertTrue(final_score>start_score)
def test_upper_confidence_bound_tractable(self):
np.random.seed(5)
data, target = make_classification(n_samples=100,
n_features=45,
n_informative=15,
n_redundant=5,
class_sep=1,
n_clusters_per_class=4,
flip_y=0.4)
model = RandomForestClassifier(max_depth=5)
model.fit(data, target)
start_score = clf_score(target, model.predict(data))
p1 = Parameter('max_depth', 'integer', lower=1, upper=10)
bayesOpt = BayesianOptimizer(model, [p1], clf_score, method='upper_confidence_bound')
best_params, best_model = bayesOpt.fit(X_train=data, y_train=target, n_iters=10)
self.assertTrue(bayesOpt.success)
best_model.fit(data, target)
final_score = clf_score(target, best_model.predict(data))
self.assertTrue(final_score>start_score)
def test_improvement(self):
np.random.seed(4)
data, target = make_classification(n_samples=100,
n_features=45,
n_informative=15,
n_redundant=5,
class_sep=1,
n_clusters_per_class=4,
flip_y=0.4)
model = RandomForestClassifier(max_depth=5)
model.fit(data, target)
start_score = clf_score(target, model.predict(data))
p1 = Parameter('max_depth', 'integer', lower=1, upper=10)
rand_search = RandomSearchOptimizer(model, [p1], clf_score)
best_params, best_model = rand_search.fit(X_train=data, y_train=target, n_iters=10)
best_model.fit(data, target)
final_score = clf_score(target, best_model.predict(data))
self.assertTrue(final_score>start_score)
def __init__(self, task: Task, scorer: Scorer, opt_logger: OptimizationLogger=VoidLogger(None)):
if task.task == "classification":
space = RandomForestOptimizer.Params.classification_space
model = ensemble.RandomForestClassifier()
else:
space = RandomForestOptimizer.Params.regression_space
model = ensemble.RandomForestRegressor()
super().__init__(model, task, space, scorer, opt_logger)
def learns(tests,trains,indep=lambda x: x[:-1],
dep = lambda x: x[-1],
rf = Abcd(),
lg = Abcd(),
dt = Abcd(),
nb = Abcd()):
x1,y1,x2,y2= trainTest(tests,trains,indep,dep)
forest = RandomForestClassifier(n_estimators = 50)
forest = forest.fit(x1,y1)
for n,got in enumerate(forest.predict(x2)):
rf(predicted = got, actual = y2[n])
logreg = linear_model.LogisticRegression(C=1e5)
logreg.fit(x1, y1)
for n,got in enumerate(logreg.predict(x2)):
lg(predicted = got, actual = y2[n])
bayes = GaussianNB()
bayes.fit(x1,y1)
for n,got in enumerate(bayes.predict(x2)):
nb(predicted = got, actual = y2[n])
dectree = DecisionTreeClassifier(criterion="entropy",
random_state=1)
dectree.fit(x1,y1)
for n,got in enumerate(dectree.predict(x2)):
dt(predicted = got, actual = y2[n])
def rforest(train, test, tunings=None, smoteit=True, duplicate=True):
"RF "
# Apply random forest Classifier to predict the number of bugs.
if smoteit:
train = SMOTE(train, atleast=50, atmost=101, resample=duplicate)
if not tunings:
clf = RandomForestClassifier(n_estimators=100, random_state=1)
else:
clf = RandomForestClassifier(n_estimators=int(tunings[0]),
max_features=tunings[1] / 100,
min_samples_leaf=int(tunings[2]),
min_samples_split=int(tunings[3])
)
train_DF = formatData(train)
test_DF = formatData(test)
features = train_DF.columns[:-2]
klass = train_DF[train_DF.columns[-2]]
# set_trace()
clf.fit(train_DF[features], klass)
preds = clf.predict(test_DF[test_DF.columns[:-2]])
return preds
def __init__(self,
estimator=RandomForestClassifier(n_estimators=50,
n_jobs=-1,
max_features=1.,
min_samples_leaf=5,
max_depth=5),
n_folds=2,
stratify=True,
random_state=1):
self.estimator = estimator
self.n_folds = n_folds
self.stratify = stratify
self.random_state = random_state
self.__cv = None
self.__pred = None
self.__target = None
self.__fitOK = False
def test_stacked_classfier_extkfold(self):
bclf = LogisticRegression(random_state=1)
clfs = [RandomForestClassifier(n_estimators=40, criterion = 'gini', random_state=1),
RidgeClassifier(random_state=1),
]
sl = StackedClassifier(bclf,
clfs,
n_folds=3,
verbose=0,
Kfold=StratifiedKFold(self.iris.target, 3),
stack_by_proba=False,
oob_score_flag=True,
oob_metrics=log_loss)
sl.fit(self.iris.data, self.iris.target)
score = sl.score(self.iris.data, self.iris.target)
self.assertGreater(score, 0.9, "Failed with score = {0}".format(score))
def test_fwls_classfier(self):
feature_func = lambda x: np.ones(x.shape)
bclf = LogisticRegression(random_state=1)
clfs = [RandomForestClassifier(n_estimators=40, criterion = 'gini', random_state=1),
RidgeClassifier(random_state=1),
]
sl = FWLSClassifier(bclf,
clfs,
feature_func=feature_func,
n_folds=3,
verbose=0,
Kfold=StratifiedKFold(self.iris.target, 3),
stack_by_proba=False)
sl.fit(self.iris.data, self.iris.target)
score = sl.score(self.iris.data, self.iris.target)
self.assertGreater(score, 0.9, "Failed with score = {0}".format(score))
def test_classifier(self):
index = [i for i in range(len(self.iris.data))]
rf = RandomForestClassifier()
jrf = JoblibedClassifier(rf, "rf", cache_dir='')
jrf.fit(self.iris.data, self.iris.target, index)
prediction = jrf.predict(self.iris.data, index)
score = accuracy_score(self.iris.target, prediction)
self.assertGreater(score, 0.9, "Failed with score = {0}".format(score))
rf = RandomForestClassifier(n_estimators=20)
jrf = JoblibedClassifier(rf, "rf", cache_dir='')
jrf.fit(self.iris.data, self.iris.target)
index = [i for i in range(len(self.iris.data))]
prediction2 = jrf.predict(self.iris.data, index)
self.assertTrue((prediction == prediction2).all())
def prec_rf(n_trees, X_train, y_train, X_test, y_test):
"""
ExtraTrees
"""
from sklearn.ensemble import RandomForestClassifier
if not issparse(X_train):
X_train = X_train.reshape((X_train.shape[0], -1))
if not issparse(X_test):
X_test = X_test.reshape((X_test.shape[0], -1))
LOGGER.info('start predict: n_trees={},X_train.shape={},y_train.shape={},X_test.shape={},y_test.shape={}'.format(
n_trees, X_train.shape, y_train.shape, X_test.shape, y_test.shape))
clf = RandomForestClassifier(n_estimators=n_trees, max_depth=None, n_jobs=-1, verbose=1)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
prec = float(np.sum(y_pred == y_test)) / len(y_test)
LOGGER.info('prec_rf{}={:.6f}%'.format(n_trees, prec*100.0))
return clf, y_pred