def get_feature_importance(self,clf, model_name ):
clfs = {'RandomForestClassifier':'feature_importances',
'ExtraTreesClassifier': 'feature_importances',
'AdaBoostClassifier': 'feature_importances',
'LogisticRegression': 'coef',
'svm.SVC': 'coef',
'GradientBoostingClassifier': 'feature_importances',
'GaussianNB': None,
'DecisionTreeClassifier': 'feature_importances',
'SGDClassifier': 'coef',
'KNeighborsClassifier': None,
'linear.SVC': 'coef'}
if clfs[model_name] == 'feature_importances':
return list(clf.feature_importances_)
elif clfs[model_name] == 'coef':
return list(clf.coef_.tolist())
else:
return None
python类ExtraTreesClassifier()的实例源码
def __init__(
self,data_block, predictors=[],cv_folds=10,
scoring_metric='accuracy',additional_display_metrics=[]):
base_classification.__init__(
self, alg=ExtraTreesClassifier(), data_block=data_block,
predictors=predictors,cv_folds=cv_folds,
scoring_metric=scoring_metric,
additional_display_metrics=additional_display_metrics)
self.model_output = pd.Series(self.default_parameters)
self.model_output['Feature_Importance'] = "-"
self.model_output['OOB_Score'] = "-"
#Set parameters to default values:
self.set_parameters(set_default=True)
def define_model(self, model, parameters, n_cores = 0):
clfs = {'RandomForestClassifier': RandomForestClassifier(n_estimators=50, n_jobs=7),
'ExtraTreesClassifier': ExtraTreesClassifier(n_estimators=10, n_jobs=7, criterion='entropy'),
'AdaBoostClassifier': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200),
'LogisticRegression': LogisticRegression(penalty='l1', C=1e5),
'svm.SVC': svm.SVC(kernel='linear', probability=True, random_state=0),
'GradientBoostingClassifier': GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=10),
'GaussianNB': GaussianNB(),
'DecisionTreeClassifier': DecisionTreeClassifier(),
'SGDClassifier': SGDClassifier(loss="hinge", penalty="l2", n_jobs=7),
'KNeighborsClassifier': KNeighborsClassifier(n_neighbors=3),
'linear.SVC': svm.LinearSVC() }
if model not in clfs:
raise ConfigError("Unsupported model {}".format(model))
clf = clfs[model]
clf.set_params(**parameters)
return clf
extra_trees_preproc_for_classification.py 文件源码
项目:AutoML-Challenge
作者: postech-mlg-exbrain
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def fit(self, X, Y, sample_weight=None):
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
num_features = X.shape[1]
max_features = int(
float(self.max_features) * (np.log(num_features) + 1))
# Use at most half of the features
max_features = max(1, min(int(X.shape[1] / 2), max_features))
preprocessor = ExtraTreesClassifier(
n_estimators=self.n_estimators, criterion=self.criterion,
max_depth=self.max_depth, min_samples_split=self.min_samples_split,
min_samples_leaf=self.min_samples_leaf, bootstrap=self.bootstrap,
max_features=max_features, max_leaf_nodes=self.max_leaf_nodes,
oob_score=self.oob_score, n_jobs=self.n_jobs, verbose=self.verbose,
random_state=self.random_state, class_weight=self.class_weight
)
preprocessor.fit(X, Y, sample_weight=sample_weight)
self.preprocessor = SelectFromModel(preprocessor, prefit=True)
return self
def prec_ets(n_trees, X_train, y_train, X_test, y_test, random_state=None):
"""
ExtraTrees
"""
from sklearn.ensemble import ExtraTreesClassifier
if not issparse(X_train):
X_train = X_train.reshape((X_train.shape[0], -1))
if not issparse(X_test):
X_test = X_test.reshape((X_test.shape[0], -1))
LOGGER.info('start predict: n_trees={},X_train.shape={},y_train.shape={},X_test.shape={},y_test.shape={}'.format(
n_trees, X_train.shape, y_train.shape, X_test.shape, y_test.shape))
clf = ExtraTreesClassifier(n_estimators=n_trees, max_depth=None, n_jobs=-1, verbose=1, random_state=random_state)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
prec = float(np.sum(y_pred == y_test)) / len(y_test)
LOGGER.info('prec_ets{}={:.6f}%'.format(n_trees, prec*100.0))
return clf, y_pred
def prec_ets(n_trees, X_train, y_train, X_test, y_test, random_state=None):
"""
ExtraTrees
"""
from sklearn.ensemble import ExtraTreesClassifier
if not issparse(X_train):
X_train = X_train.reshape((X_train.shape[0], -1))
if not issparse(X_test):
X_test = X_test.reshape((X_test.shape[0], -1))
LOGGER.info('start predict: n_trees={},X_train.shape={},y_train.shape={},X_test.shape={},y_test.shape={}'.format(
n_trees, X_train.shape, y_train.shape, X_test.shape, y_test.shape))
clf = ExtraTreesClassifier(n_estimators=n_trees, max_depth=None, n_jobs=-1, verbose=1, random_state=random_state)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
prec = float(np.sum(y_pred == y_test)) / len(y_test)
LOGGER.info('prec_ets{}={:.6f}%'.format(n_trees, prec*100.0))
return clf, y_pred
def analyseReasonWithTreeBaesd(anamolySample,normalSample,name):
data = anamolySample
target = []
for i in range(0,len(anamolySample)):
target.append(1)
data.extend(normalSample)
for i in range(0,len(normalSample)):
target.append(0)
clf = ExtraTreesClassifier()
clf = clf.fit(data,target)
model = SelectFromModel(clf,prefit=True)
outcome = model.get_support()
for i in range(0,len(name)):
if outcome[i]:
print name[i]
onlinedetectWithlittleData.py 文件源码
项目:onlineDetectForHadoop
作者: DawnsonLi
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
def analyseReasonWithTreeBaesd(anamolySample,normalSample,name):
data = anamolySample
target = []
for i in range(0,len(anamolySample)):
target.append(1)
data = data.append(normalSample)
for i in range(0,len(normalSample)):
target.append(0)
clf = ExtraTreesClassifier()
clf = clf.fit(data,target)
model = SelectFromModel(clf,prefit=True)
outcome = model.get_support()
for i in range(0,len(name)):
if outcome[i]:
print name[i]
birchForChangeWindowSize.py 文件源码
项目:onlineDetectForHadoop
作者: DawnsonLi
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
def analyseReasonWithTreeBaesd(anamolySample,normalSample):
data = anamolySample
target = []
for i in range(0,len(anamolySample)):
target.append(1)
data = data.append(normalSample)
for i in range(0,len(normalSample)):
target.append(0)
name = []
for i in data.columns:
name.append(i)
clf = ExtraTreesClassifier()
clf = clf.fit(data,target)
model = SelectFromModel(clf,prefit=True)
outcome = model.get_support()
for i in range(0,len(name)):
if outcome[i]:
print name[i]
def analyseReasonWithTreeBaesd(anamolySample,normalSample,name):
data = anamolySample
target = []
for i in range(0,len(anamolySample)):
target.append(1)
data = data.append(normalSample)
for i in range(0,len(normalSample)):
target.append(0)
clf = ExtraTreesClassifier()
clf = clf.fit(data,target)
model = SelectFromModel(clf,prefit=True)
outcome = model.get_support()
for i in range(0,len(name)):
if outcome[i]:
print name[i]
def analyseReasonWithTreeBaesd(anamolySample, normalSample, name):
data = anamolySample
target = []
for i in range(0, len(anamolySample)):
target.append(1)
data.extend(normalSample)
for i in range(0, len(normalSample)):
target.append(0)
clf = ExtraTreesClassifier()
clf = clf.fit(data, target)
model = SelectFromModel(clf, prefit=True)
outcome = model.get_support()
warnstr = ""
for i in range(0, len(name)):
if outcome[i]:
warnstr += name[i]
warnstr += " ; "
return warnstr
def analyseReasonWithTreeBaesd(anamolySample,normalSample,name):
data = anamolySample
target = []
for i in range(0,len(anamolySample)):
target.append(1)
data.extend(normalSample)
for i in range(0,len(normalSample)):
target.append(0)
clf = ExtraTreesClassifier()
clf = clf.fit(data,target)
model = SelectFromModel(clf,prefit=True)
outcome = model.get_support()
for i in range(0,len(name)):
if outcome[i]:
print name[i]
def analyseReasonWithTreeBaesd(anamolySample, normalSample, name):
target = []
for i in range(0, len(anamolySample)):
target.append(1)
data = pd.concat([anamolySample,normalSample])
for i in range(0, len(normalSample)):
target.append(0)
clf = ExtraTreesClassifier()
clf = clf.fit(data, target)
model = SelectFromModel(clf, prefit=True)
outcome = model.get_support()
warnstr = ""
for i in range(0, len(name)):
if outcome[i]:
warnstr += name[i]
warnstr += " ; "
return warnstr
def analyseReasonWithTreeBaesd(anamolySample, normalSample, name):
target = []
for i in range(0, len(anamolySample)):
target.append(1)
data = pd.concat([anamolySample,normalSample])
for i in range(0, len(normalSample)):
target.append(0)
clf = ExtraTreesClassifier()
clf = clf.fit(data, target)
model = SelectFromModel(clf, prefit=True)
outcome = model.get_support()
warnstr = ""
for i in range(0, len(name)):
if outcome[i]:
warnstr += name[i]
warnstr += " ; "
print warnstr
return warnstr
def analyseReasonWithTreeBaesd(anamolySample,normalSample,name):
data = anamolySample
target = []
for i in range(0,len(anamolySample)):
target.append(1)
data.extend(normalSample)
for i in range(0,len(normalSample)):
target.append(0)
clf = ExtraTreesClassifier()
clf = clf.fit(data,target)
model = SelectFromModel(clf,prefit=True)
outcome = model.get_support()
for i in range(0,len(name)):
if outcome[i]:
print name[i]
def runET(train_X, train_y, test_X, test_y=None, validation=1, n_est_val=50, depth_val=None, split_val=2, leaf_val=1, feat_val='auto', jobs_val=4, random_state_val=0):
clf = ensemble.ExtraTreesClassifier(
n_estimators = n_est_val,
max_depth = depth_val,
min_samples_split = split_val,
min_samples_leaf = leaf_val,
max_features = feat_val,
criterion='entropy',
n_jobs = jobs_val,
random_state = random_state_val)
clf.fit(train_X, train_y)
pred_train_y = clf.predict_proba(train_X)[:,1]
pred_test_y = clf.predict_proba(test_X)[:,1]
if validation:
train_loss = log_loss(train_y, pred_train_y)
loss = log_loss(test_y, pred_test_y)
print "Train, Test loss : ", train_loss, loss
return pred_test_y, loss
else:
return pred_test_y
def runET(train_X, train_y, test_X, test_y=None, validation=1, n_est_val=50, depth_val=None, split_val=2, leaf_val=1, feat_val='auto', jobs_val=4, random_state_val=0):
clf = ensemble.ExtraTreesClassifier(
n_estimators = n_est_val,
max_depth = depth_val,
min_samples_split = split_val,
min_samples_leaf = leaf_val,
max_features = feat_val,
criterion='entropy',
n_jobs = jobs_val,
random_state = random_state_val)
clf.fit(train_X, train_y)
pred_train_y = clf.predict_proba(train_X)[:,1]
pred_test_y = clf.predict_proba(test_X)[:,1]
if validation:
train_loss = log_loss(train_y, pred_train_y)
loss = log_loss(test_y, pred_test_y)
print "Train, Test loss : ", train_loss, loss
return pred_test_y, loss
else:
return pred_test_y
def runET(train_X, train_y, test_X, test_y=None, validation=1, n_est_val=50, depth_val=None, split_val=2, leaf_val=1, feat_val='auto', jobs_val=4, random_state_val=0):
clf = ensemble.ExtraTreesClassifier(
n_estimators = n_est_val,
max_depth = depth_val,
min_samples_split = split_val,
min_samples_leaf = leaf_val,
max_features = feat_val,
criterion='entropy',
n_jobs = jobs_val,
random_state = random_state_val)
clf.fit(train_X, train_y)
pred_train_y = clf.predict_proba(train_X)[:,1]
pred_test_y = clf.predict_proba(test_X)[:,1]
if validation:
train_loss = log_loss(train_y, pred_train_y)
loss = log_loss(test_y, pred_test_y)
print "Train, Test loss : ", train_loss, loss
return pred_test_y, loss
else:
return pred_test_y
def runET(train_X, train_y, test_X, test_y=None, validation=1, n_est_val=50, depth_val=None, split_val=2, leaf_val=1, feat_val='auto', jobs_val=4, random_state_val=0):
clf = ensemble.ExtraTreesClassifier(
n_estimators = n_est_val,
max_depth = depth_val,
min_samples_split = split_val,
min_samples_leaf = leaf_val,
max_features = feat_val,
criterion='entropy',
n_jobs = jobs_val,
random_state = random_state_val)
clf.fit(train_X, train_y)
pred_train_y = clf.predict_proba(train_X)[:,1]
pred_test_y = clf.predict_proba(test_X)[:,1]
if validation:
train_loss = log_loss(train_y, pred_train_y)
loss = log_loss(test_y, pred_test_y)
print "Train, Test loss : ", train_loss, loss
return pred_test_y, loss
else:
return pred_test_y
def extratreescv(n_estimators,
min_samples_split,
min_samples_leaf,
max_features,
max_depth,
min_weight_fraction_leaf
):
clf = ExtraTreesClassifier(n_estimators=int(n_estimators),
min_samples_split=int(min_samples_split),
min_samples_leaf=int(min_samples_leaf),
max_features= int(max_features),
max_depth = int(max_depth),
min_weight_fraction_leaf = min_weight_fraction_leaf,
n_jobs=-1,
random_state=1234,
verbose=1)
clf.fit(x0, y0)
ll = -log_loss(y1, clf.predict_proba(x1)[:,1])
return ll
def try_params( n_iterations, params ):
n_estimators = int( round( n_iterations * trees_per_iteration ))
print "n_estimators:", n_estimators
pprint( params )
clf = XT( n_estimators = n_estimators, verbose = 0, n_jobs = -1, **params )
return train_and_eval_sklearn_classifier( clf, data )
def classify(train=None, test=None, data=None, res_dir="res/", disp=True, outfilename=None):
"""Description of compare
compare multiple classifier and display the best one
"""
utils.print_success("Comparison of differents classifiers")
if data is not None:
train_features = data["train_features"]
train_groundtruths = data["train_groundtruths"]
test_features = data["test_features"]
test_groundtruths = data["test_groundtruths"]
else:
train = utils.abs_path_file(train)
test = utils.abs_path_file(test)
train_features, train_groundtruths = read_file(train)
test_features, test_groundtruths = read_file(test)
if not utils.create_dir(res_dir):
res_dir = utils.abs_path_dir(res_dir)
classifiers = {
"RandomForest": RandomForestClassifier(n_jobs=-1)
# "RandomForest": RandomForestClassifier(n_estimators=5),
# "KNeighbors":KNeighborsClassifier(3),
# "GaussianProcess":GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True),
# "DecisionTree":DecisionTreeClassifier(max_depth=5),
# "MLP":MLPClassifier(),
# "AdaBoost":AdaBoostClassifier(),
# "GaussianNB":GaussianNB(),
# "QDA":QuadraticDiscriminantAnalysis(),
# "SVM":SVC(kernel="linear", C=0.025),
# "GradientBoosting":GradientBoostingClassifier(),
# "ExtraTrees":ExtraTreesClassifier(),
# "LogisticRegression":LogisticRegression(),
# "LinearDiscriminantAnalysis":LinearDiscriminantAnalysis()
}
for key in classifiers:
utils.print_success(key)
clf = classifiers[key]
utils.print_info("\tFit")
clf.fit(train_features, train_groundtruths)
utils.print_info("\tPredict")
predictions = clf.predict(test_features)
return predictions
trainer.py 文件源码
项目:Python-Machine-Learning-Cookbook
作者: PacktPublishing
项目源码
文件源码
阅读 26
收藏 0
点赞 0
评论 0
def __init__(self, X, label_words):
self.le = preprocessing.LabelEncoder()
self.clf = ExtraTreesClassifier(n_estimators=100,
max_depth=16, random_state=0)
y = self.encode_labels(label_words)
self.clf.fit(np.asarray(X), y)
def iterative_fit(self, X, y, sample_weight=None, n_iter=1, refit=False):
from sklearn.ensemble import ExtraTreesClassifier as ETC
if refit:
self.estimator = None
if self.estimator is None:
num_features = X.shape[1]
max_features = int(
float(self.max_features) * (np.log(num_features) + 1))
# Use at most half of the features
max_features = max(1, min(int(X.shape[1] / 2), max_features))
self.estimator = ETC(
n_estimators=0, criterion=self.criterion,
max_depth=self.max_depth, min_samples_split=self.min_samples_split,
min_samples_leaf=self.min_samples_leaf, bootstrap=self.bootstrap,
max_features=max_features, max_leaf_nodes=self.max_leaf_nodes,
oob_score=self.oob_score, n_jobs=self.n_jobs, verbose=self.verbose,
random_state=self.random_state,
class_weight=self.class_weight,
warm_start=True
)
tmp = self.estimator # TODO copy ?
tmp.n_estimators += n_iter
tmp.fit(X, y, sample_weight=sample_weight)
self.estimator = tmp
return self
def __init__(self, name, kwargs):
from sklearn.ensemble import ExtraTreesClassifier
super(GCExtraTreesClassifier, self).__init__(name, ExtraTreesClassifier, kwargs)
model.py 文件源码
项目:5th_place_solution_facebook_check_ins
作者: aikinogard
项目源码
文件源码
阅读 27
收藏 0
点赞 0
评论 0
def et_opt1(df_cell_train_feats, y_train, df_cell_test_feats):
logging.info("train et_opt1 model")
clf = ExtraTreesClassifier(n_estimators=500, n_jobs=-1, max_features="log2", min_samples_split=5, min_samples_leaf=1)
clf.fit(df_cell_train_feats, y_train)
y_test_pred = clf.predict_proba(df_cell_test_feats)
return y_test_pred
def get_data_preprocessor_balancing(params, y):
d_balancing = params['layer_dict_list'][1]
if params['balancing'] == str(d_balancing['None']) or params['balancing'] == 'None':
# for fp: ['ExtraTreesClassifier', 'LinearSVC'] + clf: ['DecisionTreeClassifier', 'ExtraTreesClassifier', 'LinearSVC', 'SVC', 'RandomForestClassifier', 'SGDClassifier']
params['class_weight'] = None
# for clf: ['Adasample_weightBoostClassifier', 'GradientBoostingClassifier']
params['sample_weight'] = None
elif params['balancing'] == str(d_balancing['weighting']) or params['balancing'] == 'weighting':
# for fp: ['ExtraTreesClassifier', 'LinearSVC'] + clf: ['DecisionTreeClassifier', 'ExtraTreesClassifier', 'LinearSVC', 'SVC', 'RandomForestClassifier', 'SGDClassifier']
params['class_weight'] = 'auto'
# for clf: ['AdaBoostClassifier', 'GradientBoostingClassifier']
if len(y.shape) > 1:
offsets = [2 ** i for i in range(y.shape[1])]
y_ = np.sum(y * offsets, axis=1)
else:
y_ = y
unique, counts = np.unique(y_, return_counts=True)
cw = 1. / counts
cw = cw / np.mean(cw)
sample_weight = np.ones(y_.shape)
for i, ue in enumerate(unique):
mask = y_ == ue
sample_weight[mask] *= cw[i]
params['sample_weight'] = sample_weight
return params
def __init__(self):
SingleClassifier.SingleClassifier.__init__(self)
# weak classifier
self.clf = ExtraTreesClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0)
def makEnsemble( X, xlist, Y ):
#naive bayes
clf = MultinomialNB()
clf.fit( xlist, Y )
featureSelectModel.append (clf)
#K nearest neighbours
clf = KNeighborsClassifier()
clf.fit( xlist, Y )
featureSelectModel.append (clf)
#Logistic regression
clf = LogisticRegression(C=1)
clf.fit( xlist, Y )
featureSelectModel.append (clf)
#random forest
clf = RandomForestClassifier(n_estimators = 400)
clf.fit( X, Y )
wholeFeatureModel.append (clf)
#extra forest
clf = ExtraTreesClassifier(n_estimators = 400)
clf.fit( X, Y )
wholeFeatureModel.append (clf)
#decision forest
clf = DecisionTreeClassifier(max_depth=None, min_samples_split=1, random_state=0)
clf.fit( X, Y )
wholeFeatureModel.append (clf)
#gradient boosting
params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 1,
'learning_rate': 0.01}
clf = GradientBoostingClassifier(**params)
clf.fit( X, Y )
wholeFeatureModel.append (clf)
def clf_extra_trees(data, random_state, calibrated=False, ext_name=""):
"""
Application of extra trees classifier. For details look at
'clf_sklearn' function.
"""
et = ExtraTreesClassifier(n_estimators=500, n_jobs=-1,
max_depth=17,
max_features=0.2,
min_samples_split=80,
random_state=random_state, verbose=10)
return clf_sklearn(et, data, random_state, calibrated, clf_name='ET',
ext_name=ext_name)