def get_feature_selection_model_from_name(type_of_estimator, model_name):
model_map = {
'classifier': {
'SelectFromModel': SelectFromModel(RandomForestClassifier(n_jobs=-1, max_depth=10, n_estimators=15), threshold='20*mean'),
'RFECV': RFECV(estimator=RandomForestClassifier(n_jobs=-1), step=0.1),
'GenericUnivariateSelect': GenericUnivariateSelect(),
'KeepAll': 'KeepAll'
},
'regressor': {
'SelectFromModel': SelectFromModel(RandomForestRegressor(n_jobs=-1, max_depth=10, n_estimators=15), threshold='0.7*mean'),
'RFECV': RFECV(estimator=RandomForestRegressor(n_jobs=-1), step=0.1),
'GenericUnivariateSelect': GenericUnivariateSelect(),
'KeepAll': 'KeepAll'
}
}
return model_map[type_of_estimator][model_name]
python类SelectFromModel()的实例源码
def get_feature_selection_model_from_name(type_of_estimator, model_name):
model_map = {
'classifier': {
'SelectFromModel': SelectFromModel(RandomForestClassifier(n_jobs=-1, max_depth=10, n_estimators=15), threshold='20*mean'),
'RFECV': RFECV(estimator=RandomForestClassifier(n_jobs=-1), step=0.1),
'GenericUnivariateSelect': GenericUnivariateSelect(),
'RandomizedSparse': RandomizedLogisticRegression(),
'KeepAll': 'KeepAll'
},
'regressor': {
'SelectFromModel': SelectFromModel(RandomForestRegressor(n_jobs=-1, max_depth=10, n_estimators=15), threshold='0.7*mean'),
'RFECV': RFECV(estimator=RandomForestRegressor(n_jobs=-1), step=0.1),
'GenericUnivariateSelect': GenericUnivariateSelect(),
'RandomizedSparse': RandomizedLasso(),
'KeepAll': 'KeepAll'
}
}
return model_map[type_of_estimator][model_name]
extra_trees_preproc_for_regression.py 文件源码
项目:AutoML-Challenge
作者: postech-mlg-exbrain
项目源码
文件源码
阅读 22
收藏 0
点赞 0
评论 0
def fit(self, X, Y):
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.feature_selection import SelectFromModel
num_features = X.shape[1]
max_features = int(
float(self.max_features) * (np.log(num_features) + 1))
# Use at most half of the features
max_features = max(1, min(int(X.shape[1] / 2), max_features))
preprocessor = ExtraTreesRegressor(
n_estimators=self.n_estimators, criterion=self.criterion,
max_depth=self.max_depth, min_samples_split=self.min_samples_split,
min_samples_leaf=self.min_samples_leaf, bootstrap=self.bootstrap,
max_features=max_features, max_leaf_nodes=self.max_leaf_nodes,
oob_score=self.oob_score, n_jobs=self.n_jobs, verbose=self.verbose,
random_state=self.random_state)
preprocessor.fit(X, Y)
self.preprocessor = SelectFromModel(preprocessor, prefit=True)
return self
extra_trees_preproc_for_classification.py 文件源码
项目:AutoML-Challenge
作者: postech-mlg-exbrain
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def fit(self, X, Y, sample_weight=None):
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
num_features = X.shape[1]
max_features = int(
float(self.max_features) * (np.log(num_features) + 1))
# Use at most half of the features
max_features = max(1, min(int(X.shape[1] / 2), max_features))
preprocessor = ExtraTreesClassifier(
n_estimators=self.n_estimators, criterion=self.criterion,
max_depth=self.max_depth, min_samples_split=self.min_samples_split,
min_samples_leaf=self.min_samples_leaf, bootstrap=self.bootstrap,
max_features=max_features, max_leaf_nodes=self.max_leaf_nodes,
oob_score=self.oob_score, n_jobs=self.n_jobs, verbose=self.verbose,
random_state=self.random_state, class_weight=self.class_weight
)
preprocessor.fit(X, Y, sample_weight=sample_weight)
self.preprocessor = SelectFromModel(preprocessor, prefit=True)
return self
def analyseReasonWithTreeBaesd(anamolySample,normalSample,name):
data = anamolySample
target = []
for i in range(0,len(anamolySample)):
target.append(1)
data.extend(normalSample)
for i in range(0,len(normalSample)):
target.append(0)
clf = ExtraTreesClassifier()
clf = clf.fit(data,target)
model = SelectFromModel(clf,prefit=True)
outcome = model.get_support()
for i in range(0,len(name)):
if outcome[i]:
print name[i]
onlinedetectWithlittleData.py 文件源码
项目:onlineDetectForHadoop
作者: DawnsonLi
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def analyseReasonWithTreeBaesd(anamolySample,normalSample,name):
data = anamolySample
target = []
for i in range(0,len(anamolySample)):
target.append(1)
data = data.append(normalSample)
for i in range(0,len(normalSample)):
target.append(0)
clf = ExtraTreesClassifier()
clf = clf.fit(data,target)
model = SelectFromModel(clf,prefit=True)
outcome = model.get_support()
for i in range(0,len(name)):
if outcome[i]:
print name[i]
birchForChangeWindowSize.py 文件源码
项目:onlineDetectForHadoop
作者: DawnsonLi
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def analyseReasonWithTreeBaesd(anamolySample,normalSample):
data = anamolySample
target = []
for i in range(0,len(anamolySample)):
target.append(1)
data = data.append(normalSample)
for i in range(0,len(normalSample)):
target.append(0)
name = []
for i in data.columns:
name.append(i)
clf = ExtraTreesClassifier()
clf = clf.fit(data,target)
model = SelectFromModel(clf,prefit=True)
outcome = model.get_support()
for i in range(0,len(name)):
if outcome[i]:
print name[i]
def analyseReasonWithTreeBaesd(anamolySample,normalSample,name):
data = anamolySample
target = []
for i in range(0,len(anamolySample)):
target.append(1)
data = data.append(normalSample)
for i in range(0,len(normalSample)):
target.append(0)
clf = ExtraTreesClassifier()
clf = clf.fit(data,target)
model = SelectFromModel(clf,prefit=True)
outcome = model.get_support()
for i in range(0,len(name)):
if outcome[i]:
print name[i]
def analyseReasonWithTreeBaesd(anamolySample, normalSample, name):
data = anamolySample
target = []
for i in range(0, len(anamolySample)):
target.append(1)
data.extend(normalSample)
for i in range(0, len(normalSample)):
target.append(0)
clf = ExtraTreesClassifier()
clf = clf.fit(data, target)
model = SelectFromModel(clf, prefit=True)
outcome = model.get_support()
warnstr = ""
for i in range(0, len(name)):
if outcome[i]:
warnstr += name[i]
warnstr += " ; "
return warnstr
def analyseReasonWithTreeBaesd(anamolySample,normalSample,name):
data = anamolySample
target = []
for i in range(0,len(anamolySample)):
target.append(1)
data.extend(normalSample)
for i in range(0,len(normalSample)):
target.append(0)
clf = ExtraTreesClassifier()
clf = clf.fit(data,target)
model = SelectFromModel(clf,prefit=True)
outcome = model.get_support()
for i in range(0,len(name)):
if outcome[i]:
print name[i]
def analyseReasonWithTreeBaesd(anamolySample, normalSample, name):
target = []
for i in range(0, len(anamolySample)):
target.append(1)
data = pd.concat([anamolySample,normalSample])
for i in range(0, len(normalSample)):
target.append(0)
clf = ExtraTreesClassifier()
clf = clf.fit(data, target)
model = SelectFromModel(clf, prefit=True)
outcome = model.get_support()
warnstr = ""
for i in range(0, len(name)):
if outcome[i]:
warnstr += name[i]
warnstr += " ; "
return warnstr
def analyseReasonWithTreeBaesd(anamolySample, normalSample, name):
target = []
for i in range(0, len(anamolySample)):
target.append(1)
data = pd.concat([anamolySample,normalSample])
for i in range(0, len(normalSample)):
target.append(0)
clf = ExtraTreesClassifier()
clf = clf.fit(data, target)
model = SelectFromModel(clf, prefit=True)
outcome = model.get_support()
warnstr = ""
for i in range(0, len(name)):
if outcome[i]:
warnstr += name[i]
warnstr += " ; "
print warnstr
return warnstr
def analyseReasonWithTreeBaesd(anamolySample,normalSample,name):
data = anamolySample
target = []
for i in range(0,len(anamolySample)):
target.append(1)
data.extend(normalSample)
for i in range(0,len(normalSample)):
target.append(0)
clf = ExtraTreesClassifier()
clf = clf.fit(data,target)
model = SelectFromModel(clf,prefit=True)
outcome = model.get_support()
for i in range(0,len(name)):
if outcome[i]:
print name[i]
def __init__(self, type_of_estimator, column_descriptions, feature_selection_model='SelectFromModel'):
self.column_descriptions = column_descriptions
self.type_of_estimator = type_of_estimator
self.feature_selection_model = feature_selection_model
def test():
#??????
np.random.seed(13)
X=pd.DataFrame(np.random.randn(20,10))
X.columns=['x%d'%i for i in range(10)]
y=pd.Series(np.random.choice([0,1],20))
#??sklearn?????????????
clf_sklearn=feature_selection.SelectKBest(feature_selection.f_classif,k=4)
clf=SklearnSelector(estimator=clf_sklearn)
clf.fit(X,y)
clf.transform(X)
print(clf.feature_selected)
clf_sklearn=SelectFromModel(LogisticRegression())
clf=SklearnSelector(estimator=clf_sklearn)
clf.fit(X,y)
clf.transform(X)
print(clf.feature_selected)
#?????
clf_selectkbest=feature_selection.SelectKBest(feature_selection.f_classif,k=4)
clf_selectfrommodel=SelectFromModel(LogisticRegression())
clf_baseselector=SklearnSelector(clf_selectkbest)
clf=VotingSelector(selectors=[('clf_selectkbest',clf_selectkbest),
('clf_selectfrommodel',clf_selectfrommodel),
('clf_baseselector',clf_baseselector)],threshold=0.5)
clf.fit(X,y)
clf.transform(X)
print(clf.feature_selected)
print(clf.df_voting)
print(clf.score)
liblinear_svc_preprocessor.py 文件源码
项目:AutoML-Challenge
作者: postech-mlg-exbrain
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def fit(self, X, Y):
import sklearn.svm
from sklearn.feature_selection import SelectFromModel
self.C = float(self.C)
self.tol = float(self.tol)
self.dual = self.dual == 'True'
self.fit_intercept = self.fit_intercept == 'True'
self.intercept_scaling = float(self.intercept_scaling)
if self.class_weight == "None":
self.class_weight = None
preprocessor = sklearn.svm.LinearSVC(penalty=self.penalty,
loss=self.loss,
dual=self.dual,
tol=self.tol,
C=self.C,
class_weight=self.class_weight,
fit_intercept=self.fit_intercept,
intercept_scaling=self.intercept_scaling,
multi_class=self.multi_class,
random_state=self.random_state)
preprocessor.fit(X, Y)
self.preprocessor = SelectFromModel(preprocessor, prefit=True)
return self
def lgb_feature_selection(fe_name, matrix_x_temp, label_y, th):
# SelectfromModel
clf = LGBMClassifier(n_estimators=400)
clf.fit(matrix_x_temp, label_y)
sfm = SelectFromModel(clf, prefit=True, threshold=th)
matrix_x = sfm.transform(matrix_x_temp)
# ????????????????
feature_score_dict = {}
for fn, s in zip(fe_name, clf.feature_importances_):
feature_score_dict[fn] = s
m = 0
for k in feature_score_dict:
if feature_score_dict[k] == 0.0:
m += 1
print 'number of not-zero features:' + str(len(feature_score_dict) - m)
# ????????
feature_score_dict_sorted = sorted(feature_score_dict.items(),
key=lambda d: d[1], reverse=True)
print 'feature_importance:'
for ii in range(len(feature_score_dict_sorted)):
print feature_score_dict_sorted[ii][0], feature_score_dict_sorted[ii][1]
print '\n'
f = open('../eda/lgb_feature_importance.txt', 'w')
f.write(th)
f.write('\nRank\tFeature Name\tFeature Importance\n')
for i in range(len(feature_score_dict_sorted)):
f.write(str(i) + '\t' + str(feature_score_dict_sorted[i][0]) + '\t' + str(feature_score_dict_sorted[i][1]) + '\n')
f.close()
# ???????????
how_long = matrix_x.shape[1] # matrix_x ? ?????? ????
feature_used_dict_temp = feature_score_dict_sorted[:how_long]
feature_used_name = []
for ii in range(len(feature_used_dict_temp)):
feature_used_name.append(feature_used_dict_temp[ii][0])
print 'feature_chooesed:'
for ii in range(len(feature_used_name)):
print feature_used_name[ii]
print '\n'
f = open('../eda/lgb_feature_chose.txt', 'w')
f.write('Feature Chose Name :\n')
for i in range(len(feature_used_name)):
f.write(str(feature_used_name[i]) + '\n')
f.close()
# ??????????
feature_not_used_name = []
for i in range(len(fe_name)):
if fe_name[i] not in feature_used_name:
feature_not_used_name.append(fe_name[i])
return matrix_x, feature_not_used_name[:], len(feature_used_name)
def fit():
X, y = generate()
dX = dd.from_pandas(X, npartitions=10)
y = dd.from_pandas(y, npartitions=10)
pre_pipe = make_pipeline(
CategoricalEncoder(),
DummyEncoder(),
Imputer(),
SGDRegressor(),
)
pipe = make_pipeline(
SelectFromModel(pre_pipe),
GradientBoostingRegressor(),
)
X_ = pre_pipe.fit_transform(dX)
for i in range(X_.npartitions):
for j in range(5):
print(i, j)
X_sub = X_.get_partition(i).compute()
y_sub = y.get_partition(i).compute()
clf.partial_fit(X_sub, y_sub)
sfm = SelectFromModel(clf, prefit=True)
return pipe, clf, sfm
def __init__(self, type_of_estimator, column_descriptions, feature_selection_model='SelectFromModel'):
self.column_descriptions = column_descriptions
self.type_of_estimator = type_of_estimator
self.feature_selection_model = feature_selection_model
def featuresByInformationGain(features,labels):
treeCL = tree.DecisionTreeClassifier(criterion="entropy")
treeCL = treeCL.fit(features,labels)
transformed_features = SelectFromModel(treeCL,prefit=True).transform(features)
return transformed_features
def plot_feature_importances(columns, X_train, y_train):
feat_labels = columns[1:]
forest = RandomForestClassifier(n_estimators=10000, random_state=0)
forest.fit(X_train, y_train)
importances = forest.feature_importances_
indices = np.argsort(importances)[::-1]
for f in range(X_train.shape[1]):
print("%2d) %-*s %f" % (
f+1,
30,
feat_labels[indices[f]],
importances[indices[f]],
))
print()
plt.title('Feature Importances')
plt.bar(
range(X_train.shape[1]),
importances[indices],
color='lightblue',
align='center',
)
plt.xticks(range(X_train.shape[1]), feat_labels[indices], rotation=90)
plt.xlim([-1, X_train.shape[1]])
plt.show()
feature_selector = SelectFromModel(forest, threshold=0.15, prefit=True)
X_selected = feature_selector.transform(X_train)
print(X_selected.shape)
def tree_based_selection(self, data_set, data_target, feature_names):
"""
:param data_set:
:return:
"""
clf = ExtraTreesClassifier()
clf = clf.fit(data_set, data_target)
print clf.feature_importances_
model = SelectFromModel(clf, prefit=True)
feature_set = model.transform(data_set)
fea_index = []
for A_col in np.arange(data_set.shape[1]):
for B_col in np.arange(feature_set.shape[1]):
if (data_set[:, A_col] == feature_set[:, B_col]).all():
fea_index.append(A_col)
check = {}
for i in fea_index:
check[feature_names[i]] = data_set[0][i]
print np.array(check)
return feature_set, fea_index
def test_SelectFromModel():
'''
test the method of SelectFromModel
:return: None
'''
digits=load_digits()
X=digits.data
y=digits.target
estimator=LinearSVC(penalty='l1',dual=False)
selector=SelectFromModel(estimator=estimator,threshold='mean')
selector.fit(X,y)
selector.transform(X)
print("Threshold %s"%selector.threshold_)
print("Support is %s"%selector.get_support(indices=True))
def test_invalid_input():
clf = SGDClassifier(alpha=0.1, n_iter=10, shuffle=True, random_state=None)
for threshold in ["gobbledigook", ".5 * gobbledigook"]:
model = SelectFromModel(clf, threshold=threshold)
model.fit(data, y)
assert_raises(ValueError, model.transform, data)
def test_input_estimator_unchanged():
"""
Test that SelectFromModel fits on a clone of the estimator.
"""
est = RandomForestClassifier()
transformer = SelectFromModel(estimator=est)
transformer.fit(data, y)
assert_true(transformer.estimator is est)
def test_feature_importances():
X, y = datasets.make_classification(
n_samples=1000, n_features=10, n_informative=3, n_redundant=0,
n_repeated=0, shuffle=False, random_state=0)
est = RandomForestClassifier(n_estimators=50, random_state=0)
for threshold, func in zip(["mean", "median"], [np.mean, np.median]):
transformer = SelectFromModel(estimator=est, threshold=threshold)
transformer.fit(X, y)
assert_true(hasattr(transformer.estimator_, 'feature_importances_'))
X_new = transformer.transform(X)
assert_less(X_new.shape[1], X.shape[1])
importances = transformer.estimator_.feature_importances_
feature_mask = np.abs(importances) > func(importances)
assert_array_almost_equal(X_new, X[:, feature_mask])
# Check with sample weights
sample_weight = np.ones(y.shape)
sample_weight[y == 1] *= 100
est = RandomForestClassifier(n_estimators=50, random_state=0)
transformer = SelectFromModel(estimator=est)
transformer.fit(X, y, sample_weight=sample_weight)
importances = transformer.estimator_.feature_importances_
transformer.fit(X, y, sample_weight=3 * sample_weight)
importances_bis = transformer.estimator_.feature_importances_
assert_almost_equal(importances, importances_bis)
# For the Lasso and related models, the threshold defaults to 1e-5
transformer = SelectFromModel(estimator=Lasso(alpha=0.1))
transformer.fit(X, y)
X_new = transformer.transform(X)
mask = np.abs(transformer.estimator_.coef_) > 1e-5
assert_array_equal(X_new, X[:, mask])
def test_partial_fit():
est = PassiveAggressiveClassifier(random_state=0, shuffle=False)
transformer = SelectFromModel(estimator=est)
transformer.partial_fit(data, y,
classes=np.unique(y))
old_model = transformer.estimator_
transformer.partial_fit(data, y,
classes=np.unique(y))
new_model = transformer.estimator_
assert_true(old_model is new_model)
X_transform = transformer.transform(data)
transformer.fit(np.vstack((data, data)), np.concatenate((y, y)))
assert_array_equal(X_transform, transformer.transform(data))
def test_warm_start():
est = PassiveAggressiveClassifier(warm_start=True, random_state=0)
transformer = SelectFromModel(estimator=est)
transformer.fit(data, y)
old_model = transformer.estimator_
transformer.fit(data, y)
new_model = transformer.estimator_
assert_true(old_model is new_model)
def test_threshold_string():
est = RandomForestClassifier(n_estimators=50, random_state=0)
model = SelectFromModel(est, threshold="0.5*mean")
model.fit(data, y)
X_transform = model.transform(data)
# Calculate the threshold from the estimator directly.
est.fit(data, y)
threshold = 0.5 * np.mean(est.feature_importances_)
mask = est.feature_importances_ > threshold
assert_array_equal(X_transform, data[:, mask])
def test_threshold_without_refitting():
"""Test that the threshold can be set without refitting the model."""
clf = SGDClassifier(alpha=0.1, n_iter=10, shuffle=True, random_state=0)
model = SelectFromModel(clf, threshold=0.1)
model.fit(data, y)
X_transform = model.transform(data)
# Set a higher threshold to filter out more features.
model.threshold = 1.0
assert_greater(X_transform.shape[1], model.transform(data).shape[1])