def greedy_elim(df):
# do feature selection using boruta
X = df[[x for x in df.columns if x!='SalePrice']]
y = df['SalePrice']
#model = RandomForestRegressor(n_estimators=50)
model = GradientBoostingRegressor(n_estimators=50, learning_rate=0.05)
# 150 features seems to be the best at the moment. Why this is is unclear.
feat_selector = RFE(estimator=model, step=1, n_features_to_select=150)
# find all relevant features
feat_selector.fit_transform(X.as_matrix(), y.as_matrix())
# check selected features
features_bool = np.array(feat_selector.support_)
features = np.array(X.columns)
result = features[features_bool]
#print(result)
# check ranking of features
features_rank = feat_selector.ranking_
#print(features_rank)
rank = features_rank[features_bool]
#print(rank)
return result
python类RFE的实例源码
def test_compare_with_no_feature_selection():
'''
compare the result before the selection and after
:return: None
'''
iris=load_iris()
X,y=iris.data,iris.target
estimator=LinearSVC()
selector=RFE(estimator=estimator,n_features_to_select=2)
X_t=selector.fit_transform(X,y)
X_train,X_test,y_train,y_test=cross_validation.train_test_split(X, y,
test_size=0.25,random_state=0,stratify=y)
X_train_t,X_test_t,y_train_t,y_test_t=cross_validation.train_test_split(X_t, y,
test_size=0.25,random_state=0,stratify=y)
clf=LinearSVC()
clf_t=LinearSVC()
clf.fit(X_train,y_train)
clf_t.fit(X_train_t,y_train_t)
print("Original DataSet: test score=%s"%(clf.score(X_test,y_test)))
print("Selected DataSet: test score=%s"%(clf_t.score(X_test_t,y_test_t)))
def recurvise_index(self, clf,):
# rank all features, i.e continue the elimination until the last one
rfe = RFE(clf, n_features_to_select=1)
rfe.fit(self.features, self.labels)
# map recursive feature score to the feature names
rfedict = {k: v for k, v in
zip(self.features.columns.tolist(),
map(lambda x: round(x, 4),
rfe.ranking_
)
)
}
return rfedict
feature_selection.py 文件源码
项目:Default-Credit-Card-Prediction
作者: AlexPnt
项目源码
文件源码
阅读 28
收藏 0
点赞 0
评论 0
def rfe_selection(X,y,n_features):
"""
Performs the Recursive Feature Elimination method and selects the top ranking features
Keyword arguments:
X -- The feature vectors
y -- The target vector
n_features -- n best ranked features
"""
if verbose:
print '\nPerforming Feature Selection based on the Recursive Feature Elimination method ...'
clf=RandomForestClassifierWithCoef(n_estimators=10,n_jobs=-1)
fs= RFE(clf, n_features, step=1)
fs= fs.fit(X,y)
ranks=fs.ranking_
feature_indexes=[]
for i in xrange(len(ranks)):
if ranks[i]==1:
feature_indexes+=[i]
return X[:,feature_indexes[0:n_features]],feature_indexes[0:n_features] #return selected features and original index features
def __init__(self, conf):
SemiSupervisedFeatureSelection.__init__(self, conf)
self.projection = RFE(estimator = conf.model,
n_features_to_select = conf.num_components,
step = conf.step)
def featureRank(useFeature,trueSet,falseSet):
# load data and split
X_true = []
for dn in trueSet:
fin = open("./learn/data/"+useFeature+"_"+dn+".pkl","rb")
X_true.append(pickle.load(fin))
fin.close()
X_true = np.vstack(X_true)
print(X_true.shape)
X_false = []
for dn in falseSet:
fin = open("./learn/data/"+useFeature+"_"+dn+".pkl","rb")
X_false.append(pickle.load(fin))
fin.close()
X_false = np.vstack(X_false)
print(X_false.shape)
test_size = 0.3
X_true_train,X_true_test = train_test_split(X_true ,test_size=test_size)
X_false_train, X_false_test = train_test_split(X_false ,train_size=len(X_true_train),test_size=len(X_true_test))
X = np.vstack([X_true_train,X_false_train])
X_ = np.vstack([X_true_test,X_false_test])
Y = [1]*len(X_true_train)+[0]*len(X_false_train)
Y_ = [1]*len(X_true_test)+[0]*len(X_false_test)
X,Y = shuffle(X,Y)
X_,Y_ = shuffle(X_,Y_)
featNames = ml_feature_name.getFeatureName(useFeature)
clf = LinearSVC(C=0.1)
rfe = RFE(estimator =clf, n_features_to_select=1,step=1)
rfe.fit(X,Y)
ranks = rfe.ranking_
if(useFeature =="rp"):
fout = open("./learn/feature/rp_feature_rank.txt","w")
for i,r in enumerate(ranks):
fout.write("{0} {1}\n".format(i,r))
fout.close()
rankFeat = list(zip(ranks,featNames))
rankFeat.sort()
for rf in rankFeat:
if(useFeature in ["tfidf_1gram","tfidf_2gram","tfidf_3gram","tfidf_4gram"]):
if(ml_feature_name.isDiatonic(rf[1])):
print(rf)
else:
print(rf)
def test_RFE():
'''
test the method of RFE, the number of feature aim to 2
:return: None
'''
iris=load_iris()
X=iris.data
y=iris.target
estimator=LinearSVC()
selector=RFE(estimator=estimator,n_features_to_select=2)
selector.fit(X,y)
print("N_features %s"%selector.n_features_)
print("Support is %s"%selector.support_)
print("Ranking %s"%selector.ranking_)
def sk_feature_ref():
# load the iris datasets
dataset = datasets.load_iris()
# create a base classifier used to evaluate a subset of attributes
model_lr = LogisticRegression()
# create the RFE model and select 3 attributes
rfe = RFE(model_lr, 3)
rfe = rfe.fit(dataset.data, dataset.target)
# summarize the selection of the attributes
print rfe.support_
# [False True True True]
print rfe.ranking_
# [2 1 1 1]
print sorted(zip(map(lambda x: round(x, 4), rfe.ranking_), dataset.feature_names))
# [(1.0, 'petal length (cm)'), (1.0, 'petal width (cm)'), (1.0, 'sepal width (cm)'), (2.0, 'sepal length (cm)')]
def sk_feature_ref_v2():
X, Y = get_dummy_data()
names = ['f1', 'f2', 'f3']
model_lr = LogisticRegression()
rfe = RFE(model_lr, 2)
rfe = rfe.fit(X, Y)
print rfe.support_
print rfe.ranking_
print sorted(zip(map(lambda x: round(x, 4), rfe.ranking_), names))
def recursive_feature_elimination(self, nfeat=None, step=1, inplace=False):
"""A method to implement recursive feature elimination on the model.
Note that CV is not performed in this function. The method will
continue to eliminate some features (specified by step parameter)
at each step until the specified number of features are reached.
Parameters
__________
nfeat : int or None, default=None
The num of top features to select. If None, half of the features
are selected.
step : int or float, default=1
If int, then step corresponds to the number of features to remove
at each iteration.
If float and within (0.0, 1.0), then step corresponds to the
percentage (rounded down) of features to remove at each
iteration.
If float and greater than one, then integral part will be
considered as an integer input
inplace : bool, default=False
If True, the predictors of the class are modified to those
selected by the RFE procedure.
Returns
_______
selected : A series object containing the selected features as
index and their rank in selection as values
"""
rfe = RFE(self.alg, n_features_to_select=nfeat, step=step)
rfe.fit(
self.datablock.train[self.predictors],
self.datablock.train[self.datablock.target]
)
ranks = pd.Series(rfe.ranking_, index=self.predictors)
selected = ranks.loc[rfe.support_]
if inplace:
self.set_predictors(selected.index.tolist())
return selected