def get_feature_selection_model_from_name(type_of_estimator, model_name):
model_map = {
'classifier': {
'SelectFromModel': SelectFromModel(RandomForestClassifier(n_jobs=-1, max_depth=10, n_estimators=15), threshold='20*mean'),
'RFECV': RFECV(estimator=RandomForestClassifier(n_jobs=-1), step=0.1),
'GenericUnivariateSelect': GenericUnivariateSelect(),
'RandomizedSparse': RandomizedLogisticRegression(),
'KeepAll': 'KeepAll'
},
'regressor': {
'SelectFromModel': SelectFromModel(RandomForestRegressor(n_jobs=-1, max_depth=10, n_estimators=15), threshold='0.7*mean'),
'RFECV': RFECV(estimator=RandomForestRegressor(n_jobs=-1), step=0.1),
'GenericUnivariateSelect': GenericUnivariateSelect(),
'RandomizedSparse': RandomizedLasso(),
'KeepAll': 'KeepAll'
}
}
return model_map[type_of_estimator][model_name]
python类RandomizedLogisticRegression()的实例源码
def LogisticRegression(result):
# dd = pd.DataFrame(Variance)
dd = result
# dd['flag'] = df_flag
from random import shuffle
data = dd.as_matrix()
shuffle(data)
p = 0.8 # train/test ratio
m,n = np.shape(data)
train = data[:int(m*p),:]
test = data[int(m*p):,:]
data = result
df_flag = result.pop('df')
from sklearn.linear_model import LogisticRegression as LR
from sklearn.linear_model import RandomizedLogisticRegression as RLR
x = result.values
y = df_flag.values
rlr = RLR() #?????????????
rlr.fit(x, y) #????
rlr.get_support() #??????????????
print(u'??????????????')
print(u'??????%s' % ','.join(data.columns[rlr.get_support()]))
x = data[data.columns[rlr.get_support()]].as_matrix() #
lr = LR() # ????????
lr.fit(x, y) # ??????????????
print(u'????')
print(u'???????%s' % lr.score(x, y))
def log_reg_feat_selection(X_train, y_train, X_valid, y_valid, random_state):
"""
Feature selection based on the scores given to the features by the
RandomizedLogisticRegression algorithm.
"""
rlr = RandomizedLogisticRegression(C=[0.001, 0.01, 0.1, 1.],
sample_fraction=0.7,
n_resampling=200, selection_threshold=0.25,
verbose=5, n_jobs=-1, random_state=0)
rlr.fit(X_train, y_train)
np.save('save/feat_sel_log_reg.npy', rlr.scores_)
return rlr.scores_
def stabilty_index(self, clf):
# sklearn implements stability selection in
# RandomizedLogisticRegression class only
clf.fit(self.features, self.labels)
# map Feature scores between 0 and 1. to the feature names
stabledict = {k: v for k, v in
zip(self.features.columns.tolist(),
map(lambda x: round(x, 4),
clf.scores_
)
)
}
return stabledict
def _get_clfs(self):
clf_dict = {"rlrclf": RandomizedLogisticRegression(),
"rfclf": RandomForestClassifier(criterion='entropy'),
"dtrclf": DecisionTreeClassifier(criterion='entropy'),
"lrclf": LogisticRegression()
}
return clf_dict
def _sort_applicable_ngrams(self, list_of_ngrams, examples, labels):
"""Given an intent classification problem and a list of ngrams,
creates ordered list of most useful ngrams."""
if list_of_ngrams:
from sklearn import linear_model, preprocessing
import numpy as np
# filter examples where we do not have enough labeled instances for cv
usable_labels = []
for label in np.unique(labels):
lab_sents = np.array(examples)[np.array(labels) == label]
if len(lab_sents) < self.min_intent_examples_for_ngram_classification:
continue
usable_labels.append(label)
mask = [label in usable_labels for label in labels]
if any(mask) and len(usable_labels) >= 2:
try:
examples = np.array(examples)[mask]
labels = np.array(labels)[mask]
X = np.array(self._ngrams_in_sentences(examples, list_of_ngrams))
intent_encoder = preprocessing.LabelEncoder()
intent_encoder.fit(labels)
y = intent_encoder.transform(labels)
clf = linear_model.RandomizedLogisticRegression(C=1)
clf.fit(X, y)
scores = clf.scores_
sort_idx = [i[0] for i in sorted(enumerate(scores), key=lambda x: -1 * x[1])]
return np.array(list_of_ngrams)[sort_idx]
except ValueError as e:
if "needs samples of at least 2 classes" in str(e):
# we got unlucky during the random sampling :( and selected a slice that only contains one class
return []
else:
raise e
else:
# there is no example we can use for the cross validation
return []
else:
return []
def _sort_applicable_ngrams(self, list_of_ngrams, examples, labels):
"""Given an intent classification problem and a list of ngrams,
creates ordered list of most useful ngrams."""
if list_of_ngrams:
from sklearn import linear_model, preprocessing
import numpy as np
# filter examples where we do not have enough labeled instances for cv
usable_labels = []
for label in np.unique(labels):
lab_sents = np.array(examples)[np.array(labels) == label]
if len(lab_sents) < self.min_intent_examples_for_ngram_classification:
continue
usable_labels.append(label)
mask = [label in usable_labels for label in labels]
if any(mask) and len(usable_labels) >= 2:
try:
examples = np.array(examples)[mask]
labels = np.array(labels)[mask]
X = np.array(self._ngrams_in_sentences(examples, list_of_ngrams))
intent_encoder = preprocessing.LabelEncoder()
intent_encoder.fit(labels)
y = intent_encoder.transform(labels)
clf = linear_model.RandomizedLogisticRegression(C=1)
clf.fit(X, y)
scores = clf.scores_
sort_idx = [i[0] for i in sorted(enumerate(scores), key=lambda x: -1 * x[1])]
return np.array(list_of_ngrams)[sort_idx]
except ValueError as e:
if "needs samples of at least 2 classes" in str(e):
# we got unlucky during the random sampling :( and selected a slice that only contains one class
return []
else:
raise e
else:
# there is no example we can use for the cross validation
return []
else:
return []