def greedy_select_features(self):
saved = None if self.debug_ else self.load('chosen_features')
if saved == None:
print('initial shapes:', self.train_.shape, self.test_.shape)
num_columns = self.train_.shape[1]
col_names = [str(c) for c in range(num_columns)]
self.train_.columns = col_names
self.test_.columns = col_names
g_best_score = 1e9
g_best_features = None
y = self.y_.ravel()
current = set()
scorer = metrics.make_scorer(metrics.log_loss)
for _ in enumerate(col_names):
avail = set(col_names).difference(current)
best_score = 1e9
best_features = None
for f in avail:
newf = list(current | {f})
cv = model_selection.cross_val_score(linear_model.BayesianRidge(),
self.train_[newf], y,
cv=self.n_fold_, n_jobs=-2,
scoring = scorer)
score = np.mean(cv)
if best_score > score:
best_score = score
best_features = newf
current = set(best_features)
if g_best_score > best_score:
g_best_score = best_score
g_best_features = best_features
print('new best:', g_best_score, g_best_features, self.now())
if len(best_features) - len(g_best_features) > 15:
break
self.save('chosen_features', (g_best_features, None))
else:
g_best_features, _ = saved
print('feature selection complete.', self.now())
self.train_ = self.train_[g_best_features]
self.test_ = self.test_[g_best_features]
python类make_scorer()的实例源码
def greedy_select_features(self):
print('initial shapes:', self.train_.shape, self.test_.shape)
saved = None if self.debug_ else self.load('chosen_features')
if saved == None:
g_best_score = 1e9
g_best_features = []
current = set()
finished = False
else:
g_best_features, g_best_score, finished = saved
current = set(g_best_features)
print('SFS REUSE:', g_best_score, g_best_features, self.now())
num_columns = self.train_.shape[1]
col_names = [str(c) for c in range(num_columns)]
self.train_.columns = col_names
self.test_.columns = col_names
if not finished:
y = self.y_.ravel()
scorer = metrics.make_scorer(metrics.log_loss)
loop_count = len(col_names) - len(g_best_features)
for _ in range(loop_count):
avail = set(col_names).difference(current)
best_score = 1e9
best_features = None
for f in avail:
newf = list(current | {f})
score, _ = self.ccv(linear_model.BayesianRidge(), self.train_[newf], y, scorer)
if best_score > score:
best_score = score
best_features = newf
current = set(best_features)
if g_best_score > best_score:
g_best_score = best_score
g_best_features = best_features
print('new best:', g_best_score, g_best_features, self.now())
if len(best_features) - len(g_best_features) > 5:
break
self.save('chosen_features', (g_best_features, g_best_score, False))
# now
self.save('chosen_features', (g_best_features, g_best_score, True))
print('feature selection complete.', self.now())
self.train_ = self.train_[g_best_features]
self.test_ = self.test_[g_best_features]
def test_permutation_score():
iris = load_iris()
X = iris.data
X_sparse = coo_matrix(X)
y = iris.target
svm = SVC(kernel='linear')
cv = StratifiedKFold(2)
score, scores, pvalue = permutation_test_score(
svm, X, y, n_permutations=30, cv=cv, scoring="accuracy")
assert_greater(score, 0.9)
assert_almost_equal(pvalue, 0.0, 1)
score_label, _, pvalue_label = permutation_test_score(
svm, X, y, n_permutations=30, cv=cv, scoring="accuracy",
labels=np.ones(y.size), random_state=0)
assert_true(score_label == score)
assert_true(pvalue_label == pvalue)
# check that we obtain the same results with a sparse representation
svm_sparse = SVC(kernel='linear')
cv_sparse = StratifiedKFold(2)
score_label, _, pvalue_label = permutation_test_score(
svm_sparse, X_sparse, y, n_permutations=30, cv=cv_sparse,
scoring="accuracy", labels=np.ones(y.size), random_state=0)
assert_true(score_label == score)
assert_true(pvalue_label == pvalue)
# test with custom scoring object
def custom_score(y_true, y_pred):
return (((y_true == y_pred).sum() - (y_true != y_pred).sum())
/ y_true.shape[0])
scorer = make_scorer(custom_score)
score, _, pvalue = permutation_test_score(
svm, X, y, n_permutations=100, scoring=scorer, cv=cv, random_state=0)
assert_almost_equal(score, .93, 2)
assert_almost_equal(pvalue, 0.01, 3)
# set random y
y = np.mod(np.arange(len(y)), 3)
score, scores, pvalue = permutation_test_score(
svm, X, y, n_permutations=30, cv=cv, scoring="accuracy")
assert_less(score, 0.5)
assert_greater(pvalue, 0.2)
def test_permutation_score():
iris = load_iris()
X = iris.data
X_sparse = coo_matrix(X)
y = iris.target
svm = SVC(kernel='linear')
cv = cval.StratifiedKFold(y, 2)
score, scores, pvalue = cval.permutation_test_score(
svm, X, y, n_permutations=30, cv=cv, scoring="accuracy")
assert_greater(score, 0.9)
assert_almost_equal(pvalue, 0.0, 1)
score_label, _, pvalue_label = cval.permutation_test_score(
svm, X, y, n_permutations=30, cv=cv, scoring="accuracy",
labels=np.ones(y.size), random_state=0)
assert_true(score_label == score)
assert_true(pvalue_label == pvalue)
# check that we obtain the same results with a sparse representation
svm_sparse = SVC(kernel='linear')
cv_sparse = cval.StratifiedKFold(y, 2)
score_label, _, pvalue_label = cval.permutation_test_score(
svm_sparse, X_sparse, y, n_permutations=30, cv=cv_sparse,
scoring="accuracy", labels=np.ones(y.size), random_state=0)
assert_true(score_label == score)
assert_true(pvalue_label == pvalue)
# test with custom scoring object
def custom_score(y_true, y_pred):
return (((y_true == y_pred).sum() - (y_true != y_pred).sum())
/ y_true.shape[0])
scorer = make_scorer(custom_score)
score, _, pvalue = cval.permutation_test_score(
svm, X, y, n_permutations=100, scoring=scorer, cv=cv, random_state=0)
assert_almost_equal(score, .93, 2)
assert_almost_equal(pvalue, 0.01, 3)
# set random y
y = np.mod(np.arange(len(y)), 3)
score, scores, pvalue = cval.permutation_test_score(
svm, X, y, n_permutations=30, cv=cv, scoring="accuracy")
assert_less(score, 0.5)
assert_greater(pvalue, 0.2)
def test_rfecv():
generator = check_random_state(0)
iris = load_iris()
X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
y = list(iris.target) # regression test: list should be supported
# Test using the score function
rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5)
rfecv.fit(X, y)
# non-regression test for missing worst feature:
assert_equal(len(rfecv.grid_scores_), X.shape[1])
assert_equal(len(rfecv.ranking_), X.shape[1])
X_r = rfecv.transform(X)
# All the noisy variable were filtered out
assert_array_equal(X_r, iris.data)
# same in sparse
rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5)
X_sparse = sparse.csr_matrix(X)
rfecv_sparse.fit(X_sparse, y)
X_r_sparse = rfecv_sparse.transform(X_sparse)
assert_array_equal(X_r_sparse.toarray(), iris.data)
# Test using a customized loss function
scoring = make_scorer(zero_one_loss, greater_is_better=False)
rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5,
scoring=scoring)
ignore_warnings(rfecv.fit)(X, y)
X_r = rfecv.transform(X)
assert_array_equal(X_r, iris.data)
# Test using a scorer
scorer = get_scorer('accuracy')
rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5,
scoring=scorer)
rfecv.fit(X, y)
X_r = rfecv.transform(X)
assert_array_equal(X_r, iris.data)
# Test fix on grid_scores
def test_scorer(estimator, X, y):
return 1.0
rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5,
scoring=test_scorer)
rfecv.fit(X, y)
assert_array_equal(rfecv.grid_scores_, np.ones(len(rfecv.grid_scores_)))
# Same as the first two tests, but with step=2
rfecv = RFECV(estimator=SVC(kernel="linear"), step=2, cv=5)
rfecv.fit(X, y)
assert_equal(len(rfecv.grid_scores_), 6)
assert_equal(len(rfecv.ranking_), X.shape[1])
X_r = rfecv.transform(X)
assert_array_equal(X_r, iris.data)
rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=2, cv=5)
X_sparse = sparse.csr_matrix(X)
rfecv_sparse.fit(X_sparse, y)
X_r_sparse = rfecv_sparse.transform(X_sparse)
assert_array_equal(X_r_sparse.toarray(), iris.data)