def learn_decision_tree(data):
DT = tree.DecisionTreeClassifier(max_depth=7)
scorer = make_scorer(matthews_corrcoef)
for i in range(5):
scores = cross_val_score(DT, data.X_train, data.y_train, cv=10, scoring=scorer)
print("iteration",i, "dt mean:", scores.mean())
scores = list(scores)
print("Decision Tree train scores:\n", scores)
return DT
# DT = DT.fit(train_data[:, :-1], train_data[:, -1])
# predictionsDT = DT.predict(validation_data[:, :-1])
# validating predicions
# dtError = 0
# for i in range(0, len(validation_data)):
# if(validation_data[i][20] != predictionsDT[i]):
# dtError = dtError + 1
# print("DT Error : ", float(dtError)/len(validation_data)*100.0)
python类make_scorer()的实例源码
def fit_model(X, y):
classifier = svm.SVC()
parameters = {'kernel':['poly', 'rbf', 'sigmoid'], 'degree':[1, 2, 3], 'C':[0.1, 1, 10]}
f1_scorer = make_scorer(performance_metric,
greater_is_better=True)
clf = GridSearchCV(classifier,
param_grid=parameters,
scoring=f1_scorer)
clf.fit(X, y)
return clf
# Read student data
def rf_from_cfg(cfg, seed):
"""
Creates a random forest regressor from sklearn and fits the given data on it.
This is the function-call we try to optimize. Chosen values are stored in
the configuration (cfg).
Parameters:
-----------
cfg: Configuration
configuration chosen by smac
seed: int or RandomState
used to initialize the rf's random generator
Returns:
-----------
np.mean(rmses): float
mean of root mean square errors of random-forest test predictions
per cv-fold
"""
rfr = RandomForestRegressor(
n_estimators=cfg["num_trees"],
criterion=cfg["criterion"],
min_samples_split=cfg["min_samples_to_split"],
min_samples_leaf=cfg["min_samples_in_leaf"],
min_weight_fraction_leaf=cfg["min_weight_frac_leaf"],
max_features=cfg["max_features"],
max_leaf_nodes=cfg["max_leaf_nodes"],
bootstrap=cfg["do_bootstrapping"],
random_state=seed)
def rmse(y, y_pred):
return np.sqrt(np.mean((y_pred - y)**2))
# Creating root mean square error for sklearns crossvalidation
rmse_scorer = make_scorer(rmse, greater_is_better=False)
score = cross_val_score(rfr, boston.data, boston.target, cv=11, scoring=rmse_scorer)
return -1 * np.mean(score) # Because cross_validation sign-flips the score
def __init__(self, corpus, relationtype, modelname="scikit_classifier"):
super(ScikitRE, self).__init__()
self.modelname = relationtype + "_" + modelname
self.relationtype = relationtype
self.pairtype = relationtype
self.corpus = corpus
self.pairs = []
self.features = []
self.labels = []
self.pred = []
self.clusters = word2vec.load_clusters("corpora/Thaliana/documents-processed-clusters.txt")
self.posfmeasure = make_scorer(f1_score, average='binary', pos_label=True)
self.generate_data(corpus, modelname, relationtype)
self.text_clf = Pipeline([('vect', CountVectorizer(analyzer='char_wb', ngram_range=(3,20), min_df=0.0, max_df=0.7)),
#('vect', CountVectorizer(ngram_range=(1,3), binary=False, max_features=None)),
#('tfidf', TfidfTransformer(use_idf=True, norm="l2")),
#('clf', SGDClassifier(loss='hinge', penalty='l1', alpha=0.0001, n_iter=5, random_state=42)),
#('clf', SGDClassifier())
#('clf', svm.NuSVC(nu=0.01 ))
#('clf', RandomForestClassifier(class_weight={False:1, True:2}, n_jobs=-1))
('clf', MultinomialNB(alpha=0.01, fit_prior=False))
#('clf', DummyClassifier(strategy="constant", constant=True))
])
def build_grid_search(X, y):
parameters = {
"estimator__criterion": ['gini', 'entropy'],
"estimator__max_depth": [10, 15, 20, 25, None],
"estimator__max_features": ['auto', 'sqrt', 'log2', None]
}
ovr = OneVsRestClassifier(RandomForestClassifier(n_estimators=1000,
oob_score=True, n_jobs=-1, verbose=1))
model_tunning = GridSearchCV(ovr, param_grid=parameters, verbose=1,
n_jobs=-1, cv=10,
scoring=make_scorer(f1_score))
model_tunning.fit(X, y)
test_score = model_tunning.best_score_
print 'The best test score: ', test_score
y_score = model_tunning.predict_proba(X_test)
multiclass_roc(y_score, 'grid_search_02')
return model_tunning
def __init__(self, name,classifier=None, number_gen=20,
verbose=0, repeat=1, parallel=False,
make_logbook=False, random_state=None,
cv_metric_fuction=make_scorer(matthews_corrcoef),
features_metric_function=None):
self._name = name
self.estimator = SVC(kernel='linear', max_iter=10000) if classifier is None else clone(classifier)
self.number_gen = number_gen
self.verbose = verbose
self.repeat = repeat
self.parallel=parallel
self.make_logbook = make_logbook
self.random_state = random_state
self.cv_metric_function= cv_metric_fuction
self.features_metric_function= features_metric_function
self._random_object = check_random_state(self.random_state)
random.seed(self.random_state)
def __init__(self, name,classifier=None, number_gen=20,
verbose=0, repeat=1, parallel=False,
make_logbook=False, random_state=None,
cv_metric_fuction=make_scorer(matthews_corrcoef),
features_metric_function=None):
self._name = name
self.estimator = SVC(kernel='linear', max_iter=10000) if classifier is None else clone(classifier)
self.number_gen = number_gen
self.verbose = verbose
self.repeat = repeat
self.parallel=parallel
self.make_logbook = make_logbook
self.random_state = random_state
self.cv_metric_function= cv_metric_fuction
self.features_metric_function= features_metric_function
self._random_object = check_random_state(self.random_state)
random.seed(self.random_state)
def test_cross_val_score_with_score_func_regression():
X, y = make_regression(n_samples=30, n_features=20, n_informative=5,
random_state=0)
reg = Ridge()
# Default score of the Ridge regression estimator
scores = cross_val_score(reg, X, y, cv=5)
assert_array_almost_equal(scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)
# R2 score (aka. determination coefficient) - should be the
# same as the default estimator score
r2_scores = cross_val_score(reg, X, y, scoring="r2", cv=5)
assert_array_almost_equal(r2_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)
# Mean squared error; this is a loss function, so "scores" are negative
mse_scores = cross_val_score(reg, X, y, cv=5, scoring="mean_squared_error")
expected_mse = np.array([-763.07, -553.16, -274.38, -273.26, -1681.99])
assert_array_almost_equal(mse_scores, expected_mse, 2)
# Explained variance
scoring = make_scorer(explained_variance_score)
ev_scores = cross_val_score(reg, X, y, cv=5, scoring=scoring)
assert_array_almost_equal(ev_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)
def test_cross_val_score_with_score_func_regression():
X, y = make_regression(n_samples=30, n_features=20, n_informative=5,
random_state=0)
reg = Ridge()
# Default score of the Ridge regression estimator
scores = cval.cross_val_score(reg, X, y, cv=5)
assert_array_almost_equal(scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)
# R2 score (aka. determination coefficient) - should be the
# same as the default estimator score
r2_scores = cval.cross_val_score(reg, X, y, scoring="r2", cv=5)
assert_array_almost_equal(r2_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)
# Mean squared error; this is a loss function, so "scores" are negative
mse_scores = cval.cross_val_score(reg, X, y, cv=5,
scoring="mean_squared_error")
expected_mse = np.array([-763.07, -553.16, -274.38, -273.26, -1681.99])
assert_array_almost_equal(mse_scores, expected_mse, 2)
# Explained variance
scoring = make_scorer(explained_variance_score)
ev_scores = cval.cross_val_score(reg, X, y, cv=5, scoring=scoring)
assert_array_almost_equal(ev_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)
def test_cross_val_score_multilabel():
X = np.array([[-3, 4], [2, 4], [3, 3], [0, 2], [-3, 1],
[-2, 1], [0, 0], [-2, -1], [-1, -2], [1, -2]])
y = np.array([[1, 1], [0, 1], [0, 1], [0, 1], [1, 1],
[0, 1], [1, 0], [1, 1], [1, 0], [0, 0]])
clf = KNeighborsClassifier(n_neighbors=1)
scoring_micro = make_scorer(precision_score, average='micro')
scoring_macro = make_scorer(precision_score, average='macro')
scoring_samples = make_scorer(precision_score, average='samples')
score_micro = cval.cross_val_score(clf, X, y, scoring=scoring_micro, cv=5)
score_macro = cval.cross_val_score(clf, X, y, scoring=scoring_macro, cv=5)
score_samples = cval.cross_val_score(clf, X, y,
scoring=scoring_samples, cv=5)
assert_almost_equal(score_micro, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 3])
assert_almost_equal(score_macro, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 4])
assert_almost_equal(score_samples, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 4])
def main():
import sys
import numpy as np
from sklearn import cross_validation
from sklearn import svm
import cPickle
data_dir = sys.argv[1]
fet_list = load_list(osp.join(data_dir, 'c3d.list'))
pos_list = load_list(osp.join(data_dir, 'pos.urls'))
features = np.load(osp.join(data_dir, 'c3d.npy'))
fet_set = set(fet_list)
pos_idx = [fet_list.index(i) for i in pos_list if i in fet_set]
y = np.zeros(features.shape[0])
y[pos_idx] = 1
print 'n_pos', np.sum(y), 'n_neg', np.sum(1 - y)
params = {'n_estimators':[2, 4, 5, 6, 8, 10, 30]}
#params = {'n_estimators':[50, 70, 100, 120, 150, 200]}
clf = grid_search.GridSearchCV(RandomForestClassifier(n_estimators = 2, n_jobs = 4), params, scoring = metrics.make_scorer(lambda yt, yp: metrics.f1_score(yt, yp, pos_label = 0)), cv = 5)
clf.fit(features, y)
print clf.best_score_
print clf.best_estimator_
cPickle.dump(clf.best_estimator_, open(osp.join(data_dir, 'c3d-models-rfc.pkl'), 'w'))
def opt_classifier(clf, params, features_train, labels_train, optimize=True):
'''
GridSearchCV to find optimal parameters of the classifier.
'''
if optimize:
scorer = make_scorer(f1_score)
clf = GridSearchCV(clf, params, scoring=scorer)
clf = clf.fit(features_train, labels_train)
clf = clf.best_estimator_
else:
clf = clf.fit(features_train, labels_train)
return clf
MLNPCapstone.py 文件源码
项目:machine-learning-nanodegree-program-capstone
作者: harrylippy
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
def cross_validate(self):
clf = self._clf[self._learner]
(X_train, y_train) = self._train_data
print " + Cross-validating classifier (learner = %s)..." \
% self._learner,; stdout.flush()
scores = cross_val_score(
self._clf[self._learner],
X_train, y_train,
scoring=make_scorer(roc_auc_score),
cv=3)
print "done.\n * Scores: %r" % scores
def hierarchical_f_measure_scorer(graph):
measure = partial(hierarchical_f_measure, graph)
return make_scorer(measure)
def make_scoring(scoring):
"""
Score is reversed if greater_is_better is False.
"""
if scoring == 'r2':
return metrics.make_scorer(metrics.r2_score)
elif scoring == 'mean_absolute_error':
return metrics.make_scorer(metrics.mean_absolute_error, greater_is_better=False)
elif scoring == 'mean_squared_error':
return metrics.make_scorer(metrics.mean_squared_error, greater_is_better=False)
elif scoring == 'median_absolute_error':
return metrics.make_scorer(metrics.median_absolute_error, greater_is_better=False)
else:
raise ValueError("Not supported scoring")
def make_scoring( scoring):
if scoring == 'r2':
return make_scorer( metrics.r2_score)
elif scoring == 'mean_absolute_error':
return make_scorer( metrics.mean_absolute_error, greater_is_better=False)
elif scoring == 'mean_squared_error':
return make_scorer( metrics.mean_squared_error, greater_is_better=False)
elif scoring == 'median_absolute_error':
return make_scorer( metrics.median_absolute_error, greater_is_better=False)
else:
raise ValueError("Not supported scoring")
def _make_scoring_r0( scoring):
if scoring == 'r2':
return metrics.make_scorer( metrics.r2_score)
elif scoring == 'mean_absolute_error':
return metrics.make_scorer( metrics.mean_absolute_error, greater_is_better=False)
elif scoring == 'mean_squared_error':
return metrics.make_scorer( metrics.mean_squared_error, greater_is_better=False)
elif scoring == 'median_absolute_error':
return metrics.make_scorer( metrics.median_absolute_error, greater_is_better=False)
else:
raise ValueError("Not supported scoring")
evaluation_custom-scoring-function-grid-search-runtime.py 文件源码
项目:Machine-and-Deep-Learning-Code-Notes
作者: Dvshah13
项目源码
文件源码
阅读 25
收藏 0
点赞 0
评论 0
def my_custom_log_loss_func(ground_truth, p_predicitons, penalty=list(), eps=1e-15): # # as a general rule, the first parameter of your function should be the actual answer (ground_truth) and the second should be the predictions or the predicted probabilities (p_predicitons)
adj_p = np.clip(p_predicitons, eps, 1 - eps)
lb = LabelBinarizer()
g = lb.fit_transform(ground_truth)
if g.shape[1] == 1:
g = np.append(1 - g, g, axis=1)
if penalty:
g[:,penalty] = g[:,penalty] * 2
summation = np.sum(g * np.log(adj_p))
return summation * (-1.0/len(ground_truth))
# my_custom_scorer = make_scorer(my_custom_log_loss_func, greater_is_better=False, needs_proba=True, penalty=[4,9]) # here we set the penalty on for highly confusable numbers 4 and 9 (can change it or even leave it empty to check whether the resulting loss will be the same as that of the previous experiment with the sklearn.metrics.log_loss function)
# This new loss function will double log_loss when evaluating the results of the classes of number 4 and 9
def search(X,y):
rmse = make_scorer(RMSE, greater_is_better = False)
param_test1 = {'n_estimators':range(150,401,50)}
gsearch1 = GridSearchCV(estimator = RandomForestRegressor(min_samples_split=30,
min_samples_leaf=20,max_depth=8,max_features='sqrt' ,random_state=10),
param_grid = param_test1, scoring=rmse,cv=5)
gsearch1.fit(X,y)
print gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_
def crossV(model, X, y, folds = 5):
rmse = make_scorer(RMSE, greater_is_better = False)
scores = cross_val_score(model, X, y, cv = folds, scoring=rmse, n_jobs = 1)
print scores
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
def test_grid_search_sparse_scoring():
X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)
clf = LinearSVC()
cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring="f1")
cv.fit(X_[:180], y_[:180])
y_pred = cv.predict(X_[180:])
C = cv.best_estimator_.C
X_ = sp.csr_matrix(X_)
clf = LinearSVC()
cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring="f1")
cv.fit(X_[:180], y_[:180])
y_pred2 = cv.predict(X_[180:])
C2 = cv.best_estimator_.C
assert_array_equal(y_pred, y_pred2)
assert_equal(C, C2)
# Smoke test the score
# np.testing.assert_allclose(f1_score(cv.predict(X_[:180]), y[:180]),
# cv.score(X_[:180], y[:180]))
# test loss where greater is worse
def f1_loss(y_true_, y_pred_):
return -f1_score(y_true_, y_pred_)
F1Loss = make_scorer(f1_loss, greater_is_better=False)
cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring=F1Loss)
cv.fit(X_[:180], y_[:180])
y_pred3 = cv.predict(X_[180:])
C3 = cv.best_estimator_.C
assert_equal(C, C3)
assert_array_equal(y_pred, y_pred3)
def test_cross_val_score_score_func():
clf = MockClassifier()
_score_func_args = []
def score_func(y_test, y_predict):
_score_func_args.append((y_test, y_predict))
return 1.0
with warnings.catch_warnings(record=True):
scoring = make_scorer(score_func)
score = cross_val_score(clf, X, y, scoring=scoring)
assert_array_equal(score, [1.0, 1.0, 1.0])
assert len(_score_func_args) == 3
def test_cross_val_score_multilabel():
X = np.array([[-3, 4], [2, 4], [3, 3], [0, 2], [-3, 1],
[-2, 1], [0, 0], [-2, -1], [-1, -2], [1, -2]])
y = np.array([[1, 1], [0, 1], [0, 1], [0, 1], [1, 1],
[0, 1], [1, 0], [1, 1], [1, 0], [0, 0]])
clf = KNeighborsClassifier(n_neighbors=1)
scoring_micro = make_scorer(precision_score, average='micro')
scoring_macro = make_scorer(precision_score, average='macro')
scoring_samples = make_scorer(precision_score, average='samples')
score_micro = cross_val_score(clf, X, y, scoring=scoring_micro, cv=5)
score_macro = cross_val_score(clf, X, y, scoring=scoring_macro, cv=5)
score_samples = cross_val_score(clf, X, y, scoring=scoring_samples, cv=5)
assert_almost_equal(score_micro, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 3])
assert_almost_equal(score_macro, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 4])
assert_almost_equal(score_samples, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 4])
def test_grid_search_sparse_scoring():
X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)
clf = LinearSVC()
cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring="f1")
cv.fit(X_[:180], y_[:180])
y_pred = cv.predict(X_[180:])
C = cv.best_estimator_.C
X_ = sp.csr_matrix(X_)
clf = LinearSVC()
cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring="f1")
cv.fit(X_[:180], y_[:180])
y_pred2 = cv.predict(X_[180:])
C2 = cv.best_estimator_.C
assert_array_equal(y_pred, y_pred2)
assert_equal(C, C2)
# Smoke test the score
# np.testing.assert_allclose(f1_score(cv.predict(X_[:180]), y[:180]),
# cv.score(X_[:180], y[:180]))
# test loss where greater is worse
def f1_loss(y_true_, y_pred_):
return -f1_score(y_true_, y_pred_)
F1Loss = make_scorer(f1_loss, greater_is_better=False)
cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring=F1Loss)
cv.fit(X_[:180], y_[:180])
y_pred3 = cv.predict(X_[180:])
C3 = cv.best_estimator_.C
assert_equal(C, C3)
assert_array_equal(y_pred, y_pred3)
def test_make_scorer():
# Sanity check on the make_scorer factory function.
f = lambda *args: 0
assert_raises(ValueError, make_scorer, f, needs_threshold=True,
needs_proba=True)
def test_raises_on_score_list():
# Test that when a list of scores is returned, we raise proper errors.
X, y = make_blobs(random_state=0)
f1_scorer_no_average = make_scorer(f1_score, average=None)
clf = DecisionTreeClassifier()
assert_raises(ValueError, cross_val_score, clf, X, y,
scoring=f1_scorer_no_average)
grid_search = GridSearchCV(clf, scoring=f1_scorer_no_average,
param_grid={'max_depth': [1, 2]})
assert_raises(ValueError, grid_search.fit, X, y)
def fit_cv(self, data, labels, cv_params, epochs=10, **kwargs):
n_jobs = kwargs.get('n_jobs', 1)
iid = kwargs.get('iid', True)
refit = kwargs.get('refit', True)
cv = kwargs.get('cv', None)
verbose = kwargs.get('verbose', 0)
pre_dispatch = kwargs.get('pre_dispatch', '2*n_jobs')
error_score = kwargs.get('error_score', 'raise')
return_train_score = kwargs.get('return_train_score', True)
param_dct = self.get_params()
param_dct.update({'bootstrap_fraction': 1.0})
rscv = GridSearchCV(SGDBolasso(**param_dct),
scoring=make_scorer(accuracy_score),
verbose=verbose,
param_grid=cv_params,
fit_params={'epochs': 1, 'verbose': 0},
cv=cv,
return_train_score=return_train_score,
n_jobs=n_jobs,
iid=iid,
refit=refit,
pre_dispatch=pre_dispatch,
error_score=error_score)
rscv.fit(data, labels)
param_dct = rscv.best_params_.copy()
param_dct.update({'bootstrap_fraction': self.bootstrap_fraction})
best_estim = SGDBolasso(**param_dct)
best_estim.fit(data, labels, epochs=epochs)
return best_estim, rscv
def __grid_search_model(self, clf_factory, documents, labels, pos_label):
boolndarr = labels.values == pos_label
n = documents.size
n_pos = labels[boolndarr].size
n_neg = n - n_pos
param_grid = {
'vect__binary' : [False, True],
'vect__min_df' : [1, 2],
'vect__ngram_range' : [(1, 1), (1, 2), (1, 3)],
'vect__smooth_idf' : [False, True],
'vect__stop_words' : [None, 'english'],
'vect__sublinear_tf': [False, True],
'vect__use_idf' : [False, True],
'clf__alpha' : [0, 0.01, 0.05, 0.1, 0.5, 1]
}
k = 5
cv = ShuffleSplit(
n,
n_iter = k,
test_size = 1 / k,
random_state = 0
)
pos_weight = n_neg / n_pos
sample_weight = np.ones(n)
sample_weight[boolndarr] *= pos_weight
fit_params = {'clf__sample_weight': sample_weight}
f1_scorer = make_scorer(f1_score, pos_label=pos_label)
grid_search = GridSearchCV(
clf_factory,
param_grid,
cv = cv,
fit_params = fit_params,
n_jobs = -1,
scoring = f1_scorer
)
grid_search.fit(documents, labels)
best_estimator = grid_search.best_estimator_
best_score = grid_search.best_score_
best_params = grid_search.best_params_
print("Best F1 score: {0:04.3f}".format(best_score))
print("Parameters: {0}".format(best_params))
return best_estimator
def train(self, a_train_data, a_dev_data=None, a_n_y=-1,
a_i=-1, a_train_out=None, a_dev_out=None):
"""Method for training the model.
Args:
a_train_data (tuple[list, dict]):
list of training JSON data
a_dev_data (tuple[list, dict] or None):
list of development JSON data
a_n_y (int):
number of distinct classes
a_i (int):
row index for the output predictions
a_train_out (np.array or None):
predictions for the training set
a_dev_out (np.array or None):
predictions for the training set
Returns:
void:
Note:
updates ``a_train_out`` and ``a_dev_out`` in place
"""
self.n_y = a_n_y
x_train, y_train = self._generate_ts(a_train_data)
x_dev, y_dev = self._generate_ts(a_dev_data)
# determine cross-validation and grid-search strategy and fit the model
if self._gs:
if a_dev_data is None or not a_dev_data[0]:
cv = StratifiedKFold(y_train, n_folds=NFOLDS, shuffle=True)
else:
cv = self._devset_cv(y_train, len(y_dev), NFOLDS)
x_train = x_train + x_dev
y_train = y_train + y_dev
scorer = make_scorer(f1_score, average="macro")
self._model = GridSearchCV(self._model, self.PARAM_GRID,
scoring=scorer,
cv=cv, n_jobs=self.N_JOBS, verbose=1)
self._model.fit([el[-1] for el in x_train], y_train)
# output best hyper-parameters
if self._gs:
print("Best params:", repr(self._model.best_params_),
file=sys.stderr)
if a_i >= 0:
if a_train_out is not None:
if self._gs and a_dev_data and a_dev_data[0]:
x_train = x_train[:-len(x_dev)]
for i, x_i in x_train:
self._predict(x_i, a_train_out[i], a_i)
if a_dev_out is not None:
for i, x_i in x_dev:
self._predict(x_i, a_dev_out[i], a_i)
def greedy_select_features(self):
print('initial shapes:', self.train_.shape, self.test_.shape)
saved = None if self.debug_ else self.load('chosen_features')
if saved == None:
g_best_score = 1e9
g_best_features = []
current = set()
finished = False
else:
g_best_features, g_best_score, finished = saved
current = set(g_best_features)
print('SFS REUSE:', g_best_score, len(current), sorted(g_best_features), self.now())
if not finished:
col_names = self.train_.columns
y = self.y_.ravel()
scorer = metrics.make_scorer(metrics.log_loss)
loop_count = len(col_names) - len(g_best_features)
for _ in range(loop_count):
avail = set(col_names).difference(current)
best_score = 1e9
best_features = None
for f in avail:
newf = list(current | {f})
score, _ = self.ccv(linear_model.BayesianRidge(), self.train_[newf], y, scorer)
if best_score > score:
best_score = score
best_features = newf
current = set(best_features)
if g_best_score > best_score:
g_best_score = best_score
g_best_features = best_features
print('new best:', g_best_score, sorted(g_best_features), self.now())
else:
print('no luck', len(current), self.now())
if len(best_features) - len(g_best_features) >= 5:
break
self.save('chosen_features', (g_best_features, g_best_score, False))
# now
self.save('chosen_features', (g_best_features, g_best_score, True))
print('feature selection complete.', self.now())
self.train_ = self.train_[g_best_features]
self.test_ = self.test_[g_best_features]