def greedy_select_features(self):
print('initial shapes:', self.train_.shape, self.test_.shape)
saved = None if self.debug_ else self.load('chosen_features')
if saved == None:
g_best_score = 1e9
g_best_features = []
current = set()
finished = False
else:
g_best_features, g_best_score, finished = saved
current = set(g_best_features)
print('SFS REUSE:', g_best_score, g_best_features, self.now())
num_columns = self.train_.shape[1]
col_names = [str(c) for c in range(num_columns)]
self.train_.columns = col_names
self.test_.columns = col_names
if not finished:
y = self.y_.ravel()
scorer = metrics.make_scorer(metrics.log_loss)
loop_count = len(col_names) - len(g_best_features)
for _ in range(loop_count):
avail = set(col_names).difference(current)
best_score = 1e9
best_features = None
for f in avail:
newf = list(current | {f})
score, _ = self.ccv(linear_model.BayesianRidge(), self.train_[newf], y, scorer)
if best_score > score:
best_score = score
best_features = newf
current = set(best_features)
if g_best_score > best_score:
g_best_score = best_score
g_best_features = best_features
print('new best:', g_best_score, g_best_features, self.now())
if len(best_features) - len(g_best_features) > 5:
break
self.save('chosen_features', (g_best_features, g_best_score, False))
# now
self.save('chosen_features', (g_best_features, g_best_score, True))
print('feature selection complete.', self.now())
self.train_ = self.train_[g_best_features]
self.test_ = self.test_[g_best_features]
评论列表
文章目录