def evaluate_pca(dataset, iterator, args):
# Select features
if args.features is not None and args.features != dataset.feature_names:
print('selecting features ...')
features = _explode_features(args.features)
start = timeit.default_timer()
dataset = dataset.dataset_from_feature_names(features)
print('done, took %fs' % (timeit.default_timer() - start))
print('')
pca_components = range(1, dataset.n_features)
total_steps = len(pca_components)
if 'pca' not in args.transformers:
args.transformers.append('pca')
curr_step = 0
measures = []
for n_components in pca_components:
curr_step += 1
prefix = '%.3d' % curr_step
print('(%.3d/%.3d) evaluating with %d pca components ...' % (curr_step, total_steps, n_components))
start = timeit.default_timer()
try:
args.pca_components = n_components
ll_stats = _compute_averaged_pos_and_neg_lls(dataset, iterator, prefix, args)
measure = _compute_measure(ll_stats, dataset, args)
except:
measure = np.nan
if measure is np.isnan(measure):
print('measure: not computable')
else:
print('measure: %f' % measure)
# Correct score. The problem is that it is computed given the dataset, which has too many features.
measure = (measure * float(dataset.n_features)) / float(n_components)
measures.append(measure)
print('done, took %fs' % (timeit.default_timer() - start))
print('')
assert len(pca_components) == len(measures)
best_idx = np.nanargmax(np.array(measures)) # get the argmax ignoring NaNs
print('best result with score %f: %d PCA components' % (measures[best_idx], pca_components[best_idx]))
print('detailed reports have been saved')
# Save results
if args.output_dir is not None:
filename = '_results.csv'
with open(os.path.join(args.output_dir, filename), 'wb') as f:
writer = csv.writer(f, delimiter=';')
writer.writerow(['', 'idx', 'measure', 'components'])
for idx, (measure, n_components) in enumerate(zip(measures, pca_components)):
selected = '*' if best_idx == idx else ''
writer.writerow([selected, '%d' % idx, '%f' % measure, '%d' % n_components])
evaluate_new.py 文件源码
python
阅读 28
收藏 0
点赞 0
评论 0
评论列表
文章目录