def vectorize_fold(fold, tags, meta_df, use_metafeats=True):
with time_me('Loaded pdicts'):
scoreses = [common.pdict_for_tag(tag, fold) for tag in tags]
df = meta_df[meta_df['fold']==fold]
assert len(df)
y = df['label']
n_predictors = len(scoreses)
with time_me('Munged scores for {} predictors'.format(n_predictors), mode='print'):
# TODO: could use the logit loading fn added to user_wrapper module
scores = munge_scoreses(scoreses, df)
if not use_metafeats:
X = scores
else:
meta_cols = metavectorize.metafeature_columns
meta = df[meta_cols].values
# Special f_0 dummy meta feature for learning vanilla weight term per predictor
metafeats = np.hstack([np.ones( (len(df), 1) ), meta])
# Oh fuck this, I've spent too long trying to understand np.einsum...
# (Worth noting that sklearn.preprocessing has a 'PolynomialFeatures' utility
# that might have been useful here. But this is fine.)
n_metafeats = metafeats.shape[1]
logging.info('{} predictors x {} metafeatures -> {} coefs'.format(
n_predictors, n_metafeats, n_predictors*n_metafeats))
# X is 'metafeat major'. i.e. the first n_p values for each vector are the
# raw scores for each predictor, they're followed by each predictor's score
# multiplied by the first metafeature and so on.
X = np.tile(scores, n_metafeats) * np.repeat(metafeats, n_predictors, axis=1)
return X, y
评论列表
文章目录