def _vectorize_chunk(dsid_dir, k, pars, pretend=False):
""" Extract features on a chunk of files """
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.externals import joblib
filenames = pars['filenames_abs']
chunk_size = pars['chunk_size']
n_samples = pars['n_samples']
mslice = slice(k*chunk_size, min((k+1)*chunk_size, n_samples))
hash_opts = {key: vals for key, vals in pars.items()
if key in ['stop_words', 'n_features',
'analyser', 'ngram_range']}
hash_opts['alternate_sign'] = False
fe = HashingVectorizer(input='content', norm=None, **hash_opts)
if pretend:
return fe
fset_new = fe.transform(_read_file(fname) for fname in filenames[mslice])
fset_new.eliminate_zeros()
joblib.dump(fset_new, str(dsid_dir / 'features-{:05}'.format(k)))
评论列表
文章目录