def test_feature_extraction_tokenization(analyzer, ngram_range, use_hashing):
cache_dir = check_cache()
use_hashing = (use_hashing == 'hashed')
fe = FeatureVectorizer(cache_dir=cache_dir, mode='w')
uuid = fe.setup(analyzer=analyzer, ngram_range=ngram_range,
use_hashing=use_hashing)
fe.ingest(data_dir, file_pattern='.*\d.txt')
res2 = fe._load_features(uuid)
assert isinstance(res2, np.ndarray) or scipy.sparse.issparse(res2), "not an array {}".format(res2)
assert np.isfinite(res2.data).all()
assert_allclose(normalize(res2).data, res2.data) # data is l2 normalized
fe.delete()
评论列表
文章目录