def test_bag_of_words_for_series_pipeline():
dataset = fetch_20newsgroups(shuffle=True, random_state=1,
remove=('headers', 'footers', 'quotes'))
n = 100
series = XSeries(dataset.data[:n])
assert series.data_type == str
translator = str.maketrans('', '', string.punctuation)
tokenizer_transformer = XSeriesTransformer(
transform_function=lambda text: text.lower().translate(translator).strip().split()
)
# series = tokenizer_transformer.transform(series)
Y = np.random.binomial(1, 0.5, n)
pipeline = PipeLineChain([
('preprocessing', XSeriesTransformer(
transform_function=lambda text: text.lower().translate(translator).strip().split()
)),
('extractor', BagOfWordsTransformer()),
('pca', PCA(n_components=10)),
# ('svc', LinearSVC())
])
pipeline = pipeline.fit(series)
transformed_series = pipeline.transform(series)
# print(transformed_series)
test_bag_of_features.py 文件源码
python
阅读 27
收藏 0
点赞 0
评论 0
评论列表
文章目录