test_bag_of_features.py 文件源码-python代码片段

test_bag_of_features.py 文件源码

python

阅读 35 收藏 0 点赞 0 评论 0

项目：xpandas 作者: alan-turing-institute 项目源码文件源码

def test_bag_of_words_for_series_pipeline():
    dataset = fetch_20newsgroups(shuffle=True, random_state=1,
                                 remove=('headers', 'footers', 'quotes'))
    n = 100
    series = XSeries(dataset.data[:n])
    assert series.data_type == str

    translator = str.maketrans('', '', string.punctuation)
    tokenizer_transformer = XSeriesTransformer(
        transform_function=lambda text: text.lower().translate(translator).strip().split()
    )

    # series = tokenizer_transformer.transform(series)

    Y = np.random.binomial(1, 0.5, n)

    pipeline = PipeLineChain([
        ('preprocessing', XSeriesTransformer(
            transform_function=lambda text: text.lower().translate(translator).strip().split()
        )),
        ('extractor', BagOfWordsTransformer()),
        ('pca', PCA(n_components=10)),
        # ('svc', LinearSVC())
    ])

    pipeline = pipeline.fit(series)
    transformed_series = pipeline.transform(series)

    # print(transformed_series)