如何向矢量化数据集添加功能?

发布于 2021-01-29 14:59:29

我想写一个Naive Base文本分类器。由于sklearn不接受“文本格式”功能,因此我正在使用TfidfVectorizer对其进行转换。

我仅使用转换后的数据作为特征就能够成功创建此类分类。代码如下:

### text vectorization--go from strings to lists of numbers
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                         stop_words='english')

X_train_transformed = vectorizer.fit_transform(X_train_raw['url'])
X_test_transformed  = vectorizer.transform(X_test_raw['url'])

### feature selection, because text is super high dimensional and 
### can be really computationally chewy as a result
selector = SelectPercentile(f_classif, percentile=1)
selector.fit(X_train_transformed, y_train_raw)

X_train = selector.transform(X_train_transformed).toarray()
X_test  = selector.transform(X_test_transformed).toarray()

clf = GaussianNB()
clf.fit(X_train, y_train_raw)
.....

一切都按预期工作,但是当我想添加其他功能时遇到问题。指示天气的标志,给定的文本包含某个关键字。我尝试了多种方法来正确地转换“
url”功能,然后将转换后的功能与另一个布尔功能组合在一起,但是我没有成功。假设我有一个包含两个功能的熊猫框架,应该如何完成操作的任何提示:“
url”(我要转换)和“ contains_keyword”标志?

失败的解决方案如下所示:

vectorizer = CountVectorizer(min_df=1)
X_train_transformed = vectorizer.fit_transform(X_train_raw['url'])
X_test_transformed  = vectorizer.transform(X_test_raw['url'])
selector = SelectPercentile(f_classif, percentile=1)
selector.fit(X_train_transformed, y_train_raw)

X_train_selected = selector.transform(X_train_transformed)
X_test_selected  = selector.transform(X_test_transformed)

X_train_raw['transformed_url'] = X_train_selected.toarray().tolist()
X_train_without = X_train_raw.drop(['url'], axis=1)
X_train = X_train_without.values

这将产生包含布尔标志和列表的行,该列表是sklearn模型的错误输入。我不知道我应该如何正确地改变它。感谢您的帮助。

以下是测试数据:

url,target,ads_keyword
googleadapis l google com,1,True
googleadapis l google com,1,True
clients1 google com,1,False
c go-mpulse net,1,False
translate google pl,1,False

url-从DNS查询中获取的拆分域

target-分类的目标类

ads_keyword-表示天气的标记,“ url”包含“ ads”一词。

我想使用TfidfVectorizer转换“ url”,并将转换后的数据与“
ads_keyword”(以及将来可能更多的功能)一起用作训练朴素贝叶斯模型的功能。

关注者
0
被浏览
109
1 个回答
  • 面试哥
    面试哥 2021-01-29
    为面试而生,有面试问题,就找面试哥。

    这是一个演示,展示了如何结合特征以及如何使用调整超参数GridSearchCV

    不幸的是,您的样本数据集太小而无法训练真实模型。

    try:
        from pathlib import Path
    except ImportError:             # Python 2
        from pathlib2 import Path
    import os
    import re
    from pprint import pprint
    import pandas as pd
    import numpy as np
    from sklearn.base import BaseEstimator, TransformerMixin
    from sklearn.preprocessing import FunctionTransformer, LabelEncoder, LabelBinarizer, StandardScaler
    from sklearn.model_selection import train_test_split
    from sklearn.feature_selection import SelectPercentile
    from sklearn.feature_extraction import DictVectorizer
    from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
    from sklearn.model_selection import GridSearchCV
    from sklearn.linear_model import SGDClassifier
    from sklearn.naive_bayes import MultinomialNB, GaussianNB
    from sklearn.neural_network import MLPClassifier
    from sklearn.svm import SVC
    from sklearn.pipeline import Pipeline, FeatureUnion
    from sklearn.externals import joblib
    from scipy.sparse import csr_matrix, hstack
    
    
    class ColumnSelector(BaseEstimator, TransformerMixin):
    
        def __init__(self, name=None, position=None,
                     as_cat_codes=False, sparse=False):
            self.name = name
            self.position = position
            self.as_cat_codes = as_cat_codes
            self.sparse = sparse
    
        def fit(self, X, y=None):
            return self
    
        def transform(self, X, **kwargs):
            if self.name is not None:
                col_pos = X.columns.get_loc(self.name)
            elif self.position is not None:
                col_pos = self.position
            else:
                raise Exception('either [name] or [position] parameter must be not-None')
            if self.as_cat_codes and X.dtypes.iloc[col_pos] == 'category':
                    ret = X.iloc[:, col_pos].cat.codes
            else:
                ret = X.iloc[:, col_pos]
            if self.sparse:
                ret = csr_matrix(ret.values.reshape(-1,1))
            return ret
    
    union = FeatureUnion([
                ('text', 
                 Pipeline([
                    ('select', ColumnSelector('url')),
                    #('pct', SelectPercentile(percentile=1)),
                    ('vect', TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                             stop_words='english')),
                 ]) ),
                ('ads',
                 Pipeline([
                    ('select', ColumnSelector('ads_keyword', sparse=True,
                                              as_cat_codes=True)),
                    #('scale', StandardScaler(with_mean=False)),
                 ]) )
            ])
    
    pipe = Pipeline([
        ('union', union),
        ('clf', MultinomialNB())
    ])
    
    param_grid = [
        {
            'union__text__vect': [TfidfVectorizer(sublinear_tf=True,
                                                  max_df=0.5,
                                                  stop_words='english')],
            'clf': [SGDClassifier(max_iter=500)],
            'union__text__vect__ngram_range': [(1,1), (2,5)],
            'union__text__vect__analyzer': ['word','char_wb'],
            'clf__alpha': np.logspace(-5, 0, 6),
            #'clf__max_iter': [500],
        },
        {
            'union__text__vect': [TfidfVectorizer(sublinear_tf=True,
                                                  max_df=0.5,
                                                  stop_words='english')],
            'clf': [MultinomialNB()],
            'union__text__vect__ngram_range': [(1,1), (2,5)],
            'union__text__vect__analyzer': ['word','char_wb'],
            'clf__alpha': np.logspace(-4, 2, 7),
        },
        #{        # NOTE: does NOT support sparse matrices!
        #    'union__text__vect': [TfidfVectorizer(sublinear_tf=True,
        #                                          max_df=0.5,
        #                                          stop_words='english')],
        #    'clf': [GaussianNB()],
        #    'union__text__vect__ngram_range': [(1,1), (2,5)],
        #    'union__text__vect__analyzer': ['word','char_wb'],
        #},
    ]
    
    gs_kwargs = dict(scoring='roc_auc', cv=3, n_jobs=1, verbose=2)
    X_train, X_test, y_train, y_test = \
        train_test_split(df[['url','ads_keyword']], df['target'], test_size=0.33)
    grid = GridSearchCV(pipe, param_grid=param_grid, **gs_kwargs)
    grid.fit(X_train, y_train)
    
    # prediction
    predicted = grid.predict(X_test)
    


知识点
面圈网VIP题库

面圈网VIP题库全新上线,海量真题题库资源。 90大类考试,超10万份考试真题开放下载啦

去下载看看