python类DictVectorizer()的实例源码-面圈网

encoder.py 文件源码项目：kaggle-review 作者: daxiongshu 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def onehot_encode_bar(tr,te,cols=None,bar=10000):
    if cols is None:
        cols = [i for i in tr.columns.values if i in te.columns.values]
    vec = DictVectorizer()
    cat,num = [],[]
    for col in cols:
        nu = tr[col].unique().shape[0]
        if (nu<bar and nu>2) or tr[col].dtype=='object':
            cat.append(col)
            tr[col] = tr[col].map(str)
            te[col] = te[col].map(str)
        else:
            num.append(col)
    print("start fitting num of cat features:",len(cat))
    X = vec.fit_transform(tr[cat].T.to_dict().values())
    Xt = vec.transform(te[cat].T.to_dict().values())
    print("done fitting",X.shape,Xt.shape)
    X = sparse.hstack([X,tr[num].values],format='csr')
    Xt = sparse.hstack([Xt,te[num].values],format='csr') 
    return X,Xt

xgboostbase.py 文件源码项目：DiscourseSenser 作者: WladimirSidorenko 项目源码文件源码阅读 19 收藏 0 点赞 0 评论 0

def __init__(self, a_clf=None, a_grid_search=False):
        """Class constructor.

        Args:
          a_clf (classifier or None):
            classifier to use or None for default
          a_grid_search (bool): use grid search for estimating
            hyper-parameters

        """
        classifier = a_clf
        self._gs = a_grid_search
        if a_clf is None:
            classifier = XGBClassifier(max_depth=MAX_DEPTH,
                                       n_estimators=NTREES,
                                       learning_rate=ALPHA,
                                       objective="multi:softprob")
            self._clf = classifier
        # latest version of XGBoost cannot deal with non-sparse feature vectors
        self._model = Pipeline([("vect", DictVectorizer()),
                                ("clf", classifier)])

predictor.py 文件源码项目：auto_ml 作者: doordash 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def _consolidate_pipeline(self, transformation_pipeline, final_model=None):
        # First, restrict our DictVectorizer or DataFrameVectorizer
        # This goes through and has DV only output the items that have passed our support mask
        # This has a number of benefits: speeds up computation, reduces memory usage, and combines several transforms into a single, easy step
        # It also significantly reduces the size of dv.vocabulary_ which can get quite large

        dv = transformation_pipeline.named_steps['dv']

        try:
            feature_selection = transformation_pipeline.named_steps['feature_selection']
            feature_selection_mask = feature_selection.support_mask
            dv.restrict(feature_selection_mask)
        except KeyError:
            pass

        # We have overloaded our _construct_pipeline method to work both to create a new pipeline from scratch at the start of training, and to go through a trained pipeline in exactly the same order and steps to take a dedicated FeatureSelection model out of an already trained pipeline
        # In this way, we ensure that we only have to maintain a single centralized piece of logic for the correct order a pipeline should follow
        trained_pipeline_without_feature_selection = self._construct_pipeline(trained_pipeline=transformation_pipeline, final_model=final_model)

        return trained_pipeline_without_feature_selection

CEP_Exp_One_Storm.py 文件源码项目：dmon-adp 作者: igabriel85 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def ohEncoding(data, cols, replace=False):
    if cols is None:
        cols = []
        for el, v in data.dtypes.iteritems():
            if v == 'object':
                if el == 'key':
                    pass
                else:
                    cols.append(el)
        print "Categorical features not set, detected as categorical: %s" % str(cols)
    vec = DictVectorizer()
    mkdict = lambda row: dict((col, row[col]) for col in cols)
    vecData = pd.DataFrame(vec.fit_transform(data[cols].apply(mkdict, axis=1)).toarray())
    vecData.columns = vec.get_feature_names()
    vecData.index = data.index
    if replace is True:
        data = data.drop(cols, axis=1)
        data = data.join(vecData)
    return data, vecData, vec


# df, t, v = ohEncoding(df, col, replace=True)

dmonscikitforest_ACI.py 文件源码项目：dmon-adp 作者: igabriel85 项目源码文件源码阅读 18 收藏 0 点赞 0 评论 0

def ohEncoding(data, cols=None, replace=False):
    if cols is None:
        cols = []
        for el, v in data.dtypes.iteritems():
            if v == 'object':
                cols.append(el)
        print "Categorical features not set, detected as categorical: %s" % str(cols)
    vec = DictVectorizer()
    mkdict = lambda row: dict((col, row[col]) for col in cols)
    vecData = pd.DataFrame(vec.fit_transform(data[cols].apply(mkdict, axis=1)).toarray())
    vecData.columns = vec.get_feature_names()
    vecData.index = data.index
    if replace is True:
        data = data.drop(cols, axis=1)
        data = data.join(vecData)
    return data, vecData, vec

feature_extractors.py 文件源码项目：StrepHit 作者: Wikidata 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def __init__(self, language='en', window_width=2, collapse_fes=True, target_size=None):
        """ Initializes the extractor.

            :param language: The language of the sentences that will be used
            :param window_width: how many tokens to look before and after a each
             token when building its features.
            :param collapse_fes: Whether to collapse FEs to a single token
             or to keep them split.
        """
        self.language = language
        self.tagger = TTPosTagger(language)
        self.window_width = window_width
        self.collapse_fes = collapse_fes
        self.unk_feature = 'UNK'
        self.vectorizer = DictVectorizer()
        self.target_size = target_size
        self.reducer = TruncatedSVD(target_size) if target_size else None
        self.vocabulary = set()
        self.label_index = {}
        self.lu_index = {}
        self.stopwords = set(w.lower() for w in StopWords().words(language))
        self.start()

wangbase.py 文件源码项目：DiscourseSenser 作者: WladimirSidorenko 项目源码文件源码阅读 38 收藏 0 点赞 0 评论 0

def __init__(self, a_clf=None, a_grid_search=False):
        """Class constructor.

        Initialize classifier.

        Args:
          a_clf (classifier or None):
            classifier to use or None for default
          a_grid_search (bool): use grid search for estimating hyper-parameters

        """
        classifier = a_clf or LinearSVC(C=DFLT_C,
                                        **DFLT_PARAMS)
        self._gs = a_grid_search
        self._model = Pipeline([("vect", DictVectorizer()),
                                ("clf", classifier)])

2016-11-07_v0.0.1.py 文件源码项目：SPHERE-HyperStream 作者: IRC-SPHERE 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def _execute(self, sources, alignment_stream, interval):
        time_interval = TimeInterval(MIN_DATE, interval.end)
        param_doc = sources[0].window(time_interval, force_calculation=True).last()
        if param_doc is None:
            logging.debug("No model found in {} for time interval {}".format(sources[0].stream_id, time_interval))
            return

        steps = deserialise_json_pipeline({
            'vectorisation': DictVectorizer(sparse=False),
            'fill_missing': FillZeros(),
            'classifier': LinearDiscriminantAnalysis(),
            'label_encoder': LabelEncoder()
        }, param_doc.value)

        clf = Pipeline([(kk, steps[kk]) for kk in ('vectorisation', 'fill_missing', 'classifier')])
        locations = steps['label_encoder'].classes_

        data = sources[1].window(interval, force_calculation=True)
        for tt, dd in data:
            yield StreamInstance(tt, {locations[ii]: pp for ii, pp in enumerate(clf.predict_proba(dd)[0])})

scikitlearn.py 文件源码项目：Price-Comparator 作者: Thejas-1 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def __init__(self, estimator, dtype=float, sparse=True):
        """
        :param estimator: scikit-learn classifier object.

        :param dtype: data type used when building feature array.
            scikit-learn estimators work exclusively on numeric data. The
            default value should be fine for almost all situations.

        :param sparse: Whether to use sparse matrices internally.
            The estimator must support these; not all scikit-learn classifiers
            do (see their respective documentation and look for "sparse
            matrix"). The default value is True, since most NLP problems
            involve sparse feature sets. Setting this to False may take a
            great amount of memory.
        :type sparse: boolean.
        """
        self._clf = estimator
        self._encoder = LabelEncoder()
        self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)

knock74.py 文件源码项目：100knock2016 作者: tmu-nlp 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def predict_function():
    x_list = []
    line_list = []
    line_dict = {}
    predict_doc = joblib.load('logreg.pkl')
    feature_doc = joblib.load("word_vec.pkl")
    y_train, x_train = get_feature()
    line = "bad bad good good"
    line_list = line.split()
    for line in x_train:
        for key in line:
            line_dict[key] = 0
    line_dict.update(dict(Counter(line_list)))
    for a in sorted(line_dict.items(), key = lambda x:x[1]):
        print(a)
    x_list.append(line_dict)
    print(x_list)
    exit()
    X = DictVectorizer().fit_transform(x_list)
    pred = predict_doc.predict(X)
    prob = predict_doc.predict_proba(X)
    for pred, prob in zip(pred,prob):
        print(pred, prob)

knock85.py 文件源码项目：100knock2016 作者: tmu-nlp 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def dimension_reduction():
    X = PPMI_matrix()
    word_list = list()
    vecdict_list = list()
    for word, vector in sorted(X.items()):
        word_list.append(word)
        vecdict_list.append(dict(vector))
    Dic2Vec = DictVectorizer(sparse=True)
    vector_list = Dic2Vec.fit_transform(vecdict_list)

    X_svd = svds(vector_list, 300)
    X_pca = np.dot(X_svd[0], np.diag(X_svd[1]))
    word_matrix = dict()
    for word, vector in zip(word_list, X_pca):
        word_matrix[word] = vector

    return word_matrix

knock74.py 文件源码项目：100knock2016 作者: tmu-nlp 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def predict_function():
    x_list = []
    line_list = []
    line_dict = {}
    predict_doc = joblib.load('logreg.pkl')
    feature_doc = joblib.load("word_vec.pkl")
    y_train, x_train = get_feature()
    line = "bad bad good good"
    line_list = line.split()
    for line in x_train:
        for key in line:
            line_dict[key] = 0
    line_dict.update(dict(Counter(line_list)))
    for a in sorted(line_dict.items(), key = lambda x:x[1]):
        print(a)
    x_list.append(line_dict)
    print(x_list)
    exit()
    X = DictVectorizer().fit_transform(x_list)
    pred = predict_doc.predict(X)
    prob = predict_doc.predict_proba(X)
    for pred, prob in zip(pred,prob):
        print(pred, prob)

knock85.py 文件源码项目：100knock2016 作者: tmu-nlp 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def dimension_compression():
    X_t_c = make_matrix()
    token_list = []
    contexts_list = []
    for token, contexts in sorted(X_t_c.items()):
        token_list.append(token)
        contexts_list.append(contexts)

    pca = PCA(n_components = 300)
    DictoVec = DictVectorizer(sparse = True)

    sparse = DictoVec.fit_transform(contexts_list)

    print(sparse.shape)

    vec_list = pca.fit_transform(sparse.todense())

    word_vec = {}
    for token, vec in zip(token_list, vec_list):
        word_vec[token] = vec

    return word_vec

knock85.py 文件源码项目：100knock2016 作者: tmu-nlp 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def dim_reduction():
  dic2vec = DictVectorizer(sparse=True)
  PPMI = getPPMI()
  tc = list()
  token_list = list()
  for token, contexts in sorted(PPMI.items()):
    token_list.append(token)
    contexts = dict(contexts)
    tc.append(contexts)

  tc_vec = dic2vec.fit_transform(tc)
  tc_svd = svds(tc_vec, 300)
  tc_pca = np.dot(tc_svd[0], np.diag(tc_svd[1]))

  word_vec = dict()
  for token, vec in zip(token_list, tc_pca):
    word_vec[token] = vec

  return word_vec

predictor.py 文件源码项目：auto_ml 作者: ClimbsRocks 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def _consolidate_pipeline(self, transformation_pipeline, final_model=None):
        # First, restrict our DictVectorizer or DataFrameVectorizer
        # This goes through and has DV only output the items that have passed our support mask
        # This has a number of benefits: speeds up computation, reduces memory usage, and combines several transforms into a single, easy step
        # It also significantly reduces the size of dv.vocabulary_ which can get quite large

        try:
            feature_selection = transformation_pipeline.named_steps['feature_selection']
            feature_selection_mask = feature_selection.support_mask
            transformation_pipeline.named_steps['dv'].restrict(feature_selection_mask)
        except KeyError:
            pass

        # We have overloaded our _construct_pipeline method to work both to create a new pipeline from scratch at the start of training, and to go through a trained pipeline in exactly the same order and steps to take a dedicated FeatureSelection model out of an already trained pipeline
        # In this way, we ensure that we only have to maintain a single centralized piece of logic for the correct order a pipeline should follow
        trained_pipeline_without_feature_selection = self._construct_pipeline(trained_pipeline=transformation_pipeline, final_model=final_model)

        return trained_pipeline_without_feature_selection

DecisionTree.py 文件源码项目：MLLearning 作者: buptdjd 项目源码文件源码阅读 35 收藏 0 点赞 0 评论 0

def data2Vector(self):
        vec = DictVectorizer()
        dummy_x = vec.fit_transform(self.feature_list).toarray()
        lb = LabelBinarizer()
        dummy_y = lb.fit_transform(self.label_list)
        return dummy_x, dummy_y

    # here the decision tree use the algorithm which we call ID3, ID3 will use
    # information gain as feature select

scikitlearn.py 文件源码项目：PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def __init__(self, estimator, dtype=float, sparse=True):
        """
        :param estimator: scikit-learn classifier object.

        :param dtype: data type used when building feature array.
            scikit-learn estimators work exclusively on numeric data. The
            default value should be fine for almost all situations.

        :param sparse: Whether to use sparse matrices internally.
            The estimator must support these; not all scikit-learn classifiers
            do (see their respective documentation and look for "sparse
            matrix"). The default value is True, since most NLP problems
            involve sparse feature sets. Setting this to False may take a
            great amount of memory.
        :type sparse: boolean.
        """
        self._clf = estimator
        self._encoder = LabelEncoder()
        self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)

scikitlearn.py 文件源码项目：neighborhood_mood_aws 作者: jarrellmark 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def __init__(self, estimator, dtype=float, sparse=True):
        """
        :param estimator: scikit-learn classifier object.

        :param dtype: data type used when building feature array.
            scikit-learn estimators work exclusively on numeric data. The
            default value should be fine for almost all situations.

        :param sparse: Whether to use sparse matrices internally.
            The estimator must support these; not all scikit-learn classifiers
            do (see their respective documentation and look for "sparse
            matrix"). The default value is True, since most NLP problems
            involve sparse feature sets. Setting this to False may take a
            great amount of memory.
        :type sparse: boolean.
        """
        self._clf = estimator
        self._encoder = LabelEncoder()
        self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)

scikitlearn.py 文件源码项目：hate-to-hugs 作者: sdoran35 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def __init__(self, estimator, dtype=float, sparse=True):
        """
        :param estimator: scikit-learn classifier object.

        :param dtype: data type used when building feature array.
            scikit-learn estimators work exclusively on numeric data. The
            default value should be fine for almost all situations.

        :param sparse: Whether to use sparse matrices internally.
            The estimator must support these; not all scikit-learn classifiers
            do (see their respective documentation and look for "sparse
            matrix"). The default value is True, since most NLP problems
            involve sparse feature sets. Setting this to False may take a
            great amount of memory.
        :type sparse: boolean.
        """
        self._clf = estimator
        self._encoder = LabelEncoder()
        self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)

learn.py 文件源码项目：partisan-discourse 作者: DistrictDataLabs 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def transform(self, documents):
        """
        Returns a dictionary of text features in advance of a DictVectorizer.
        """
        for document in documents:
            # Collect token and vocabulary counts
            counts = Counter(
                item[0] for para in document for sent in para for item in sent
            )

            # Yield structured information about the document
            yield {
                'paragraphs': len(document),
                'sentences': sum(len(para) for para in document),
                'words': sum(counts.values()),
                'vocab': len(counts),
            }


##########################################################################
## Model Building Functions
##########################################################################

knock73.py 文件源码项目：100knock2017 作者: tmu-nlp 项目源码文件源码阅读 19 收藏 0 点赞 0 评论 0

def create_feature(sent_list):
    feature_ = []
    polarity = []
# ??????
    features_ = []
# ??????
#?????
    vec = DictVectorizer()

    for line in sent_list:
        sentence = line.strip('\n').split()
        sentence2 = sentence.pop(0)
        polarity.append(int(sentence2))
        #print(polarity)
        feature_ = feature(sentence)
        '''
        for word in feature(sentence):
            feature_.append(word)
            print(feature_)
            '''
        features_.append(feature_vector(feature_))
    x_feature = vec.fit_transform(features_)
    return x_feature, polarity

scikitlearn.py 文件源码项目：FancyWord 作者: EastonLee 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def __init__(self, estimator, dtype=float, sparse=True):
        """
        :param estimator: scikit-learn classifier object.

        :param dtype: data type used when building feature array.
            scikit-learn estimators work exclusively on numeric data. The
            default value should be fine for almost all situations.

        :param sparse: Whether to use sparse matrices internally.
            The estimator must support these; not all scikit-learn classifiers
            do (see their respective documentation and look for "sparse
            matrix"). The default value is True, since most NLP problems
            involve sparse feature sets. Setting this to False may take a
            great amount of memory.
        :type sparse: boolean.
        """
        self._clf = estimator
        self._encoder = LabelEncoder()
        self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)

stsTwitterCorpus.py 文件源码项目：nlpSentiment 作者: ClimbsRocks 项目源码文件源码阅读 18 收藏 0 点赞 0 评论 0

def getFeatures(numWordsToUse, allTweets, allTweetsSentiment):
    # each corpus's getFeatures function is responsible for somehow loading in their own allTweets and allTweetsSentiment data
    # then they have to ensure that data is tokenized (leveraging the modular tokenization functionality in utils)
    # then shuffle the dataset
    # then create the frequency distribution and popularWords
    # then extract features from each tweet, and un-combine the sentiment again


    global popularWords
    formattedTweets, sentiment, popularWords = utils.nlpFeatureEngineering(
            allTweets, allTweetsSentiment,0,numWordsToUse,'counts'
        )

    # right now we have a data structure roughly equivalent to a dense matrix, except each row is a dictionary
    # DictVectorizer performs two key functions for us:
        # 1. transforms each row from a dictionary into a vector using consistent placing of keys into indexed positions within each vector
        # 2. returns sparse vectors, saving enormous amounts of memory which becomes very useful when training our models
    sparseFeatures = dv.fit_transform(formattedTweets)

    return sparseFeatures, sentiment

scikitlearn.py 文件源码项目：beepboop 作者: nicolehe 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def __init__(self, estimator, dtype=float, sparse=True):
        """
        :param estimator: scikit-learn classifier object.

        :param dtype: data type used when building feature array.
            scikit-learn estimators work exclusively on numeric data. The
            default value should be fine for almost all situations.

        :param sparse: Whether to use sparse matrices internally.
            The estimator must support these; not all scikit-learn classifiers
            do (see their respective documentation and look for "sparse
            matrix"). The default value is True, since most NLP problems
            involve sparse feature sets. Setting this to False may take a
            great amount of memory.
        :type sparse: boolean.
        """
        self._clf = estimator
        self._encoder = LabelEncoder()
        self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)

scikitlearn.py 文件源码项目：kind2anki 作者: prz3m 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def __init__(self, estimator, dtype=float, sparse=True):
        """
        :param estimator: scikit-learn classifier object.

        :param dtype: data type used when building feature array.
            scikit-learn estimators work exclusively on numeric data. The
            default value should be fine for almost all situations.

        :param sparse: Whether to use sparse matrices internally.
            The estimator must support these; not all scikit-learn classifiers
            do (see their respective documentation and look for "sparse
            matrix"). The default value is True, since most NLP problems
            involve sparse feature sets. Setting this to False may take a
            great amount of memory.
        :type sparse: boolean.
        """
        self._clf = estimator
        self._encoder = LabelEncoder()
        self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)

scikitlearn.py 文件源码项目：but_sentiment 作者: MixedEmotions 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def __init__(self, estimator, dtype=float, sparse=True):
        """
        :param estimator: scikit-learn classifier object.

        :param dtype: data type used when building feature array.
            scikit-learn estimators work exclusively on numeric data. The
            default value should be fine for almost all situations.

        :param sparse: Whether to use sparse matrices internally.
            The estimator must support these; not all scikit-learn classifiers
            do (see their respective documentation and look for "sparse
            matrix"). The default value is True, since most NLP problems
            involve sparse feature sets. Setting this to False may take a
            great amount of memory.
        :type sparse: boolean.
        """
        self._clf = estimator
        self._encoder = LabelEncoder()
        self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)

test_dict_vectorizer.py 文件源码项目：Parallel-SGD 作者: angadgill 项目源码文件源码阅读 19 收藏 0 点赞 0 评论 0

def test_unseen_or_no_features():
    D = [{"camelot": 0, "spamalot": 1}]
    for sparse in [True, False]:
        v = DictVectorizer(sparse=sparse).fit(D)

        X = v.transform({"push the pram a lot": 2})
        if sparse:
            X = X.toarray()
        assert_array_equal(X, np.zeros((1, 2)))

        X = v.transform({})
        if sparse:
            X = X.toarray()
        assert_array_equal(X, np.zeros((1, 2)))

        try:
            v.transform([])
        except ValueError as e:
            assert_in("empty", str(e))

encoder.py 文件源码项目：kaggle-review 作者: daxiongshu 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def onehot_encode(tr,te,cols=None):
    if cols is None:
        cols = [i for i in tr.columns.values if i in te.columns.values]
    vec = DictVectorizer()
    for col in cols:
        tr[col] = tr[col].map(str)
        te[col] = te[col].map(str)
    print("start fitting")
    X = vec.fit_transform(tr[cols].T.to_dict().values())
    Xt = vec.transform(te[cols].T.to_dict().values())
    print("done fitting",X.shape,Xt.shape)
    return X,Xt

predictor.py 文件源码项目：auto_ml 作者: doordash 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def _validate_input_col_descriptions(self):
        found_output_column = False
        self.cols_to_ignore = []
        expected_vals = set(['categorical', 'text', 'nlp'])

        for key, value in self.column_descriptions.items():
            value = value.lower()
            self.column_descriptions[key] = value
            if value == 'output':
                self.output_column = key
                found_output_column = True
            elif value == 'date':
                self.date_cols.append(key)
            elif value == 'ignore':
                self.cols_to_ignore.append(key)
            elif value in expected_vals:
                pass
            else:
                raise ValueError('We are not sure how to process this column of data: ' + str(value) + '. Please pass in "output", "categorical", "ignore", "nlp", or "date".')
        if found_output_column is False:
            print('Here is the column_descriptions that was passed in:')
            print(self.column_descriptions)
            raise ValueError('In your column_descriptions, please make sure exactly one column has the value "output", which is the value we will be training models to predict.')

        # We will be adding one new categorical variable for each date col
        # Be sure to add it here so the rest of the pipeline knows to handle it as a categorical column
        for date_col in self.date_cols:
            self.column_descriptions[date_col + '_day_part'] = 'categorical'


    # We use _construct_pipeline at both the start and end of our training.
    # At the start, it constructs the pipeline from scratch
    # At the end, it takes FeatureSelection out after we've used it to restrict DictVectorizer, and adds final_model back in if we did grid search on it

FeatureSelection.py 文件源码项目：rdocChallenge 作者: Elyne 项目源码文件源码阅读 18 收藏 0 点赞 0 评论 0

def chiSquare(train_data, train_classes, topK):
    vectorizer = DictVectorizer()  

    # Fit and transform the train data.        
    x_train = vectorizer.fit_transform(train_data)
    y_train = train_classes

    if (x_train.shape[1] < topK):
        topK = x_train.shape[1]

    selector = SelectKBest(chi2, k=topK)
    x_new = selector.fit_transform(x_train, y_train)

    return vectorizer.inverse_transform(selector.inverse_transform(x_new))