python类Pipeline()的实例源码

cross_genre_profiler.py 文件源码 项目:magic 作者: pan-webis-de 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def __init__(self, lang=None, method=None, features=None):
        fs = []
        if 'unigram' in features:
            fs.append(word_unigrams())
        if 'bigram' in features:
            fs.append(word_bigrams())
        if 'spelling' in features:
            fs.append(avg_spelling_error(lang=lang))
        if 'punctuation' in features:
            fs.append(punctuation_features())
        if 'char' in features:
            fs.append(char_ngrams())

        fu = FeatureUnion(fs, n_jobs=1)
        self.pipeline = Pipeline([('features', fu),
                                  ('scale', Normalizer()),
                                  ('classifier', get_classifier(method=method))])
04_sent.py 文件源码 项目:Building-Machine-Learning-Systems-With-Python-Second-Edition 作者: PacktPublishing 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def create_union_model(params=None):
    def preprocessor(tweet):
        tweet = tweet.lower()

        for k in emo_repl_order:
            tweet = tweet.replace(k, emo_repl[k])
        for r, repl in re_repl.iteritems():
            tweet = re.sub(r, repl, tweet)

        return tweet.replace("-", " ").replace("_", " ")

    tfidf_ngrams = TfidfVectorizer(preprocessor=preprocessor,
                                   analyzer="word")
    ling_stats = LinguisticVectorizer()
    all_features = FeatureUnion(
        [('ling', ling_stats), ('tfidf', tfidf_ngrams)])
    #all_features = FeatureUnion([('tfidf', tfidf_ngrams)])
    #all_features = FeatureUnion([('ling', ling_stats)])
    clf = MultinomialNB()
    pipeline = Pipeline([('all', all_features), ('clf', clf)])

    if params:
        pipeline.set_params(**params)

    return pipeline
03_clean.py 文件源码 项目:Building-Machine-Learning-Systems-With-Python-Second-Edition 作者: PacktPublishing 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def create_ngram_model(params=None):
    def preprocessor(tweet):
        global emoticons_replaced
        tweet = tweet.lower()

        for k in emo_repl_order:
            tweet = tweet.replace(k, emo_repl[k])
        for r, repl in re_repl.iteritems():
            tweet = re.sub(r, repl, tweet)

        return tweet

    tfidf_ngrams = TfidfVectorizer(preprocessor=preprocessor,
                                   analyzer="word")
    clf = MultinomialNB()
    pipeline = Pipeline([('tfidf', tfidf_ngrams), ('clf', clf)])

    if params:
        pipeline.set_params(**params)

    return pipeline
model.py 文件源码 项目:wende 作者: h404bi 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def init_model():
        # “????”??
        f_trunk = QuestionTrunkVectorizer(tokenizer=tokenize)

        # Word2Vec ????
        f_word2vec = Question2VecVectorizer(tokenizer=tokenize)

        # ???? (400 ?)
        union_features = FeatureUnion([
            ('f_trunk_lsa', Pipeline([
                ('trunk', f_trunk),
                # ??_????: ?????? (LSA)
                ('lsa', TruncatedSVD(n_components=200, n_iter=10))
            ])),
            ('f_word2vec', f_word2vec),
        ])

        model = Pipeline([('union', union_features), ('clf', LinearSVC(C=0.02))])
        return model
data_preparation.py 文件源码 项目:healthcareai-py 作者: HealthCatalyst 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def full_pipeline(model_type, predicted_column, grain_column, impute=True, verbose=True):
    """
    Builds the data preparation pipeline. Sequentially runs transformers and filters to clean and prepare the data.

    Note advanced users may wish to use their own custom pipeline.
    """

    # Note: this could be done more elegantly using FeatureUnions _if_ you are not using pandas dataframes for
    #   inputs of the later pipelines as FeatureUnion intrinsically converts outputs to numpy arrays.
    pipeline = Pipeline([
        ('remove_DTS_columns', hcai_filters.DataframeColumnSuffixFilter()),
        ('remove_grain_column', hcai_filters.DataframeColumnRemover(grain_column)),
        # Perform one of two basic imputation methods
        # TODO we need to think about making this optional to solve the problem of rare and very predictive values
        ('imputation', hcai_transformers.DataFrameImputer(impute=impute, verbose=verbose)),
        ('null_row_filter', hcai_filters.DataframeNullValueFilter(excluded_columns=None)),
        ('convert_target_to_binary', hcai_transformers.DataFrameConvertTargetToBinary(model_type, predicted_column)),
        ('prediction_to_numeric', hcai_transformers.DataFrameConvertColumnToNumeric(predicted_column)),
        ('create_dummy_variables', hcai_transformers.DataFrameCreateDummyVariables(excluded_columns=[predicted_column])),
    ])
    return pipeline
RBFTrainer.py 文件源码 项目:Steal-ML 作者: ftramer 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def grid_retrain_in_f(self, n_dim=500):
        rbf_map = RBFSampler(n_dim, random_state=1)
        fourier_approx_svm = pipeline.Pipeline([("mapper", rbf_map),
                                                ("svm", LinearSVC())])

        # C_range = np.logspace(-5, 15, 21, base=2)
        # gamma_range = np.logspace(-15, 3, 19, base=2)
        # param_grid = dict(mapper__gamma=gamma_range, svm__C=C_range)
        # cv = StratifiedShuffleSplit(Y, n_iter=5, test_size=0.2, random_state=42)
        # grid = GridSearchCV(fourier_approx_svm, param_grid=param_grid, cv=cv)
        # grid.fit(X, Y)
        #
        # rbf_svc2 = grid.best_estimator_

        rbf_svc2 = fourier_approx_svm
        rbf_svc2.fit(self.X_ex, self.y_ex)

        self.set_clf2(rbf_svc2)
        return self.benchmark()
model_test.py 文件源码 项目:odin 作者: imito 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def test_transform_then_prediction(self):
        with TemporaryDirectory() as temp:
            from sklearn.pipeline import Pipeline
            path = os.path.join(temp, 'audio.sph')
            urlretrieve(filename=path,
                        url='https://s3.amazonaws.com/ai-datasets/sw02001.sph')
            f = Pipeline([
                ('mspec', model.SpeechTransform('mspec', fs=8000, vad=False)),
                ('slice', model.Transform(lambda x: x[:, :40])),
                ('pred', model.SequentialModel(N.Dropout(0.3),
                                               N.Dense(20, activation=K.relu),
                                               N.Dense(10, activation=K.softmax))
                )
            ])
            x1 = f.predict(path)
            x2 = f.predict_proba(path)

            f = cPickle.loads(cPickle.dumps(f))
            y1 = f.predict(path)
            y2 = f.predict_proba(path)
            self.assertEqual(np.array_equal(x1, y1), True)
            self.assertEqual(np.array_equal(x2, y2), True)
model_test.py 文件源码 项目:odin 作者: imito 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def test_complex_transform(self):
        with TemporaryDirectory() as temp:
            from sklearn.pipeline import Pipeline
            path = os.path.join(temp, 'audio.sph')
            urlretrieve(filename=path,
                        url='https://s3.amazonaws.com/ai-datasets/sw02001.sph')
            f = Pipeline([
                ('step1', model.SpeechTransform('mspec', fs=8000, vad=True)),
                ('step2', model.Transform(lambda x: (x[0][:, :40],
                                                     x[1].astype(str)))),
                ('step3', model.Transform(lambda x: (np.sum(x[0]),
                                                    ''.join(x[1].tolist()))))
            ])
            x = f.transform(path)
            f = cPickle.loads(cPickle.dumps(f))
            y = f.transform(path)
            self.assertEqual(x[0], y[0])
            self.assertEqual(y[0], -3444229.0)
            self.assertEqual(x[1], y[1])
enron_poi_ml_ci.py 文件源码 项目:machine-learning 作者: cinserra 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def transform_pca(clf_list):
    '''
    From classifier list to pipeline list of the same classifiers and PCA.
    '''

    pca = PCA()
    params_pca = {"pca__n_components":[2, 3, 4, 5, 10, 15, 20], "pca__whiten": [False]}

    for j in range(len(clf_list)):

        name = "clf_" + str(j)
        clf, params = clf_list[j]

        # Parameters in GridSearchCV need to have double underscores
        # between specific classifiers.
        new_params = {}
        for key, value in params.iteritems():
            new_params[name + "__" + key] = value

        new_params.update(params_pca)
        clf_list[j] = (Pipeline([("pca", pca), (name, clf)]), new_params)

    return clf_list
pca.py 文件源码 项目:yellowbrick 作者: DistrictDataLabs 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def __init__(self, X=None, y=None, ax=None, scale=True, color=None, proj_dim=2,
                 colormap=palettes.DEFAULT_SEQUENCE, **kwargs):
        super(PCADecomposition, self).__init__(ax=ax, **kwargs)
        # Data Parameters
        if proj_dim not in (2, 3):
            raise YellowbrickValueError("proj_dim object is not 2 or 3.")

        self.color = color
        self.pca_features_ = None
        self.scale = scale
        self.proj_dim = proj_dim
        self.pca_transformer = Pipeline([('scale', StandardScaler(with_std=self.scale)),
                                         ('pca', PCA(self.proj_dim, ))
                                         ])
        # Visual Parameters
        self.colormap = colormap
test_bestfit.py 文件源码 项目:yellowbrick 作者: DistrictDataLabs 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def test_select_best(self):
        """
        Test the select best fit estimator
        """
        X, y = ANSCOMBE[1]
        X = np.array(X)
        y = np.array(y)
        X = X[:,np.newaxis]

        model = fit_select_best(X, y)
        self.assertIsNotNone(model)
        self.assertIsInstance(model, Pipeline)

        X, y = ANSCOMBE[3]
        X = np.array(X)
        y = np.array(y)
        X = X[:,np.newaxis]

        model = fit_select_best(X, y)
        self.assertIsNotNone(model)
        self.assertIsInstance(model, LinearRegression)
model.py 文件源码 项目:fake-news-detection 作者: aldengolab 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def run(self):
        '''
        Runs a model with params p.
        '''
        self.clf.set_params(**self.params)
        # f = get_feature_transformer(self.parser)
        # self.X_train_fts = f.fit_transform(self.X_train)
        # self.X_test_fts = f.transform(self.X_test)
        self.pipeline = Pipeline([
            # ('feature_gen', f),
            ('clf', self.clf),
        ])
        self.y_pred_probs = self.pipeline.fit(self.X_train,self.y_train).predict_proba(self.X_test)[:,1]
        if self.model_type in ['RF', 'ET', 'AB', 'GB', 'DT']:
            self.importances = self.clf.feature_importances_
        elif self.model_type in ['SVM', 'LR', 'SGD']:
            self.importances = self.clf.coef_[0]
ensemble.py 文件源码 项目:IBRel 作者: lasigeBioTM 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def __init__(self, path, etype, **kwargs):
        super(EnsembleModel, self).__init__(path, etype=etype, **kwargs)
        self.basedir = "models/ensemble/"
        self.goldstd = kwargs.get("goldstd")
        self.data = {}
        self.offsets = []
        self.pipeline = Pipeline(
            [
                #('clf', SGDClassifier(loss='hinge', penalty='l1', alpha=0.0001, n_iter=5, random_state=42)),
                #('clf', SGDClassifier())
                # ('clf', svm.NuSVC(nu=0.01 ))
                ('clf', RandomForestClassifier(class_weight={False:1, True:1}, n_jobs=-1, criterion="entropy", warm_start=True))
                # ('clf', tree.DecisionTreeClassifier(criterion="entropy")),
                # ('clf', MultinomialNB())
                # ('clf', GaussianNB())
                #('clf', svm.SVC(kernel="rbf", degree=2, C=1)),
                #('clf', svm.SVC(kernel="linear", C=2))
                #('clf', DummyClassifier(strategy="constant", constant=True))
            ])
scikitre.py 文件源码 项目:IBRel 作者: lasigeBioTM 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def __init__(self, corpus, relationtype, modelname="scikit_classifier"):
        super(ScikitRE, self).__init__()
        self.modelname = relationtype + "_" + modelname
        self.relationtype = relationtype
        self.pairtype = relationtype
        self.corpus = corpus
        self.pairs = []
        self.features = []
        self.labels = []
        self.pred = []
        self.clusters = word2vec.load_clusters("corpora/Thaliana/documents-processed-clusters.txt")
        self.posfmeasure = make_scorer(f1_score, average='binary', pos_label=True)
        self.generate_data(corpus, modelname, relationtype)
        self.text_clf = Pipeline([('vect', CountVectorizer(analyzer='char_wb', ngram_range=(3,20), min_df=0.0, max_df=0.7)),
                                  #('vect', CountVectorizer(ngram_range=(1,3), binary=False, max_features=None)),
                                  #('tfidf', TfidfTransformer(use_idf=True, norm="l2")),
                                  #('clf', SGDClassifier(loss='hinge', penalty='l1', alpha=0.0001, n_iter=5, random_state=42)),
                                  #('clf', SGDClassifier())
                                  #('clf', svm.NuSVC(nu=0.01 ))
                                   #('clf', RandomForestClassifier(class_weight={False:1, True:2}, n_jobs=-1))
                                  ('clf', MultinomialNB(alpha=0.01, fit_prior=False))
                                  #('clf', DummyClassifier(strategy="constant", constant=True))
                                 ])
ensemble_ner.py 文件源码 项目:IBRel 作者: lasigeBioTM 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def __init__(self, path, goldset, base_model, features=None, types=None):
        self.ensemble_pipeline = Pipeline([
            ('clf', ensemble.RandomForestClassifier(criterion="gini", n_estimators=1000))
            ])
        self.base_model = base_model
        self.path = path
        self.predicted = []
        self.res = None
        self.ids, self.data, self.labels = [], [], []
        self.goldset = goldset
        if types: # features is a list of classifier names
            self.types = types
        else:
            self.types = []
        self.feature_names = []
        for t in self.types:
            self.feature_names.append(t)
            self.feature_names.append(t + "_ssm")
        for f in features:
            self.feature_names.append(f)
test_estimators.py 文件源码 项目:catwalk 作者: dssg 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def test_cutoff_inside_a_pipeline(data):
    minmax_scaler = preprocessing.MinMaxScaler()
    dsapp_cutoff = CutOff()

    pipeline =Pipeline([
        ('minmax_scaler',minmax_scaler),
        ('dsapp_cutoff', dsapp_cutoff)
    ])

    pipeline.fit(data['X_train'], data['y_train'])

    X_fake_new_data = data['X_test'][-1,:].reshape(1,-1) + 0.5

    mms = preprocessing.MinMaxScaler().fit(data['X_train'])

    assert np.all(( mms.transform(X_fake_new_data) > 1  ) == (pipeline.transform(X_fake_new_data) == 1))
test_estimators.py 文件源码 项目:catwalk 作者: dssg 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def test_dsapp_lr(data):
    dsapp_lr = ScaledLogisticRegression()
    dsapp_lr.fit(data['X_train'], data['y_train'])

    minmax_scaler = preprocessing.MinMaxScaler()
    dsapp_cutoff = CutOff()
    lr = linear_model.LogisticRegression()

    pipeline =Pipeline([
        ('minmax_scaler',minmax_scaler),
        ('dsapp_cutoff', dsapp_cutoff),
        ('lr', lr)
    ])

    pipeline.fit(data['X_train'], data['y_train'])

    assert np.all(dsapp_lr.predict(data['X_test']) == pipeline.predict(data['X_test']))
sindy.py 文件源码 项目:sparsereg 作者: Ohjeah 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def fit(self, x, y=None):
        if y is not None:
            xdot = y
        else:
            xdot = self.derivative.transform(x)

        if self.operators is not None:
            feature_transformer = SymbolicFeatures(exponents=np.linspace(1, self.degree, self.degree), operators=self.operators)
        else:
            feature_transformer = PolynomialFeatures(degree=self.degree, include_bias=False)

        steps = [("features", feature_transformer),
                 ("model", STRidge(alpha=self.alpha, threshold=self.threshold, **self.kw))]
        self.model = MultiOutputRegressor(Pipeline(steps), n_jobs=self.n_jobs)
        self.model.fit(x, xdot)

        self.n_input_features_ = self.model.estimators_[0].steps[0][1].n_input_features_
        self.n_output_features_ = self.model.estimators_[0].steps[0][1].n_output_features_
        return self
optimizer_base.py 文件源码 项目:OptML 作者: johannespetrat 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def get_best_params_and_model(self):
        """
        Returns the best parameters and model after optimization.
        Keyword arguments:
            None
        """
        best_params_idx = np.argmax([score for score, params in self.hyperparam_history])
        best_params = self.hyperparam_history[best_params_idx][1]
        if isinstance(self.model, Pipeline):
            all_params = self.model.get_params()
            all_params.update(best_params)
            best_model = self.model.set_params(**all_params)
        else:
            best_model = self.model.__class__(**dict(self.model.get_params(), **best_params))
        return best_params, best_model


问题


面经


文章

微信
公众号

扫码关注公众号