def __init__(self, lang=None, method=None, features=None):
fs = []
if 'unigram' in features:
fs.append(word_unigrams())
if 'bigram' in features:
fs.append(word_bigrams())
if 'spelling' in features:
fs.append(avg_spelling_error(lang=lang))
if 'punctuation' in features:
fs.append(punctuation_features())
if 'char' in features:
fs.append(char_ngrams())
fu = FeatureUnion(fs, n_jobs=1)
self.pipeline = Pipeline([('features', fu),
('scale', Normalizer()),
('classifier', get_classifier(method=method))])
python类Pipeline()的实例源码
04_sent.py 文件源码
项目:Building-Machine-Learning-Systems-With-Python-Second-Edition
作者: PacktPublishing
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def create_union_model(params=None):
def preprocessor(tweet):
tweet = tweet.lower()
for k in emo_repl_order:
tweet = tweet.replace(k, emo_repl[k])
for r, repl in re_repl.iteritems():
tweet = re.sub(r, repl, tweet)
return tweet.replace("-", " ").replace("_", " ")
tfidf_ngrams = TfidfVectorizer(preprocessor=preprocessor,
analyzer="word")
ling_stats = LinguisticVectorizer()
all_features = FeatureUnion(
[('ling', ling_stats), ('tfidf', tfidf_ngrams)])
#all_features = FeatureUnion([('tfidf', tfidf_ngrams)])
#all_features = FeatureUnion([('ling', ling_stats)])
clf = MultinomialNB()
pipeline = Pipeline([('all', all_features), ('clf', clf)])
if params:
pipeline.set_params(**params)
return pipeline
03_clean.py 文件源码
项目:Building-Machine-Learning-Systems-With-Python-Second-Edition
作者: PacktPublishing
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
def create_ngram_model(params=None):
def preprocessor(tweet):
global emoticons_replaced
tweet = tweet.lower()
for k in emo_repl_order:
tweet = tweet.replace(k, emo_repl[k])
for r, repl in re_repl.iteritems():
tweet = re.sub(r, repl, tweet)
return tweet
tfidf_ngrams = TfidfVectorizer(preprocessor=preprocessor,
analyzer="word")
clf = MultinomialNB()
pipeline = Pipeline([('tfidf', tfidf_ngrams), ('clf', clf)])
if params:
pipeline.set_params(**params)
return pipeline
def init_model():
# “????”??
f_trunk = QuestionTrunkVectorizer(tokenizer=tokenize)
# Word2Vec ????
f_word2vec = Question2VecVectorizer(tokenizer=tokenize)
# ???? (400 ?)
union_features = FeatureUnion([
('f_trunk_lsa', Pipeline([
('trunk', f_trunk),
# ??_????: ?????? (LSA)
('lsa', TruncatedSVD(n_components=200, n_iter=10))
])),
('f_word2vec', f_word2vec),
])
model = Pipeline([('union', union_features), ('clf', LinearSVC(C=0.02))])
return model
def full_pipeline(model_type, predicted_column, grain_column, impute=True, verbose=True):
"""
Builds the data preparation pipeline. Sequentially runs transformers and filters to clean and prepare the data.
Note advanced users may wish to use their own custom pipeline.
"""
# Note: this could be done more elegantly using FeatureUnions _if_ you are not using pandas dataframes for
# inputs of the later pipelines as FeatureUnion intrinsically converts outputs to numpy arrays.
pipeline = Pipeline([
('remove_DTS_columns', hcai_filters.DataframeColumnSuffixFilter()),
('remove_grain_column', hcai_filters.DataframeColumnRemover(grain_column)),
# Perform one of two basic imputation methods
# TODO we need to think about making this optional to solve the problem of rare and very predictive values
('imputation', hcai_transformers.DataFrameImputer(impute=impute, verbose=verbose)),
('null_row_filter', hcai_filters.DataframeNullValueFilter(excluded_columns=None)),
('convert_target_to_binary', hcai_transformers.DataFrameConvertTargetToBinary(model_type, predicted_column)),
('prediction_to_numeric', hcai_transformers.DataFrameConvertColumnToNumeric(predicted_column)),
('create_dummy_variables', hcai_transformers.DataFrameCreateDummyVariables(excluded_columns=[predicted_column])),
])
return pipeline
def grid_retrain_in_f(self, n_dim=500):
rbf_map = RBFSampler(n_dim, random_state=1)
fourier_approx_svm = pipeline.Pipeline([("mapper", rbf_map),
("svm", LinearSVC())])
# C_range = np.logspace(-5, 15, 21, base=2)
# gamma_range = np.logspace(-15, 3, 19, base=2)
# param_grid = dict(mapper__gamma=gamma_range, svm__C=C_range)
# cv = StratifiedShuffleSplit(Y, n_iter=5, test_size=0.2, random_state=42)
# grid = GridSearchCV(fourier_approx_svm, param_grid=param_grid, cv=cv)
# grid.fit(X, Y)
#
# rbf_svc2 = grid.best_estimator_
rbf_svc2 = fourier_approx_svm
rbf_svc2.fit(self.X_ex, self.y_ex)
self.set_clf2(rbf_svc2)
return self.benchmark()
def test_transform_then_prediction(self):
with TemporaryDirectory() as temp:
from sklearn.pipeline import Pipeline
path = os.path.join(temp, 'audio.sph')
urlretrieve(filename=path,
url='https://s3.amazonaws.com/ai-datasets/sw02001.sph')
f = Pipeline([
('mspec', model.SpeechTransform('mspec', fs=8000, vad=False)),
('slice', model.Transform(lambda x: x[:, :40])),
('pred', model.SequentialModel(N.Dropout(0.3),
N.Dense(20, activation=K.relu),
N.Dense(10, activation=K.softmax))
)
])
x1 = f.predict(path)
x2 = f.predict_proba(path)
f = cPickle.loads(cPickle.dumps(f))
y1 = f.predict(path)
y2 = f.predict_proba(path)
self.assertEqual(np.array_equal(x1, y1), True)
self.assertEqual(np.array_equal(x2, y2), True)
def test_complex_transform(self):
with TemporaryDirectory() as temp:
from sklearn.pipeline import Pipeline
path = os.path.join(temp, 'audio.sph')
urlretrieve(filename=path,
url='https://s3.amazonaws.com/ai-datasets/sw02001.sph')
f = Pipeline([
('step1', model.SpeechTransform('mspec', fs=8000, vad=True)),
('step2', model.Transform(lambda x: (x[0][:, :40],
x[1].astype(str)))),
('step3', model.Transform(lambda x: (np.sum(x[0]),
''.join(x[1].tolist()))))
])
x = f.transform(path)
f = cPickle.loads(cPickle.dumps(f))
y = f.transform(path)
self.assertEqual(x[0], y[0])
self.assertEqual(y[0], -3444229.0)
self.assertEqual(x[1], y[1])
def transform_pca(clf_list):
'''
From classifier list to pipeline list of the same classifiers and PCA.
'''
pca = PCA()
params_pca = {"pca__n_components":[2, 3, 4, 5, 10, 15, 20], "pca__whiten": [False]}
for j in range(len(clf_list)):
name = "clf_" + str(j)
clf, params = clf_list[j]
# Parameters in GridSearchCV need to have double underscores
# between specific classifiers.
new_params = {}
for key, value in params.iteritems():
new_params[name + "__" + key] = value
new_params.update(params_pca)
clf_list[j] = (Pipeline([("pca", pca), (name, clf)]), new_params)
return clf_list
def __init__(self, X=None, y=None, ax=None, scale=True, color=None, proj_dim=2,
colormap=palettes.DEFAULT_SEQUENCE, **kwargs):
super(PCADecomposition, self).__init__(ax=ax, **kwargs)
# Data Parameters
if proj_dim not in (2, 3):
raise YellowbrickValueError("proj_dim object is not 2 or 3.")
self.color = color
self.pca_features_ = None
self.scale = scale
self.proj_dim = proj_dim
self.pca_transformer = Pipeline([('scale', StandardScaler(with_std=self.scale)),
('pca', PCA(self.proj_dim, ))
])
# Visual Parameters
self.colormap = colormap
def test_select_best(self):
"""
Test the select best fit estimator
"""
X, y = ANSCOMBE[1]
X = np.array(X)
y = np.array(y)
X = X[:,np.newaxis]
model = fit_select_best(X, y)
self.assertIsNotNone(model)
self.assertIsInstance(model, Pipeline)
X, y = ANSCOMBE[3]
X = np.array(X)
y = np.array(y)
X = X[:,np.newaxis]
model = fit_select_best(X, y)
self.assertIsNotNone(model)
self.assertIsInstance(model, LinearRegression)
def run(self):
'''
Runs a model with params p.
'''
self.clf.set_params(**self.params)
# f = get_feature_transformer(self.parser)
# self.X_train_fts = f.fit_transform(self.X_train)
# self.X_test_fts = f.transform(self.X_test)
self.pipeline = Pipeline([
# ('feature_gen', f),
('clf', self.clf),
])
self.y_pred_probs = self.pipeline.fit(self.X_train,self.y_train).predict_proba(self.X_test)[:,1]
if self.model_type in ['RF', 'ET', 'AB', 'GB', 'DT']:
self.importances = self.clf.feature_importances_
elif self.model_type in ['SVM', 'LR', 'SGD']:
self.importances = self.clf.coef_[0]
def __init__(self, path, etype, **kwargs):
super(EnsembleModel, self).__init__(path, etype=etype, **kwargs)
self.basedir = "models/ensemble/"
self.goldstd = kwargs.get("goldstd")
self.data = {}
self.offsets = []
self.pipeline = Pipeline(
[
#('clf', SGDClassifier(loss='hinge', penalty='l1', alpha=0.0001, n_iter=5, random_state=42)),
#('clf', SGDClassifier())
# ('clf', svm.NuSVC(nu=0.01 ))
('clf', RandomForestClassifier(class_weight={False:1, True:1}, n_jobs=-1, criterion="entropy", warm_start=True))
# ('clf', tree.DecisionTreeClassifier(criterion="entropy")),
# ('clf', MultinomialNB())
# ('clf', GaussianNB())
#('clf', svm.SVC(kernel="rbf", degree=2, C=1)),
#('clf', svm.SVC(kernel="linear", C=2))
#('clf', DummyClassifier(strategy="constant", constant=True))
])
def __init__(self, corpus, relationtype, modelname="scikit_classifier"):
super(ScikitRE, self).__init__()
self.modelname = relationtype + "_" + modelname
self.relationtype = relationtype
self.pairtype = relationtype
self.corpus = corpus
self.pairs = []
self.features = []
self.labels = []
self.pred = []
self.clusters = word2vec.load_clusters("corpora/Thaliana/documents-processed-clusters.txt")
self.posfmeasure = make_scorer(f1_score, average='binary', pos_label=True)
self.generate_data(corpus, modelname, relationtype)
self.text_clf = Pipeline([('vect', CountVectorizer(analyzer='char_wb', ngram_range=(3,20), min_df=0.0, max_df=0.7)),
#('vect', CountVectorizer(ngram_range=(1,3), binary=False, max_features=None)),
#('tfidf', TfidfTransformer(use_idf=True, norm="l2")),
#('clf', SGDClassifier(loss='hinge', penalty='l1', alpha=0.0001, n_iter=5, random_state=42)),
#('clf', SGDClassifier())
#('clf', svm.NuSVC(nu=0.01 ))
#('clf', RandomForestClassifier(class_weight={False:1, True:2}, n_jobs=-1))
('clf', MultinomialNB(alpha=0.01, fit_prior=False))
#('clf', DummyClassifier(strategy="constant", constant=True))
])
def __init__(self, path, goldset, base_model, features=None, types=None):
self.ensemble_pipeline = Pipeline([
('clf', ensemble.RandomForestClassifier(criterion="gini", n_estimators=1000))
])
self.base_model = base_model
self.path = path
self.predicted = []
self.res = None
self.ids, self.data, self.labels = [], [], []
self.goldset = goldset
if types: # features is a list of classifier names
self.types = types
else:
self.types = []
self.feature_names = []
for t in self.types:
self.feature_names.append(t)
self.feature_names.append(t + "_ssm")
for f in features:
self.feature_names.append(f)
def test_cutoff_inside_a_pipeline(data):
minmax_scaler = preprocessing.MinMaxScaler()
dsapp_cutoff = CutOff()
pipeline =Pipeline([
('minmax_scaler',minmax_scaler),
('dsapp_cutoff', dsapp_cutoff)
])
pipeline.fit(data['X_train'], data['y_train'])
X_fake_new_data = data['X_test'][-1,:].reshape(1,-1) + 0.5
mms = preprocessing.MinMaxScaler().fit(data['X_train'])
assert np.all(( mms.transform(X_fake_new_data) > 1 ) == (pipeline.transform(X_fake_new_data) == 1))
def test_dsapp_lr(data):
dsapp_lr = ScaledLogisticRegression()
dsapp_lr.fit(data['X_train'], data['y_train'])
minmax_scaler = preprocessing.MinMaxScaler()
dsapp_cutoff = CutOff()
lr = linear_model.LogisticRegression()
pipeline =Pipeline([
('minmax_scaler',minmax_scaler),
('dsapp_cutoff', dsapp_cutoff),
('lr', lr)
])
pipeline.fit(data['X_train'], data['y_train'])
assert np.all(dsapp_lr.predict(data['X_test']) == pipeline.predict(data['X_test']))
def fit(self, x, y=None):
if y is not None:
xdot = y
else:
xdot = self.derivative.transform(x)
if self.operators is not None:
feature_transformer = SymbolicFeatures(exponents=np.linspace(1, self.degree, self.degree), operators=self.operators)
else:
feature_transformer = PolynomialFeatures(degree=self.degree, include_bias=False)
steps = [("features", feature_transformer),
("model", STRidge(alpha=self.alpha, threshold=self.threshold, **self.kw))]
self.model = MultiOutputRegressor(Pipeline(steps), n_jobs=self.n_jobs)
self.model.fit(x, xdot)
self.n_input_features_ = self.model.estimators_[0].steps[0][1].n_input_features_
self.n_output_features_ = self.model.estimators_[0].steps[0][1].n_output_features_
return self
def get_best_params_and_model(self):
"""
Returns the best parameters and model after optimization.
Keyword arguments:
None
"""
best_params_idx = np.argmax([score for score, params in self.hyperparam_history])
best_params = self.hyperparam_history[best_params_idx][1]
if isinstance(self.model, Pipeline):
all_params = self.model.get_params()
all_params.update(best_params)
best_model = self.model.set_params(**all_params)
else:
best_model = self.model.__class__(**dict(self.model.get_params(), **best_params))
return best_params, best_model