def unscaled_pipelines():
# Random forest parameters
random_forest_kwargs = {
'n_estimators': 10,
'criterion': 'mse',
'random_state': _RANDOM_STATE,
'n_jobs': cpu_count(),
'verbose': True,
}
# Gradient boosting parameters
gradient_boost_kwargs = {
'random_state': _RANDOM_STATE,
'verbose': 1,
}
models = [
DecisionTreeRegressor(max_depth=3, random_state=_RANDOM_STATE),
# RandomForestRegressor(**random_forest_kwargs),
# GradientBoostingRegressor(**gradient_boost_kwargs),
]
pipelines = []
for m in models:
# Steps
pipelines.append(make_pipeline(m))
return pipelines
python类make_pipeline()的实例源码
classification.py 文件源码
项目:decoding-brain-challenge-2016
作者: alexandrebarachant
项目源码
文件源码
阅读 31
收藏 0
点赞 0
评论 0
def fit(self, X, y):
"""Fit TSclassifier.
Parameters
----------
X : ndarray, shape (n_trials, n_channels, n_channels)
ndarray of SPD matrices.
y : ndarray shape (n_trials, 1)
labels corresponding to each trial.
Returns
-------
self : TSclassifier. instance
The TSclassifier. instance.
"""
ts = TangentSpace(metric=self.metric, tsupdate=self.tsupdate)
self._pipe = make_pipeline(ts, self.clf)
self._pipe.fit(X, y)
return self
def sample_pipelines(pca_kernels=None, svr_kernels=None):
"""
Pipelines that can't be fit in a reasonable amount of time on the whole
dataset
"""
# Model instances
model_steps = []
if pca_kernels is None:
pca_kernels = ['poly', 'rbf', 'sigmoid', 'cosine']
for pca_kernel in pca_kernels:
model_steps.append([
KernelPCA(n_components=2, kernel=pca_kernel),
LinearRegression(),
])
if svr_kernels is None:
svr_kernels = ['poly', 'rbf', 'sigmoid']
for svr_kernel in svr_kernels:
model_steps.append(SVR(kernel=svr_kernel, verbose=True, cache_size=1000))
# Pipelines
pipelines = []
for m in model_steps:
# Steps
common_steps = [
StandardScaler(),
]
model_steps = m if isinstance(m, list) else [m]
steps = common_steps + model_steps
pipelines.append(make_pipeline(*steps))
return pipelines
def build_pipeline(base_estimator, parameters):
"""
Builds a pipeline where the base estimator is initialized with given parameters. The `@preprocessor` parameter
is a special parameter that will determine which pre-processing steps to use.
:param base_estimator: The base estimator of the pipeline
:param parameters: The parameters for the base estimator, includes special parameters for the pipeline itself
:return: The (pipeline with the) base estimator, initialized with given parameters
"""
params = copy(parameters)
preprocessors = Builder.extract_preprocessors(params)
estimator = Builder.setup_estimator(base_estimator, params)
if preprocessors is None:
return estimator
return make_pipeline(*preprocessors, estimator)
def decompose(doc_vecs, n_features=100, normalize=False, flip=False):
svd = TruncatedSVD(n_features)
if normalize:
if flip:
lsa = make_pipeline(svd, Normalizer(copy=False))
doc_mat = lsa.fit_transform(doc_vecs.transpose())
doc_mat = doc_mat.transpose()
else:
lsa = make_pipeline(svd, Normalizer(copy=False))
doc_mat = lsa.fit_transform(doc_vecs)
return doc_mat
else:
if flip:
doc_mat = svd.fit_transform(doc_vecs.transpose())
doc_mat = doc_mat.transpose()
else:
doc_mat = svd.fit_transform(doc_vecs)
return doc_mat
def test_mdr_sklearn_pipeline():
"""Ensure that MDR can be used as a transformer in a scikit-learn pipeline"""
features = np.array([[2, 0],
[0, 0],
[0, 1],
[0, 0],
[0, 0],
[0, 0],
[0, 1],
[0, 0],
[0, 0],
[0, 1],
[0, 0],
[0, 0],
[0, 0],
[1, 1],
[1, 1]])
classes = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0])
clf = make_pipeline(MDR(), LogisticRegression())
cv_scores = cross_val_score(clf, features, classes, cv=StratifiedKFold(n_splits=5, shuffle=True))
assert np.mean(cv_scores) > 0.
def test_mdr_sklearn_pipeline_parallel():
"""Ensure that MDR can be used as a transformer in a parallelized scikit-learn pipeline"""
features = np.array([[2, 0],
[0, 0],
[0, 1],
[0, 0],
[0, 0],
[0, 0],
[0, 1],
[0, 0],
[0, 0],
[0, 1],
[0, 0],
[0, 0],
[0, 0],
[1, 1],
[1, 1]])
classes = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0])
clf = make_pipeline(MDR(), LogisticRegression())
cv_scores = cross_val_score(clf, features, classes, cv=StratifiedKFold(n_splits=5, shuffle=True), n_jobs=-1)
assert np.mean(cv_scores) > 0.
classification.py 文件源码
项目:decoding_challenge_cortana_2016_3rd
作者: kingjr
项目源码
文件源码
阅读 33
收藏 0
点赞 0
评论 0
def fit(self, X, y):
"""Fit TSclassifier.
Parameters
----------
X : ndarray, shape (n_trials, n_channels, n_channels)
ndarray of SPD matrices.
y : ndarray shape (n_trials, 1)
labels corresponding to each trial.
Returns
-------
self : TSclassifier. instance
The TSclassifier. instance.
"""
ts = TangentSpace(metric=self.metric, tsupdate=self.tsupdate)
self._pipe = make_pipeline(ts, self.clf)
self._pipe.fit(X, y)
return self
def __init__(self, model_type=DEFAULT_MODEL_TYPE):
"""
Set ups model and pipeline for learning and predicting.
:param model_type: only 'SVR' model is supported for now
"""
assert (model_type == 'SVR'), "Model '{}' is not supported. " \
"We support only SVR for now.".format(model_type)
self._model_type = model_type
self._model_params = BTCForecast.DEFAULT_SVR_MODEL_PARAMS
# set up SVR pipeline
self._scaler = preprocessing.StandardScaler(copy=True, with_mean=True, with_std=True)
self._model = SVR(kernel=self._model_params['kernel'],
epsilon=self._model_params['epsilon'],
C=self._model_params['c'],
gamma=self._model_params['gamma'])
self._pipeline = make_pipeline(self._scaler, self._model)
self.has_learned = False
def test_check_scoring_gridsearchcv():
# test that check_scoring works on GridSearchCV and pipeline.
# slightly redundant non-regression test.
grid = GridSearchCV(LinearSVC(), param_grid={'C': [.1, 1]})
scorer = check_scoring(grid, "f1")
assert_true(isinstance(scorer, _PredictScorer))
pipe = make_pipeline(LinearSVC())
scorer = check_scoring(pipe, "f1")
assert_true(isinstance(scorer, _PredictScorer))
# check that cross_val_score definitely calls the scorer
# and doesn't make any assumptions about the estimator apart from having a
# fit.
scores = cross_val_score(EstimatorWithFit(), [[1], [2], [3]], [1, 0, 1],
scoring=DummyScorer())
assert_array_equal(scores, 1)
def LDAPageVctorizer(*,
n_topics: int,
min_df: int,
max_features: int,
max_iter: int,
ngram_range: Tuple[int, int],
vocabulary=None,
batch_size: int=4096,
verbose=1):
vec = _vectorizer(min_df=min_df, max_features=max_features,
ngram_range=ngram_range, vocabulary=vocabulary)
lda = LatentDirichletAllocation(
learning_method='online',
n_topics=n_topics,
batch_size=batch_size,
evaluate_every=2,
verbose=verbose,
max_iter=max_iter,
n_jobs=1,
)
return make_pipeline(vec, lda)
def test_in_pipeline():
X, y = make_classification(n_samples=100, n_features=5, chunks=50)
pipe = make_pipeline(DoNothingTransformer(), LogisticRegression())
pipe.fit(X, y)
def test_gridsearch():
X, y = make_classification(n_samples=100, n_features=5, chunks=50)
grid = {
'logisticregression__C': [1000, 100, 10, 2]
}
pipe = make_pipeline(DoNothingTransformer(), LogisticRegression())
search = dcv.GridSearchCV(pipe, grid, cv=3)
search.fit(X, y)
def get_pipeline(clf=RandomForestClassifier(n_estimators=100, class_weight="balanced")):
return make_pipeline(DictVectorizer(sparse=False), clf)
def truncated_svd(self):
# https://github.com/chrisjmccormick/LSA_Classification/blob/master/inspect_LSA.py
svd = TruncatedSVD(self.dimensions)
lsa = make_pipeline(svd, Normalizer(copy=False))
X_reduced = lsa.fit_transform(self.bag_of_words_matrix)
print(svd.components_[0])
print(svd.explained_variance_ratio_)
print(svd.explained_variance_ratio_.sum())
def scaled_pipelines():
# Model parameters
# RANSAC parameters
# 500 max trials takes 90s
ransac_kwargs = {
'max_trials': 1000,
'min_samples': 5000,
'loss': 'absolute_loss',
'residual_threshold': 2.0,
'random_state': _RANDOM_STATE,
}
# Ridge CV parameters
alphas = [.01, .1, 1, 10]
# Model instances
model_steps = [
LinearRegression(),
# [PolynomialFeatures(degree=2), LinearRegression()],
# [PolynomialFeatures(degree=3), LinearRegression()],
# RANSACRegressor(base_estimator=LinearRegression(), **ransac_kwargs),
# RANSACRegressor with polynomial regression?
# RidgeCV(alphas=alphas),
# LassoCV(), # Alphas set automatically by default
# ElasticNetCV(l1_ratio=0.5), # Same as default
# [PolynomialFeatures(degree=2), ElasticNetCV(l1_ratio=0.5)],
# SGDRegressor(),
]
# Pipelines
pipelines = []
for m in model_steps:
# Steps
common_steps = [
StandardScaler(),
PCA(**_PCA_KWARGS)
]
model_steps = m if isinstance(m, list) else [m]
steps = common_steps + model_steps
pipelines.append(make_pipeline(*steps))
return pipelines
def fit():
X, y = generate()
dX = dd.from_pandas(X, npartitions=10)
y = dd.from_pandas(y, npartitions=10)
pre_pipe = make_pipeline(
CategoricalEncoder(),
DummyEncoder(),
Imputer(),
SGDRegressor(),
)
pipe = make_pipeline(
SelectFromModel(pre_pipe),
GradientBoostingRegressor(),
)
X_ = pre_pipe.fit_transform(dX)
for i in range(X_.npartitions):
for j in range(5):
print(i, j)
X_sub = X_.get_partition(i).compute()
y_sub = y.get_partition(i).compute()
clf.partial_fit(X_sub, y_sub)
sfm = SelectFromModel(clf, prefit=True)
return pipe, clf, sfm
def regression_pipeline(regression_model):
return make_pipeline([StandardScaler(), regression_model])
def build_classifier(base_clf=svm.SVC()):
# The imputer is for "use_taxonomy", and shouldn't affect if it's False.
# TODO: should also try with other imputer strategies
return pipeline.make_pipeline(preprocessing.Imputer(strategy='most_frequent'), preprocessing.StandardScaler(),
base_clf)
# noinspection PyPep8Naming
def make_ward_clustering(self, short_filenames, input_texts):
output_dir = self.output_dir + 'WARD/'
if not os.path.exists(output_dir):
os.makedirs(output_dir)
if self.need_tf_idf:
self.signals.PrintInfo.emit("?????? TF-IDF...")
idf_filename = output_dir + 'tf_idf.csv'
msg = self.calculate_and_write_tf_idf(idf_filename, input_texts)
self.signals.PrintInfo.emit(msg)
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(input_texts)
svd = TruncatedSVD(2)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
X = lsa.fit_transform(X)
ward = AgglomerativeClustering(n_clusters=self.ward_clusters_count, linkage='ward')
predict_result = ward.fit_predict(X)
self.signals.PrintInfo.emit('\n??????? ?? ??????????:\n')
clasters_output = ''
for claster_index in range(max(predict_result) + 1):
clasters_output += ('??????? ' + str(claster_index) + ':\n')
for predict, document in zip(predict_result, short_filenames):
if predict == claster_index:
clasters_output += (' ' + str(document) + '\n')
clasters_output += '\n'
self.signals.PrintInfo.emit(clasters_output)
self.signals.PrintInfo.emit('????????? ?:' + str(output_dir + 'clusters.txt'))
writeStringToFile(clasters_output, output_dir + 'clusters.txt')
self.draw_clusters_plot(X, predict_result, short_filenames)
def make_spectral_clustering(self, short_filenames, input_texts):
output_dir = self.output_dir + 'spectral/'
if not os.path.exists(output_dir):
os.makedirs(output_dir)
if self.need_tf_idf:
self.signals.PrintInfo.emit("?????? TF-IDF...")
idf_filename = output_dir + 'tf_idf.csv'
msg = self.calculate_and_write_tf_idf(idf_filename, input_texts)
self.signals.PrintInfo.emit(msg)
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(input_texts)
svd = TruncatedSVD(2)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
X = lsa.fit_transform(X)
spectral = SpectralClustering(n_clusters=self.spectral_clusters_count)
predict_result = spectral.fit_predict(X)
self.signals.PrintInfo.emit('\n??????? ?? ??????????:\n')
clasters_output = ''
for claster_index in range(max(predict_result) + 1):
clasters_output += ('??????? ' + str(claster_index) + ':\n')
for predict, document in zip(predict_result, short_filenames):
if predict == claster_index:
clasters_output += (' ' + str(document) + '\n')
clasters_output += '\n'
self.signals.PrintInfo.emit(clasters_output)
self.signals.PrintInfo.emit('????????? ?:' + str(output_dir + 'clusters.txt'))
writeStringToFile(clasters_output, output_dir + 'clusters.txt')
self.draw_clusters_plot(X, predict_result, short_filenames)
# aa = Affinity Propagation
def make_aa_clustering(self, short_filenames, input_texts):
output_dir = self.output_dir + 'affinity_propagation/'
if not os.path.exists(output_dir):
os.makedirs(output_dir)
if self.need_tf_idf:
self.signals.PrintInfo.emit("?????? TF-IDF...")
idf_filename = output_dir + 'tf_idf.csv'
msg = self.calculate_and_write_tf_idf(idf_filename, input_texts)
self.signals.PrintInfo.emit(msg)
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(input_texts)
svd = TruncatedSVD(2)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
X = lsa.fit_transform(X)
aa_clusterizator = AffinityPropagation(damping=self.aa_damping,
max_iter=self.aa_max_iter,
convergence_iter=self.aa_no_change_stop)
predict_result = aa_clusterizator.fit_predict(X)
self.signals.PrintInfo.emit('\n??????? ?? ??????????:\n')
clasters_output = ''
for claster_index in range(max(predict_result) + 1):
clasters_output += ('??????? ' + str(claster_index) + ':\n')
for predict, document in zip(predict_result, short_filenames):
if predict == claster_index:
clasters_output += (' ' + str(document) + '\n')
clasters_output += '\n'
self.signals.PrintInfo.emit(clasters_output)
self.signals.PrintInfo.emit('????????? ?:' + str(output_dir + 'clusters.txt'))
writeStringToFile(clasters_output, output_dir + 'clusters.txt')
self.draw_clusters_plot(X, predict_result, short_filenames)
def make_birch_clustering(self, short_filenames, input_texts):
output_dir = self.output_dir + 'birch/'
if not os.path.exists(output_dir):
os.makedirs(output_dir)
if self.need_tf_idf:
self.signals.PrintInfo.emit("?????? TF-IDF...")
idf_filename = output_dir + 'tf_idf.csv'
msg = self.calculate_and_write_tf_idf(idf_filename, input_texts)
self.signals.PrintInfo.emit(msg)
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(input_texts)
svd = TruncatedSVD(2)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
X = lsa.fit_transform(X)
birch = Birch(threshold=self.birch_threshold,
branching_factor=self.birch_branching_factor,
n_clusters=self.birch_clusters_count)
predict_result = birch.fit_predict(X)
self.signals.PrintInfo.emit('\n??????? ?? ??????????:\n')
clasters_output = ''
for claster_index in range(max(predict_result) + 1):
clasters_output += ('??????? ' + str(claster_index) + ':\n')
for predict, document in zip(predict_result, short_filenames):
if predict == claster_index:
clasters_output += (' ' + str(document) + '\n')
clasters_output += '\n'
self.signals.PrintInfo.emit(clasters_output)
self.signals.PrintInfo.emit('????????? ?:' + str(output_dir + 'clusters.txt'))
writeStringToFile(clasters_output, output_dir + 'clusters.txt')
self.draw_clusters_plot(X, predict_result, short_filenames)
def doPCA(X, output_columns_count):
#DO PCA on the data and use it to transform
svd = TruncatedSVD(output_columns_count)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
X = lsa.fit_transform(X)
return X
def test_train():
# define input streams
names = ['v', 'p', 't', 'w', 'a']
# define first transformation
units = ["knot","in_Hg","celsius","force_pound","degree"]
tounits = ["m/s", "pascal", "kelvin", "newton", "radian"]
tuple_units = []
for i, unit in enumerate(units):
tuple_units.append((unit, tounits[i]))
s1 = UnitTransformer(tuple_units)
# second layer of transformation
constants = {"s": 61.0, "R": 286.9}
labels = ["2*w/(v**2*(p/R/t)*s)"]
s2 = FormulaTransformer(labels, names, constants)
# sink ( any sink transformation could be used to predict )
# no fit_transform rule, can only predict
features = ["a"]
s3 = make_pipeline(FormulaTransformer(features, names), LinearRegression())
# train the shit outof it
with (open("data/training.csv")) as f:
df = pd.read_csv(f, names=names, header=0)
# awkward transformation from dataframe to numpy matrix
# could use panda sklearn to solve
ndarray = df.as_matrix(names)
rawX = s1.fit_transform(ndarray)
y = s2.fit_transform(rawX)
X = rawX
s3.fit(X, y)
y_ = s3.predict(X)
print X.shape, y_.shape
#plt.scatter(FormulaTransformer(features, names).fit_transform(X), y_)
#plt.show()
# wrap the process as StreamPipeline for learning machine
sp = StreamPipeline(names, s3)
sp.predict(v=1.0, p=2.0, t=3.0, w=4.0, a=5.0)
def _get_pipeline(self, name):
return make_pipeline(self.pipeline, classifiers[name])
def featuresByLSA(features,ncomponents=100):
svd = TruncatedSVD(n_components=ncomponents)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
dtm_lsa = lsa.fit_transform(features)
return dtm_lsa
def fit_quadratic(X, y):
"""
Uses OLS with Polynomial order 2.
"""
model = make_pipeline(
PolynomialFeatures(2), linear_model.LinearRegression()
)
model.fit(X, y)
return model
def test_relieff_pipeline():
"""Ensure that ReliefF works in a sklearn pipeline when it is parallelized"""
np.random.seed(49082)
clf = make_pipeline(ReliefF(n_features_to_select=2, n_neighbors=100, n_jobs=-1),
RandomForestClassifier(n_estimators=100, n_jobs=-1))
assert np.mean(cross_val_score(clf, features, labels, cv=3)) > 0.7
def test_relieff_pipeline_parallel():
"""Ensure that ReliefF works in a sklearn pipeline where cross_val_score is parallelized"""
np.random.seed(49082)
clf = make_pipeline(ReliefF(n_features_to_select=2, n_neighbors=100),
RandomForestClassifier(n_estimators=100, n_jobs=-1))
assert np.mean(cross_val_score(clf, features, labels, cv=3, n_jobs=-1)) > 0.7