def rfr_feature_select():
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import cross_val_score, ShuffleSplit
boston = load_boston()
X = boston["data"]
Y = boston["target"]
names = boston["feature_names"]
rf = RandomForestRegressor(n_estimators=20, max_depth=4)
scores = []
for i in range(X.shape[1]):
score = cross_val_score(rf, X[:, i:i + 1],
Y, scoring="r2", cv=ShuffleSplit(len(X), 3, .3))
scores.append((round(np.mean(score), 3), names[i]))
print sorted(scores, reverse=True)
python类ShuffleSplit()的实例源码
def data_split(inputfile):
data = hkl.load(inputfile)
X = data['mat']
X_kspec = data['kmer']
y = data['y']
rs = ShuffleSplit(len(y), n_iter=1,random_state = 1)
X_kspec = X_kspec.reshape((X_kspec.shape[0],1024,4))
X = np.concatenate((X,X_kspec), axis = 1)
X = X[:,np.newaxis]
X = X.transpose((0,1,3,2))
for train_idx, test_idx in rs:
X_train = X[train_idx,:]
y_train = y[train_idx]
X_test = X[test_idx,:]
y_test = y[test_idx]
X_train = X_train.astype('float32')
y_train = y_train.astype('int32')
X_test = X_test.astype('float32')
y_test = y_test.astype('int32')
return [X_train, y_train, X_test, y_test]
#define the network architecture
def train(self):
"""
Train SVM
"""
print "Starting Training"
rs = ShuffleSplit(self.length,n_iter=self.args.fold,test_size=self.args.test_size,random_state=self.args.random_state)
self.fold = 1
for train_index,test_index in rs:
self.train_images,self.train_labels = self.images[train_index,...],self.labels[train_index,...]
self.valid_images,self.valid_labels = self.images[test_index,...],self.labels[test_index,...]
#pdb.set_trace()
self.svm_classifier = self.classifier.fit(self.train_images,self.train_labels)
self.test(self.valid_images)
self.fold+=1
04_sent.py 文件源码
项目:Building-Machine-Learning-Systems-With-Python-Second-Edition
作者: PacktPublishing
项目源码
文件源码
阅读 33
收藏 0
点赞 0
评论 0
def __grid_search_model(clf_factory, X, Y):
cv = ShuffleSplit(
n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)
param_grid = dict(vect__ngram_range=[(1, 1), (1, 2), (1, 3)],
vect__min_df=[1, 2],
vect__smooth_idf=[False, True],
vect__use_idf=[False, True],
vect__sublinear_tf=[False, True],
vect__binary=[False, True],
clf__alpha=[0, 0.01, 0.05, 0.1, 0.5, 1],
)
grid_search = GridSearchCV(clf_factory(),
param_grid=param_grid,
cv=cv,
score_func=f1_score,
verbose=10)
grid_search.fit(X, Y)
clf = grid_search.best_estimator_
print clf
return clf
02_tuning.py 文件源码
项目:Building-Machine-Learning-Systems-With-Python-Second-Edition
作者: PacktPublishing
项目源码
文件源码
阅读 29
收藏 0
点赞 0
评论 0
def grid_search_model(clf_factory, X, Y):
cv = ShuffleSplit(
n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)
param_grid = dict(vect__ngram_range=[(1, 1), (1, 2), (1, 3)],
vect__min_df=[1, 2],
vect__stop_words=[None, "english"],
vect__smooth_idf=[False, True],
vect__use_idf=[False, True],
vect__sublinear_tf=[False, True],
vect__binary=[False, True],
clf__alpha=[0, 0.01, 0.05, 0.1, 0.5, 1],
)
grid_search = GridSearchCV(clf_factory(),
param_grid=param_grid,
cv=cv,
score_func=f1_score,
verbose=10)
grid_search.fit(X, Y)
clf = grid_search.best_estimator_
print clf
return clf
evaluate_features.py 文件源码
项目:motion-classification
作者: matthiasplappert
项目源码
文件源码
阅读 27
收藏 0
点赞 0
评论 0
def evaluate(X, args):
enum = ShuffleSplit(len(X), n_iter=args.n_iterations, test_size=args.test_size)
train_scores = []
test_scores = []
for train_index, test_index in enum:
X_train = [X[idx] for idx in train_index]
X_test = [X[idx] for idx in test_index]
X_train, X_test = preprocess_datasets(X_train, X_test, args)
model = GaussianHMM(n_states=args.n_states, n_training_iterations=args.n_training_iterations,
topology=args.topology)
model.fit(X_train)
train_scores.extend([model.loglikelihood(X_curr) for X_curr in X_train])
test_scores.extend([model.loglikelihood(X_curr) for X_curr in X_test])
train_scores_array = np.array(train_scores)
train_mean = float(np.mean(train_scores_array))
train_std = float(np.std(train_scores_array))
test_scores_array = np.array(test_scores)
test_mean = float(np.mean(test_scores_array))
test_std = float(np.std(test_scores_array))
return train_mean, train_std, test_mean, test_std
def optimize_learner_dad(learner, X, U, iters, train_size = 0.5):
num_traj = X.shape[2]
if train_size < 1.0:
from sklearn import cross_validation
rs = cross_validation.ShuffleSplit(num_traj, n_iter=1, train_size=train_size,
random_state=0, test_size=1.-train_size)
for train_index, test_index in rs:
pass
Xtrain = X[:,:,train_index]; Xtest = X[:,:,test_index]
Utrain = U[:,:,train_index]; Utest = U[:,:,test_index]
elif train_size == 1.0:
Xtrain = X; Xtest = X
Utrain = U; Utest = U
else:
raise Exception('Train size must be in (0,1]')
dad = DaDControl()
dad.learn(Xtrain, Utrain, learner, iters, Xtest, Utest, verbose=False)
print(' DaD (iters:{:d}). Initial Err: {:.4g}, Best: {:.4g}'.format(iters,
dad.initial_test_err, dad.min_test_error))
return dad
def test_cross_val_generator_with_indices():
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
y = np.array([1, 1, 2, 2])
labels = np.array([1, 2, 3, 4])
# explicitly passing indices value is deprecated
loo = cval.LeaveOneOut(4)
lpo = cval.LeavePOut(4, 2)
kf = cval.KFold(4, 2)
skf = cval.StratifiedKFold(y, 2)
lolo = cval.LeaveOneLabelOut(labels)
lopo = cval.LeavePLabelOut(labels, 2)
ps = cval.PredefinedSplit([1, 1, 2, 2])
ss = cval.ShuffleSplit(2)
for cv in [loo, lpo, kf, skf, lolo, lopo, ss, ps]:
for train, test in cv:
assert_not_equal(np.asarray(train).dtype.kind, 'b')
assert_not_equal(np.asarray(train).dtype.kind, 'b')
X[train], X[test]
y[train], y[test]
def test_cross_val_generator_with_default_indices():
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
y = np.array([1, 1, 2, 2])
labels = np.array([1, 2, 3, 4])
loo = cval.LeaveOneOut(4)
lpo = cval.LeavePOut(4, 2)
kf = cval.KFold(4, 2)
skf = cval.StratifiedKFold(y, 2)
lolo = cval.LeaveOneLabelOut(labels)
lopo = cval.LeavePLabelOut(labels, 2)
ss = cval.ShuffleSplit(2)
ps = cval.PredefinedSplit([1, 1, 2, 2])
for cv in [loo, lpo, kf, skf, lolo, lopo, ss, ps]:
for train, test in cv:
assert_not_equal(np.asarray(train).dtype.kind, 'b')
assert_not_equal(np.asarray(train).dtype.kind, 'b')
X[train], X[test]
y[train], y[test]
def fit(self, X, y, test_size=0.3):
# Grid search cross-val (best C param)
cv = ShuffleSplit(len(X), n_iter=1, test_size=0.3, random_state=self.seed_)
clf_cv = GridSearchCV(self.clf_base_, self.clf_hyparams_, cv=cv, n_jobs=-1, verbose=4)
print('====> Training Classifier (with grid search hyperparam tuning) .. ')
print('====> BATCH Training (in-memory): {:4.3f} MB'.format(X.nbytes / 1024.0 / 1024.0) )
clf_cv.fit(X, y)
print('BEST: {}, {}'.format(clf_cv.best_score_, clf_cv.best_params_))
# Setting clf to best estimator
self.clf_ = clf_cv.best_estimator_
# # Calibrating classifier
# print('Calibrating Classifier ... ')
# self.clf_prob_ = CalibratedClassifierCV(self.clf_, cv=cv, method='sigmoid')
# self.clf_prob_.fit(X, y)
# # Setting clf to best estimator
# self.clf_ = clf_cv.best_estimator_
# pred_targets = self.clf_.predict(X)
if self.epoch_no_ % 10 == 0:
self.save(self.filename_.replace('.h5', '_iter_{}.h5'.format(self.epoch_no_)))
self.save(self.filename_)
self.epoch_no_ += 1
def train_test_split(X, y, test_size=0.25, random_state=42, stratify=True):
if stratify:
n_folds = int(round(1 / test_size))
sss = StratifiedKFold(y, n_folds=n_folds, random_state=random_state)
else:
sss = ShuffleSplit(len(y), test_size=test_size, random_state=random_state)
train_idx, test_idx = iter(sss).next()
return X[train_idx], X[test_idx], y[train_idx], y[test_idx]
def __call__(self, X, y, net):
if self.eval_size is not None:
if net.regression or not self.stratify:
# test_size = self.eval_size
# kf = ShuffleSplit(
# y.shape[0], test_size=test_size,
# random_state=self.random_state
# )
# train_indices, valid_indices = next(iter(kf))
# valid_indices = shuffle(valid_indices)
test_size = 1 - self.eval_size
kf = ShuffleSplit(
y.shape[0], test_size=test_size,
random_state=self.random_state
)
valid_indices, train_indices = next(iter(kf))
else:
n_folds = int(round(1 / self.eval_size))
kf = StratifiedKFold(y, n_folds=n_folds, random_state=self.random_state)
train_indices, valid_indices = next(iter(kf))
X_train, y_train = X[train_indices], y[train_indices]
X_valid, y_valid = X[valid_indices], y[valid_indices]
else:
X_train, y_train = X, y
X_valid, y_valid = X[len(X):], y[len(y):]
return X_train, X_valid, y_train, y_valid
def data_split(inputfile,reads_count):
data = hkl.load(inputfile)
reads_count= hkl.load(reads_count)
X = data['mat']
X_kspec = data['kmer']
reads_count = np.array(reads_count)
y = np.mean(reads_count, axis = 1)
y = np.log(y+1e-3)
rs = ShuffleSplit(len(y), n_iter=1,random_state = 1)
X_kspec = X_kspec.reshape((X_kspec.shape[0],1024,4))
X = np.concatenate((X,X_kspec), axis = 1)
X = X[:,np.newaxis]
X = X.transpose((0,1,3,2))
for train_idx, test_idx in rs:
X_train = X[train_idx,:]
y_train = y[train_idx]
X_test = X[test_idx,:]
y_test = y[test_idx]
X_train = X_train.astype('float32')
y_train = y_train.astype('float32')
X_test = X_test.astype('float32')
y_test = y_test.astype('float32')
print 'Data prepration done!'
return [X_train, y_train, X_test, y_test]
#define the network architecture
def cv(model, X, y, n_iter=5, test_size=0.3):
split = cross_validation.ShuffleSplit(
len(X), n_iter=n_iter, test_size=test_size,
)
return cross_validation.cross_val_score(model, X, y, cv=split,
scoring='accuracy', n_jobs=-1)
def load_images(image_h5_file, n_images=-1, shuffle_seed=1):
"""Load images and auxiliary data from h5 file.
Args:
image_h5_file: location of h5 file containing images.
n_images: number of images to load, -1 loads all.
auxvars: list of auxvar field names to load.
Returns:
images: array of image arrays.
aux_data: dict of auxvar arrays.
TODO: add support for multiple classes.
"""
with h5py.File(image_h5_file, 'r') as h5file:
images = h5file['images']
auxvars = h5file['auxvars']
if n_images < 0:
n_images = len(images)
elif n_images > len(images):
print("Cannot load {0} images. Only {1} images in {2}".format(
n_images, len(images), image_h5_file))
n_images = len(images)
if n_images < len(images):
rs = cross_validation.ShuffleSplit(
len(images), n_iter=1, test_size=n_images,
random_state=shuffle_seed)
for train, test in rs:
keep = test
images = np.take(images, keep, axis=0)
auxvars = np.take(auxvars, keep, axis=0)
else:
images = h5file['images'][:]
auxvars = h5file['auxvars'][:]
return images, auxvars
def _get_split(X, y):
split = ShuffleSplit(y.shape[0], n_iter=1)
train, validate = list(split)[0]
X_train, X_validate, y_train, y_validate = X[train], X[validate], y[train], y[validate]
return X_train, X_validate, y_train, y_validate
def test_shuffle_split():
ss1 = cval.ShuffleSplit(10, test_size=0.2, random_state=0)
ss2 = cval.ShuffleSplit(10, test_size=2, random_state=0)
ss3 = cval.ShuffleSplit(10, test_size=np.int32(2), random_state=0)
for typ in six.integer_types:
ss4 = cval.ShuffleSplit(10, test_size=typ(2), random_state=0)
for t1, t2, t3, t4 in zip(ss1, ss2, ss3, ss4):
assert_array_equal(t1[0], t2[0])
assert_array_equal(t2[0], t3[0])
assert_array_equal(t3[0], t4[0])
assert_array_equal(t1[1], t2[1])
assert_array_equal(t2[1], t3[1])
assert_array_equal(t3[1], t4[1])
def test_shufflesplit_errors():
assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=2.0)
assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=1.0)
assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=0.1,
train_size=0.95)
assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=11)
assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=10)
assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=8, train_size=3)
assert_raises(ValueError, cval.ShuffleSplit, 10, train_size=1j)
assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=None,
train_size=None)
def test_shufflesplit_reproducible():
# Check that iterating twice on the ShuffleSplit gives the same
# sequence of train-test when the random_state is given
ss = cval.ShuffleSplit(10, random_state=21)
assert_array_equal(list(a for a, b in ss), list(a for a, b in ss))
def __grid_search_model(self, clf_factory, documents, labels, pos_label):
boolndarr = labels.values == pos_label
n = documents.size
n_pos = labels[boolndarr].size
n_neg = n - n_pos
param_grid = {
'vect__binary' : [False, True],
'vect__min_df' : [1, 2],
'vect__ngram_range' : [(1, 1), (1, 2), (1, 3)],
'vect__smooth_idf' : [False, True],
'vect__stop_words' : [None, 'english'],
'vect__sublinear_tf': [False, True],
'vect__use_idf' : [False, True],
'clf__alpha' : [0, 0.01, 0.05, 0.1, 0.5, 1]
}
k = 5
cv = ShuffleSplit(
n,
n_iter = k,
test_size = 1 / k,
random_state = 0
)
pos_weight = n_neg / n_pos
sample_weight = np.ones(n)
sample_weight[boolndarr] *= pos_weight
fit_params = {'clf__sample_weight': sample_weight}
f1_scorer = make_scorer(f1_score, pos_label=pos_label)
grid_search = GridSearchCV(
clf_factory,
param_grid,
cv = cv,
fit_params = fit_params,
n_jobs = -1,
scoring = f1_scorer
)
grid_search.fit(documents, labels)
best_estimator = grid_search.best_estimator_
best_score = grid_search.best_score_
best_params = grid_search.best_params_
print("Best F1 score: {0:04.3f}".format(best_score))
print("Parameters: {0}".format(best_params))
return best_estimator
04_sent.py 文件源码
项目:Building-Machine-Learning-Systems-With-Python-Second-Edition
作者: PacktPublishing
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def train_model(clf, X, Y, name="NB ngram", plot=False):
# create it again for plotting
cv = ShuffleSplit(
n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)
train_errors = []
test_errors = []
scores = []
pr_scores = []
precisions, recalls, thresholds = [], [], []
clfs = [] # just to later get the median
for train, test in cv:
X_train, y_train = X[train], Y[train]
X_test, y_test = X[test], Y[test]
clf.fit(X_train, y_train)
clfs.append(clf)
train_score = clf.score(X_train, y_train)
test_score = clf.score(X_test, y_test)
train_errors.append(1 - train_score)
test_errors.append(1 - test_score)
scores.append(test_score)
proba = clf.predict_proba(X_test)
fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1])
precision, recall, pr_thresholds = precision_recall_curve(
y_test, proba[:, 1])
pr_scores.append(auc(recall, precision))
precisions.append(precision)
recalls.append(recall)
thresholds.append(pr_thresholds)
if plot:
scores_to_sort = pr_scores
median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]
plot_pr(pr_scores[median], name, phase, precisions[median],
recalls[median], label=name)
log_false_positives(clfs[median], X_test, y_test, name)
summary = (np.mean(scores), np.std(scores),
np.mean(pr_scores), np.std(pr_scores))
print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary
return np.mean(train_errors), np.mean(test_errors)
03_clean.py 文件源码
项目:Building-Machine-Learning-Systems-With-Python-Second-Edition
作者: PacktPublishing
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def train_model(clf, X, Y, name="NB ngram", plot=False):
# create it again for plotting
cv = ShuffleSplit(
n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)
train_errors = []
test_errors = []
scores = []
pr_scores = []
precisions, recalls, thresholds = [], [], []
clfs = [] # just to later get the median
for train, test in cv:
X_train, y_train = X[train], Y[train]
X_test, y_test = X[test], Y[test]
clf.fit(X_train, y_train)
clfs.append(clf)
train_score = clf.score(X_train, y_train)
test_score = clf.score(X_test, y_test)
train_errors.append(1 - train_score)
test_errors.append(1 - test_score)
scores.append(test_score)
proba = clf.predict_proba(X_test)
fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1])
precision, recall, pr_thresholds = precision_recall_curve(
y_test, proba[:, 1])
pr_scores.append(auc(recall, precision))
precisions.append(precision)
recalls.append(recall)
thresholds.append(pr_thresholds)
if plot:
scores_to_sort = pr_scores
median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]
plot_pr(pr_scores[median], name, phase, precisions[median],
recalls[median], label=name)
log_false_positives(clfs[median], X_test, y_test, name)
summary = (np.mean(scores), np.std(scores),
np.mean(pr_scores), np.std(pr_scores))
print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary
return np.mean(train_errors), np.mean(test_errors)
02_tuning.py 文件源码
项目:Building-Machine-Learning-Systems-With-Python-Second-Edition
作者: PacktPublishing
项目源码
文件源码
阅读 32
收藏 0
点赞 0
评论 0
def train_model(clf, X, Y, name="NB ngram", plot=False):
# create it again for plotting
cv = ShuffleSplit(
n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)
train_errors = []
test_errors = []
scores = []
pr_scores = []
precisions, recalls, thresholds = [], [], []
for train, test in cv:
X_train, y_train = X[train], Y[train]
X_test, y_test = X[test], Y[test]
clf.fit(X_train, y_train)
train_score = clf.score(X_train, y_train)
test_score = clf.score(X_test, y_test)
train_errors.append(1 - train_score)
test_errors.append(1 - test_score)
scores.append(test_score)
proba = clf.predict_proba(X_test)
fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1])
precision, recall, pr_thresholds = precision_recall_curve(
y_test, proba[:, 1])
pr_scores.append(auc(recall, precision))
precisions.append(precision)
recalls.append(recall)
thresholds.append(pr_thresholds)
if plot:
scores_to_sort = pr_scores
median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]
plot_pr(pr_scores[median], name, phase, precisions[median],
recalls[median], label=name)
summary = (np.mean(scores), np.std(scores),
np.mean(pr_scores), np.std(pr_scores))
print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary
return np.mean(train_errors), np.mean(test_errors)
def analyze(clf,labels=None):
def _do(matrix, test_ratio=0.0):
if labels: # Learning mode
# Split train & test folds
shuffle = ShuffleSplit(len(matrix), test_size=test_ratio)
trainlist, testlist = [(a,b) for (a,b) in shuffle][-1]
X_train = [x for x in map(lambda i: matrix[i], trainlist)]
Y_train = [y for y in map(lambda i: labels[i], trainlist)]
X_valid = [x for x in map(lambda i: matrix[i], testlist)]
Y_valid = [y for y in map(lambda i: labels[i], testlist)]
# Display what the underlying classifier is
print(colored(clf[-1],'yellow'))
# Display the dimension of the training elements
print(colored('Trainset:','cyan'))
print(colored('X: {0}'.format(np.shape(X_train)),'yellow'))
print(colored('y: {0}'.format(np.shape(Y_train)),'yellow'))
# Process trainset
for opr in clf[:-1]:
print(colored(opr,'yellow'))
X_train = opr.fit_transform(X_train,Y_train)
# NOTE: The last operation of the CLF is always a clustering algo
clf[-1].fit(X_train,Y_train)
# Display the dimension of the training elements
print(colored('Validation set:','cyan'))
print(colored('X: {0}'.format(np.shape(X_valid)),'yellow'))
print(colored('y: {0}'.format(np.shape(Y_valid)),'yellow'))
# Process validation set
for opr in clf[:-1]:
print(colored(opr,'yellow'))
X_valid = opr.transform(X_valid)
# Return tuple of [actual], [prediction]
# on the validation set
return (Y_valid, clf[-1].predict(X_valid))
else: # Classification mode
X = matrix
# Feature transformations
for opr in clf[:-1]:
X = opr.transform(X)
# NOTE: Predict the clusters with the last operation
y = clf[-1].predict(X)
return iter(y)
return _do