def __init__( self, pdr, E_QC = "E_QC", Em = "Em", type_name = "Type", type_l = [1,2,3,4],
disp = False, graph = False):
# This parameter will be used in the run() function.
self.type_l = type_l
self.disp = disp
self.graph = graph
self.xMa = {}
self.yVa = {}
# self.kfa = {}
for type_id in type_l:
pdr_new = pdr[ pdr[ type_name] == type_id]
self.xMa[type_id] = np.mat( pdr_new[ E_QC].values).T
self.yVa[type_id] = np.mat( pdr_new[ Em].values).T
# kfa[type_id] = cross_validation.KFold( np.shape(yVa[type_id])[0], n_folds=5, shuffle=True)
python类KFold()的实例源码
jgrid (james-90X3A's conflicted copy 2016-04-21).py 文件源码
项目:jamespy_py3
作者: jskDr
项目源码
文件源码
阅读 33
收藏 0
点赞 0
评论 0
def _gs_SVC_r0( xM, yVc, params):
"""
Since classification is considered, we use yVc which includes digital values
whereas yV can include float point values.
"""
print(xM.shape, yVc.shape)
clf = svm.SVC()
#parmas = {'alpha': np.logspace(1, -1, 9)}
kf5 = cross_validation.KFold( xM.shape[0], n_folds=5, shuffle=True)
gs = grid_search.GridSearchCV( clf, params, cv = kf5, n_jobs = -1)
gs.fit( xM, yVc)
return gs
jgrid (james-90X3A's conflicted copy 2016-04-21).py 文件源码
项目:jamespy_py3
作者: jskDr
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
def gs_SVC( xM, yVc, params, n_folds = 5):
"""
Since classification is considered, we use yVc which includes digital values
whereas yV can include float point values.
"""
print(xM.shape, yVc.shape)
clf = svm.SVC()
#parmas = {'alpha': np.logspace(1, -1, 9)}
kf5 = cross_validation.KFold( xM.shape[0], n_folds=n_folds, shuffle=True)
gs = grid_search.GridSearchCV( clf, params, cv = kf5, n_jobs = -1)
gs.fit( xM, yVc)
return gs
jgrid (james-90X3A's conflicted copy 2016-04-21).py 文件源码
项目:jamespy_py3
作者: jskDr
项目源码
文件源码
阅读 37
收藏 0
点赞 0
评论 0
def gs_Ridge_BIKE( A_list, yV, XX = None, alphas_log = (1, -1, 9), n_folds = 5, n_jobs = -1):
"""
As is a list of A matrices where A is similarity matrix.
X is a concatened linear descriptors.
If no X is used, X can be empty
"""
clf = binary_model.BIKE_Ridge( A_list, XX)
parmas = {'alpha': np.logspace( *alphas_log)}
ln = A_list[0].shape[0] # ls is the number of molecules.
kf_n = cross_validation.KFold( ln, n_folds=n_folds, shuffle=True)
gs = grid_search.GridSearchCV( clf, parmas, scoring = 'r2', cv = kf_n, n_jobs = n_jobs)
AX_idx = np.array([list(range( ln))]).T
gs.fit( AX_idx, yV)
return gs
jgrid (james-90X3A's conflicted copy 2016-04-21).py 文件源码
项目:jamespy_py3
作者: jskDr
项目源码
文件源码
阅读 27
收藏 0
点赞 0
评论 0
def cv( method, xM, yV, alpha, n_folds = 5, n_jobs = -1, grid_std = None, graph = True):
"""
method can be 'Ridge', 'Lasso'
cross validation is performed so as to generate prediction output for all input molecules
"""
print(xM.shape, yV.shape)
clf = getattr( linear_model, method)( alpha = alpha)
kf_n = cross_validation.KFold( xM.shape[0], n_folds=n_folds, shuffle=True)
yV_pred = cross_validation.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs)
if graph:
print('The prediction output using cross-validation is given by:')
jutil.cv_show( yV, yV_pred, grid_std = grid_std)
return yV_pred
jgrid (james-90X3A's conflicted copy 2016-04-21).py 文件源码
项目:jamespy_py3
作者: jskDr
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def cv_Ridge_BIKE( A_list, yV, XX = None, alpha = 0.5, n_folds = 5, n_jobs = -1, grid_std = None):
clf = binary_model.BIKE_Ridge( A_list, XX, alpha = alpha)
ln = A_list[0].shape[0] # ls is the number of molecules.
kf_n = cross_validation.KFold( ln, n_folds=n_folds, shuffle=True)
AX_idx = np.array([list(range( ln))]).T
yV_pred = cross_validation.cross_val_predict( clf, AX_idx, yV, cv = kf_n, n_jobs = n_jobs)
print('The prediction output using cross-validation is given by:')
jutil.cv_show( yV, yV_pred, grid_std = grid_std)
return yV_pred
def _gs_SVC_r0( xM, yVc, params):
"""
Since classification is considered, we use yVc which includes digital values
whereas yV can include float point values.
"""
print(xM.shape, yVc.shape)
clf = svm.SVC()
#parmas = {'alpha': np.logspace(1, -1, 9)}
kf5 = cross_validation.KFold( xM.shape[0], n_folds=5, shuffle=True)
gs = grid_search.GridSearchCV( clf, params, cv = kf5, n_jobs = -1)
gs.fit( xM, yVc)
return gs
def gs_Ridge_BIKE( A_list, yV, XX = None, alphas_log = (1, -1, 9), n_folds = 5, n_jobs = -1):
"""
As is a list of A matrices where A is similarity matrix.
X is a concatened linear descriptors.
If no X is used, X can be empty
"""
clf = binary_model.BIKE_Ridge( A_list, XX)
parmas = {'alpha': np.logspace( *alphas_log)}
ln = A_list[0].shape[0] # ls is the number of molecules.
kf_n = cross_validation.KFold( ln, n_folds=n_folds, shuffle=True)
gs = grid_search.GridSearchCV( clf, parmas, scoring = 'r2', cv = kf_n, n_jobs = n_jobs)
AX_idx = np.array([list(range( ln))]).T
gs.fit( AX_idx, yV)
return gs
def gs_BIKE_Ridge( A_list, yV, alphas_log = (1, -1, 9), X_concat = None, n_folds = 5, n_jobs = -1):
"""
As is a list of A matrices where A is similarity matrix.
X is a concatened linear descriptors.
If no X is used, X can be empty
"""
clf = binary_model.BIKE_Ridge( A_list, X_concat)
parmas = {'alpha': np.logspace( *alphas_log)}
ln = A_list[0].shape[0] # ls is the number of molecules.
kf_n = cross_validation.KFold( ln, n_folds=n_folds, shuffle=True)
gs = grid_search.GridSearchCV( clf, parmas, scoring = 'r2', cv = kf_n, n_jobs = n_jobs)
AX_idx = np.array([list(range( ln))]).T
gs.fit( AX_idx, yV)
return gs
def cv( method, xM, yV, alpha, n_folds = 5, n_jobs = -1, grid_std = None, graph = True, shuffle = True):
"""
method can be 'Ridge', 'Lasso'
cross validation is performed so as to generate prediction output for all input molecules
"""
print(xM.shape, yV.shape)
clf = getattr( linear_model, method)( alpha = alpha)
kf_n = cross_validation.KFold( xM.shape[0], n_folds=n_folds, shuffle=shuffle)
yV_pred = cross_validation.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs)
if graph:
print('The prediction output using cross-validation is given by:')
jutil.cv_show( yV, yV_pred, grid_std = grid_std)
return yV_pred
def _cv_LOO_r0( method, xM, yV, alpha, n_jobs = -1, grid_std = None, graph = True):
"""
method can be 'Ridge', 'Lasso'
cross validation is performed so as to generate prediction output for all input molecules
"""
n_folds = xM.shape[0]
print(xM.shape, yV.shape)
clf = getattr( linear_model, method)( alpha = alpha)
kf_n = cross_validation.KFold( xM.shape[0], n_folds=n_folds)
yV_pred = cross_validation.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs)
if graph:
print('The prediction output using cross-validation is given by:')
jutil.cv_show( yV, yV_pred, grid_std = grid_std)
return yV_pred
def create_cv_id(target, n_folds_ = 5, cv_id_name=cv_id_name, seed=407):
try:
a = StratifiedKFold(target['target'],n_folds=n_folds_, shuffle=True, random_state=seed)
cv_index = a.test_folds
print 'Done StratifiedKFold'
except:
cv_index = np.empty(len(target))
a = KFold(len(target),n_folds=n_folds_, shuffle=True, random_state=seed)
for idx, i in enumerate(a):
cv_index[i[1]] = idx
cv_index = cv_index.astype(int)
print 'Done Kfold'
np.save(INPUT_PATH + cv_id_name, cv_index)
return
######### Utils #########
#feature list????????????util??
def test_kfold_no_shuffle():
# Manually check that KFold preserves the data ordering on toy datasets
splits = iter(cval.KFold(4, 2))
train, test = next(splits)
assert_array_equal(test, [0, 1])
assert_array_equal(train, [2, 3])
train, test = next(splits)
assert_array_equal(test, [2, 3])
assert_array_equal(train, [0, 1])
splits = iter(cval.KFold(5, 2))
train, test = next(splits)
assert_array_equal(test, [0, 1, 2])
assert_array_equal(train, [3, 4])
train, test = next(splits)
assert_array_equal(test, [3, 4])
assert_array_equal(train, [0, 1, 2])
def test_predefinedsplit_with_kfold_split():
# Check that PredefinedSplit can reproduce a split generated by Kfold.
folds = -1 * np.ones(10)
kf_train = []
kf_test = []
for i, (train_ind, test_ind) in enumerate(cval.KFold(10, 5, shuffle=True)):
kf_train.append(train_ind)
kf_test.append(test_ind)
folds[test_ind] = i
ps_train = []
ps_test = []
ps = cval.PredefinedSplit(folds)
for train_ind, test_ind in ps:
ps_train.append(train_ind)
ps_test.append(test_ind)
assert_array_equal(ps_train, kf_train)
assert_array_equal(ps_test, kf_test)
def calculate_roc(thresholds, embeddings1, embeddings2, actual_issame, nrof_folds=10):
assert(embeddings1.shape[0] == embeddings2.shape[0])
assert(embeddings1.shape[1] == embeddings2.shape[1])
nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
nrof_thresholds = len(thresholds)
k_fold = KFold(n_splits=nrof_folds, shuffle=False)
tprs = np.zeros((nrof_folds,nrof_thresholds))
fprs = np.zeros((nrof_folds,nrof_thresholds))
accuracy = np.zeros((nrof_folds))
diff = np.subtract(embeddings1, embeddings2)
dist = np.sum(np.square(diff),1)
indices = np.arange(nrof_pairs)
for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):
# Find the best threshold for the fold
acc_train = np.zeros((nrof_thresholds))
for threshold_idx, threshold in enumerate(thresholds):
_, _, acc_train[threshold_idx] = calculate_accuracy(threshold, dist[train_set], actual_issame[train_set])
best_threshold_index = np.argmax(acc_train)
for threshold_idx, threshold in enumerate(thresholds):
tprs[fold_idx,threshold_idx], fprs[fold_idx,threshold_idx], _ = calculate_accuracy(threshold, dist[test_set], actual_issame[test_set])
_, _, accuracy[fold_idx] = calculate_accuracy(thresholds[best_threshold_index], dist[test_set], actual_issame[test_set])
tpr = np.mean(tprs,0)
fpr = np.mean(fprs,0)
return tpr, fpr, accuracy
def calculate_val(thresholds, embeddings1, embeddings2, actual_issame, far_target, nrof_folds=10):
assert(embeddings1.shape[0] == embeddings2.shape[0])
assert(embeddings1.shape[1] == embeddings2.shape[1])
nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
nrof_thresholds = len(thresholds)
k_fold = KFold(n_splits=nrof_folds, shuffle=False)
val = np.zeros(nrof_folds)
far = np.zeros(nrof_folds)
diff = np.subtract(embeddings1, embeddings2)
dist = np.sum(np.square(diff),1)
indices = np.arange(nrof_pairs)
for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):
# Find the threshold that gives FAR = far_target
far_train = np.zeros(nrof_thresholds)
for threshold_idx, threshold in enumerate(thresholds):
_, far_train[threshold_idx] = calculate_val_far(threshold, dist[train_set], actual_issame[train_set])
if np.max(far_train)>=far_target:
f = interpolate.interp1d(far_train, thresholds, kind='slinear')
threshold = f(far_target)
else:
threshold = 0.0
val[fold_idx], far[fold_idx] = calculate_val_far(threshold, dist[test_set], actual_issame[test_set])
val_mean = np.mean(val)
far_mean = np.mean(far)
val_std = np.std(val)
return val_mean, val_std, far_mean
def get_kfold_bydate(self, df, n_folds = 10):
df.sort_values(by = ['time_date','time_id','start_district_id'], axis = 0, inplace = True)
df.reset_index(drop=True, inplace = True)
kf = KFold(df.shape[0], n_folds= n_folds, shuffle=False)
for train_index, test_index in kf:
print("TRAIN:", train_index, "TEST:", test_index)
return kf
def test_large_grid():
"""In this test, we purposely overfit a RandomForest to completely random data
in order to assert that the test error will far supercede the train error.
"""
if not SK18:
custom_cv = KFold(n=y_train.shape[0], n_folds=3, shuffle=True, random_state=42)
else:
custom_cv = KFold(n_splits=3, shuffle=True, random_state=42)
# define the pipe
pipe = Pipeline([
('scaler', SelectiveScaler()),
('pca', SelectivePCA(weight=True)),
('rf', RandomForestClassifier(random_state=42))
])
# define hyper parameters
hp = {
'scaler__scaler': [StandardScaler(), RobustScaler(), MinMaxScaler()],
'pca__whiten': [True, False],
'pca__weight': [True, False],
'pca__n_components': uniform(0.75, 0.15),
'rf__n_estimators': randint(5, 10),
'rf__max_depth': randint(5, 15)
}
# define the grid
grid = RandomizedSearchCV(pipe, hp, n_iter=2, scoring='accuracy', n_jobs=1, cv=custom_cv, random_state=42)
# this will fail because we haven't fit yet
assert_fails(grid.score, (ValueError, AttributeError), X_train, y_train)
# fit the grid
grid.fit(X_train, y_train)
# score for coverage -- this might warn...
with warnings.catch_warnings():
warnings.simplefilter("ignore")
grid.score(X_train, y_train)
# coverage:
assert grid._estimator_type == 'classifier'
# get predictions
tr_pred, te_pred = grid.predict(X_train), grid.predict(X_test)
# evaluate score (SHOULD be better than random...)
accuracy_score(y_train, tr_pred), accuracy_score(y_test, te_pred)
# grid score reports:
# assert fails for bad percentile
assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 0.0})
assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 1.0})
# assert fails for bad y_axis
assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'y_axis': 'bad_axis'})
# assert passes otherwise
report_grid_score_detail(grid, charts=True, percentile=0.95) # just ensure percentile works
def crossvalidate(problem, dataset, set_size, uid, w, var, cov,
transform, old_alpha, lmbda=0.5):
"""Finds the best hyperparameters using cross-validation.
Parameters
----------
WRITEME
Returns
-------
alpha : tuple
The best hyperparameter.
"""
if len(dataset) % _NUM_FOLDS != 0:
return old_alpha
kfold = KFold(len(dataset), n_folds=_NUM_FOLDS)
f = compute_transform(uid, w, var, cov, transform, lmbda=lmbda)
avg_accuracy = np.zeros(len(_ALPHAS))
for i, alpha in enumerate(_ALPHAS):
accuracies = []
for tr_indices, ts_indices in kfold:
w, _ = problem.select_query(dataset[tr_indices], set_size, alpha,
transform=f)
utilities = np.dot(w, dataset[ts_indices].T)
accuracies.append((utilities > 0).mean())
avg_accuracy[i] = sum(accuracies) / len(accuracies)
alpha = _I_TO_ALPHA[np.argmax(avg_accuracy)]
_LOG.debug('''\
alpha accuracies = {avg_accuracy}
best alpha = {alpha}
''', **locals())
return alpha
def cross_validate(model, X, y, n_folds, batch_size, num_epoch, func_for_evaluation=None):
# let's shuffle first.
seed = 5
np.random.seed(seed)
np.random.shuffle(X)
np.random.seed(seed)
np.random.shuffle(y)
X = np.array(X)
y = np.array(y)
scores = np.zeros(n_folds)
kf = KFold(len(y), n_folds=n_folds)
for i, (train_index, test_index) in enumerate(kf):
X_train, y_train = X[train_index, :], y[train_index]
X_test, y_test = X[test_index, :], y[test_index]
model.fit(X_train, y_train,
batch_size=batch_size,
nb_epoch=num_epoch)
predictions = model.predict(X_test)
score = func_for_evaluation(predictions[:, 0].tolist(), y_test)
try:
scores[i] = score[0]
except IndexError:
scores[i] = score
print "{}-Fold cross validation score: {}".format(n_folds, scores.mean())