def _cv_r0( method, xM, yV, alpha, n_folds = 5, n_jobs = -1, grid_std = None, graph = True):
"""
method can be 'Ridge', 'Lasso'
cross validation is performed so as to generate prediction output for all input molecules
"""
print(xM.shape, yV.shape)
clf = getattr( linear_model, method)( alpha = alpha)
kf_n = cross_validation.KFold( xM.shape[0], n_folds=n_folds, shuffle=True)
yV_pred = cross_validation.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs)
if graph:
print('The prediction output using cross-validation is given by:')
jutil.cv_show( yV, yV_pred, grid_std = grid_std)
return yV_pred
python类cross_val_predict()的实例源码
def test_model(self, n_folds=10):
""" ?? `??K-??????Stratified K-folds cross-validating?`
???????
"""
logging.debug("testing model with {}-folds CV".format(n_folds))
model = self.init_model()
X = self.data.data
y = self.data.target
cv = cross_validation.StratifiedKFold(y, n_folds=n_folds, random_state=42)
t0 = time()
y_pred = cross_validation.cross_val_predict(model, X=X, y=y, n_jobs=-1, cv=cv)
t = time() - t0
print("=" * 52)
print("time cost: {}".format(t))
print()
print("confusion matrix\n", metrics.confusion_matrix(y, y_pred))
print()
print("\t\taccuracy: {}".format(metrics.accuracy_score(y, y_pred)))
print()
print("\t\tclassification report")
print("-" * 52)
print(metrics.classification_report(y, y_pred))
def get_logistic_regression_coefs_l2(self, category,
clf=RidgeClassifierCV()):
''' Computes l2-penalized logistic regression score.
Parameters
----------
category : str
category name to score
category : str
category name to score
Returns
-------
(coefficient array, accuracy, majority class baseline accuracy)
'''
from sklearn.cross_validation import cross_val_predict
y = self._get_mask_from_category(category)
X = TfidfTransformer().fit_transform(self._X)
clf.fit(X, y)
y_hat = cross_val_predict(clf, X, y)
acc, baseline = self._get_accuracy_and_baseline_accuracy(y, y_hat)
return clf.coef_[0], acc, baseline
def get_logistic_regression_coefs_l1(self, category,
clf=LassoCV(alphas=[0.1, 0.001],
max_iter=10000,
n_jobs=-1)):
''' Computes l1-penalized logistic regression score.
Parameters
----------
category : str
category name to score
Returns
-------
(coefficient array, accuracy, majority class baseline accuracy)
'''
from sklearn.cross_validation import cross_val_predict
y = self._get_mask_from_category(category)
y_continuous = self._get_continuous_version_boolean_y(y)
# X = TfidfTransformer().fit_transform(self._X)
X = self._X
clf.fit(X, y_continuous)
y_hat = (cross_val_predict(clf, X, y_continuous) > 0)
acc, baseline = self._get_accuracy_and_baseline_accuracy(y, y_hat)
clf.fit(X, y_continuous)
return clf.coef_, acc, baseline
def fit(self, xy_file, fname_out):
"""
All grid results will be saved later,
although only the best result is saved.
"""
df = read_csv( xy_file)
X = df['X'].values
y = df['y'].values
super().fit( X, y)
yp = cross_validation.cross_val_predict( self.best_estimator_, X, y)
m_idx = pd.MultiIndex.from_product([['yp'], df['y'].columns])
yp_df = pd.DataFrame( yp, index = df.index, columns=m_idx)
df_out = pd.concat([df, yp_df], axis = 1)
df_out.to_csv( fname_out)
return self
def cross_val_predict(self, fname_out = None):
"""
This function is added to save the result of the predicted values.
"""
yp = cross_validation.cross_val_predict( self.best_estimator_, self.X, self.y)
idx = pd.MultiIndex.from_product([['yp'], self.df['y'].columns])
yp_df = pd.DataFrame( yp, index = self.df.index, columns=idx)
df_out_org = self.df.merge( yp_df, left_index = True, right_index = True)
self.df_out = DataFrame( df_out_org[["X", "y", "yp", "param"]])
# df_out = pd.concat([self.df, yp_df], axis = 1)
self.df_out.to_csv_excel( '_out', self.fname, fname_out)
return yp
def cross_val_predict(self, fname_out = None):
"""
This function is added to save the result of the predicted values.
"""
yp = cross_validation.cross_val_predict( self.best_estimator_, self.X, self.y)
idx = pd.MultiIndex.from_product([['yp'], self.df['y'].columns])
yp_df = pd.DataFrame( yp, index = self.df.index, columns=idx)
df_out_org = self.df.merge( yp_df, left_index = True, right_index = True)
self.df_out = DataFrame( df_out_org[["X", "y", "yp", "param"]])
# df_out = pd.concat([self.df, yp_df], axis = 1)
self.df_out.to_csv_excel( '_out', self.fname, fname_out)
return yp
jgrid (james-90X3A's conflicted copy 2016-04-21).py 文件源码
项目:jamespy_py3
作者: jskDr
项目源码
文件源码
阅读 32
收藏 0
点赞 0
评论 0
def cv( method, xM, yV, alpha, n_folds = 5, n_jobs = -1, grid_std = None, graph = True):
"""
method can be 'Ridge', 'Lasso'
cross validation is performed so as to generate prediction output for all input molecules
"""
print(xM.shape, yV.shape)
clf = getattr( linear_model, method)( alpha = alpha)
kf_n = cross_validation.KFold( xM.shape[0], n_folds=n_folds, shuffle=True)
yV_pred = cross_validation.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs)
if graph:
print('The prediction output using cross-validation is given by:')
jutil.cv_show( yV, yV_pred, grid_std = grid_std)
return yV_pred
jgrid (james-90X3A's conflicted copy 2016-04-21).py 文件源码
项目:jamespy_py3
作者: jskDr
项目源码
文件源码
阅读 32
收藏 0
点赞 0
评论 0
def cv_Ridge_BIKE( A_list, yV, XX = None, alpha = 0.5, n_folds = 5, n_jobs = -1, grid_std = None):
clf = binary_model.BIKE_Ridge( A_list, XX, alpha = alpha)
ln = A_list[0].shape[0] # ls is the number of molecules.
kf_n = cross_validation.KFold( ln, n_folds=n_folds, shuffle=True)
AX_idx = np.array([list(range( ln))]).T
yV_pred = cross_validation.cross_val_predict( clf, AX_idx, yV, cv = kf_n, n_jobs = n_jobs)
print('The prediction output using cross-validation is given by:')
jutil.cv_show( yV, yV_pred, grid_std = grid_std)
return yV_pred
def cv( method, xM, yV, alpha, n_folds = 5, n_jobs = -1, grid_std = None, graph = True, shuffle = True):
"""
method can be 'Ridge', 'Lasso'
cross validation is performed so as to generate prediction output for all input molecules
"""
print(xM.shape, yV.shape)
clf = getattr( linear_model, method)( alpha = alpha)
kf_n = cross_validation.KFold( xM.shape[0], n_folds=n_folds, shuffle=shuffle)
yV_pred = cross_validation.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs)
if graph:
print('The prediction output using cross-validation is given by:')
jutil.cv_show( yV, yV_pred, grid_std = grid_std)
return yV_pred
def _cv_LOO_r0( method, xM, yV, alpha, n_jobs = -1, grid_std = None, graph = True):
"""
method can be 'Ridge', 'Lasso'
cross validation is performed so as to generate prediction output for all input molecules
"""
n_folds = xM.shape[0]
print(xM.shape, yV.shape)
clf = getattr( linear_model, method)( alpha = alpha)
kf_n = cross_validation.KFold( xM.shape[0], n_folds=n_folds)
yV_pred = cross_validation.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs)
if graph:
print('The prediction output using cross-validation is given by:')
jutil.cv_show( yV, yV_pred, grid_std = grid_std)
return yV_pred
def cross_predict(feat, f_name, X=X, y=y):
if os.name == 'nt':
n_jobs = 1
else:
n_jobs = -1
# ????
# clf_1 = MultinomialNB(alpha=5)
clf_2 = LinearSVC(C=0.02)
# ???? (CV)
# This cross-validation object is a merge of StratifiedKFold and ShuffleSplit,
# which returns stratified randomized folds. The folds are made by preserving
# the percentage of samples for each class.
#
# Note: like the ShuffleSplit strategy, stratified random splits do not guarantee
# that all folds will be different, although this is still
# very likely for sizeable datasets.
#
# Pass this cv to cross_val_predict will raise
# ValueError:cross_val_predict only works for partitions
#
# ? cv ?????? fold ? fold ????????
# cv = cross_validation.StratifiedShuffleSplit(y, test_size=0.2, random_state=42)
# This cross-validation object is a variation of KFold that returns stratified folds.
# The folds are made by preserving the percentage of samples for each class.
cv = cross_validation.StratifiedKFold(y, n_folds=5, random_state=42)
model = Pipeline([('feat', feat), ('clf', clf_2)])
t0 = time()
y_pred = cross_validation.cross_val_predict(model, X=X, y=y, n_jobs=n_jobs, cv=cv)
t = time() - t0
print("=" * 20, f_name, "=" * 20)
print("time cost: {}".format(t))
# print("y_predict: {}".format(y_pred))
print()
print('confusion matrix:\n', confusion_matrix(y, y_pred))
print()
print('\t\taccuracy: {}'.format(accuracy_score(y, y_pred)))
print()
print("\t\tclassification report")
print("-" * 52)
print(classification_report(y, y_pred))
# ??
# ???? (tfidf: baseline feature)
def cv_Ridge_BIKE( A_list, yV, XX = None, alpha = 0.5, n_folds = 5, n_jobs = -1, grid_std = None):
clf = binary_model.BIKE_Ridge( A_list, XX, alpha = alpha)
ln = A_list[0].shape[0] # ls is the number of molecules.
kf_n = cross_validation.KFold( ln, n_folds=n_folds, shuffle=True)
AX_idx = np.array([list(range( ln))]).T
yV_pred = cross_validation.cross_val_predict( clf, AX_idx, yV, cv = kf_n, n_jobs = n_jobs)
print('The prediction output using cross-validation is given by:')
jutil.cv_show( yV, yV_pred, grid_std = grid_std)
return yV_pred
def cv_BIKE_Ridge( A_list, yV, alpha = 0.5, XX = None, n_folds = 5, n_jobs = -1, grid_std = None):
clf = binary_model.BIKE_Ridge( A_list, XX, alpha = alpha)
ln = A_list[0].shape[0] # ls is the number of molecules.
kf_n = cross_validation.KFold( ln, n_folds=n_folds, shuffle=True)
AX_idx = np.array([list(range( ln))]).T
yV_pred = cross_validation.cross_val_predict( clf, AX_idx, yV, cv = kf_n, n_jobs = n_jobs)
print('The prediction output using cross-validation is given by:')
jutil.cv_show( yV, yV_pred, grid_std = grid_std)
return yV_pred
def multireg(self,Xtrain,ytrain, Xtest, ytest):
self.normalize(Xtrain)
'''
# polynomial try
poly = PolynomialFeatures(degree=2)
Xtrain = poly.fit_transform(Xtrain)
Xtest = poly.fit_transform(Xtest)
'''
# normal clf fit
clf = linear_model.LinearRegression()
clf.fit (Xtrain, ytrain)
coeffients = clf.coef_
print "coefficients:", coeffients
print "intercept:", clf.intercept_
print "train score", clf.score(Xtrain,ytrain)
print "test score", clf.score(Xtest,ytest)
# manual calculate train accuracy
train_results = clf.predict(Xtrain)
print "first x:", Xtrain[0]
print "first result:", train_results[0]
correct = 0
for i in range(len(train_results)):
if round(train_results[i], 1) == round(ytrain[i], 1):
correct += 1
accuracy = correct * 1.0 / len(ytrain)
print "train accuracy: ", accuracy * 100, "%"
# cross validation
score = cross_validation.cross_val_score(clf, Xtrain, ytrain, scoring='mean_squared_error', cv = 5)
print "cross validation score: ", score
predict = cross_val_predict(clf, Xtrain, ytrain, cv = 5)
correct = 0
for i in range(len(predict)):
if round(predict[i], 1) == round(ytrain[i], 1):
correct += 1
accuracy = correct * 1.0 / len(ytrain)
print "cross validation accuracy: ", accuracy * 100, "%"
# manual calculate test accuracy
self.normalize(Xtest)
results = clf.predict(Xtest)
correct = 0
for i in range(len(results)):
if round(results[i], 1) == round(ytest[i], 1):
correct += 1
accuracy = correct * 1.0 / len(ytest)
print "test accuracy: ", accuracy * 100, "%"
return coeffients
def lasso_multireg(self,Xtrain,ytrain, Xtest, ytest):
self.normalize(Xtrain)
clf = linear_model.Lasso(alpha = 0.5)
clf.fit (Xtrain, ytrain)
coeffients = clf.coef_
print "coeffients: ", coeffients
print "train score", clf.score(Xtrain,ytrain)
print "test score", clf.score(Xtest,ytest)
# manual calculate train accuracy
train_results = clf.predict(Xtrain)
correct = 0
for i in range(len(train_results)):
if round(train_results[i], 1) == round(ytrain[i], 1):
correct += 1
accuracy = correct * 1.0 / len(ytrain)
print "train accuracy: ", accuracy * 100, "%"
# cross validation
predict = cross_val_predict(clf, Xtrain, ytrain, cv = 5)
correct = 0
for i in range(len(predict)):
if round(predict[i], 1) == round(ytrain[i], 1):
correct += 1
accuracy = correct * 1.0 / len(ytrain)
print "cross validation accuracy: ", accuracy * 100, "%"
# manual calculate test accuracy
self.normalize(Xtest)
results = clf.predict(Xtest)
correct = 0
for i in range(len(results)):
#print round(results[i], 1), round(ytest[i], 1)
if round(results[i], 1) == round(ytest[i], 1):
correct += 1
accuracy = correct * 1.0 / len(ytest)
print "test accuracy: ", accuracy * 100, "%"
return coeffients
def _generate_cross_val_predict_test(X, y, est, pd_est, must_match):
def test(self):
self.assertEqual(
hasattr(est, 'predict'),
hasattr(pd_est, 'predict'))
if not hasattr(est, 'predict'):
return
pd_y_hat = pd_cross_val_predict(pd_est, X, y)
self.assertTrue(isinstance(pd_y_hat, pd.Series))
self.assertTrue(pd_y_hat.index.equals(X.index))
if must_match:
y_hat = cross_val_predict(est, X.as_matrix(), y.values)
np.testing.assert_allclose(pd_y_hat, y_hat)
return test
def test_cross_val_predict():
boston = load_boston()
X, y = boston.data, boston.target
cv = cval.KFold(len(boston.target))
est = Ridge()
# Naive loop (should be same as cross_val_predict):
preds2 = np.zeros_like(y)
for train, test in cv:
est.fit(X[train], y[train])
preds2[test] = est.predict(X[test])
preds = cval.cross_val_predict(est, X, y, cv=cv)
assert_array_almost_equal(preds, preds2)
preds = cval.cross_val_predict(est, X, y)
assert_equal(len(preds), len(y))
cv = cval.LeaveOneOut(len(y))
preds = cval.cross_val_predict(est, X, y, cv=cv)
assert_equal(len(preds), len(y))
Xsp = X.copy()
Xsp *= (Xsp > np.median(Xsp))
Xsp = coo_matrix(Xsp)
preds = cval.cross_val_predict(est, Xsp, y)
assert_array_almost_equal(len(preds), len(y))
preds = cval.cross_val_predict(KMeans(), X)
assert_equal(len(preds), len(y))
def bad_cv():
for i in range(4):
yield np.array([0, 1, 2, 3]), np.array([4, 5, 6, 7, 8])
assert_raises(ValueError, cval.cross_val_predict, est, X, y, cv=bad_cv())
def test_cross_val_predict_input_types():
clf = Ridge()
# Smoke test
predictions = cval.cross_val_predict(clf, X, y)
assert_equal(predictions.shape, (10,))
# test with multioutput y
predictions = cval.cross_val_predict(clf, X_sparse, X)
assert_equal(predictions.shape, (10, 2))
predictions = cval.cross_val_predict(clf, X_sparse, y)
assert_array_equal(predictions.shape, (10,))
# test with multioutput y
predictions = cval.cross_val_predict(clf, X_sparse, X)
assert_array_equal(predictions.shape, (10, 2))
# test with X and y as list
list_check = lambda x: isinstance(x, list)
clf = CheckingClassifier(check_X=list_check)
predictions = cval.cross_val_predict(clf, X.tolist(), y.tolist())
clf = CheckingClassifier(check_y=list_check)
predictions = cval.cross_val_predict(clf, X, y.tolist())
# test with 3d X and
X_3d = X[:, :, np.newaxis]
check_3d = lambda x: x.ndim == 3
clf = CheckingClassifier(check_X=check_3d)
predictions = cval.cross_val_predict(clf, X_3d, y)
assert_array_equal(predictions.shape, (10,))
def test_cross_val_predict_pandas():
# check cross_val_score doesn't destroy pandas dataframe
types = [(MockDataFrame, MockDataFrame)]
try:
from pandas import Series, DataFrame
types.append((Series, DataFrame))
except ImportError:
pass
for TargetType, InputFeatureType in types:
# X dataframe, y series
X_df, y_ser = InputFeatureType(X), TargetType(y)
check_df = lambda x: isinstance(x, InputFeatureType)
check_series = lambda x: isinstance(x, TargetType)
clf = CheckingClassifier(check_X=check_df, check_y=check_series)
cval.cross_val_predict(clf, X_df, y_ser)
def test_cross_val_predict_sparse_prediction():
# check that cross_val_predict gives same result for sparse and dense input
X, y = make_multilabel_classification(n_classes=2, n_labels=1,
allow_unlabeled=False,
return_indicator=True,
random_state=1)
X_sparse = csr_matrix(X)
y_sparse = csr_matrix(y)
classif = OneVsRestClassifier(SVC(kernel='linear'))
preds = cval.cross_val_predict(classif, X, y, cv=10)
preds_sparse = cval.cross_val_predict(classif, X_sparse, y_sparse, cv=10)
preds_sparse = preds_sparse.toarray()
assert_array_almost_equal(preds_sparse, preds)
def validate(self, features, labels, number_folds):
"""
Compute a model's performance metrics based on k-fold cross-validation technique.
Parameters
----------
features: array-like of shape = [number_samples, number_features]
The validation input samples.
labels: array-like of shape = [number_samples] or [number_samples, number_outputs]
The target values (class labels in classification).
number_folds: int
The amount of folds for the k-fold cross-validation.
If 0 compute metrics withput folds.
If > 0 compute metrics with n folds, n=number_folds.
Return
----------
accuracy: float
The accuracy of the model based on it's confusion matrix.
precision: float
The precision of the model based on it's confusion matrix.
sensitivity: float
The sensitivity of the model based on it's confusion matrix.
specificity: float
The specificity of the model based on it's confusion matrix.
kappa: float
The Cohen's Kappa of the model based on it's confusion matrix.
"""
if number_folds == 0:
predictions = self.model.predict(features)
else:
predictions = cross_val_predict(self.model, features, labels, cv = number_folds)
matrix = confusion_matrix(labels, predictions)
sum_columns = numpy.sum(matrix, 0)
sum_rows = numpy.sum(matrix, 1)
diagonal_sum = numpy.trace(matrix)
total_sum = numpy.sum(sum_rows)
accuracy = diagonal_sum / total_sum
temp_precision = []
temp_sensitivity = []
temp_specificity = []
for i in range(len(matrix)):
temp_precision.append(matrix[i][i] / sum_columns[i])
temp_sensitivity.append(matrix[i][i] / sum_rows[i])
temp_reduced_sum = total_sum - sum_rows[i] - sum_columns[i] + matrix[i][i]
temp_specificity.append(temp_reduced_sum / (temp_reduced_sum + sum_columns[i] - matrix[i][i]))
precision = sum(temp_precision * sum_rows) / total_sum
sensitivity = sum(temp_sensitivity * sum_rows) / total_sum
specificity = sum(temp_specificity * sum_rows) / total_sum
kappa_sum = sum(sum_rows * sum_columns)
kappa_numerator = (total_sum * diagonal_sum) - kappa_sum
kappa_denominator = (total_sum * total_sum) - kappa_sum
kappa = kappa_numerator / kappa_denominator
return accuracy, precision, sensitivity, specificity, kappa
def ridge_multireg(self,Xtrain,ytrain, Xtest, ytest):
self.normalize(Xtrain)
'''
# polynomial try
poly = PolynomialFeatures(degree=2)
Xtrain = poly.fit_transform(Xtrain)
Xtest = poly.fit_transform(Xtest)
'''
# normal clf try
clf = linear_model.Ridge(alpha = 10000)
clf.fit (Xtrain, ytrain)
coeffients = clf.coef_
print "train score", clf.score(Xtrain,ytrain)
print "test score", clf.score(Xtest,ytest)
# manual calculate train accuracy
train_results = clf.predict(Xtrain)
correct = 0
for i in range(len(train_results)):
if round(train_results[i], 1) == round(ytrain[i], 1):
correct += 1
accuracy = correct * 1.0 / len(ytrain)
print "train accuracy: ", accuracy * 100, "%"
# cross validation
score = cross_validation.cross_val_score(clf, Xtrain, ytrain, scoring='mean_squared_error', cv = 5)
print "cross validation score: ", score
'''
predict = cross_val_predict(clf, Xtrain, ytrain, cv = 5)
correct = 0
for i in range(len(predict)):
if round(predict[i]) == round(ytrain[i]):
correct += 1
accuracy = correct * 1.0 / len(ytrain)
print "cross validation accuracy: ", accuracy * 100, "%"
'''
# manual calculate test accuracy
self.normalize(Xtest)
results = clf.predict(Xtest)
correct = 0
for i in range(len(results)):
if round(results[i], 1) == round(ytest[i], 1):
correct += 1
accuracy = correct * 1.0 / len(ytest)
print "test accuracy: ", accuracy * 100, "%"
return coeffients