def learn_decision_tree(data):
DT = tree.DecisionTreeClassifier(max_depth=7)
scorer = make_scorer(matthews_corrcoef)
for i in range(5):
scores = cross_val_score(DT, data.X_train, data.y_train, cv=10, scoring=scorer)
print("iteration",i, "dt mean:", scores.mean())
scores = list(scores)
print("Decision Tree train scores:\n", scores)
return DT
# DT = DT.fit(train_data[:, :-1], train_data[:, -1])
# predictionsDT = DT.predict(validation_data[:, :-1])
# validating predicions
# dtError = 0
# for i in range(0, len(validation_data)):
# if(validation_data[i][20] != predictionsDT[i]):
# dtError = dtError + 1
# print("DT Error : ", float(dtError)/len(validation_data)*100.0)
python类cross_val_score()的实例源码
def _sfn(l, mask, myrad, bcast_var):
"""Score classifier on searchlight data using cross-validation.
The classifier is in `bcast_var[2]`. The labels are in `bast_var[0]`. The
number of cross-validation folds is in `bast_var[1].
"""
clf = bcast_var[2]
data = l[0][mask, :].T
# print(l[0].shape, mask.shape, data.shape)
skf = model_selection.StratifiedKFold(n_splits=bcast_var[1],
shuffle=False)
accuracy = np.mean(model_selection.cross_val_score(clf, data,
y=bcast_var[0],
cv=skf,
n_jobs=1))
return accuracy
def example_of_cross_validation_using_model_selection(raw_data, labels, num_subjects, num_epochs_per_subj):
# NOTE: this method does not work for sklearn.svm.SVC with precomputed kernel
# when the kernel matrix is computed in portions; also, this method only works
# for self-correlation, i.e. correlation between the same data matrix.
# no shrinking, set C=1
svm_clf = svm.SVC(kernel='precomputed', shrinking=False, C=1)
#logit_clf = LogisticRegression()
clf = Classifier(svm_clf, epochs_per_subj=num_epochs_per_subj)
# doing leave-one-subject-out cross validation
# no shuffling in cv
skf = model_selection.StratifiedKFold(n_splits=num_subjects,
shuffle=False)
scores = model_selection.cross_val_score(clf, list(zip(raw_data, raw_data)),
y=labels,
cv=skf)
print(scores)
logger.info(
'the overall cross validation accuracy is %.2f' %
np.mean(scores)
)
def test_cv():
"""Simple CV check."""
# XXX: don't use scikit-learn for tests.
X, y = make_regression()
cv = KFold(X.shape[0], 5)
glm_normal = GLM(distr='gaussian', alpha=0.01, reg_lambda=0.1)
# check that it returns 5 scores
scores = cross_val_score(glm_normal, X, y, cv=cv)
assert_equal(len(scores), 5)
param_grid = [{'alpha': np.linspace(0.01, 0.99, 2)},
{'reg_lambda': np.logspace(np.log(0.5), np.log(0.01),
10, base=np.exp(1))}]
glmcv = GridSearchCV(glm_normal, param_grid, cv=cv)
glmcv.fit(X, y)
def model_cross_valid(X,Y):
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
def bulid_model(model_name):
model = model_name()
return model
scoring = 'neg_mean_squared_error'
# + random fest boost lstm gbdt
for model_name in [LinearRegression,ElasticNet]:
#for model_name in [LinearRegression,Ridge,Lasso,ElasticNet,KNeighborsRegressor,DecisionTreeRegressor,SVR,RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor]:
model = bulid_model(model_name)
results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print(model_name,results.mean())
def fit_regression(X, y, regression_class=LinearRegression, regularization_const=.001):
'''
Given a dataset and some solutions (X, y) a regression class (from scikit learn)
and an Lambda which is required if the regression class is Lasso or Ridge
X (pandas DataFrame): The data.
y (pandas DataFrame or Series): The answers.
regression_class (class): One of sklearn.linear_model.[LinearRegression, Ridge, Lasso]
regularization_const: the regularization_const value (regularization parameter) for Ridge or Lasso.
Called alpha by scikit learn for interface reasons.
Return:
tuple, (the_fitted_regressor, mean(cross_val_score)).
'''
if regression_class is LinearRegression:
predictor = regression_class()
else:
predictor = regression_class(alpha=regularization_const, normalize=True)
predictor.fit(X, y)
cross_scores = cross_val_score(predictor, X, y=y, scoring='neg_mean_squared_error')
cross_scores_corrected = np.sqrt(-1 * cross_scores) # Scikit learn returns negative vals && we need root
return (predictor, np.mean(cross_scores_corrected))
def test_mdr_sklearn_pipeline():
"""Ensure that MDR can be used as a transformer in a scikit-learn pipeline"""
features = np.array([[2, 0],
[0, 0],
[0, 1],
[0, 0],
[0, 0],
[0, 0],
[0, 1],
[0, 0],
[0, 0],
[0, 1],
[0, 0],
[0, 0],
[0, 0],
[1, 1],
[1, 1]])
classes = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0])
clf = make_pipeline(MDR(), LogisticRegression())
cv_scores = cross_val_score(clf, features, classes, cv=StratifiedKFold(n_splits=5, shuffle=True))
assert np.mean(cv_scores) > 0.
def test_mdr_sklearn_pipeline_parallel():
"""Ensure that MDR can be used as a transformer in a parallelized scikit-learn pipeline"""
features = np.array([[2, 0],
[0, 0],
[0, 1],
[0, 0],
[0, 0],
[0, 0],
[0, 1],
[0, 0],
[0, 0],
[0, 1],
[0, 0],
[0, 0],
[0, 0],
[1, 1],
[1, 1]])
classes = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0])
clf = make_pipeline(MDR(), LogisticRegression())
cv_scores = cross_val_score(clf, features, classes, cv=StratifiedKFold(n_splits=5, shuffle=True), n_jobs=-1)
assert np.mean(cv_scores) > 0.
def eval_models(eda_objs, clfs):
'''
Uses a given set of classifiers objects to evaluates a given set of pipelines
and return their CV scores.
Parameters
----------
pipelines_names: list of strings
names of the pipelines to compare
eda_objs : list of objects
clfs : list of classifiers
*kwargs : Additional arguments to pass to sikit-learn's cross_val_score
'''
if isinstance(clfs, list) is False:
clfs = [clfs]
acc = []
for clf_name, clf in clfs:
for pipe_name, obj in eda_objs:
X, y = obj.df[obj._get_input_features()], obj.df[obj.y]
cv_score = cross_val_score(estimator=clf, X=X, y=y, cv=5, scoring='r2') #neg_mean_squared_error
acc.append([(clf_name, pipe_name, v) for v in cv_score])
acc = [item for sublist in acc for item in sublist] # flatten the list of lists
return acc
def __init__(self, model, ax=None, alphas=None,
cv=None, scoring=None, **kwargs):
# Check to make sure this is not a "RegressorCV"
name = model.__class__.__name__
if name.endswith("CV"):
raise YellowbrickTypeError((
"'{}' is a CV regularization model;"
" try AlphaSelection instead."
).format(name))
# Call super to initialize the class
super(ManualAlphaSelection, self).__init__(model, ax=ax, **kwargs)
# Set manual alpha selection parameters
self.alphas = alphas or np.logspace(-10, -2, 200)
self.errors = None
self.score_method = partial(cross_val_score, cv=cv, scoring=scoring)
def train_model(team_stats, result_data, test_data):
# ??????
X, y = build_dataSet(team_stats, result_data)
# ??????
print("Fitting on %d game samples.." % len(X))
model = LogisticRegression()
model.fit(X, y)
#??10????????????
print("Doing cross-validation..")
print(cross_val_score(model, X, y, cv = 10, scoring='accuracy', n_jobs=-1).mean())
#??????model?????????
print('Predicting on test data..')
result = []
for index, row in test_data.iterrows():
team1 = row['Vteam']
team2 = row['Hteam']
pred = predict_winner(team1, team2, model, team_stats)
result.append(pred[0][0])
return result
k_fold_predictor.py 文件源码
项目:movie-quality-profitability-predictor
作者: wbowditch
项目源码
文件源码
阅读 22
收藏 0
点赞 0
评论 0
def compute_cross_fold(data):
data_table = pd.read_csv("total_set.csv",index_col=0)
#data_norm = (data - data.mean()) / (data.sum())
scaler = preprocessing.StandardScaler().fit(data)
data_scaled = scaler.transform(data)
#print data_scaled
profitability_target = data_table['Profitable']
#print profitability_target
#gross_target = data_table['Domestic Gross']
#tomato = data_table['Rotten']
#normalized_target_gross = (gross_target - gross_target.mean()) / (gross_target.max() - gross_target.min())
#tomato = (tomato - tomato.mean()) / (tomato.max() - tomato.min())
#clf_profit = svm.SVC(kernel='rbf',C=0.8, gamma=5,verbose=True)
clf_profit = svm.LinearSVC(C=0.001,verbose=True,tol=.1)
clf_profit.fit(data_scaled,profitability_target)
scores = cross_val_score(clf_profit, data_scaled, profitability_target, cv=10)
#print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
return (scores.mean(), scores.std() * 2)
def cross_validation():
x_train, x_test, y_train, y_test = load_data()
k_lst = list(range(1, 30))
lst_scores = []
for k in k_lst:
knn = KNeighborsClassifier(n_neighbors=k)
scores = cross_val_score(knn, x_train, y_train, cv=10, scoring='accuracy')
lst_scores.append(scores.mean())
# changing to misclassification error
MSE = [1 - x for x in lst_scores]
optimal_k = k_lst[MSE.index(min(MSE))]
print "The optimal number of neighbors is %d" % optimal_k
# plot misclassification error vs k
# plt.plot(k_lst, MSE)
# plt.ylabel('Misclassification Error')
plt.plot(k_lst, lst_scores)
plt.xlabel('Number of Neighbors K')
plt.ylabel('correct classification rate')
plt.show()
def test_cross_val_score_predict_labels():
# Check if ValueError (when labels is None) propagates to cross_val_score
# and cross_val_predict
# And also check if labels is correctly passed to the cv object
X, y = make_classification(n_samples=20, n_classes=2, random_state=0)
clf = SVC(kernel="linear")
label_cvs = [LeaveOneLabelOut(), LeavePLabelOut(2), LabelKFold(),
LabelShuffleSplit()]
for cv in label_cvs:
assert_raise_message(ValueError,
"The labels parameter should not be None",
cross_val_score, estimator=clf, X=X, y=y, cv=cv)
assert_raise_message(ValueError,
"The labels parameter should not be None",
cross_val_predict, estimator=clf, X=X, y=y, cv=cv)
def test_cross_val_score_pandas():
# check cross_val_score doesn't destroy pandas dataframe
types = [(MockDataFrame, MockDataFrame)]
try:
from pandas import Series, DataFrame
types.append((Series, DataFrame))
except ImportError:
pass
for TargetType, InputFeatureType in types:
# X dataframe, y series
# 3 fold cross val is used so we need atleast 3 samples per class
X_df, y_ser = InputFeatureType(X), TargetType(y2)
check_df = lambda x: isinstance(x, InputFeatureType)
check_series = lambda x: isinstance(x, TargetType)
clf = CheckingClassifier(check_X=check_df, check_y=check_series)
cross_val_score(clf, X_df, y_ser)
def test_cross_val_score_precomputed():
# test for svm with precomputed kernel
svm = SVC(kernel="precomputed")
iris = load_iris()
X, y = iris.data, iris.target
linear_kernel = np.dot(X, X.T)
score_precomputed = cross_val_score(svm, linear_kernel, y)
svm = SVC(kernel="linear")
score_linear = cross_val_score(svm, X, y)
assert_array_equal(score_precomputed, score_linear)
# Error raised for non-square X
svm = SVC(kernel="precomputed")
assert_raises(ValueError, cross_val_score, svm, X, y)
# test error is raised when the precomputed kernel is not array-like
# or sparse
assert_raises(ValueError, cross_val_score, svm,
linear_kernel.tolist(), y)
def test_cross_val_score_with_score_func_classification():
iris = load_iris()
clf = SVC(kernel='linear')
# Default score (should be the accuracy score)
scores = cross_val_score(clf, iris.data, iris.target, cv=5)
assert_array_almost_equal(scores, [0.97, 1., 0.97, 0.97, 1.], 2)
# Correct classification score (aka. zero / one score) - should be the
# same as the default estimator score
zo_scores = cross_val_score(clf, iris.data, iris.target,
scoring="accuracy", cv=5)
assert_array_almost_equal(zo_scores, [0.97, 1., 0.97, 0.97, 1.], 2)
# F1 score (class are balanced so f1_score should be equal to zero/one
# score
f1_scores = cross_val_score(clf, iris.data, iris.target,
scoring="f1_weighted", cv=5)
assert_array_almost_equal(f1_scores, [0.97, 1., 0.97, 0.97, 1.], 2)
two_sigma_financial_modelling.py 文件源码
项目:PortfolioTimeSeriesAnalysis
作者: MizioAnd
项目源码
文件源码
阅读 22
收藏 0
点赞 0
评论 0
def rmse_cv(model, x_train, y_train):
rmse = np.sqrt(-cross_val_score(model, x_train, y_train, scoring='neg_mean_squared_error', cv=5))
return rmse
def KFold_CrossValidation(self, scoring_metric):
# Generate cross validation folds for the training dataset.
error = model_selection.cross_val_score(
estimator=self.alg,
X=self.datablock.train[self.predictors].values,
y=self.datablock.train[self.datablock.target].values,
cv=self.cv_folds, scoring=scoring_metric, n_jobs=-1
)
return {
'mean_error': np.mean(error),
'std_error': np.std(error),
'all_error': error
}
def feval(d):
max_depth = d['max_depth']
n_estimators = d['n_estimators']
clf = RandomForestClassifier(n_jobs=-1, max_depth=max_depth, n_estimators=n_estimators)
scores = cross_val_score(clf, data_X, data_y, cv=5, scoring='accuracy')
return np.mean(scores) - np.std(scores)
def _cross_validation_for_one_voxel(clf, vid, num_folds, subject_data, labels):
"""Score classifier on data using cross validation."""
# no shuffling in cv
skf = model_selection.StratifiedKFold(n_splits=num_folds,
shuffle=False)
scores = model_selection.cross_val_score(clf, subject_data,
y=labels,
cv=skf, n_jobs=1)
logger.debug(
'cross validation for voxel %d is done' %
vid
)
return (vid, scores.mean())
def adaBoost(self, settings, data=None, dropna=True):
df = self.__loadData(data, dropna)
features = df.columns[:-1]
X = df[features]
y = df.iloc[:, -1].values
seed = 7
num_trees = 500
kfold = model_selection.KFold(n_splits=10, random_state=seed)
print kfold
model = AdaBoostClassifier(n_estimators=num_trees, random_state=seed)
results = model_selection.cross_val_score(model, X, y, cv=kfold)
model.fit(X, y)
print results.mean()
print model.score(X, y)
return True
8voting_classifier.py 文件源码
项目:Machine-Learning-Algorithms
作者: PacktPublishing
项目源码
文件源码
阅读 29
收藏 0
点赞 0
评论 0
def compute_accuracies(lr, dt, svc, vc, X, Y):
accuracies = []
accuracies.append(cross_val_score(lr, X, Y, scoring='accuracy', cv=10).mean())
accuracies.append(cross_val_score(dt, X, Y, scoring='accuracy', cv=10).mean())
accuracies.append(cross_val_score(svc, X, Y, scoring='accuracy', cv=10).mean())
accuracies.append(cross_val_score(vc, X, Y, scoring='accuracy', cv=10).mean())
print('Accuracies:')
print(np.array(accuracies))
return accuracies
def multiprocessing_grid_search(queue, shared_list, persistent_object):
"""Explore cross validation grid using multiprocessing."""
# scores = cross_val_score(*cross_val_score_args, **cross_val_score_kwargs)
# queue.put(scores)
while True:
# All parameters from cross_val_score, i to compute pickle name and
# persistent_path.
passed_parameters = queue.get()
if passed_parameters is None:
break
# Dismember arguments and values.
grid, cvs_args, cvs_kwargs = passed_parameters
estimator, x = cvs_args
estimator.set_params(**grid)
del cvs_args
# Check if value was already calculated:
stored_value = persistent_object.retrieve(estimator, grid)
if stored_value is None:
scores = cross_val_score(estimator, x, **cvs_kwargs)
persistent_object.update(estimator, grid, scores)
else:
scores = stored_value
grid_result = grid.copy()
grid_result['scores'] = scores
shared_list.append(grid_result)
dsb_create_voxel_model_predictions.py 文件源码
项目:data-science-bowl-2017
作者: tondonia
项目源码
文件源码
阅读 31
收藏 0
点赞 0
评论 0
def score(self, params):
self.change_to_int(params, self.to_int_params)
self.level0.set_params(**params)
score = model_selection.cross_val_score(self.level0, self.trainX, self.trainY, cv=5, n_jobs=-1)
print('%s ------ Score Mean:%f, Std:%f' % (params, score.mean(), score.std()))
return {'loss': score.mean(), 'status': STATUS_OK}
def rmse_cv(model, x_train, y_train):
rmse = np.sqrt(-cross_val_score(model, x_train, y_train, scoring='neg_mean_squared_error', cv=5))
return rmse
def evaluateModel(C, gamma):
clf = SVC(C=10**C, gamma=10**gamma)
return np.average(cross_val_score(clf, X, y))
def rmsle_cv(model):
kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train.values)
rmse= np.sqrt(-cross_val_score(model, train.values, y_train, scoring="neg_mean_squared_error", cv = kf))
return(rmse)
def cross_validate(estimator, training_data, training_targets):
mse = cross_val_score(estimator, X=training_data, y=training_targets, scoring=root_mean_log_squared_error)
r2 = cross_val_score(estimator, X=training_data, y=training_targets, scoring='r2')
return (-1 * np.mean(mse), np.mean(r2))
Adaboost.py 文件源码
项目:Machine-Learning-Tools-on-Iris-Dataset
作者: debjitpaul
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def perform_adaboost(self,X_train_std,y_train,X_test_std, y_test): ##perform adaboost
ada = AdaBoostClassifier(n_estimators=10)
ada.fit(X_train_std, y_train)
train_score=cross_val_score(ada,X_train_std, y_train)
print('The training accuracy is {:.2f}%'.format(train_score.mean()*100))
test_score=cross_val_score(ada,X_test_std, y_test)
print('The test accuracy is {:.2f}%'.format(test_score.mean()*100))
X=X_test_std
y=y_test
resolution=0.01
#Z = svm.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
markers = ('s', 'x', 'o', '^', 'v')
colors = ('red', 'blue', 'green', 'gray', 'cyan')
cmap = ListedColormap(colors[:len(np.unique(y_test))])
X=X_test_std
y=y_test
# plot the decision surface
x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
np.arange(x2_min, x2_max, resolution))
Z = ada.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
Z = Z.reshape(xx1.shape)
plt.contourf(xx1, xx2, Z, alpha=0.3, cmap=cmap)
plt.xlim(xx1.min(), xx1.max())
plt.ylim(xx2.min(), xx2.max())
for idx, cl in enumerate(np.unique(y)):
plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1],
alpha=0.5, c=cmap(idx),
marker=markers[idx], label=cl)
plt.show()