def print_accuracy_report(classifier, X, y, num_validations=5):
accuracy = cross_validation.cross_val_score(classifier,
X, y, scoring='accuracy', cv=num_validations)
print "Accuracy: " + str(round(100*accuracy.mean(), 2)) + "%"
f1 = cross_validation.cross_val_score(classifier,
X, y, scoring='f1_weighted', cv=num_validations)
print "F1: " + str(round(100*f1.mean(), 2)) + "%"
precision = cross_validation.cross_val_score(classifier,
X, y, scoring='precision_weighted', cv=num_validations)
print "Precision: " + str(round(100*precision.mean(), 2)) + "%"
recall = cross_validation.cross_val_score(classifier,
X, y, scoring='recall_weighted', cv=num_validations)
print "Recall: " + str(round(100*recall.mean(), 2)) + "%"
python类cross_val_score()的实例源码
utilities.py 文件源码
项目:Python-Machine-Learning-Cookbook
作者: PacktPublishing
项目源码
文件源码
阅读 29
收藏 0
点赞 0
评论 0
def rfr_feature_select():
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import cross_val_score, ShuffleSplit
boston = load_boston()
X = boston["data"]
Y = boston["target"]
names = boston["feature_names"]
rf = RandomForestRegressor(n_estimators=20, max_depth=4)
scores = []
for i in range(X.shape[1]):
score = cross_val_score(rf, X[:, i:i + 1],
Y, scoring="r2", cv=ShuffleSplit(len(X), 3, .3))
scores.append((round(np.mean(score), 3), names[i]))
print sorted(scores, reverse=True)
def evaluate_model(model, X_train, y_train):
'''
INPUT
- model: this is a classification model from sklearn
- X_train: 2d array of the features
- y_train: 1d array of the target
OUTPUT
- information about the model's accuracy using 10
fold cross validation
- model: the fit model
Returns the model
'''
print(np.mean(cross_val_score(model, X_train, y_train,
cv=10, n_jobs=-1, verbose=10)))
model.fit(X_train, y_train)
return model
utilities.py 文件源码
项目:Python-Machine-Learning-Cookbook
作者: PacktPublishing
项目源码
文件源码
阅读 27
收藏 0
点赞 0
评论 0
def print_accuracy_report(classifier, X, y, num_validations=5):
accuracy = cross_validation.cross_val_score(classifier,
X, y, scoring='accuracy', cv=num_validations)
print "Accuracy: " + str(round(100*accuracy.mean(), 2)) + "%"
f1 = cross_validation.cross_val_score(classifier,
X, y, scoring='f1_weighted', cv=num_validations)
print "F1: " + str(round(100*f1.mean(), 2)) + "%"
precision = cross_validation.cross_val_score(classifier,
X, y, scoring='precision_weighted', cv=num_validations)
print "Precision: " + str(round(100*precision.mean(), 2)) + "%"
recall = cross_validation.cross_val_score(classifier,
X, y, scoring='recall_weighted', cv=num_validations)
print "Recall: " + str(round(100*recall.mean(), 2)) + "%"
def experiment(model_class, vectorizer, xval):
name = model_class.__class__.__name__ + '.' + model_class.penalty
model = model_class.fit(X, y)
model_weights = vectorizer.inverse_transform(model.coef_)[0]
with open('weights.%s.txt' % name, 'w') as f:
f.write('%s\t%f\n' % ('(intercept)', model.intercept_))
f.writelines('%s\t%f\n' % k for k in model_weights.items())
acc_scores = cross_validation.cross_val_score(model, X, y, cv=xval)
auc_scores = cross_validation.cross_val_score(model, X, y, scoring='roc_auc', cv=xval)
prec_scores = cross_validation.cross_val_score(model, X, y, scoring='precision', cv=xval)
recall_scores = cross_validation.cross_val_score(model, X, y, scoring='recall', cv=xval)
f1_scores = cross_validation.cross_val_score(model, X, y, scoring='f1', cv=xval)
print '-'*80
print 'acc\t%.4f\t%s' % (np.mean(acc_scores), name)
print 'auc\t%.4f\t%s' % (np.mean(auc_scores), name)
print 'prec\t%.4f\t%s' % (np.mean(prec_scores), name)
print 'recall\t%.4f\t%s' % (np.mean(recall_scores), name)
print 'f1\t%.4f\t%s' % (np.mean(f1_scores), name)
def trainLimited(self, featureFile, n_datapoints):
(label_vector, input_vector) = loadData(featureFile)
trainData, testData, trainLabels, testLabels = \
cross_validation.train_test_split(input_vector, label_vector, test_size=(0))
n_totalrows = int((len(label_vector)/n_datapoints))
for n in range(0, n_totalrows):
limited_label_vector = trainLabels[0: (n+1) * n_datapoints]
limited_input_vector = trainData[0: (n+1) * n_datapoints]
kNNClassifier = neighbors.KNeighborsClassifier(self.n_neighbors, weights='distance')
kNNClassifier.fit(limited_input_vector, limited_label_vector)
scores = cross_validation.cross_val_score(kNNClassifier, limited_input_vector, limited_label_vector, cv = 5)
print '%f on %d datapoints' % ((sum(scores) / len(scores)), len(limited_label_vector))
def run_model(model,dtrain,predictor_var,target,scoring_method='mean_squared_error'):
cv_method = KFold(len(dtrain),5)
cv_scores = cross_val_score(model,dtrain[predictor_var],dtrain[target],cv=cv_method,scoring=scoring_method)
#print cv_scores, np.mean(cv_scores), np.sqrt((-1)*np.mean(cv_scores))
dtrain_for_val = dtrain[dtrain['Year']<2000]
dtest_for_val = dtrain[dtrain['Year']>1999]
#cv_method = KFold(len(dtrain_for_val),5)
#cv_scores_2 = cross_val_score(model,dtrain_for_val[predictor_var],dtrain_for_val[target],cv=cv_method,scoring=scoring_method)
#print cv_scores_2, np.mean(cv_scores_2)
dtrain_for_val_ini = dtrain_for_val[predictor_var]
dtest_for_val_ini = dtest_for_val[predictor_var]
model.fit(dtrain_for_val_ini,dtrain_for_val[target])
pred_for_val = model.predict(dtest_for_val_ini)
#print math.sqrt(mean_squared_error(dtest_for_val['Footfall'],pred_for_val))
def eval_model(name, model, data):
print '=' * 20
print name, 'training'
model.fit(data, train.target, sample_weight=sample_weights)
print name, 'trained'
predictions = model.predict(processed_test_data)
print name, 'accuracy', np.mean(predictions == test.target)
print(metrics.classification_report(test.target, predictions))
print metrics.confusion_matrix(test.target, predictions)
print name, 'f1 cross validation', cross_validation.cross_val_score(model, grammar_processed_data, train.target, scoring='f1')
print name, 'precision cross validation', cross_validation.cross_val_score(
model, grammar_processed_data, train.target, scoring='precision'
)
return model, predictions
# SVM need balance on input features, same ranges and variances and stuff like that
def cross_validation_report(clf, dataset):
data = count_vectorizer.transform([row[0] for row in dataset])
target = [row[1] for row in dataset]
return cross_validation.cross_val_score(clf, data, target)
04_model_preparation.py 文件源码
项目:uda-da-p5-enron-fraud-detection
作者: watanabe8760
项目源码
文件源码
阅读 25
收藏 0
点赞 0
评论 0
def evaluate(model, name):
"""
Evaluates model by cross validation.
"""
# Get scores through cross validation
score_f1 = cross_val_score(model, X, y, scoring='f1', cv=splitter_)
score_pr = cross_val_score(model, X, y, scoring='precision', cv=splitter_)
score_re = cross_val_score(model, X, y, scoring='recall', cv=splitter_)
# Save image of score distributions
save_dist(name, score_f1, score_pr, score_re)
# Compute mean and std of each score
result = DataFrame(index=['f1', 'precision', 'recall'],
columns=['mean', 'std'])
result.loc['f1', 'mean'] = np.mean(score_f1)
result.loc['precision', 'mean'] = np.mean(score_pr)
result.loc['recall', 'mean'] = np.mean(score_re)
result.loc['f1', 'std'] = np.std(score_f1)
result.loc['precision', 'std'] = np.std(score_pr)
result.loc['recall', 'std'] = np.std(score_re)
print model
print result
def rf_from_cfg(cfg, seed):
"""
Creates a random forest regressor from sklearn and fits the given data on it.
This is the function-call we try to optimize. Chosen values are stored in
the configuration (cfg).
Parameters:
-----------
cfg: Configuration
configuration chosen by smac
seed: int or RandomState
used to initialize the rf's random generator
Returns:
-----------
np.mean(rmses): float
mean of root mean square errors of random-forest test predictions
per cv-fold
"""
rfr = RandomForestRegressor(
n_estimators=cfg["num_trees"],
criterion=cfg["criterion"],
min_samples_split=cfg["min_samples_to_split"],
min_samples_leaf=cfg["min_samples_in_leaf"],
min_weight_fraction_leaf=cfg["min_weight_frac_leaf"],
max_features=cfg["max_features"],
max_leaf_nodes=cfg["max_leaf_nodes"],
bootstrap=cfg["do_bootstrapping"],
random_state=seed)
def rmse(y, y_pred):
return np.sqrt(np.mean((y_pred - y)**2))
# Creating root mean square error for sklearns crossvalidation
rmse_scorer = make_scorer(rmse, greater_is_better=False)
score = cross_val_score(rfr, boston.data, boston.target, cv=11, scoring=rmse_scorer)
return -1 * np.mean(score) # Because cross_validation sign-flips the score
Models.py 文件源码
项目:Stock-Prediction-Time-Series-Analysis-Python
作者: Nekooeimehr
项目源码
文件源码
阅读 33
收藏 0
点赞 0
评论 0
def Second_Model_KRR(Scaled_Input_Data, Output_Data):
T0 = time.time()
n = len(Scaled_Input_Data)
Grid_Dict = {"alpha": [1e0, 1e-1, 1e-2],"gamma": np.logspace(-2, 1, 3)}
krr_Tuned = GridSearchCV(KernelRidge(kernel='rbf', gamma=0.1), cv=5 ,param_grid=Grid_Dict, scoring="mean_absolute_error")
krr_Tuned.fit(Scaled_Input_Data, Output_Data)
KRR_MSE = KernelRidge(kernel='rbf', alpha=krr_Tuned.best_params_['alpha'], gamma=krr_Tuned.best_params_['gamma'])
KRR_Time = time.time() - T0
print('The computational time of Kernel Ridge Regression for ', n, ' examples is: ', KRR_Time)
MSEs_KRR = cross_validation.cross_val_score(KRR_MSE, Scaled_Input_Data, Output_Data, cv=cross_validation.LeaveOneOut(n), scoring="mean_absolute_error")
MeanMSE_KRR = np.mean(list(MSEs_KRR))
print('The average MSE of Kernel Ridge Regression for ', n, ' examples is: ', (-1*MeanMSE_KRR))
return(MeanMSE_KRR, krr_Tuned)
def evaluate_model(model, X_train, y_train):
"""
Args:
model (sklearn classification model): this model from sklearn that
will be used to fit the data and to see the 10 fold cross val score of
X_train (2d numpy array): this is the feature matrix
y_train (1d numpy array): this is the array of targets
Returns:
prints information about the model's accuracy using 10
fold cross validation
model (sklearn classification model): the model that has already been
fit to the data
"""
print(np.mean(cross_val_score(model, X_train, y_train,
cv=10, n_jobs=-1, verbose=10)))
model.fit(X_train, y_train)
return model
def clf_scores(clf, x_train, y_train, x_test, y_test):
info = dict()
# TODO: extend this to a confusion matrix per fold for more flexibility downstream (tuning)
# TODO: calculate a set of ROC curves per fold instead of running it on test, currently introducing bias
scores = cross_val_score(clf, x_train, y_train, cv=cv, n_jobs=-1)
runtime = time()
clf.fit(x_train, y_train)
runtime = time() - runtime
y_test_predicted = clf.predict(x_test)
info['runtime'] = runtime
info['accuracy'] = min(scores)
info['accuracy_test'] = accuracy_score(y_test, y_test_predicted)
info['accuracy_folds'] = scores
info['confusion_matrix'] = confusion_matrix(y_test, y_test_predicted)
clf.fit(x_train, y_train)
fpr, tpr, _ = roc_curve(y_test, clf_predict_proba(clf, x_test))
info['fpr'] = fpr
info['tpr'] = tpr
info['auc'] = auc(fpr, tpr)
return info
def test_cross_val_score_mask():
# test that cross_val_score works with boolean masks
svm = SVC(kernel="linear")
iris = load_iris()
X, y = iris.data, iris.target
cv_indices = cval.KFold(len(y), 5)
scores_indices = cval.cross_val_score(svm, X, y, cv=cv_indices)
cv_indices = cval.KFold(len(y), 5)
cv_masks = []
for train, test in cv_indices:
mask_train = np.zeros(len(y), dtype=np.bool)
mask_test = np.zeros(len(y), dtype=np.bool)
mask_train[train] = 1
mask_test[test] = 1
cv_masks.append((train, test))
scores_masks = cval.cross_val_score(svm, X, y, cv=cv_masks)
assert_array_equal(scores_indices, scores_masks)
def test_cross_val_score_precomputed():
# test for svm with precomputed kernel
svm = SVC(kernel="precomputed")
iris = load_iris()
X, y = iris.data, iris.target
linear_kernel = np.dot(X, X.T)
score_precomputed = cval.cross_val_score(svm, linear_kernel, y)
svm = SVC(kernel="linear")
score_linear = cval.cross_val_score(svm, X, y)
assert_array_equal(score_precomputed, score_linear)
# Error raised for non-square X
svm = SVC(kernel="precomputed")
assert_raises(ValueError, cval.cross_val_score, svm, X, y)
# test error is raised when the precomputed kernel is not array-like
# or sparse
assert_raises(ValueError, cval.cross_val_score, svm,
linear_kernel.tolist(), y)
def test_cross_val_score_with_score_func_classification():
iris = load_iris()
clf = SVC(kernel='linear')
# Default score (should be the accuracy score)
scores = cval.cross_val_score(clf, iris.data, iris.target, cv=5)
assert_array_almost_equal(scores, [0.97, 1., 0.97, 0.97, 1.], 2)
# Correct classification score (aka. zero / one score) - should be the
# same as the default estimator score
zo_scores = cval.cross_val_score(clf, iris.data, iris.target,
scoring="accuracy", cv=5)
assert_array_almost_equal(zo_scores, [0.97, 1., 0.97, 0.97, 1.], 2)
# F1 score (class are balanced so f1_score should be equal to zero/one
# score
f1_scores = cval.cross_val_score(clf, iris.data, iris.target,
scoring="f1_weighted", cv=5)
assert_array_almost_equal(f1_scores, [0.97, 1., 0.97, 0.97, 1.], 2)
def test_cross_val_score_with_score_func_regression():
X, y = make_regression(n_samples=30, n_features=20, n_informative=5,
random_state=0)
reg = Ridge()
# Default score of the Ridge regression estimator
scores = cval.cross_val_score(reg, X, y, cv=5)
assert_array_almost_equal(scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)
# R2 score (aka. determination coefficient) - should be the
# same as the default estimator score
r2_scores = cval.cross_val_score(reg, X, y, scoring="r2", cv=5)
assert_array_almost_equal(r2_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)
# Mean squared error; this is a loss function, so "scores" are negative
mse_scores = cval.cross_val_score(reg, X, y, cv=5,
scoring="mean_squared_error")
expected_mse = np.array([-763.07, -553.16, -274.38, -273.26, -1681.99])
assert_array_almost_equal(mse_scores, expected_mse, 2)
# Explained variance
scoring = make_scorer(explained_variance_score)
ev_scores = cval.cross_val_score(reg, X, y, cv=5, scoring=scoring)
assert_array_almost_equal(ev_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)
def test_cross_val_score_multilabel():
X = np.array([[-3, 4], [2, 4], [3, 3], [0, 2], [-3, 1],
[-2, 1], [0, 0], [-2, -1], [-1, -2], [1, -2]])
y = np.array([[1, 1], [0, 1], [0, 1], [0, 1], [1, 1],
[0, 1], [1, 0], [1, 1], [1, 0], [0, 0]])
clf = KNeighborsClassifier(n_neighbors=1)
scoring_micro = make_scorer(precision_score, average='micro')
scoring_macro = make_scorer(precision_score, average='macro')
scoring_samples = make_scorer(precision_score, average='samples')
score_micro = cval.cross_val_score(clf, X, y, scoring=scoring_micro, cv=5)
score_macro = cval.cross_val_score(clf, X, y, scoring=scoring_macro, cv=5)
score_samples = cval.cross_val_score(clf, X, y,
scoring=scoring_samples, cv=5)
assert_almost_equal(score_micro, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 3])
assert_almost_equal(score_macro, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 4])
assert_almost_equal(score_samples, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 4])
def run():
tr_data = np.loadtxt('../new/TRAIN_LRFORMAT.txt')
te_data = np.loadtxt('../new/TEST_LRFORMAT.txt')
tr_x = tr_data[:,1:]
tr_y = tr_data[:,0]
te_x = te_data[:,1:]
lr = LogisticRegression(
solver='liblinear',
multi_class='ovr',
class_weight='balanced',
penalty='l2',
n_jobs=-1)
#te_pred = lr.predict_proba(te_x)
cv = 10
scores = cross_val_score(lr,tr_x,tr_y,cv=cv,scoring='accuracy')
print(str(scores))
#np.savetxt('result/te_lr.txt',te_pred)
linearRegression_lassoRegularization.py 文件源码
项目:HousePricePredictionKaggle
作者: Nuwantha
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def rmse_cv(model, X, y):
return (cross_val_score(model, X, y, scoring=scorer)).mean()
def baseline_logisticRegression():
train_data = pd.read_csv(r"data/train.csv")
#print u"?????\n",train_data.info()
#print u'?????\n',train_data.describe()
#display_data(train_data) # ????????
#display_with_process(train_data) # ??????????????????,????
process_data = pre_processData(train_data,'process_train_data') # ????????????
train_data = process_data.filter(regex='Survived|Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') # ???????????
train_np = train_data.as_matrix() # ????
'''??model'''
X = train_np[:,1:]
y = train_np[:,0]
#=X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
#=model = linear_model.LogisticRegression(C=1.0,tol=1e-6).fit(X_train,y_train)
model = linear_model.LogisticRegression(C=1.0,tol=1e-6).fit(X,y)
print pd.DataFrame({"columns":list(train_data.columns)[1:],"coef_":list(model.coef_.T)})
#=prediction = model.predict(X_test)
#=cv_error = pd.DataFrame(data=list(X_test[np.where(prediction!=y_test)]),columns=list(train_data.columns)[1:])
#=cv_error.to_csv(r'error.csv',index=True)
#=print np.float32(np.sum(prediction == y_test))/np.float32(prediction.shape[0])
'''??????'''
test_data = pd.read_csv(r"data/test.csv")
process_test_data = pre_processData(test_data,'process_test_data') # ?????
test_data = process_test_data.filter(regex='Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
test_np = test_data.as_matrix()
predict = model.predict(test_np)
result = pd.DataFrame(data={'PassengerId':process_test_data['PassengerId'].as_matrix(),'Survived':predict.astype(np.int32)})
result.to_csv(r'baseline_logisticRegression_result/prediction.csv',index=False)
#clf = linear_model.LogisticRegression(C=1.0,tol=1e-6)
#print cross_validation.cross_val_score(clf, X,y,cv=5)
# baseline?SVM??——0.78947
def baseline_logisticRegression_crossValidate():
origin_train_data = pd.read_csv(r"data/train.csv")
process_data = fe_preprocessData(origin_train_data,'process_train_data') # ????????????
process_data_train,process_data_cv = train_test_split(process_data,test_size=0.2)
train_data = process_data_train.filter(regex='Survived|Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') # ???????????
train_np = train_data.as_matrix() # ????
'''??model'''
X_train = train_np[:,1:]
y_train = train_np[:,0]
model = linear_model.LogisticRegression(C=1.0,tol=1e-6).fit(X_train,y_train)
print pd.DataFrame({'columns':list(train_data.columns[1:]),'coef_':list(model.coef_.T)})
cv_data = process_data_cv.filter(regex='Survived|Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
cv_np = cv_data.as_matrix()
X_cv = cv_np[:,1:]
y_cv = cv_np[:,0]
predictions = model.predict(X_cv)
print np.float32(np.sum(predictions == y_cv))/np.float32(predictions.shape[0])
'''?????????????????'''
error_items = origin_train_data.loc[origin_train_data['PassengerId'].isin(process_data_cv[predictions != y_cv]['PassengerId'].values)]
predictions_item = pd.DataFrame(data=process_data_cv[predictions != y_cv]['PassengerId'])
predictions_item.columns=['error_PassengerId']
error_result = pd.concat([error_items,predictions_item],axis=1)
error_result.to_csv(r'error.csv',index=False)
#=print pd.DataFrame({"columns":list(train_data.columns)[1:],"coef_":list(model.coef_.T)})
#=prediction = model.predict(X_test)
#=print np.float32(np.sum(prediction == y_test))/np.float32(prediction.shape[0])
'''??????'''
'''test_data = pd.read_csv(r"data/test.csv")
process_test_data = fe_preprocessData(test_data,'process_test_data',optimize=True) # ?????
test_data = process_test_data.filter(regex='Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
test_np = test_data.as_matrix()
predict = model.predict(test_np)
result = pd.DataFrame(data={'PassengerId':process_test_data['PassengerId'].as_matrix(),'Survived':predict.astype(np.int32)})
result.to_csv(r'logisticRegression_result/prediction.csv',index=False)'''
#clf = linear_model.LogisticRegression(C=1.0,tol=1e-6)
#print cross_validation.cross_val_score(clf, X,y,cv=5)
def optimize_logisticRegression():
train_data = pd.read_csv(r"data/train.csv")
print u"?????\n",train_data.info()
print u'?????\n',train_data.describe()
#display_data(train_data) # ????????
#display_with_process(train_data) # ??????????????????,????
process_data = fe_preprocessData(train_data,'process_train_data') # ????????????
train_data = process_data.filter(regex='Survived|Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') # ???????????
train_np = train_data.as_matrix() # ????
'''??model'''
X = train_np[:,1:]
y = train_np[:,0]
#=X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
#=model = linear_model.LogisticRegression(C=1.0,tol=1e-6).fit(X_train,y_train)
model = linear_model.LogisticRegression(C=1.0,tol=1e-6).fit(X,y)
print pd.DataFrame({"columns":list(train_data.columns)[1:],"coef_":list(model.coef_.T)})
'''??????'''
test_data = pd.read_csv(r"data/test.csv")
process_test_data = fe_preprocessData(test_data,'process_test_data') # ?????
test_data = process_test_data.filter(regex='Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
test_np = test_data.as_matrix()
predict = model.predict(test_np)
result = pd.DataFrame(data={'PassengerId':process_test_data['PassengerId'].as_matrix(),'Survived':predict.astype(np.int32)})
result.to_csv(r'optimize_logisticRegression_result/prediction.csv',index=False)
#clf = linear_model.LogisticRegression(C=1.0,tol=1e-6)
#print cross_validation.cross_val_score(clf, X,y,cv=5)
## ????????
def stump(X, y):
score = cross_val_score(LinearSVC(), X, y, cv = 5, n_jobs=5, scoring = 'average_precision')
clf = LinearSVC()
clf.fit(X, y)
coef = clf.coef_[0,0]
inter = clf.intercept_[0]
return np.mean(score), np.sign(coef), inter / np.abs(coef)
def run_croos_validation(self):
features,labels,cv = self.getFeaturesLabel()
scores = cross_validation.cross_val_score(self.clf, features, labels, cv=cv, scoring=mean_absolute_percentage_error_scoring, n_jobs = -1)
print "cross validation scores: means, {}, std, {}, details,{}".format(np.absolute(scores.mean()), scores.std(), np.absolute(scores))
return -np.absolute(scores.mean())
def build_random_forest_model(x_train, y_train):
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(x_train, y_train.ravel())
print "10-fold Cross validation score is :"
print np.mean(cross_val_score(rf_model, x_train, y_train, cv=10))
return rf_model
sklearn_data.py 文件源码
项目:-Classification-on-Chinese-Magazine-
作者: lixiaosi33
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def evaluate_cross_validation(clf, X, y, K):
# create a k-fold croos validation iterator of k=5 folds
cv = KFold(len(y), K, shuffle=True, random_state=0)
# by default the score used is the one returned by score method of the estimator (accuracy)
scores = cross_val_score(clf, X, y, cv=cv)
print scores
print ("Mean score: {0:.3f} (+/-{1:.3f})").format(
np.mean(scores), sem(scores))
def hackathon_GBC_model(clf, train, features):
clf.fit(train[features], train["Class"])
probab_of_predict = clf.predict_proba(train[features])[:,1]
predict_train = clf.predict(train[features])
cv_score = cross_val_score(clf, train[features], train["Class"], cv=5, scoring="roc_auc")
print("----------------------Model performance-----------------------")
print("Accuracy score: ", accuracy_score(train["Class"].values, predict_train))
print("AUC: ", roc_auc_score(train["Class"],probab_of_predict) )
print("CV score: Mean - {}, Max - {}, Min - {}, Std - {}".format(np.mean(cv_score), np.max(cv_score),
np.min(cv_score), np.std(cv_score)))
Relative_Feature_importance = pd.Series(clf.feature_importances_, features).sort_values(ascending=False)
Relative_Feature_importance.plot(kind='bar', title='Order of Feature Importance')
plt.ylabel('Feature Importance')
plt.show()
def print_metrics(clf):
#scores = cross_validation.cross_val_score(clf,features,labels,cv=5,scoring='accuracy')
#print 'Accuracy:',scores.mean()
cv = cross_validation.StratifiedKFold(labels,n_folds=5)
mean_tpr = 0.0
mean_fpr = np.linspace(0,1,100)
all_tpr = []
for i, (train,test) in enumerate(cv):
probas_ = clf.fit(features[train],labels[train]).predict_proba(features[test])
fpr,tpr,thresholds = metrics.roc_curve(labels[test],probas_[:,1])
mean_tpr += interp(mean_fpr,fpr,tpr)
mean_tpr[0] = 0.0
roc_auc = metrics.auc(fpr,tpr)
plt.plot(fpr,tpr,lw=1,label='ROC fold %d (area = %0.2f)' % (i,roc_auc))
plt.plot([0,1],[0,1],'--',color=(0.6,0.6,0.6),label='Luck')
mean_tpr /= len(cv)
mean_tpr[-1] = 1.0
mean_auc = metrics.auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, 'k--',
label='Mean ROC (area = %0.2f)' % mean_auc, lw=2)
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('auc_sent.png')