def train_and_calibrate_cv(model, X_tr, y_tr, cv=5):
y_pred_xval = np.zeros(len(y_tr))
skf = cross_validation.StratifiedKFold(y_tr, n_folds=cv,shuffle=True)
i = 0;
for train, test in skf:
i = i+1
print("training fold {} of {}".format(i, cv))
X_train_xval = np.array(X_tr)[train,:]
X_test_xval = np.array(X_tr)[test,:]
y_train_xval = np.array(y_tr)[train]
# We could also copy the model first and then fit it
model_copy = clone(model)
model_copy.fit(X_train_xval,y_train_xval)
y_pred_xval[test]=model.predict_proba(X_test_xval)[:,1]
print("training full model")
model_copy = clone(model)
model_copy.fit(X_tr,y_tr)
print("calibrating function")
calib_func = prob_calibration_function(y_tr, y_pred_xval)
return model_copy, calib_func
python类cross_validation()的实例源码
def _cv_len(cv, X, y):
"""This method computes the length of a cross validation
object, agnostic of whether sklearn-0.17 or sklearn-0.18
is being used.
Parameters
----------
cv : `sklearn.cross_validation._PartitionIterator` or `sklearn.model_selection.BaseCrossValidator`
The cv object from which to extract length. If using
sklearn-0.17, this can be computed by calling `len` on
``cv``, else it's computed with `cv.get_n_splits(X, y)`.
X : pd.DataFrame or np.ndarray, shape(n_samples, n_features)
The dataframe or np.ndarray being fit in the grid search.
y : np.ndarray, shape(n_samples,)
The target being fit in the grid search.
Returns
-------
int
"""
return len(cv) if not SK18 else cv.get_n_splits(X, y)
def best_shape_clustering(mols, nb_layers, k_range=range(3, 20), train_ratio=0.8, cluster_key='shape_cid'):
from sklearn.cross_validation import train_test_split
from sklearn.metrics import silhouette_score
shape_df = mols['dynamic'].apply(lambda x: temporal_shape(x, nb_layers))
train_idx, test_idx = train_test_split(shape_df.index.values, train_size=train_ratio)
train_mat = np.array(list(shape_df[shape_df.index.isin(train_idx)].values))
full_mat = np.array(list(shape_df.values))
centroids = None
labels = None
best_score = 0
for k in k_range:
res = cluster_shapes(train_mat, full_mat, k)
score = silhouette_score(full_mat, res[1])
if score > best_score:
centroids = res[0]
labels = res[1]
best_score = score
mols[cluster_key] = labels
return mols, centroids
def splitValidateModel(self, visualizePredictions = False):
(label_vector, input_vector) = loadData(self.featureFile)
indexArray = range(0, len(input_vector))
trainData, testData, trainLabels, expectedLabels, trainIndices, testIndices = \
cross_validation.train_test_split(input_vector, label_vector, indexArray, test_size=(1.0 - self.percentSplit))
kNNClassifier = neighbors.KNeighborsClassifier(self.n_neighbors, weights='distance')
kNNClassifier.fit(trainData, trainLabels)
predictedLabels = kNNClassifier.predict(testData)
print("Classification report for classifier %s:\n%s\n"
% ('k-NearestNeighbour', metrics.classification_report(expectedLabels, predictedLabels)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(expectedLabels, predictedLabels))
print('Split Validation training :: Done.\n')
if visualizePredictions:
self.__visualizePredictedDataset__(input_vector, testIndices, predictedLabels, expectedLabels)
def trainLimited(self, featureFile, n_datapoints):
(label_vector, input_vector) = loadData(featureFile)
trainData, testData, trainLabels, testLabels = \
cross_validation.train_test_split(input_vector, label_vector, test_size=(0))
n_totalrows = int((len(label_vector)/n_datapoints))
for n in range(0, n_totalrows):
limited_label_vector = trainLabels[0: (n+1) * n_datapoints]
limited_input_vector = trainData[0: (n+1) * n_datapoints]
kNNClassifier = neighbors.KNeighborsClassifier(self.n_neighbors, weights='distance')
kNNClassifier.fit(limited_input_vector, limited_label_vector)
scores = cross_validation.cross_val_score(kNNClassifier, limited_input_vector, limited_label_vector, cv = 5)
print '%f on %d datapoints' % ((sum(scores) / len(scores)), len(limited_label_vector))
def _set_cv(cv, X, y, classifier):
"""This method returns either a `sklearn.cross_validation._PartitionIterator` or
`sklearn.model_selection.BaseCrossValidator` depending on whether sklearn-0.17
or sklearn-0.18 is being used.
Parameters
----------
cv : int, `_PartitionIterator` or `BaseCrossValidator`
The CV object or int to check. If an int, will be converted
into the appropriate class of crossvalidator.
X : pd.DataFrame or np.ndarray, shape(n_samples, n_features)
The dataframe or np.ndarray being fit in the grid search.
y : np.ndarray, shape(n_samples,)
The target being fit in the grid search.
classifier : bool
Whether the estimator being fit is a classifier
Returns
-------
`_PartitionIterator` or `BaseCrossValidator`
"""
return check_cv(cv, X, y, classifier) if not SK18 else check_cv(cv, y, classifier)
def shuffle_data(data, labels):
data, _, labels, _ = sklearn.cross_validation.train_test_split(data, labels, test_size=0.0, random_state=42)
return data, labels
def crossValidateModel(self):
(label_vector, input_vector) = loadData(self.featureFile)
kFold = 5
kNNClassifier = neighbors.KNeighborsClassifier(self.n_neighbors, weights='distance')
scores = cross_validation.cross_val_score(kNNClassifier, input_vector, label_vector, cv = kFold)
print("\n----- k-fold Cross Validation -----")
print(scores)
print("Average: ", sum(scores) / len(scores))
def trainLimitedMLP(self, featureFile, n_datapoints):
(label_vector, input_vector) = self.__loadData__(featureFile)
n_totalrows = int((len(label_vector)/n_datapoints))
k=[]
for n in range(0, n_totalrows):
trainData, testData, trainLabels, testLabels = \
cross_validation.train_test_split(input_vector, label_vector, test_size=(0.2))
limited_label_vector = trainLabels[0: (n+1) * n_datapoints]
limited_input_vector = trainData[0: (n+1) * n_datapoints]
average = []
for a in range(0,5):
_, maxVal = self.trainMLPWithData(limited_input_vector, limited_label_vector, 1000)
average.append(maxVal)
averageMaxVal = sum(average) / len(average)
print 'Total Average Value: %s \n\n' % (averageMaxVal)
average = []
k.append(averageMaxVal)
print('Limited MLP training result -------------')
for i in range (0,len(k)):
print '%f on %d datapoints' % (k[i], n_datapoints * (i+1))
print '------------------------------------------'
def _cross_val(data, est, cv, n_jobs):
"""Helper to compute cross validation."""
try:
from sklearn.model_selection import cross_val_score
except ImportError:
# XXX support sklearn < 0.18
from sklearn.cross_validation import cross_val_score
return np.mean(cross_val_score(est, data, cv=cv, n_jobs=n_jobs,
scoring=_gaussian_loglik_scorer))
def _set_cv(cv, estimator=None, X=None, y=None):
"""Set the default CV depending on whether clf
is classifier/regressor."""
# Detect whether classification or regression
if estimator in ['classifier', 'regressor']:
est_is_classifier = estimator == 'classifier'
else:
est_is_classifier = is_classifier(estimator)
# Setup CV
if check_version('sklearn', '0.18'):
from sklearn import model_selection as models
from sklearn.model_selection import (check_cv,
StratifiedKFold, KFold)
if isinstance(cv, (int, np.int)):
XFold = StratifiedKFold if est_is_classifier else KFold
cv = XFold(n_splits=cv)
elif isinstance(cv, str):
if not hasattr(models, cv):
raise ValueError('Unknown cross-validation')
cv = getattr(models, cv)
cv = cv()
cv = check_cv(cv=cv, y=y, classifier=est_is_classifier)
else:
from sklearn import cross_validation as models
from sklearn.cross_validation import (check_cv,
StratifiedKFold, KFold)
if isinstance(cv, (int, np.int)):
if est_is_classifier:
cv = StratifiedKFold(y=y, n_folds=cv)
else:
cv = KFold(n=len(y), n_folds=cv)
elif isinstance(cv, str):
if not hasattr(models, cv):
raise ValueError('Unknown cross-validation')
cv = getattr(models, cv)
if cv.__name__ not in ['KFold', 'LeaveOneOut']:
raise NotImplementedError('CV cannot be defined with str'
' for sklearn < .017.')
cv = cv(len(y))
cv = check_cv(cv=cv, X=X, y=y, classifier=est_is_classifier)
# Extract train and test set to retrieve them at predict time
if hasattr(cv, 'split'):
cv_splits = [(train, test) for train, test in
cv.split(X=np.zeros_like(y), y=y)]
else:
# XXX support sklearn.cross_validation cv
cv_splits = [(train, test) for train, test in cv]
if not np.all([len(train) for train, _ in cv_splits]):
raise ValueError('Some folds do not have any train epochs.')
return cv, cv_splits
def trainMLPWithData(self, input_vector, label_vector, printSteps = 250):
percent_split = 0.7
trX, teX, trY, teY = cross_validation.train_test_split(input_vector,
label_vector, test_size=(1.0-percent_split), random_state=0)
n_inputs = 10
n_outputs = 8
X = tf.placeholder("float", [None, n_inputs])
Y = tf.placeholder("float", [None, n_outputs])
w_h = tf.Variable(tf.random_normal([n_inputs, 10], stddev=0.01))
w_o = tf.Variable(tf.random_normal([10, n_outputs], stddev=0.01))
p_keep_input = tf.placeholder("float")
p_keep_hidden = tf.placeholder("float")
X = tf.nn.dropout(X, p_keep_input)
h = tf.nn.relu(tf.matmul(X, w_h))
h = tf.nn.dropout(h, p_keep_hidden)
py_x = tf.matmul(h, w_o)
learnRate = 0.01
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(py_x, Y))
train_step = tf.train.RMSPropOptimizer(0.001, 0.9).minimize(cost)
#train_step = tf.train.GradientDescentOptimizer(learnRate).minimize(cost)
# Add accuracy checking nodes
tf_correct_prediction = tf.equal(tf.argmax(py_x,1), tf.argmax(teY,1))
tf_accuracy = tf.reduce_mean(tf.cast(tf_correct_prediction, "float"))
# Init variables
init = tf.initialize_all_variables()
sess = tf.Session()
sess.run(init)
k=[]
for i in range(10000):
sess.run(train_step, feed_dict={X: trX, Y: trY, p_keep_input: 0.8, p_keep_hidden: 0.5})
result = sess.run(tf_accuracy, feed_dict={X: teX, Y: teY, p_keep_input: 1.0, p_keep_hidden: 1.0})
# Save data
k.append(result)
if (i % printSteps == 0):
print("Run {},{}".format(i,result))
k=np.array(k)
print("Max accuracy: {}".format(k.max()))
print(('MLP training with %s datapoints :: Done \n\n') % (len(input_vector)))
self.trainedModel = sess
return (self.trainedModel, k.max())
def rand_forest_train(self):
# ??????????
users = pd.read_csv('names.csv')
# ??similarity?platform?reputation?entropy????????????
X = users[['similarity', 'platform', 'reputation', 'entropy']]
y = users['human_or_machine']
# ?????????? 25%???????
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)
# ????????????????
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer(sparse=False)
X_train = vec.fit_transform(X_train.to_dict(orient='record'))
X_test = vec.transform(X_test.to_dict(orient='record'))
# ?????????????????????
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
dtc_y_pred = dtc.predict(X_test)
# ???????????????????????
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
rfc_y_pred = rfc.predict(X_test)
# ???????????????????????
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
gbc.fit(X_train, y_train)
gbc_y_pred = gbc.predict(X_test)
from sklearn.metrics import classification_report
# ??????????????????? ?????????? ??? F1??
print("??????????", dtc.score(X_test, y_test))
print(classification_report(dtc_y_pred, y_test))
# ??????????????????????????????? ??? F1??
print("????????????", rfc.score(X_test, y_test))
print(classification_report(rfc_y_pred, y_test))
# ??????????????????????????????? ??? F1??
print("????????????", gbc.score(X_test, y_test))
print(classification_report(gbc_y_pred, y_test))
users = pd.read_csv('values.csv')
# ??????????
X = users[['similarity', 'platform', 'reputation', 'entropy']]
X = vec.transform(X.to_dict(orient='record'))
print(rfc.predict(X))
self.dtc = dtc
self.rfc = rfc
self.gbc = gbc