def _cv_len(cv, X, y):
"""This method computes the length of a cross validation
object, agnostic of whether sklearn-0.17 or sklearn-0.18
is being used.
Parameters
----------
cv : `sklearn.cross_validation._PartitionIterator` or `sklearn.model_selection.BaseCrossValidator`
The cv object from which to extract length. If using
sklearn-0.17, this can be computed by calling `len` on
``cv``, else it's computed with `cv.get_n_splits(X, y)`.
X : pd.DataFrame or np.ndarray, shape(n_samples, n_features)
The dataframe or np.ndarray being fit in the grid search.
y : np.ndarray, shape(n_samples,)
The target being fit in the grid search.
Returns
-------
int
"""
return len(cv) if not SK18 else cv.get_n_splits(X, y)
python类model_selection()的实例源码
def _set_cv(cv, X, y, classifier):
"""This method returns either a `sklearn.cross_validation._PartitionIterator` or
`sklearn.model_selection.BaseCrossValidator` depending on whether sklearn-0.17
or sklearn-0.18 is being used.
Parameters
----------
cv : int, `_PartitionIterator` or `BaseCrossValidator`
The CV object or int to check. If an int, will be converted
into the appropriate class of crossvalidator.
X : pd.DataFrame or np.ndarray, shape(n_samples, n_features)
The dataframe or np.ndarray being fit in the grid search.
y : np.ndarray, shape(n_samples,)
The target being fit in the grid search.
classifier : bool
Whether the estimator being fit is a classifier
Returns
-------
`_PartitionIterator` or `BaseCrossValidator`
"""
return check_cv(cv, X, y, classifier) if not SK18 else check_cv(cv, y, classifier)
def _cross_val(data, est, cv, n_jobs):
"""Helper to compute cross validation."""
try:
from sklearn.model_selection import cross_val_score
except ImportError:
# XXX support sklearn < 0.18
from sklearn.cross_validation import cross_val_score
return np.mean(cross_val_score(est, data, cv=cv, n_jobs=n_jobs,
scoring=_gaussian_loglik_scorer))
def __init__(self, clf=None, le=None):
# type: (sklearn.model_selection.GridSearchCV, sklearn.preprocessing.LabelEncoder) -> None
"""Construct a new intent classifier using the sklearn framework."""
from sklearn.preprocessing import LabelEncoder
if le is not None:
self.le = le
else:
self.le = LabelEncoder()
self.clf = clf
def train(self, training_data, config, **kwargs):
# type: (TrainingData, RasaNLUConfig, **Any) -> None
"""Train the intent classifier on a data set.
:param num_threads: number of threads used during training time"""
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import numpy as np
labels = [e.get("intent") for e in training_data.intent_examples]
if len(set(labels)) < 2:
logger.warn("Can not train an intent classifier. Need at least 2 different classes. " +
"Skipping training of intent classifier.")
else:
y = self.transform_labels_str2num(labels)
X = np.stack([example.get("text_features") for example in training_data.intent_examples])
sklearn_config = config.get("intent_classifier_sklearn")
C = sklearn_config.get("C", [1, 2, 5, 10, 20, 100])
kernel = sklearn_config.get("kernel", "linear")
# dirty str fix because sklearn is expecting str not instance of basestr...
tuned_parameters = [{"C": C, "kernel": [str(kernel)]}]
cv_splits = max(2, min(MAX_CV_FOLDS, np.min(np.bincount(y)) // 5)) # aim for 5 examples in each fold
self.clf = GridSearchCV(SVC(C=1, probability=True, class_weight='balanced'),
param_grid=tuned_parameters, n_jobs=config["num_threads"],
cv=cv_splits, scoring='f1_weighted', verbose=1)
self.clf.fit(X, y)
def __init__(self, clf=None, le=None):
# type: (sklearn.model_selection.GridSearchCV, sklearn.preprocessing.LabelEncoder) -> None
"""Construct a new intent classifier using the sklearn framework."""
from sklearn.preprocessing import LabelEncoder
if le is not None:
self.le = le
else:
self.le = LabelEncoder()
self.clf = clf
def train(self, training_data, config, **kwargs):
# type: (TrainingData, RasaNLUConfig, **Any) -> None
"""Train the intent classifier on a data set.
:param num_threads: number of threads used during training time"""
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import numpy as np
labels = [e.get("intent") for e in training_data.intent_examples]
if len(set(labels)) < 2:
logger.warn("Can not train an intent classifier. Need at least 2 different classes. " +
"Skipping training of intent classifier.")
else:
y = self.transform_labels_str2num(labels)
X = np.stack([example.get("text_features") for example in training_data.intent_examples])
sklearn_config = config.get("intent_classifier_sklearn")
C = sklearn_config.get("C", [1, 2, 5, 10, 20, 100])
kernel = sklearn_config.get("kernel", "linear")
# dirty str fix because sklearn is expecting str not instance of basestr...
tuned_parameters = [{"C": C, "kernel": [str(kernel)]}]
cv_splits = max(2, min(MAX_CV_FOLDS, np.min(np.bincount(y)) // 5)) # aim for 5 examples in each fold
self.clf = GridSearchCV(SVC(C=1, probability=True, class_weight='balanced'),
param_grid=tuned_parameters, n_jobs=config["num_threads"],
cv=cv_splits, scoring='f1_weighted', verbose=1)
self.clf.fit(X, y)
def _set_cv(cv, estimator=None, X=None, y=None):
"""Set the default CV depending on whether clf
is classifier/regressor."""
# Detect whether classification or regression
if estimator in ['classifier', 'regressor']:
est_is_classifier = estimator == 'classifier'
else:
est_is_classifier = is_classifier(estimator)
# Setup CV
if check_version('sklearn', '0.18'):
from sklearn import model_selection as models
from sklearn.model_selection import (check_cv,
StratifiedKFold, KFold)
if isinstance(cv, (int, np.int)):
XFold = StratifiedKFold if est_is_classifier else KFold
cv = XFold(n_splits=cv)
elif isinstance(cv, str):
if not hasattr(models, cv):
raise ValueError('Unknown cross-validation')
cv = getattr(models, cv)
cv = cv()
cv = check_cv(cv=cv, y=y, classifier=est_is_classifier)
else:
from sklearn import cross_validation as models
from sklearn.cross_validation import (check_cv,
StratifiedKFold, KFold)
if isinstance(cv, (int, np.int)):
if est_is_classifier:
cv = StratifiedKFold(y=y, n_folds=cv)
else:
cv = KFold(n=len(y), n_folds=cv)
elif isinstance(cv, str):
if not hasattr(models, cv):
raise ValueError('Unknown cross-validation')
cv = getattr(models, cv)
if cv.__name__ not in ['KFold', 'LeaveOneOut']:
raise NotImplementedError('CV cannot be defined with str'
' for sklearn < .017.')
cv = cv(len(y))
cv = check_cv(cv=cv, X=X, y=y, classifier=est_is_classifier)
# Extract train and test set to retrieve them at predict time
if hasattr(cv, 'split'):
cv_splits = [(train, test) for train, test in
cv.split(X=np.zeros_like(y), y=y)]
else:
# XXX support sklearn.cross_validation cv
cv_splits = [(train, test) for train, test in cv]
if not np.all([len(train) for train, _ in cv_splits]):
raise ValueError('Some folds do not have any train epochs.')
return cv, cv_splits