sklearn_intent_classifier.py 文件源码-python代码片段

def train(self, training_data, config, **kwargs):
        # type: (TrainingData, RasaNLUConfig, **Any) -> None
        """Train the intent classifier on a data set.

        :param num_threads: number of threads used during training time"""
        from sklearn.model_selection import GridSearchCV
        from sklearn.svm import SVC
        import numpy as np

        labels = [e.get("intent") for e in training_data.intent_examples]

        if len(set(labels)) < 2:
            logger.warn("Can not train an intent classifier. Need at least 2 different classes. " +
                        "Skipping training of intent classifier.")
        else:
            y = self.transform_labels_str2num(labels)
            X = np.stack([example.get("text_features") for example in training_data.intent_examples])

            sklearn_config = config.get("intent_classifier_sklearn")
            C = sklearn_config.get("C", [1, 2, 5, 10, 20, 100])
            kernel = sklearn_config.get("kernel", "linear")
            # dirty str fix because sklearn is expecting str not instance of basestr...
            tuned_parameters = [{"C": C, "kernel": [str(kernel)]}]
            cv_splits = max(2, min(MAX_CV_FOLDS, np.min(np.bincount(y)) // 5))  # aim for 5 examples in each fold

            self.clf = GridSearchCV(SVC(C=1, probability=True, class_weight='balanced'),
                                    param_grid=tuned_parameters, n_jobs=config["num_threads"],
                                    cv=cv_splits, scoring='f1_weighted', verbose=1)

            self.clf.fit(X, y)