ngram_featurizer.py 文件源码-python代码片段

def _sort_applicable_ngrams(self, list_of_ngrams, examples, labels):
        """Given an intent classification problem and a list of ngrams,

        creates ordered list of most useful ngrams."""

        if list_of_ngrams:
            from sklearn import linear_model, preprocessing
            import numpy as np

            # filter examples where we do not have enough labeled instances for cv
            usable_labels = []
            for label in np.unique(labels):
                lab_sents = np.array(examples)[np.array(labels) == label]
                if len(lab_sents) < self.min_intent_examples_for_ngram_classification:
                    continue
                usable_labels.append(label)

            mask = [label in usable_labels for label in labels]
            if any(mask) and len(usable_labels) >= 2:
                try:
                    examples = np.array(examples)[mask]
                    labels = np.array(labels)[mask]

                    X = np.array(self._ngrams_in_sentences(examples, list_of_ngrams))
                    intent_encoder = preprocessing.LabelEncoder()
                    intent_encoder.fit(labels)
                    y = intent_encoder.transform(labels)

                    clf = linear_model.RandomizedLogisticRegression(C=1)
                    clf.fit(X, y)
                    scores = clf.scores_
                    sort_idx = [i[0] for i in sorted(enumerate(scores), key=lambda x: -1 * x[1])]

                    return np.array(list_of_ngrams)[sort_idx]
                except ValueError as e:
                    if "needs samples of at least 2 classes" in str(e):
                        # we got unlucky during the random sampling :( and selected a slice that only contains one class
                        return []
                    else:
                        raise e
            else:
                # there is no example we can use for the cross validation
                return []
        else:
            return []