def _sort_applicable_ngrams(self, list_of_ngrams, examples, labels):
"""Given an intent classification problem and a list of ngrams,
creates ordered list of most useful ngrams."""
if list_of_ngrams:
from sklearn import linear_model, preprocessing
import numpy as np
# filter examples where we do not have enough labeled instances for cv
usable_labels = []
for label in np.unique(labels):
lab_sents = np.array(examples)[np.array(labels) == label]
if len(lab_sents) < self.min_intent_examples_for_ngram_classification:
continue
usable_labels.append(label)
mask = [label in usable_labels for label in labels]
if any(mask) and len(usable_labels) >= 2:
try:
examples = np.array(examples)[mask]
labels = np.array(labels)[mask]
X = np.array(self._ngrams_in_sentences(examples, list_of_ngrams))
intent_encoder = preprocessing.LabelEncoder()
intent_encoder.fit(labels)
y = intent_encoder.transform(labels)
clf = linear_model.RandomizedLogisticRegression(C=1)
clf.fit(X, y)
scores = clf.scores_
sort_idx = [i[0] for i in sorted(enumerate(scores), key=lambda x: -1 * x[1])]
return np.array(list_of_ngrams)[sort_idx]
except ValueError as e:
if "needs samples of at least 2 classes" in str(e):
# we got unlucky during the random sampling :( and selected a slice that only contains one class
return []
else:
raise e
else:
# there is no example we can use for the cross validation
return []
else:
return []
评论列表
文章目录