def train_sentence_classifier(self, pairtype):
self.text_clf = Pipeline([('vect', CountVectorizer(analyzer='char_wb', ngram_range=(7,20), min_df=0.2, max_df=0.5)),
#('vect', CountVectorizer(analyzer='word', ngram_range=(1,5), stop_words="english", min_df=0.1)),
# ('tfidf', TfidfTransformer(use_idf=True, norm="l2")),
#('tfidf', TfidfVectorizer(analyzer='char_wb', ngram_range=(6,20))),
#('clf', SGDClassifier(loss='hinge', penalty='l1', alpha=0.01, n_iter=5, random_state=42)),
#('clf', SGDClassifier())
#('clf', svm.SVC(kernel='rbf', C=10, verbose=True, tol=1e-5))
#('clf', RandomForestClassifier(n_estimators=10))
#('feature_selection', feature_selection.SelectFromModel(LinearSVC(penalty="l1"))),
('clf', MultinomialNB(alpha=0.1, fit_prior=False))
#('clf', DummyClassifier(strategy="constant", constant=True))
])
f, labels, sids = self.get_features(pairtype)
half_point = int(len(f)*0.5)
self.train_sentences = sids[:half_point]
"""ch2 = SelectKBest(chi2, k=20)
X_train = text_clf.named_steps["vect"].fit_transform(f[:half_point])
X_test = text_clf.named_steps["vect"].transform(f[half_point:])
X_train = ch2.fit_transform(X_train, labels[:half_point])
X_test = ch2.transform(X_test)
feature_names = text_clf.named_steps["vect"].get_feature_names()
feature_names = [feature_names[i] for i
in ch2.get_support(indices=True)]
# print feature_names"""
# train
text_clf = self.text_clf.fit(f[:half_point], labels[:half_point])
#save model
if not os.path.exists("models/kernel_models/" + pairtype + "_sentence_classifier/"):
os.makedirs("models/kernel_models/" + pairtype + "_sentence_classifier/")
logging.info("Training complete, saving to {}/{}/{}.pkl".format("models/kernel_models/",
pairtype + "_sentence_classifier/", pairtype))
joblib.dump(text_clf, "{}/{}/{}.pkl".format("models/kernel_models/",
pairtype + "_sentence_classifier/", pairtype))
# evaluate
pred = text_clf.predict(f[half_point:])
# print len(pred), sum(pred)
self.type_sentences[pairtype] = []
for ip, p in enumerate(pred):
if p:
self.type_sentences[pairtype].append(sids[half_point + ip])
res = metrics.confusion_matrix(labels[half_point:], pred)
return res[1][1], res[0][1], res[1][0]
评论列表
文章目录