def train(self):
parameters = {'vect__ngram_range': [(1, 1), (1, 2), (1,3), (2,3)],
#'vect__binary': (True, False),
'clf__alpha': (1e-2, 1e-3, 1e-1, 1e-4, 1e-5),
'clf__loss': ('hinge', 'log'),
'clf__penalty': ('l2', 'l1', 'elasticnet')
# 'clf__nu': (0.5,0.6),
#'clf__kernel': ('rbf', 'linear', 'poly'),
# 'clf__tol': (1e-3, 1e-4, 1e-2, 1e-4)
#'clf__n_estimators': (10, 50, 100, 500),
#'clf__criterion': ('gini', 'entropy'),
#'clf__max_features': ("auto", "log2", 100,)
#'clf__alpha': (0, 1e-2, 1e-3, 1e-1, 1e-4, 1e-5),
#'clf__fit_prior': (False, True),
}
# gs_clf = GridSearchCV(self.text_clf, parameters, n_jobs=-1, scoring=self.posfmeasure)
# gs_clf = gs_clf.fit(self.features, self.labels)
# print gs_clf.best_params_
logging.info("Traning with {}/{} true pairs".format(str(sum(self.labels)), str(len(self.labels))))
try:
self.text_clf = self.text_clf.fit(self.features, self.labels)
except ValueError:
print "error training {}".format(self.modelname)
return
if not os.path.exists(self.basedir + self.modelname):
os.makedirs(self.basedir + self.modelname)
logging.info("Training complete, saving to {}/{}/{}.pkl".format(self.basedir, self.modelname, self.modelname))
joblib.dump(self.text_clf, "{}/{}/{}.pkl".format(self.basedir, self.modelname, self.modelname))
ch2 = SelectKBest(chi2, k=20)
half_point = int(len(self.features)*0.5)
X_train = self.text_clf.named_steps["vect"].fit_transform(self.features[:half_point])
X_test = self.text_clf.named_steps["vect"].transform(self.features[half_point:])
X_train = ch2.fit_transform(X_train, self.labels[:half_point])
X_test = ch2.transform(X_test)
feature_names = self.text_clf.named_steps["vect"].get_feature_names()
feature_names = [feature_names[i] for i
in ch2.get_support(indices=True)]
print feature_names
# joblib.dump(gs_clf.best_estimator_, "{}/{}/{}.pkl".format(self.basedir, self.modelname, self.modelname))
# self.test()
评论列表
文章目录