def train_model(clf, X, Y, name="NB ngram", plot=False):
# create it again for plotting
cv = ShuffleSplit(
n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)
train_errors = []
test_errors = []
scores = []
pr_scores = []
precisions, recalls, thresholds = [], [], []
clfs = [] # just to later get the median
for train, test in cv:
X_train, y_train = X[train], Y[train]
X_test, y_test = X[test], Y[test]
clf.fit(X_train, y_train)
clfs.append(clf)
train_score = clf.score(X_train, y_train)
test_score = clf.score(X_test, y_test)
train_errors.append(1 - train_score)
test_errors.append(1 - test_score)
scores.append(test_score)
proba = clf.predict_proba(X_test)
fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1])
precision, recall, pr_thresholds = precision_recall_curve(
y_test, proba[:, 1])
pr_scores.append(auc(recall, precision))
precisions.append(precision)
recalls.append(recall)
thresholds.append(pr_thresholds)
if plot:
scores_to_sort = pr_scores
median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]
plot_pr(pr_scores[median], name, phase, precisions[median],
recalls[median], label=name)
log_false_positives(clfs[median], X_test, y_test, name)
summary = (np.mean(scores), np.std(scores),
np.mean(pr_scores), np.std(pr_scores))
print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary
return np.mean(train_errors), np.mean(test_errors)
04_sent.py 文件源码
python
阅读 24
收藏 0
点赞 0
评论 0
评论列表
文章目录