def __grid_search_model(self, clf_factory, documents, labels, pos_label):
boolndarr = labels.values == pos_label
n = documents.size
n_pos = labels[boolndarr].size
n_neg = n - n_pos
param_grid = {
'vect__binary' : [False, True],
'vect__min_df' : [1, 2],
'vect__ngram_range' : [(1, 1), (1, 2), (1, 3)],
'vect__smooth_idf' : [False, True],
'vect__stop_words' : [None, 'english'],
'vect__sublinear_tf': [False, True],
'vect__use_idf' : [False, True],
'clf__alpha' : [0, 0.01, 0.05, 0.1, 0.5, 1]
}
k = 5
cv = ShuffleSplit(
n,
n_iter = k,
test_size = 1 / k,
random_state = 0
)
pos_weight = n_neg / n_pos
sample_weight = np.ones(n)
sample_weight[boolndarr] *= pos_weight
fit_params = {'clf__sample_weight': sample_weight}
f1_scorer = make_scorer(f1_score, pos_label=pos_label)
grid_search = GridSearchCV(
clf_factory,
param_grid,
cv = cv,
fit_params = fit_params,
n_jobs = -1,
scoring = f1_scorer
)
grid_search.fit(documents, labels)
best_estimator = grid_search.best_estimator_
best_score = grid_search.best_score_
best_params = grid_search.best_params_
print("Best F1 score: {0:04.3f}".format(best_score))
print("Parameters: {0}".format(best_params))
return best_estimator
评论列表
文章目录