document_filter.py 文件源码-python代码片段

def __grid_search_model(self, clf_factory, documents, labels, pos_label):
        boolndarr        = labels.values == pos_label
        n                = documents.size
        n_pos            = labels[boolndarr].size
        n_neg            = n - n_pos

        param_grid       = {
            'vect__binary'      : [False, True],
            'vect__min_df'      : [1, 2],
            'vect__ngram_range' : [(1, 1), (1, 2), (1, 3)],
            'vect__smooth_idf'  : [False, True],
            'vect__stop_words'  : [None, 'english'],
            'vect__sublinear_tf': [False, True],
            'vect__use_idf'     : [False, True],
            'clf__alpha'        : [0, 0.01, 0.05, 0.1, 0.5, 1]
        }

        k                = 5
        cv               = ShuffleSplit(
            n,
            n_iter       = k,
            test_size    = 1 / k,
            random_state = 0
        )

        pos_weight       = n_neg / n_pos
        sample_weight    = np.ones(n)
        sample_weight[boolndarr] *= pos_weight
        fit_params       = {'clf__sample_weight': sample_weight}

        f1_scorer        = make_scorer(f1_score, pos_label=pos_label)

        grid_search      = GridSearchCV(
            clf_factory,
            param_grid,
            cv           = cv,
            fit_params   = fit_params,
            n_jobs       = -1,
            scoring      = f1_scorer
        )

        grid_search.fit(documents, labels)
        best_estimator   = grid_search.best_estimator_
        best_score       = grid_search.best_score_
        best_params      = grid_search.best_params_

        print("Best F1 score: {0:04.3f}".format(best_score))
        print("Parameters: {0}".format(best_params))

        return best_estimator