evaluate_features.py 文件源码-python代码片段

def cross_predict(feat, f_name, X=X, y=y):

    if os.name == 'nt':
        n_jobs = 1
    else:
        n_jobs = -1
    # ????
    # clf_1 = MultinomialNB(alpha=5)
    clf_2 = LinearSVC(C=0.02)

    # ???? (CV)
    # This cross-validation object is a merge of StratifiedKFold and ShuffleSplit,
    # which returns stratified randomized folds. The folds are made by preserving
    # the percentage of samples for each class.
    #
    #  Note: like the ShuffleSplit strategy, stratified random splits do not guarantee
    # that all folds will be different, although this is still
    # very likely for sizeable datasets.
    #
    # Pass this cv to cross_val_predict will raise
    # ValueError:cross_val_predict only works for partitions
    #
    # ? cv ?????? fold ? fold ????????
    # cv = cross_validation.StratifiedShuffleSplit(y, test_size=0.2, random_state=42)

    # This cross-validation object is a variation of KFold that returns stratified folds.
    # The folds are made by preserving the percentage of samples for each class.
    cv = cross_validation.StratifiedKFold(y, n_folds=5, random_state=42)

    model = Pipeline([('feat', feat), ('clf', clf_2)])
    t0 = time()
    y_pred = cross_validation.cross_val_predict(model, X=X, y=y, n_jobs=n_jobs, cv=cv)
    t = time() - t0
    print("=" * 20, f_name, "=" * 20)
    print("time cost: {}".format(t))
    # print("y_predict: {}".format(y_pred))
    print()
    print('confusion matrix:\n', confusion_matrix(y, y_pred))
    print()
    print('\t\taccuracy: {}'.format(accuracy_score(y, y_pred)))
    print()
    print("\t\tclassification report")
    print("-" * 52)
    print(classification_report(y, y_pred))


# ??
# ???? (tfidf: baseline feature)