sentiment_rf.py 文件源码-python代码片段

def search_best_rf():
    Xtrain_raw, ytrain_raw = load_raw_data("sentidata_train_raw.pkl")
    print "training data loaded"
    print_label_frequency(ytrain_raw)

    ############# create the pipeline
    pipeline = Pipeline([
        ('vect', CountVectorizer(analyzer=do_nothing)),
        ('tfidf', TfidfTransformer()),
        ('rf', RandomForestClassifier(oob_score=True, verbose=1)),
    ])

    ############# initialize the search
    parameters = {
        'vect__max_features': (2000,3000,4000),
        'rf__n_estimators': range(300,1200,100),
        'rf__criterion':['gini','entropy'],
        'rf__max_depth': range(10,100,10),
        'rf__min_samples_split': range(10,100,10),
    }
    validate_split = PredefinedSplit(test_fold=make_train_validate_split(len(ytrain_raw)))

    scoring_method = "roc_auc"
    searchcv = RandomizedSearchCV(estimator=pipeline,
                                param_distributions=parameters,
                                n_iter=200,
                                scoring=scoring_method,
                                n_jobs=-1,
                                verbose=1,
                                cv = validate_split)

    ############# search
    print "#################### search cv begins"
    searchcv.fit(Xtrain_raw, ytrain_raw)
    print "#################### search cv ends"
    print "best {}: {}".format(scoring_method, searchcv.best_score_)
    print "best parameters: ", searchcv.best_params_

    ############# check the best model
    bestpipeline = searchcv.best_estimator_
    common.dump_predictor("pipeline_rf.pkl",bestpipeline)

    rf = bestpipeline.steps[-1][1]
    print "RF's OOB score: {}".format(rf.oob_score_)

    # words = bestpipeline.steps[0][1].get_feature_names()
    # feat_importances = zip(words, rf.feature_importances_)
    # feat_importances.sort(key=lambda t: -t[1])
    # print feat_importances

    ############# training error analysis
    ytrain_predict = bestpipeline.predict(Xtrain_raw)
    print_classification_report('Training Data', ytrain_raw, ytrain_predict)

    ############# test error analysis
    Xtest_raw, ytest_raw = load_raw_data("sentidata_test_raw.pkl")
    ytest_predict = bestpipeline.predict(Xtest_raw)
    print_classification_report('Testing Data', ytest_raw, ytest_predict)