def search_best_rf():
Xtrain_raw, ytrain_raw = load_raw_data("sentidata_train_raw.pkl")
print "training data loaded"
print_label_frequency(ytrain_raw)
############# create the pipeline
pipeline = Pipeline([
('vect', CountVectorizer(analyzer=do_nothing)),
('tfidf', TfidfTransformer()),
('rf', RandomForestClassifier(oob_score=True, verbose=1)),
])
############# initialize the search
parameters = {
'vect__max_features': (2000,3000,4000),
'rf__n_estimators': range(300,1200,100),
'rf__criterion':['gini','entropy'],
'rf__max_depth': range(10,100,10),
'rf__min_samples_split': range(10,100,10),
}
validate_split = PredefinedSplit(test_fold=make_train_validate_split(len(ytrain_raw)))
scoring_method = "roc_auc"
searchcv = RandomizedSearchCV(estimator=pipeline,
param_distributions=parameters,
n_iter=200,
scoring=scoring_method,
n_jobs=-1,
verbose=1,
cv = validate_split)
############# search
print "#################### search cv begins"
searchcv.fit(Xtrain_raw, ytrain_raw)
print "#################### search cv ends"
print "best {}: {}".format(scoring_method, searchcv.best_score_)
print "best parameters: ", searchcv.best_params_
############# check the best model
bestpipeline = searchcv.best_estimator_
common.dump_predictor("pipeline_rf.pkl",bestpipeline)
rf = bestpipeline.steps[-1][1]
print "RF's OOB score: {}".format(rf.oob_score_)
# words = bestpipeline.steps[0][1].get_feature_names()
# feat_importances = zip(words, rf.feature_importances_)
# feat_importances.sort(key=lambda t: -t[1])
# print feat_importances
############# training error analysis
ytrain_predict = bestpipeline.predict(Xtrain_raw)
print_classification_report('Training Data', ytrain_raw, ytrain_predict)
############# test error analysis
Xtest_raw, ytest_raw = load_raw_data("sentidata_test_raw.pkl")
ytest_predict = bestpipeline.predict(Xtest_raw)
print_classification_report('Testing Data', ytest_raw, ytest_predict)
评论列表
文章目录