def test_one_rf():
Xtrain_raw, ytrain_raw = load_raw_data("sentidata_train_raw.pkl")
print "training data loaded"
print_label_frequency(ytrain_raw)
############# create the pipeline
pipeline = Pipeline([
('vect', CountVectorizer(analyzer=lambda x:x,max_features=3000)),
('tfidf', TfidfTransformer()),
('rf', RandomForestClassifier(n_estimators=500,
max_depth=200,
min_samples_split=10,
oob_score=True,
n_jobs=-1,verbose=1,class_weight='balanced')),
])
############# train
pipeline.fit(Xtrain_raw,ytrain_raw)
############# check result
rf = pipeline.steps[-1][1]
rf.oob_score_
############# training error
ytrain_predict = pipeline.predict(Xtrain_raw)
print classification_report(y_true=ytrain_raw,y_pred=ytrain_predict)
print confusion_matrix(y_true=ytrain_raw,y_pred=ytrain_predict)
############# testing error
Xtest_raw, ytest_raw = load_raw_data("sentidata_test_raw.pkl")
ytest_predict = pipeline.predict(Xtest_raw)
accuracy_score(y_true=ytest_raw,y_pred=ytest_predict)
print classification_report(y_true=ytest_raw,y_pred=ytest_predict)
评论列表
文章目录