def rfFitScore(clf, dftrain, dftrain_y, dftest, dftest_y):
'''random forest classifier fit and score.
clf=RandomForestClassifier, dftrain=train data,
dftrain_y=train data Y, dftest=test data,
dftest_y=test data Y'''
clfit = clf.fit(dftrain, dftrain_y['Y']) # clf.fit(X, y)
imp = clfit.feature_importances_ # ndarray of 562
# clfit.fit_transform( X, y=None ) # returns X_new
new_y = clfit.predict( dftest ) # returns predicted Y
test_score = clfit.score( dftest, dftest_y['Y'] )
print("test score:", test_score) # clfit.oob_score_
if (clf.oob_score):
print("oob score", clfit.oob_score_)
# calculate test score by other means
print("predict True %.3f percent, %d out of %d" % \
((100 * sum(dftest_y['Y'] == new_y) / dftest_y.shape[0]), \
sum(dftest_y['Y'] == new_y), dftest_y.shape[0]))
print("predict False %.3f percent, %d out of %d" % \
((100 * sum(dftest_y['Y'] != new_y) / dftest_y.shape[0]), \
sum(dftest_y['Y'] != new_y), dftest_y.shape[0]))
# new_p = clfit.predict_proba( dftest )
# # probability of each X variable to predict each y class
# print("test predict probabilities head:\n", new_p[:5])
# cross table of variable predictions
ptab = pd.crosstab(dftest_y['Y'], new_y, \
rownames=['actual'], colnames=['predicted'])
print("cross table:\n", ptab)
# accuracy: percent labeled correctly
# precision: true positives / (true positives + true negatives)
# recall: true positives / (true positives + false negatives)
precision, recall, fbeta, support = prfs(dftest_y['Y'], new_y)
print("precision", precision, "\nrecall", recall, \
"\nfbeta", fbeta, "\nsupport", support)
if (clf.oob_score):
return test_score, imp, clfit.oob_score_
else:
return test_score, imp
评论列表
文章目录