def random_forest(df_features, df_ground_truth, out_dir, n_splits=5):
X = df_features.as_matrix()
y_true = np.argmax(df_ground_truth.as_matrix(), axis=1)
print "~~ Class distribution ~~"
for k, v in sorted(CLASS_LABELS.items(), key=lambda x: x[1]):
print "{}: {:.2f}%".format(k.capitalize(), (len(y_true[y_true == v]) / float(len(y_true))) * 100)
# Use stratified k-fold cross-validation
skf = StratifiedShuffleSplit(n_splits=5, test_size=.2)
auc_results = []
for i, (train, test) in enumerate(skf.split(X, y_true)):
X_train, y_train = X[train, :], y_true[train]
rf = RandomForestClassifier(class_weight='balanced')
rf.fit(X_train, y_train)
X_test, y_test = X[test, :], y_true[test]
y_pred_prob = rf.predict_proba(X_test)
auc = roc_auc(y_pred_prob, to_categorical(y_test), dict_reverse(CLASS_LABELS),
join(out_dir, 'roc_auc_split_{}.svg'.format(i)))
auc_results.append(auc['micro'])
print "\n~~ Average AUC over {} splits ~~\n{}".format(n_splits, np.mean(auc_results))
#X_train, X_test, y_train, y_test = train_test_split(X, y_true, train_size=.7, test_size=.3, random_state=4)
#
# rf.fit(X_train, y_train)
# # joblib.dump(rf, join(out_dir, 'classifier.pkl'))
#
# y_pred = rf.predict(X_test)
#
# print classification_report(y_true, y_pred)
# print confusion_matrix(y_true, y_pred)
# print accuracy_score(y_true, y_pred)
#
# return
#
# y_pred_prob = rf.predict_proba(X_test)
# roc_auc(y_pred_prob, to_categorical(y_test), CLASS_LABELS, join(out_dir, 'roc_auc.svg'))
评论列表
文章目录