def make_predictions_random_forest(df, features, target, split=0.70):
print "using %d features (%d columns) on %d rows and target %s. Split %f." % (
len(features), len(df.columns), len(df), target, split)
# print "unused features: ", '\n\t\t'.join([f for f in df.columns if f not in features])
# print "columns: ", '\n\t\t'.join(df.columns)
df['is_train'] = np.random.uniform(0, 1, len(df)) <= split
train, test = df[df['is_train'] == True], df[df['is_train'] == False]
clf = Pipeline([
("imputer", Imputer(strategy="mean", axis=0)),
('feature_selection', SelectKBest(k=200)),
("forest", RandomForestClassifier(
min_samples_leaf=1, min_samples_split=10, n_estimators=60, max_depth=None, criterion='gini'))])
clf.fit(train[features], train[target])
score = clf.score(test[features], test[target])
predicted = clf.predict(test[features])
cm = confusion_matrix(test[target], predicted)
# print classification_report(test[target], predicted)
return score, cm
# Utility function to report best scores
评论列表
文章目录