def build_model_random_forest(df, features, categorical_features, target, split=0.70):
print "using %d features (%d columns) on %d rows and target %s. Split %f." % (
len(features), len(df.columns), len(df), target, split)
df['is_train'] = np.random.uniform(0, 1, len(df)) <= split
train, test = df[df['is_train'] == True], df[df['is_train'] == False]
# one_hot_encoding because it doesn't work in pipeline for some reason
# for f in categorical_features:
# dummies = pd.get_dummies(df[f], prefix=f)
# for dummy in dummies.columns:
# df[dummy] = dummies[dummy]
# features.append(dummy)
# df = df.drop(f, 1)
# features.remove(f)
clf = Pipeline([
("imputer", Imputer(strategy="mean", axis=0)),
('feature_selection', SelectKBest(k=5)),
("forest", RandomForestClassifier())])
clf.fit(train[features], train[target])
score = clf.score(test[features], test[target])
predicted = clf.predict(test[features])
cm = confusion_matrix(test[target], predicted)
print "Random Forest score: %f" % score
print "confusion_matrix : \n%s" % cm
return clf
评论列表
文章目录