def train_model(split=.25):
"""Tran model based on the iris dataset.
This will split the iris dataset into train and test set, will
train a Random Forest CLassifier and fit the trained model to
the test dataset.
In addition the confusion matrix and features importance will be
calculated.
Args:
split (float): Fraction of observations in the test dataset.
Returns:
RandomForestClassifier: Trained model.
pandas.DataFrame: Confusion matrix.
dictionary: Features importance
"""
iris = load_iris()
all_data = pd.DataFrame(iris.data, columns=iris.feature_names)
features = all_data.columns.str.replace('\s+', '_').str.replace('\W+', '')
all_data['species'] = pd.Categorical.from_codes(iris.target,
iris.target_names)
train, test = train_test_split(all_data, test_size=split)
clf = RandomForestClassifier(n_jobs=1)
clf.fit(train.drop('species', axis=1), train.species)
preds = clf.predict(test.drop('species', axis=1))
conf_matrix = pd.crosstab(test['species'], preds,
rownames=['Actual Species'],
colnames=['Predicted Species'])
f_importances = list(zip(train.drop('species', axis=1).columns,
clf.feature_importances_))
return clf, conf_matrix, f_importances, features
评论列表
文章目录