def get_model_class(method):
"""
Returns the class associated with a method string.
:param method: A string describing the method to use.
:return: A class corresponding to the method.
"""
if method == 'logistic':
return sklearn.linear_model.LogisticRegression
elif method == 'svm':
return sklearn.svm.SVC
elif method == 'mirowski-svm':
return sklearn.svm.SVC
elif method == 'sgd':
return sklearn.linear_model.SGDClassifier
elif method == 'random-forest':
return sklearn.ensemble.RandomForestClassifier
elif method == 'nearest-centroid':
return sklearn.neighbors.NearestCentroid
elif method == 'knn':
return sklearn.neighbors.KNeighborsClassifier
elif method == 'bagging':
return sklearn.ensemble.BaggingClassifier
else:
raise NotImplementedError("Method {} is not supported".format(method))
python类ensemble()的实例源码
def predictKFoldRandomForest(X, y, estimators=10, criterion="gini", maxdepth=None, selectKBest=0, kfold=10):
"""
Classifies the data using decision trees and k-fold CV
:param X: The matrix of feature vectors
:type X: list
:param y: The vector containing labels corresponding to the feature vectors
:type y: list
:param estimators: The number of random trees to use in classification
:type estimators: int
:param criterion: The splitting criterion employed by the decision tree
:type criterion: str
:param splitter: The method used to split the data
:type splitter: str
:param maxDepth: The maximum depth the tree is allowed to grow
:type maxDepth: int
:param selectKBest: The number of best features to select
:type selectKBest: int
:param kfold: The number of folds to use in K-fold CV
:type kfold: int
:return: A list of predicted labels across the k-folds
"""
try:
# Prepare data
X, y = numpy.array(X), numpy.array(y)
# Define classifier
clf = ensemble.RandomForestClassifier(n_estimators=estimators, criterion=criterion, max_depth=maxdepth)
X_new = SelectKBest(chi2, k=selectKBest).fit_transform(X, y) if selectKBest > 0 else X
predicted = cross_val_predict(clf, X_new, y, cv=kfold).tolist()
except Exception as e:
prettyPrintError(e)
return []
return predicted
def model_diagnostics(mod, X, y, type='single'):
if type == 'ensemble':
g = mod.predict(X, y)
else:
g = mod.predict(X)
return diagnostics(g, y)
#functions for accuracy statistics
advanced_supvervised_model_trainer.py 文件源码
项目:healthcareai-py
作者: HealthCatalyst
项目源码
文件源码
阅读 22
收藏 0
点赞 0
评论 0
def ensemble_regression(self, scoring_metric='neg_mean_squared_error', model_by_name=None):
# TODO stub
self.validate_regression('Ensemble Regression')
raise HealthcareAIError('We apologize. An ensemble linear regression has not yet been implemented.')
def train_model(target, features, model_params, test_split):
"""
Train the Gradient Boosting model on the provided data and calculate the accuracy of the model
Input:
@param target: 1D Array of the variable that the model is to be trianed to predict
@param features: 2D Array NSamples * NFeatures to use in trining the model
@param model_params: A dictionary of model parameters, the full specification can be found on the
scikit learn page for [GradientBoostingRegressor](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html)
@parma test_split: The fraction of the data to be withheld for testing the model / calculating the accuray
"""
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=test_split)
model = GradientBoostingRegressor(**model_params)
model.fit(features_train, target_train)
accuracy = calculate_model_accuracy(model, features, target)
return model, accuracy
def train_model(target, features, model_params, test_split):
"""
Train the Gradient Boosting model on the provided data and calculate the accuracy of the model
Input:
@param target: 1D Array of the variable that the model is to be trianed to predict
@param features: 2D Array NSamples * NFeatures to use in trining the model
@param model_params: A dictionary of model parameters, the full specification can be found on the
scikit learn page for [GradientBoostingRegressor](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html)
@parma test_split: The fraction of the data to be withheld for testing the model / calculating the accuray
"""
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=test_split)
model = GradientBoostingRegressor(**model_params)
model.fit(features_train, target_train)
accuracy = calculate_model_accuracy(model, features, target)
return model, accuracy
def train_model(target, features, model_params, test_split):
"""
Train the Gradient Boosting model on the provided data and calculate the accuracy of the model
Input:
@param target: 1D Array of the variable that the model is to be trianed to predict
@param features: 2D Array NSamples * NFeatures to use in trining the model
@param model_params: A dictionary of model parameters, the full specification can be found on the
scikit learn page for [GradientBoostingRegressor](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html)
@parma test_split: The fraction of the data to be withheld for testing the model / calculating the accuray
"""
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=test_split)
model = GradientBoostingRegressor(**model_params)
model.fit(features_train, target_train)
accuracy = calculate_model_accuracy(model, features, target)
return model, accuracy
def train_model(target, features, model_params, test_split):
"""
Train the Gradient Boosting model on the provided data and calculate the accuracy of the model
Input:
@param target: 1D Array of the variable that the model is to be trianed to predict
@param features: 2D Array NSamples * NFeatures to use in trining the model
@param model_params: A dictionary of model parameters, the full specification can be found on the
scikit learn page for [GradientBoostingRegressor](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html)
@parma test_split: The fraction of the data to be withheld for testing the model / calculating the accuray
"""
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=test_split)
model = GradientBoostingRegressor(**model_params)
model.fit(features_train, target_train)
accuracy = calculate_model_accuracy(model, features, target)
return model, accuracy
def train_model(target, features, model_params, test_split):
"""
Train the Gradient Boosting model on the provided data and calculate the accuracy of the model
Input:
@param target: 1D Array of the variable that the model is to be trianed to predict
@param features: 2D Array NSamples * NFeatures to use in trining the model
@param model_params: A dictionary of model parameters, the full specification can be found on the
scikit learn page for [GradientBoostingRegressor](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html)
@parma test_split: The fraction of the data to be withheld for testing the model / calculating the accuray
"""
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=test_split)
model = GradientBoostingRegressor(**model_params)
model.fit(features_train, target_train)
accuracy = calculate_model_accuracy(model, features, target)
return model, accuracy
def train_model(target, features, model_params, test_split):
"""
Train the Gradient Boosting model on the provided data and calculate the accuracy of the model
Input:
@param target: 1D Array of the variable that the model is to be trianed to predict
@param features: 2D Array NSamples * NFeatures to use in trining the model
@param model_params: A dictionary of model parameters, the full specification can be found on the
scikit learn page for [GradientBoostingRegressor](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html)
@parma test_split: The fraction of the data to be withheld for testing the model / calculating the accuray
"""
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=test_split)
model = GradientBoostingRegressor(**model_params)
model.fit(features_train, target_train)
accuracy = calculate_model_accuracy(model, features, target)
return model, accuracy
def train_model(target, features, model_params, test_split):
"""
Train the Gradient Boosting model on the provided data and calculate the accuracy of the model
Input:
@param target: 1D Array of the variable that the model is to be trianed to predict
@param features: 2D Array NSamples * NFeatures to use in trining the model
@param model_params: A dictionary of model parameters, the full specification can be found on the
scikit learn page for [GradientBoostingRegressor](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html)
@parma test_split: The fraction of the data to be withheld for testing the model / calculating the accuray
"""
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=test_split)
model = GradientBoostingRegressor(**model_params)
model.fit(features_train, target_train)
accuracy = calculate_model_accuracy(model, features, target)
return model, accuracy
def train_model(target, features, model_params, test_split):
"""
Train the Gradient Boosting model on the provided data and calculate the accuracy of the model
Input:
@param target: 1D Array of the variable that the model is to be trianed to predict
@param features: 2D Array NSamples * NFeatures to use in trining the model
@param model_params: A dictionary of model parameters, the full specification can be found on the
scikit learn page for [GradientBoostingRegressor](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html)
@parma test_split: The fraction of the data to be withheld for testing the model / calculating the accuray
"""
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=test_split)
model = GradientBoostingRegressor(**model_params)
model.fit(features_train, target_train)
accuracy = calculate_model_accuracy(model, features, target)
return model, accuracy
def train_model(target, features, model_params, test_split):
"""
Train the Gradient Boosting model on the provided data and calculate the accuracy of the model
Input:
@param target: 1D Array of the variable that the model is to be trianed to predict
@param features: 2D Array NSamples * NFeatures to use in trining the model
@param model_params: A dictionary of model parameters, the full specification can be found on the
scikit learn page for [GradientBoostingRegressor](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html)
@parma test_split: The fraction of the data to be withheld for testing the model / calculating the accuray
"""
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=test_split)
model = GradientBoostingRegressor(**model_params)
model.fit(features_train, target_train)
accuracy = calculate_model_accuracy(model, features, target)
return model, accuracy
def train_model(target, features, model_params, test_split):
"""
Train the Gradient Boosting model on the provided data and calculate the accuracy of the model
Input:
@param target: 1D Array of the variable that the model is to be trianed to predict
@param features: 2D Array NSamples * NFeatures to use in trining the model
@param model_params: A dictionary of model parameters, the full specification can be found on the
scikit learn page for [GradientBoostingRegressor](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html)
@parma test_split: The fraction of the data to be withheld for testing the model / calculating the accuray
"""
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=test_split)
model = GradientBoostingRegressor(**model_params)
model.fit(features_train, target_train)
accuracy = calculate_model_accuracy(model, features, target)
return model, accuracy
def train_model(target, features, model_params, test_split):
"""
Train the Gradient Boosting model on the provided data and calculate the accuracy of the model
Input:
@param target: 1D Array of the variable that the model is to be trianed to predict
@param features: 2D Array NSamples * NFeatures to use in trining the model
@param model_params: A dictionary of model parameters, the full specification can be found on the
scikit learn page for [GradientBoostingRegressor](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html)
@parma test_split: The fraction of the data to be withheld for testing the model / calculating the accuray
"""
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=test_split)
model = GradientBoostingRegressor(**model_params)
model.fit(features_train, target_train)
accuracy = calculate_model_accuracy(model, features, target)
return model, accuracy
def predictAndTestEnsemble(X, y, Xtest, ytest, classifiers=[], selectKBest=0):
"""
Trains an Ensemble of classifiers (with default params) and using a training dataset,
and returns majority vote using the same training dataset and an out-of-sample test dataset
:type X: list
:param y: The labels corresponding to the training feature vectors
:type y: list
:param Xtest: The matrix of test feature vectors
:type Xtest: list
:param ytest: The labels corresponding to the test feature vectors
:type ytest: list
:param classifiers: A list of classifiers to use in the ensemble
:type classifiers: list of str
:param selectKBest: The number of best features to select
:type selectKBest: int
:return: Two lists of the validation and test accuracies across the k-folds
"""
try:
predicted, predicted_test = [], []
# Prepare the data
X, y, Xtest, ytest = numpy.array(X), numpy.array(y), numpy.array(Xtest), numpy.array(ytest)
# Define classifiers
ensembleClassifiers = []
for c in classifiers:
if c.lower().find("knn") != -1:
K = int(c.split('-')[-1])
clf = neighbors.KNeighborsClassifier(n_neighbors=K)
elif c.lower().find("svm") != -1:
clf = svm.SVC(kernel='linear', C=1)
elif c.lower().find("forest") != -1:
E = int(c.split('-')[-1])
clf = ensemble.RandomForestClassifier(n_estimators=E,)
# Add to list
ensembleClassifiers.append((c, clf))
# Select K Best features if applicable
X_new = SelectKBest(chi2, k=selectKBest).fit_transform(X, y) if selectKBest > 0 else X
Xtest_new = SelectKBest(chi2, k=selectKBest).fit_transform(Xtest, ytest) if selectKBest > 0 else Xtest
# Train and fit the voting classifier
voting = VotingClassifier(estimators=ensembleClassifiers, voting='hard')
prettyPrint("Fitting ensemble model")
voting = voting.fit(X_new, y)
prettyPrint("Validating model")
predicted = voting.predict(X_new)
# Same for the test dataset
prettyPrint("Testing the model")
predicted_test = voting.predict(Xtest_new)
except Exception as e:
prettyPrintError(e)
return [], []
return predicted, predicted_test
def predictAndTestRandomForest(X, y, Xtest, ytest, estimators=10, criterion="gini", maxdepth=None, selectKBest=0):
"""
Trains a tree using the training data and tests it using the test data using K-fold cross validation
:param Xtr: The matrix of training feature vectors
:type Xtr: list
:param ytr: The labels corresponding to the training feature vectors
:type ytr: list
:param Xte: The matrix of test feature vectors
:type yte: list
:param estimators: The number of random trees to use in classification
:type estimators: int
:param criterion: The splitting criterion employed by the decision tree
:type criterion: str
:param maxdepth: The maximum depth the tree is allowed to grow
:type maxdepth: int
:param selectKBest: The number of best features to select
:type selectKBest: int
:return: Two lists of the validation and test accuracies across the 10 folds
"""
try:
predicted, predicted_test = [], []
# Define classifier and cross validation iterator
clf = ensemble.RandomForestClassifier(n_estimators=estimators, criterion=criterion, max_depth=maxdepth)
# Start the cross validation learning
X, y, Xtest, ytest = numpy.array(X), numpy.array(y), numpy.array(Xtest), numpy.array(ytest)
# Select K Best features if enabled
prettyPrint("Selecting %s best features from feature vectors" % selectKBest)
X_new = SelectKBest(chi2, k=selectKBest).fit_transform(X, y) if selectKBest > 0 else X
Xtest_new = SelectKBest(chi2, k=selectKBest).fit_transform(Xtest, ytest) if selectKBest > 0 else Xtest
# Fit model
prettyPrint("Fitting model")
clf.fit(X_new, y)
# Validate and test model
prettyPrint("Validating model using training data")
predicted = clf.predict(X_new)
prettyPrint("Testing model")
predicted_test = clf.predict(Xtest_new)
except Exception as e:
prettyPrintError(e)
return [], []
return predicted, predicted_test
advanced_supvervised_model_trainer.py 文件源码
项目:healthcareai-py
作者: HealthCatalyst
项目源码
文件源码
阅读 22
收藏 0
点赞 0
评论 0
def ensemble_classification(self, scoring_metric='roc_auc', trained_model_by_name=None):
"""
This provides a simple way to put data in and have healthcare.ai train a few models and pick the best one for
your data.
Args:
scoring_metric (str): The metric used to rank the models. Defaults to 'roc_auc'
trained_model_by_name (dict): A dictionary of trained models to compare for a custom ensemble
Returns:
TrainedSupervisedModel: The best TrainedSupervisedModel found.
"""
self.validate_classification('Ensemble Classification')
self.validate_score_metric_for_number_of_classes(scoring_metric)
score_by_name = {}
# Here is the default list of algorithms to try for the ensemble
# Adding an ensemble method is as easy as adding a new key:value pair in the `model_by_name` dictionary
if trained_model_by_name is None:
# TODO because these now all return TSMs it will be additionally slow by all the factor models.
# TODO Could these be trained separately then after the best is found, train the factor model and add to TSM?
trained_model_by_name = {
'KNN': self.knn(randomized_search=True, scoring_metric=scoring_metric),
'Logistic Regression': self.logistic_regression(randomized_search=True),
'Random Forest Classifier': self.random_forest_classifier(
trees=200,
randomized_search=True,
scoring_metric=scoring_metric)}
for name, model in trained_model_by_name.items():
# Unroll estimator from trained supervised model
estimator = hcai_tsm.get_estimator_from_trained_supervised_model(model)
# Get the score objects for the estimator
score = self.metrics(estimator)
self._console_log('{} algorithm: score = {}'.format(name, score))
# TODO this may need to ferret out each classification score separately
score_by_name[name] = score[scoring_metric]
sorted_names_and_scores = sorted(score_by_name.items(), key=lambda x: x[1])
best_algorithm_name, best_score = sorted_names_and_scores[-1]
best_model = trained_model_by_name[best_algorithm_name]
self._console_log('Based on the scoring metric {}, the best algorithm found is: {}'.format(scoring_metric,
best_algorithm_name))
self._console_log('{} {} = {}'.format(best_algorithm_name, scoring_metric, best_score))
return best_model
def rand_forest_train(self):
# ??????????
users = pd.read_csv('names.csv')
# ??similarity?platform?reputation?entropy????????????
X = users[['similarity', 'platform', 'reputation', 'entropy']]
y = users['human_or_machine']
# ?????????? 25%???????
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)
# ????????????????
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer(sparse=False)
X_train = vec.fit_transform(X_train.to_dict(orient='record'))
X_test = vec.transform(X_test.to_dict(orient='record'))
# ?????????????????????
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
dtc_y_pred = dtc.predict(X_test)
# ???????????????????????
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
rfc_y_pred = rfc.predict(X_test)
# ???????????????????????
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
gbc.fit(X_train, y_train)
gbc_y_pred = gbc.predict(X_test)
from sklearn.metrics import classification_report
# ??????????????????? ?????????? ??? F1??
print("??????????", dtc.score(X_test, y_test))
print(classification_report(dtc_y_pred, y_test))
# ??????????????????????????????? ??? F1??
print("????????????", rfc.score(X_test, y_test))
print(classification_report(rfc_y_pred, y_test))
# ??????????????????????????????? ??? F1??
print("????????????", gbc.score(X_test, y_test))
print(classification_report(gbc_y_pred, y_test))
users = pd.read_csv('values.csv')
# ??????????
X = users[['similarity', 'platform', 'reputation', 'entropy']]
X = vec.transform(X.to_dict(orient='record'))
print(rfc.predict(X))
self.dtc = dtc
self.rfc = rfc
self.gbc = gbc