def get_toy_config():
config = {}
ca_config = {}
ca_config["random_state"] = 0
ca_config["max_layers"] = 100
ca_config["early_stopping_rounds"] = 3
ca_config["n_classes"] = 10
ca_config["estimators"] = []
ca_config["estimators"].append(
{"n_folds": 5, "type": "XGBClassifier", "n_estimators": 10, "max_depth": 5,
"objective": "multi:softprob", "silent": True, "nthread": -1, "learning_rate": 0.1} )
ca_config["estimators"].append({"n_folds": 5, "type": "RandomForestClassifier", "n_estimators": 10, "max_depth": None, "n_jobs": -1})
ca_config["estimators"].append({"n_folds": 5, "type": "ExtraTreesClassifier", "n_estimators": 10, "max_depth": None, "n_jobs": -1})
ca_config["estimators"].append({"n_folds": 5, "type": "LogisticRegression"})
config["cascade"] = ca_config
return config
python类RandomForestClassifier()的实例源码
ClassificationRandomForest.py 文件源码
项目:AirTicketPredicting
作者: junlulocky
项目源码
文件源码
阅读 27
收藏 0
点赞 0
评论 0
def parameterChoosing(self):
# Set the parameters by cross-validation
tuned_parameters = [{'max_depth': range(20,60),
'n_estimators': range(10,40),
'max_features': ['sqrt', 'log2', None]
}
]
clf = GridSearchCV(RandomForestClassifier(n_estimators=30), tuned_parameters, cv=5, scoring='precision_weighted')
clf.fit(self.X_train, self.y_train.ravel())
print "Best parameters set found on development set:\n"
print clf.best_params_
print "Grid scores on development set:\n"
for params, mean_score, scores in clf.grid_scores_:
print "%0.3f (+/-%0.03f) for %r\n" % (mean_score, scores.std() * 2, params)
print "Detailed classification report:\n"
y_true, y_pred = self.y_test, clf.predict(self.X_test)
print classification_report(y_true, y_pred)
def get_classifier(self):
algo=self.algo
if algo=="GBT":
return GradientBoostingClassifier()
elif algo=="RF":
return RandomForestClassifier()
elif algo=="ADB":
return AdaBoostClassifier()
elif algo =="DT":
return DecisionTreeClassifier()
elif algo=="NB":
return BernoulliNB()
elif algo=="SGD":
return SGDClassifier()
elif algo=="SVC":
return LinearSVC()
elif algo=="MLPC":
return MLPClassifier(activation='logistic', batch_size='auto',
early_stopping=True, hidden_layer_sizes=(100,), learning_rate='adaptive',
learning_rate_init=0.1, max_iter=5000, random_state=1,
solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False,
warm_start=False)
return 0
def performRFClass(X_train, y_train, X_test, y_test, fout, savemodel):
"""
Random Forest Binary Classification
"""
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1)
clf.fit(X_train, y_train)
# if savemodel == True:
# fname_out = '{}-{}.pickle'.format(fout, datetime.now())
# with open(fname_out, 'wb') as f:
# cPickle.dump(clf, f, -1)
accuracy = clf.score(X_test, y_test)
return accuracy
def performRFClass(X_train, y_train, X_test, y_test, fout, savemodel):
"""
Random Forest Binary Classification
"""
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1)
clf.fit(X_train, y_train)
# if savemodel == True:
# fname_out = '{}-{}.pickle'.format(fout, datetime.now())
# with open(fname_out, 'wb') as f:
# cPickle.dump(clf, f, -1)
accuracy = clf.score(X_test, y_test)
print "RF: ", accuracy
def random_forest(self, sensors_set):
features = list(self.dataset.get_sensors_set_features(sensors_set))
print("RANDOM FOREST.....")
print("CLASSIFICATION BASED ON THESE SENSORS: ", self.dataset.get_remained_sensors(sensors_set))
print("NUMBER OF FEATURES: ", len(features))
train_features, train_classes, test_features, test_classes = self.__get_sets_for_classification(
self.dataset.get_train, self.dataset.get_test, features)
classifier_forest = RandomForestClassifier(n_estimators=const.PAR_RF_ESTIMATOR)
classifier_forest.fit(train_features, train_classes)
test_prediction = classifier_forest.predict(test_features)
acc = accuracy_score(test_classes, test_prediction)
df_feature = pd.DataFrame(
{'accuracy': acc, 'featureName': features, 'importance': classifier_forest.feature_importances_})
df_feature = df_feature.sort_values(by='importance', ascending=False)
print("ACCURACY : " + str(acc))
print("END RANDOM FOREST")
if not os.path.exists(const.DIR_RESULTS):
os.makedirs(const.DIR_RESULTS)
df_feature.to_csv(const.DIR_RESULTS + "/" + str(sensors_set) + const.FILE_RANDOM_FOREST_RESULTS, index=False)
# neural network algorithm training on training al train set and test on all test set
def setUpClass(self):
"""
Set up the unit test by loading the dataset and training a model.
"""
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestClassifier
import numpy as np
scikit_data = load_boston()
scikit_model = RandomForestClassifier(random_state = 1)
t = scikit_data.target
target = np.digitize(t, np.histogram(t)[1]) - 1
scikit_model.fit(scikit_data.data, target)
# Save the data and the model
self.scikit_data = scikit_data
self.target = target
self.scikit_model = scikit_model
def test_random_forest_classifier(self):
for dtype in self.number_data_type.keys():
scikit_model = RandomForestClassifier(random_state=1)
data = self.scikit_data['data'].astype(dtype)
target = self.scikit_data['target'].astype(dtype) > self.scikit_data['target'].astype(dtype).mean()
scikit_model, spec = self._sklearn_setup(scikit_model, dtype, data, target)
test_data = data[0].reshape(1, -1)
self._check_tree_model(spec, 'multiArrayType', 'int64Type', 2)
coreml_model = create_model(spec)
try:
self.assertEqual(scikit_model.predict(test_data)[0],
bool(int(coreml_model.predict({'data': test_data})['target'])),
msg="{} != {} for Dtype: {}".format(
scikit_model.predict(test_data)[0],
bool(int(coreml_model.predict({'data': test_data})['target'])),
dtype
)
)
except RuntimeError:
print("{} not supported. ".format(dtype))
def __init__(self, outputs, inputs, k=None, hypers=None, params=None,
distargs=None, rng=None):
self.rng = gu.gen_rng() if rng is None else rng
self.outputs = outputs
self.inputs = inputs
self.rng = gu.gen_rng() if rng is None else rng
assert len(self.outputs) == 1
assert len(self.inputs) >= 1
assert self.outputs[0] not in self.inputs
assert len(distargs['inputs']['stattypes']) == len(self.inputs)
self.stattypes = distargs['inputs']['stattypes']
# Number of output categories and input dimension.
# XXX WHATTA HACK. BayesDB passes in top-level kwargs, not in distargs.
self.k = k if k is not None else int(distargs['k'])
self.p = len(distargs['inputs']['stattypes'])
# Sufficient statistics.
self.N = 0
self.data = Data(x=OrderedDict(), Y=OrderedDict())
self.counts = [0] * self.k
# Outlier and random forest parameters.
if params is None: params = {}
self.alpha = params.get('alpha', .1)
self.regressor = params.get('forest', None)
if self.regressor is None:
self.regressor = RandomForestClassifier(random_state=self.rng)
def rf_categorize(email):
# get training corpus
emails = []
db = utils.get_local_db()
for collection in db.collection_names():
for record in db.get_collection(collection).find():
emails.append([collection] + [record['Text']])
# vectorize corpus
labels = [row[0] for row in emails]
data = [row[1] for row in emails]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data)
X = X.toarray()
# vectorize input
email_vector = vectorizer.transform([email])
# create random forest and return prediction
forest = RandomForestClassifier(n_estimators = int(sqrt(len(X[0])))+1)
forest.fit(X, labels)
return forest.predict(email_vector)[0]
def get_classifier(method='logistic_regression'):
if 'logistic_regression' == method:
return LogisticRegression(C=1e3,
tol=0.01,
multi_class='ovr',
solver='liblinear',
n_jobs=-1,
random_state=123)
if 'random_forest' == method:
return RandomForestClassifier(n_estimators=250,
bootstrap=False,
n_jobs=-1,
random_state=123)
if 'gradient_boosting' == method:
return xgb.XGBClassifier(max_depth=10,
subsample=0.7,
n_estimators=500,
min_child_weight=0.05,
colsample_bytree=0.3,
learning_rate=0.1)
def applyRandomForestClassifier(self, train, test):
#init algorithm
RFC = RandomForestClassifier()
#training target
y_train = train[["Survived"]]
x_train = train[train.columns.difference(["PassengerId","Survived"])]
#fitting
RFC.fit(x_train, y_train)
result = RFC.predict(test[test.columns.difference(["PassengerId"])])
self.writeMessage("current training score")
print RFC.score(x_train, y_train)
test["Survived"] = result
return test
def buildModel(dataset, method, parameters):
"""
Build final model for predicting real testing data
"""
features = dataset.columns[0:-1]
if method == 'RNN':
clf = performRNNlass(dataset[features], dataset['UpDown'])
return clf
elif method == 'RF':
clf = RandomForestClassifier(n_estimators=1000, n_jobs=-1)
elif method == 'KNN':
clf = neighbors.KNeighborsClassifier()
elif method == 'SVM':
c = parameters[0]
g = parameters[1]
clf = SVC(C=c, gamma=g)
elif method == 'ADA':
clf = AdaBoostClassifier()
return clf.fit(dataset[features], dataset['UpDown'])
Stock_Prediction_Model_Random_Forrest.py 文件源码
项目:StockRecommendSystem
作者: doncat99
项目源码
文件源码
阅读 35
收藏 0
点赞 0
评论 0
def build_model(self, X_train, y_train):
if self.paras.load == True:
model = self.load_training_model(self.paras.window_len)
if model != None:
return model
print('build Random Forrest model...')
# range of number of trees : 5*(1 -> 10) = 5,10,...,50 trees
t_min = self.paras.tree_min[index]
t_max = self.paras.tree_max[index]
# range of max of features : 1 -> 10 features
f_min = self.paras.feature_min[index]
f_max = self.paras.feature_max[index]
# range of window : 1 -> 70 days
w_min = self.paras.window_min
w_max = self.paras.window_max
w_opt, n_opt, m_opt = self.best_window(X_train, y_train, w_min,w_max,t_min,t_max,f_min,f_max)
model = RandomForestClassifier(n_estimators=n_opt,max_features=m_opt, n_jobs=8, verbose=self.paras.verbose)
return model
def __init__(self, info, verbose=True, debug_mode=False):
self.label_num=info['label_num']
self.target_num=info['target_num']
self.task = info['task']
self.metric = info['metric']
self.postprocessor = None
#self.postprocessor = MultiLabelEnsemble(LogisticRegression(), balance=True) # To calibrate proba
self.postprocessor = MultiLabelEnsemble(LogisticRegression(), balance=False) # To calibrate proba
if debug_mode>=2:
self.name = "RandomPredictor"
self.model = RandomPredictor(self.target_num)
self.predict_method = self.model.predict_proba
return
if info['task']=='regression':
if info['is_sparse']==True:
self.name = "BaggingRidgeRegressor"
self.model = BaggingRegressor(base_estimator=Ridge(), n_estimators=1, verbose=verbose) # unfortunately, no warm start...
else:
self.name = "GradientBoostingRegressor"
self.model = GradientBoostingRegressor(n_estimators=1, max_depth=4, min_samples_split=14, verbose=verbose, warm_start = True)
self.predict_method = self.model.predict # Always predict probabilities
else:
if info['has_categorical']: # Out of lazziness, we do not convert categorical variables...
self.name = "RandomForestClassifier"
self.model = RandomForestClassifier(n_estimators=1, verbose=verbose) # unfortunately, no warm start...
elif info['is_sparse']:
self.name = "BaggingNBClassifier"
self.model = BaggingClassifier(base_estimator=BernoulliNB(), n_estimators=1, verbose=verbose) # unfortunately, no warm start...
else:
self.name = "GradientBoostingClassifier"
self.model = eval(self.name + "(n_estimators=1, verbose=" + str(verbose) + ", random_state=1, warm_start = True)")
if info['task']=='multilabel.classification':
self.model = MultiLabelEnsemble(self.model)
self.predict_method = self.model.predict_proba
def try_params( n_iterations, params ):
n_estimators = int( round( n_iterations * trees_per_iteration ))
print "n_estimators:", n_estimators
pprint( params )
clf = RF( n_estimators = n_estimators, verbose = 0, n_jobs = -1, **params )
return train_and_eval_sklearn_classifier( clf, data )
def run_predict_random_forest(X_train,Y_train,X_test,Y_test, n_estimators=30, max_features=500, show_mistakes=False):
forest = RandomForestClassifier(n_estimators=10, max_features=20, max_depth=10)
clf = SKClassifier(forest)
forest_fit = clf.fit(X_train, Y_train)
pred = forest_fit.predict(X_test)
print('\n Random forest 0-1 error. \n Train: ',zero_one_score(Y_train, forest_fit.predict(X_train)), '\n Test: ',
zero_one_score(Y_test, pred))
met = clf.metrics(X_test,Y_test)
if show_mistakes:
mis = clf.show_mistakes(X_test,Y_test,10)
print('Metrics:', met)
return clf
def run_predict_random_forest(X_train,Y_train,X_test,Y_test, n_estimators=30, max_features=500, show_mistakes=False):
forest = RandomForestClassifier(n_estimators=10, max_features=20, max_depth=10)
clf = SKClassifier(forest)
forest_fit = clf.fit(X_train, Y_train)
pred = forest_fit.predict(X_test)
print('\n Random forest 0-1 error. \n Train: ',zero_one_score(Y_train, forest_fit.predict(X_train)), '\n Test: ',
zero_one_score(Y_test, pred))
met = clf.metrics(X_test,Y_test)
if show_mistakes:
mis = clf.show_mistakes(X_test,Y_test,10)
print('Metrics:', met)
return clf
def run_predict_random_forest(X_train,Y_train,X_test,Y_test, n_estimators=30, max_features=500, show_mistakes=False):
forest = RandomForestClassifier(n_estimators=10, max_features=20, max_depth=10)
clf = SKClassifier(forest)
forest_fit = clf.fit(X_train, Y_train)
pred = forest_fit.predict(X_test)
print('\n Random forest 0-1 error. \n Train: ',zero_one_score(Y_train, forest_fit.predict(X_train)), '\n Test: ',
zero_one_score(Y_test, pred))
met = clf.metrics(X_test,Y_test)
if show_mistakes:
mis = clf.show_mistakes(X_test,Y_test,10)
print('Metrics:', met)
return clf
def run_predict_random_forest(X_train,Y_train,X_test,Y_test, n_estimators=30, max_features=500, show_mistakes=False):
forest = RandomForestClassifier(n_estimators=10, max_features=20, max_depth=10)
clf = SKClassifier(forest)
forest_fit = clf.fit(X_train, Y_train)
pred = forest_fit.predict(X_test)
print('\n Random forest 0-1 error. \n Train: ',zero_one_score(Y_train, forest_fit.predict(X_train)), '\n Test: ',
zero_one_score(Y_test, pred))
met = clf.metrics(X_test,Y_test)
if show_mistakes:
mis = clf.show_mistakes(X_test,Y_test,10)
print('Metrics:', met)
return clf