def trained_models():
dataset = datasets.load_breast_cancer()
X = dataset.data
y = dataset.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=12345)
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
lr = LogisticRegression()
lr.fit(X_train, y_train)
svc_w_linear_kernel = SVC(kernel='linear')
svc_w_linear_kernel.fit(X_train, y_train)
svc_wo_linear_kernel = SVC()
svc_wo_linear_kernel.fit(X_train, y_train)
dummy = DummyClassifier()
dummy.fit(X_train, y_train)
return {'RF':rf, 'LR':lr, 'SVC_w_linear_kernel':svc_w_linear_kernel,
'Dummy':dummy, 'SVC_wo_linear_kernel':svc_wo_linear_kernel}
python类LogisticRegression()的实例源码
def get_feature_importance(self,clf, model_name ):
clfs = {'RandomForestClassifier':'feature_importances',
'ExtraTreesClassifier': 'feature_importances',
'AdaBoostClassifier': 'feature_importances',
'LogisticRegression': 'coef',
'svm.SVC': 'coef',
'GradientBoostingClassifier': 'feature_importances',
'GaussianNB': None,
'DecisionTreeClassifier': 'feature_importances',
'SGDClassifier': 'coef',
'KNeighborsClassifier': None,
'linear.SVC': 'coef'}
if clfs[model_name] == 'feature_importances':
return list(clf.feature_importances_)
elif clfs[model_name] == 'coef':
return list(clf.coef_.tolist())
else:
return None
def parameterChoosing(self):
# Set the parameters by cross-validation
tuned_parameters = [{'penalty': ['l1'],
'C': np.logspace(-5,5)},
{'penalty': ['l2'],
'C': np.logspace(-5,5)}]
clf = GridSearchCV(linear_model.LogisticRegression(tol=1e-6), tuned_parameters, cv=5, scoring='precision_weighted')
clf.fit(self.X_train, self.y_train.ravel())
print "Best parameters set found on development set:\n"
print clf.best_params_
print "Grid scores on development set:\n"
for params, mean_score, scores in clf.grid_scores_:
print "%0.3f (+/-%0.03f) for %r\n" % (mean_score, scores.std() * 2, params)
print "Detailed classification report:\n"
y_true, y_pred = self.y_test, clf.predict(self.X_test)
print classification_report(y_true, y_pred)
def define_model(self, model, parameters, n_cores = 0):
clfs = {'RandomForestClassifier': RandomForestClassifier(n_estimators=50, n_jobs=7),
'ExtraTreesClassifier': ExtraTreesClassifier(n_estimators=10, n_jobs=7, criterion='entropy'),
'AdaBoostClassifier': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200),
'LogisticRegression': LogisticRegression(penalty='l1', C=1e5),
'svm.SVC': svm.SVC(kernel='linear', probability=True, random_state=0),
'GradientBoostingClassifier': GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=10),
'GaussianNB': GaussianNB(),
'DecisionTreeClassifier': DecisionTreeClassifier(),
'SGDClassifier': SGDClassifier(loss="hinge", penalty="l2", n_jobs=7),
'KNeighborsClassifier': KNeighborsClassifier(n_neighbors=3),
'linear.SVC': svm.LinearSVC() }
if model not in clfs:
raise ConfigError("Unsupported model {}".format(model))
clf = clfs[model]
clf.set_params(**parameters)
return clf
def test_homonym(H, sent, features, C=1.0):
X_0 = features(matching(sent, H[0]))
X_1 = features(matching(sent, H[1]))
y_0 = numpy.zeros(len(X_0))
y_1 = numpy.ones(len(X_1))
X = normalize(numpy.vstack([X_0, X_1]), norm='l2')
y = numpy.hstack([y_0, y_1])
classifier = LogisticRegression(C=C)
fold = StratifiedKFold(y, n_folds=10)
score = []
count = []
for tr, te in fold:
X_tr, X_te = X[tr], X[te]
y_tr, y_te = y[tr], y[te]
classifier.fit(X_tr, y_tr)
score.append(sum(classifier.predict(X_te) == y_te))
count.append(len(y_te))
score = numpy.array(score, dtype='float')
count = numpy.array(count, dtype='float')
result = {'word1_count': len(y_0),
'word2_count': len(y_1),
'majority': 1.0 * max(len(y_0),len(y_1))/len(y),
'kfold_acc': score/count }
return result
resnet_regressor.py 文件源码
项目:Brain_Tumor_Segmentation
作者: KarthikRevanuru
项目源码
文件源码
阅读 32
收藏 0
点赞 0
评论 0
def train_xgboost():
df = pd.read_csv('survival_data.csv', index_col=0, encoding = 'UTF-7')
p = np.array([np.mean(np.load('training/%s_flair.nii.gz.npy' % str(id)), axis=0) for id in folder_names_train])
q = np.array([np.mean(np.load('training/%s_t1.nii.gz.npy' % str(id)), axis=0) for id in folder_names_train])
r = np.array([np.mean(np.load('training/%s_t1ce.nii.gz.npy' % str(id)), axis=0) for id in folder_names_train])
s = np.array([np.mean(np.load('training/%s_t2.nii.gz.npy' % str(id)), axis=0) for id in folder_names_train])
y=np.array([])
t=0
z=np.array([])
for ind in range(len(folder_names_train)):
try:
temp = df.get_value(str(folder_names_train[ind]),'Survival')
y=np.append(y,temp)
temp = df.get_value(str(folder_names_train[ind]),'Age')
z=np.append(z,np.array([temp]))
except Exception as e:
t+=1
print (t,str(e),"Label Not found, deleting entry")
y=np.append(y,0)
z=np.array([[v] for v in z])
t=np.concatenate((p,q),axis=1)
u=np.concatenate((r,s),axis=1)
x=np.concatenate((t,u),axis=1)
#print(x.shape)
#print (x)
#print (x.shape,z.shape)
x=np.concatenate((x,z),axis=1)
#print (x)
#clf=linear_model.LogisticRegression(C=1e5)
#clf = RandomForestRegressor()
clf = xgb.XGBRegressor()
clf.fit(x,y)
return clf
def __init__(
self,data_block, predictors=[],cv_folds=10,
scoring_metric='accuracy',additional_display_metrics=[]):
base_classification.__init__(
self, alg=LogisticRegression(), data_block=data_block,
predictors=predictors,cv_folds=cv_folds,
scoring_metric=scoring_metric,
additional_display_metrics=additional_display_metrics
)
self.model_output=pd.Series(self.default_parameters)
self.model_output['Coefficients'] = "-"
#Set parameters to default values:
self.set_parameters(set_default=True)
def test_model_detection(self):
sklearn_model = LogisticRegression()
pipeline_model = Pipeline([('log', sklearn_model)])
xgb_model = XGBClassifier()
nn_model = NNModel(100,10)
sklearn_opt = Optimizer(sklearn_model,[], lambda x: x)
pipeline_opt = Optimizer(pipeline_model,[], lambda x: x)
xgb_opt = Optimizer(xgb_model,[], lambda x: x)
nn_opt = Optimizer(nn_model,[], lambda x: x)
self.assertEqual(sklearn_opt.model_module, 'sklearn')
self.assertEqual(pipeline_opt.model_module, 'pipeline')
self.assertEqual(xgb_opt.model_module, 'xgboost')
self.assertEqual(nn_opt.model_module, 'keras')
sentiment.py 文件源码
项目:Twitter-and-IMDB-Sentimental-Analytics
作者: abhinandanramesh
项目源码
文件源码
阅读 29
收藏 0
点赞 0
评论 0
def build_models_NLP(train_pos_vec, train_neg_vec):
"""
Returns a BernoulliNB and LosticRegression Model that are fit to the training data.
"""
Y = ["pos"]*len(train_pos_vec) + ["neg"]*len(train_neg_vec)
# Use sklearn's BernoulliNB and LogisticRegression functions to fit two models to the training data.
# For BernoulliNB, use alpha=1.0 and binarize=None
# For LogisticRegression, pass no parameters
train_vec = []
train_vec.extend(train_pos_vec)
train_vec.extend(train_neg_vec)
nb_model = BernoulliNB(alpha=1.0, binarize=None, class_prior=None, fit_prior=True)
nb_model.fit(train_vec, Y)
lr_model = LogisticRegression()
lr_model.fit(train_vec, Y)
return nb_model, lr_model
sentiment.py 文件源码
项目:Twitter-and-IMDB-Sentimental-Analytics
作者: abhinandanramesh
项目源码
文件源码
阅读 26
收藏 0
点赞 0
评论 0
def build_models_DOC(train_pos_vec, train_neg_vec):
"""
Returns a GaussianNB and LosticRegression Model that are fit to the training data.
"""
Y = ["pos"]*len(train_pos_vec) + ["neg"]*len(train_neg_vec)
# Use sklearn's GaussianNB and LogisticRegression functions to fit two models to the training data.
# For LogisticRegression, pass no parameters
train_vec = []
train_vec.extend(train_pos_vec)
train_vec.extend(train_neg_vec)
nb_model = GaussianNB()
nb_model.fit(train_vec, Y)
lr_model = LogisticRegression()
lr_model.fit(train_vec, Y)
return nb_model, lr_model
def learns(tests,trains,indep=lambda x: x[:-1],
dep = lambda x: x[-1],
rf = Abcd(),
lg = Abcd(),
dt = Abcd(),
nb = Abcd()):
x1,y1,x2,y2= trainTest(tests,trains,indep,dep)
forest = RandomForestClassifier(n_estimators = 50)
forest = forest.fit(x1,y1)
for n,got in enumerate(forest.predict(x2)):
rf(predicted = got, actual = y2[n])
logreg = linear_model.LogisticRegression(C=1e5)
logreg.fit(x1, y1)
for n,got in enumerate(logreg.predict(x2)):
lg(predicted = got, actual = y2[n])
bayes = GaussianNB()
bayes.fit(x1,y1)
for n,got in enumerate(bayes.predict(x2)):
nb(predicted = got, actual = y2[n])
dectree = DecisionTreeClassifier(criterion="entropy",
random_state=1)
dectree.fit(x1,y1)
for n,got in enumerate(dectree.predict(x2)):
dt(predicted = got, actual = y2[n])
def test_dsapp_lr(data):
dsapp_lr = ScaledLogisticRegression()
dsapp_lr.fit(data['X_train'], data['y_train'])
minmax_scaler = preprocessing.MinMaxScaler()
dsapp_cutoff = CutOff()
lr = linear_model.LogisticRegression()
pipeline =Pipeline([
('minmax_scaler',minmax_scaler),
('dsapp_cutoff', dsapp_cutoff),
('lr', lr)
])
pipeline.fit(data['X_train'], data['y_train'])
assert np.all(dsapp_lr.predict(data['X_test']) == pipeline.predict(data['X_test']))
def cv_reg_lr(trX, trY, vaX, vaY, Cs=[0.01, 0.05, 0.1, 0.5, 1., 5., 10., 50., 100.]):
tr_accs = []
va_accs = []
models = []
for C in Cs:
model = LR(C=C)
model.fit(trX, trY)
tr_pred = model.predict(trX)
va_pred = model.predict(vaX)
tr_acc = metrics.accuracy_score(trY, tr_pred)
va_acc = metrics.accuracy_score(vaY, va_pred)
print '%.4f %.4f %.4f'%(C, tr_acc, va_acc)
tr_accs.append(tr_acc)
va_accs.append(va_acc)
models.append(model)
best = np.argmax(va_accs)
print 'best model C: %.4f tr_acc: %.4f va_acc: %.4f'%(Cs[best], tr_accs[best], va_accs[best])
return models[best]
logistic_regression.py 文件源码
项目:MultimodalAutoencoder
作者: natashamjaques
项目源码
文件源码
阅读 25
收藏 0
点赞 0
评论 0
def train_and_predict(self, param_dict, predict_on='val'):
"""Initializes a LR classifier according to the desired parameter settings,
trains it, and returns the predictions on the appropriate evaluation dataset.
Args:
param_dict: A dictionary with keys representing parameter names and
values representing settings for those parameters.
predict_on: The dataset used for evaluating the model. Can set to
'Test' to get final results.
Returns: The predicted Y labels.
"""
if predict_on == 'test':
predict_X = self.data_loader.test_X
else:
predict_X = self.data_loader.val_X
self.model = linear_model.LogisticRegression(penalty=param_dict['penalty'],
C=param_dict['C'])
self.model.fit(self.data_loader.train_X, self.data_loader.train_Y)
preds = self.predict_on_data(predict_X)
return preds
def test_build_param_grid_set_estimator():
clf1 = SVC()
clf2 = LogisticRegression()
clf3 = SVC()
clf4 = SGDClassifier()
estimator = set_grid(Pipeline([('sel', set_grid(SelectKBest(), k=[2, 3])),
('clf', None)]),
clf=[set_grid(clf1, kernel=['linear']),
clf2,
set_grid(clf3, kernel=['poly'], degree=[2, 3]),
clf4])
param_grid = [{'clf': [clf1], 'clf__kernel': ['linear'], 'sel__k': [2, 3]},
{'clf': [clf3], 'clf__kernel': ['poly'],
'clf__degree': [2, 3], 'sel__k': [2, 3]},
{'clf': [clf2, clf4], 'sel__k': [2, 3]}]
assert build_param_grid(estimator) == param_grid
def test_make_grid_search():
X, y = load_iris(return_X_y=True)
lr = LogisticRegression()
svc = set_grid(SVC(), kernel=['poly'], degree=[2, 3])
gs1 = make_grid_search(lr, cv=5) # empty grid
gs2 = make_grid_search(svc, cv=5)
gs3 = make_grid_search([lr, svc], cv=5)
for gs, n_results in [(gs1, 1), (gs2, 2), (gs3, 3)]:
gs.fit(X, y)
assert gs.cv == 5
assert len(gs.cv_results_['params']) == n_results
svc_mask = gs3.cv_results_['param_root'] == svc
assert svc_mask.sum() == 2
assert gs3.cv_results_['param_root__degree'][svc_mask].tolist() == [2, 3]
assert gs3.cv_results_['param_root'][~svc_mask].tolist() == [lr]
def convert(model, feature_names, target):
"""Convert a Logistic Regression model to the protobuf spec.
Parameters
----------
model: LogisticRegression
A trained LogisticRegression model.
feature_names: [str], optional (default=None)
Name of the input columns.
target: str, optional (default=None)
Name of the output column.
Returns
-------
model_spec: An object of type Model_pb.
Protobuf representation of the model
"""
if not(_HAS_SKLEARN):
raise RuntimeError('scikit-learn not found. scikit-learn conversion API is disabled.')
_sklearn_util.check_expected_type(model, LogisticRegression)
_sklearn_util.check_fitted(model, lambda m: hasattr(m, 'coef_'))
return _MLModel(_convert(model, feature_names, target))
def cv_reg_lr(trX, trY, vaX, vaY, Cs=[0.01, 0.05, 0.1, 0.5, 1., 5., 10., 50., 100.]):
tr_accs = []
va_accs = []
models = []
for C in Cs:
model = LR(C=C)
model.fit(trX, trY)
tr_pred = model.predict(trX)
va_pred = model.predict(vaX)
tr_acc = metrics.accuracy_score(trY, tr_pred)
va_acc = metrics.accuracy_score(vaY, va_pred)
print '%.4f %.4f %.4f'%(C, tr_acc, va_acc)
tr_accs.append(tr_acc)
va_accs.append(va_acc)
models.append(model)
best = np.argmax(va_accs)
print 'best model C: %.4f tr_acc: %.4f va_acc: %.4f'%(Cs[best], tr_accs[best], va_accs[best])
return models[best]
def test_stacked_classfier_extkfold(self):
bclf = LogisticRegression(random_state=1)
clfs = [RandomForestClassifier(n_estimators=40, criterion = 'gini', random_state=1),
RidgeClassifier(random_state=1),
]
sl = StackedClassifier(bclf,
clfs,
n_folds=3,
verbose=0,
Kfold=StratifiedKFold(self.iris.target, 3),
stack_by_proba=False,
oob_score_flag=True,
oob_metrics=log_loss)
sl.fit(self.iris.data, self.iris.target)
score = sl.score(self.iris.data, self.iris.target)
self.assertGreater(score, 0.9, "Failed with score = {0}".format(score))
def test_fwls_classfier(self):
feature_func = lambda x: np.ones(x.shape)
bclf = LogisticRegression(random_state=1)
clfs = [RandomForestClassifier(n_estimators=40, criterion = 'gini', random_state=1),
RidgeClassifier(random_state=1),
]
sl = FWLSClassifier(bclf,
clfs,
feature_func=feature_func,
n_folds=3,
verbose=0,
Kfold=StratifiedKFold(self.iris.target, 3),
stack_by_proba=False)
sl.fit(self.iris.data, self.iris.target)
score = sl.score(self.iris.data, self.iris.target)
self.assertGreater(score, 0.9, "Failed with score = {0}".format(score))
def prec_log(X_train, y_train, X_test, y_test):
from sklearn.linear_model import LogisticRegression
if not issparse(X_train):
X_train = X_train.reshape((X_train.shape[0], -1))
if not issparse(X_test):
X_test = X_test.reshape((X_test.shape[0], -1))
LOGGER.info('start predict: X_train.shape={},y_train.shape={},X_test.shape={},y_test.shape={}'.format(
X_train.shape, y_train.shape, X_test.shape, y_test.shape))
X_train = X_train.reshape((X_train.shape[0], -1))
X_test = X_test.reshape((X_test.shape[0], -1))
clf = LogisticRegression(solver='sag', n_jobs=-1, verbose=1)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
prec = float(np.sum(y_pred == y_test)) / len(y_test)
LOGGER.info('prec_log={:.6f}%'.format(prec*100.0))
return clf, y_pred
def distance(self, d1, d2):
# Extract summary statistics from the dataset
s1 = self.statistics_calc.statistics(d1)
s2 = self.statistics_calc.statistics(d2)
# compute distnace between the statistics
training_set_features = np.concatenate((s1, s2), axis=0)
label_s1 = np.zeros(shape=(len(s1), 1))
label_s2 = np.ones(shape=(len(s2), 1))
training_set_labels = np.concatenate((label_s1, label_s2), axis=0).ravel()
reg_inv = 1e5
log_reg_model = linear_model.LogisticRegression(C=reg_inv, penalty='l1')
log_reg_model.fit(training_set_features, training_set_labels)
score = log_reg_model.score(training_set_features, training_set_labels)
distance = 2.0 * (score - 0.5)
return distance
def cross_validation_accuracy(clf, X, labels, k):
"""
Compute the average testing accuracy over k folds of cross-validation. You
can use sklearn's KFold class here (no random seed, and no shuffling
needed).
Params:
clf......A LogisticRegression classifier.
X........A csr_matrix of features.
labels...The true labels for each instance in X
k........The number of cross-validation folds.
Returns:
The average testing accuracy of the classifier
over each fold of cross-validation.
"""
###TODO
pass
def fit_best_classifier(docs, labels, best_result):
"""
Using the best setting from eval_all_combinations,
re-vectorize all the training data and fit a
LogisticRegression classifier to all training data.
(i.e., no cross-validation done here)
Params:
docs..........List of training document strings.
labels........The true labels for each training document (0 or 1)
best_result...Element of eval_all_combinations
with highest accuracy
Returns:
clf.....A LogisticRegression classifier fit to all
training data.
vocab...The dict from feature name to column index.
"""
###TODO
pass
def top_coefs(clf, label, n, vocab):
"""
Find the n features with the highest coefficients in
this classifier for this label.
See the .coef_ attribute of LogisticRegression.
Params:
clf.....LogisticRegression classifier
label...1 or 0; if 1, return the top coefficients
for the positive class; else for negative.
n.......The number of coefficients to return.
vocab...Dict from feature name to column index.
Returns:
List of (feature_name, coefficient) tuples, SORTED
in descending order of the coefficient for the
given class label.
"""
###TODO
pass
def print_top_misclassified(test_docs, test_labels, X_test, clf, n):
"""
Print the n testing documents that are misclassified by the
largest margin. By using the .predict_proba function of
LogisticRegression <https://goo.gl/4WXbYA>, we can get the
predicted probabilities of each class for each instance.
We will first identify all incorrectly classified documents,
then sort them in descending order of the predicted probability
for the incorrect class.
E.g., if document i is misclassified as positive, we will
consider the probability of the positive class when sorting.
Params:
test_docs.....List of strings, one per test document
test_labels...Array of true testing labels
X_test........csr_matrix for test data
clf...........LogisticRegression classifier fit on all training
data.
n.............The number of documents to print.
Returns:
Nothing; see Log.txt for example printed output.
"""
###TODO
pass
def task73(features):
features = numpy.array(features)
words = list(set(features[:, 1]))
pos_vec = numpy.zeros(len(words))
neg_vec = numpy.zeros(len(words))
for feature in features:
index = words.index(feature[1])
if feature[0] == '-1':
pos_vec[index] += 1
else:
neg_vec[index] += 1
model = linear_model.LogisticRegression()
model.fit([pos_vec, neg_vec], [1, -1])
return (words, model)
def test_lr_on_data(X_train, y_train, X_validate, y_validate, X_test, y_test):
y_train_flatten = list(itertools.chain(*y_train))
# Train LR Model
lr = LogisticRegression(solver='lbfgs')
lr.fit(X_train, y_train_flatten)
# Test model on validation set
predictions_val = lr.predict_proba(X_validate)
predictions_val = array([i[-1] for i in predictions_val])
best_threshold_validate = find_threshold_logistic(y_validate, predictions_val, predictions_val)
precision_val, recall_val, f1_val = evaluate_with_threshold(y_validate, predictions_val, predictions_val,
best_threshold_validate)
globals.logger.info("Found threshold: %f. Precision/recall/f1 over validation set: %f/%f/%f" %
(best_threshold_validate, precision_val, recall_val, f1_val))
# Test model on test set
predictions_test = lr.predict_proba(X_test)
predictions_test = array([i[-1] for i in predictions_test])
best_threshold_test = find_threshold_logistic(y_test, predictions_test, predictions_test, verbose=True)
precision, recall, f1 = evaluate_with_threshold(y_test, predictions_test, predictions_test, best_threshold_test)
globals.logger.info("Found threshold: %f. Precision/recall/f1 over test set: %f/%f/%f" %
(best_threshold_test, precision, recall, f1))
return precision, recall, f1
def convert(model, feature_names, target):
"""Convert a Logistic Regression model to the protobuf spec.
Parameters
----------
model: LogisticRegression
A trained LogisticRegression model.
feature_names: [str], optional (default=None)
Name of the input columns.
target: str, optional (default=None)
Name of the output column.
Returns
-------
model_spec: An object of type Model_pb.
Protobuf representation of the model
"""
if not(_HAS_SKLEARN):
raise RuntimeError('scikit-learn not found. scikit-learn conversion API is disabled.')
_sklearn_util.check_expected_type(model, LogisticRegression)
_sklearn_util.check_fitted(model, lambda m: hasattr(m, 'coef_'))
return _MLModel(_convert(model, feature_names, target))
def train_using_logistic(feat1, feat2):
n_plus = len(feat1)
n_minus = len(feat2)
X = np.concatenate((feat1, feat2), axis=0)
y = np.concatenate((np.zeros(n_plus), np.ones(n_minus)), axis=0)
y = y + 1
print(X.shape, y.shape, n_plus, n_minus, feat1.shape, feat2.shape)
logreg = linear_model.LogisticRegression(C=1e5)
logreg.fit(X, y)
print("Score using logistic regression on training data is ", logreg.score(X, y))
return logreg