def get_feature_importance(self,clf, model_name ):
clfs = {'RandomForestClassifier':'feature_importances',
'ExtraTreesClassifier': 'feature_importances',
'AdaBoostClassifier': 'feature_importances',
'LogisticRegression': 'coef',
'svm.SVC': 'coef',
'GradientBoostingClassifier': 'feature_importances',
'GaussianNB': None,
'DecisionTreeClassifier': 'feature_importances',
'SGDClassifier': 'coef',
'KNeighborsClassifier': None,
'linear.SVC': 'coef'}
if clfs[model_name] == 'feature_importances':
return list(clf.feature_importances_)
elif clfs[model_name] == 'coef':
return list(clf.coef_.tolist())
else:
return None
python类SGDClassifier()的实例源码
def __init__(self, filename, target_map, classifier='svm'):
self.seed_ = 0
self.filename_ = filename
self.target_map_ = target_map
self.target_ids_ = (np.unique(target_map.keys())).astype(np.int32)
self.epoch_no_ = 0
self.st_time_ = time.time()
# Setup classifier
print('-------------------------------')
print('====> Building Classifier, setting class weights')
if classifier == 'svm':
self.clf_hyparams_ = {'C':[0.01, 0.1, 1.0, 10.0, 100.0], 'class_weight': ['balanced']}
self.clf_base_ = LinearSVC(random_state=self.seed_)
elif classifier == 'sgd':
self.clf_hyparams_ = {'alpha':[0.0001, 0.001, 0.01, 0.1, 1.0, 10.0], 'class_weight':['auto']} # 'loss':['hinge'],
self.clf_ = SGDClassifier(loss='log', penalty='l2', shuffle=False, random_state=self.seed_,
warm_start=True, n_jobs=-1, n_iter=1, verbose=4)
else:
raise Exception('Unknown classifier type %s. Choose from [sgd, svm, gradient-boosting, extra-trees]'
% classifier)
classify.py 文件源码
项目:oss-github-analysis-project
作者: itu-oss-project-team
项目源码
文件源码
阅读 33
收藏 0
点赞 0
评论 0
def __create_classifiers(self):
classifiers = list()
classifiers.append({"func": linear_model.SGDClassifier(loss="log"),
"name": "sgd"})
classifiers.append({"func": neighbors.KNeighborsClassifier(1, weights='distance'),
"name": "knn1"})
classifiers.append({"func": neighbors.KNeighborsClassifier(3, weights='distance'),
"name": "knn3"})
classifiers.append({"func": neighbors.KNeighborsClassifier(5, weights='distance'),
"name": "knn5"})
classifiers.append({"func": GaussianNB(),
"name": "naive_bayes"})
# classifiers.append({"func": tree.DecisionTreeClassifier(), "name": "decision_tree"})
# classifiers.append({"func": MLPClassifier(max_iter=10000), "name": "mlp"})
# classifiers.append({"func": RandomForestClassifier(), "name": "random_forest"})
return classifiers
def define_model(self, model, parameters, n_cores = 0):
clfs = {'RandomForestClassifier': RandomForestClassifier(n_estimators=50, n_jobs=7),
'ExtraTreesClassifier': ExtraTreesClassifier(n_estimators=10, n_jobs=7, criterion='entropy'),
'AdaBoostClassifier': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200),
'LogisticRegression': LogisticRegression(penalty='l1', C=1e5),
'svm.SVC': svm.SVC(kernel='linear', probability=True, random_state=0),
'GradientBoostingClassifier': GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=10),
'GaussianNB': GaussianNB(),
'DecisionTreeClassifier': DecisionTreeClassifier(),
'SGDClassifier': SGDClassifier(loss="hinge", penalty="l2", n_jobs=7),
'KNeighborsClassifier': KNeighborsClassifier(n_neighbors=3),
'linear.SVC': svm.LinearSVC() }
if model not in clfs:
raise ConfigError("Unsupported model {}".format(model))
clf = clfs[model]
clf.set_params(**parameters)
return clf
def try_params( n_iterations, params ):
n_iterations = int( round( n_iterations ))
print "n_iterations:", n_iterations
pprint( params )
if params['scaler']:
scaler = eval( "{}()".format( params['scaler'] ))
x_train_ = scaler.fit_transform( data['x_train'].astype( float ))
x_test_ = scaler.transform( data['x_test'].astype( float ))
local_data = { 'x_train': x_train_, 'y_train': data['y_train'],
'x_test': x_test_, 'y_test': data['y_test'] }
else:
local_data = data
# we need a copy because at the next small round the best params will be re-used
params_ = dict( params )
params_.pop( 'scaler' )
clf = SGD( n_iter = n_iterations, **params_ )
return train_and_eval_sklearn_classifier( clf, local_data )
def learn(self, features, labels):
""" Fits the classifier
If it's state is empty, the classifier is fitted, if not
the classifier is partially fitted.
See sklearn's SGDClassifier fit and partial_fit methods.
Args:
features (:obj:`list` of :obj:`list` of :obj:`float`)
labels (:obj:`list` of :obj:`str`): Labels for each set of features.
New features are learnt.
"""
labels = np.ravel(labels)
self.__learn_labels(labels)
if len(labels) == 0:
return
labels = self.labels.transform(labels)
if self.feature_length > 0 and hasattr(self.clf, 'partial_fit'):
# FIXME? check docs, may need to pass class=[...]
self.clf = self.clf.partial_fit(features, labels)
else:
self.clf = self.clf.fit(features, labels)
self.feature_length = len(features[0])
def make_classifier(self, name, ids, labels):
"""Entrenar un clasificador SVM sobre los textos cargados.
Crea un clasificador que se guarda en el objeto bajo el nombre `name`.
Args:
name (str): Nombre para el clasidicador.
ids (list): Se espera una lista de N ids de textos ya almacenados
en el TextClassifier.
labels (list): Se espera una lista de N etiquetas. Una por cada id
de texto presente en ids.
Nota:
Usa el clasificador de `Scikit-learn <http://scikit-learn.org/>`_
"""
if not all(np.in1d(ids, self.ids)):
raise ValueError("Hay ids de textos que no se encuentran \
almacenados.")
setattr(self, name, SGDClassifier())
classifier = getattr(self, name)
indices = np.searchsorted(self.ids, ids)
classifier.fit(self.tfidf_mat[indices, :], labels)
def test_build_param_grid_set_estimator():
clf1 = SVC()
clf2 = LogisticRegression()
clf3 = SVC()
clf4 = SGDClassifier()
estimator = set_grid(Pipeline([('sel', set_grid(SelectKBest(), k=[2, 3])),
('clf', None)]),
clf=[set_grid(clf1, kernel=['linear']),
clf2,
set_grid(clf3, kernel=['poly'], degree=[2, 3]),
clf4])
param_grid = [{'clf': [clf1], 'clf__kernel': ['linear'], 'sel__k': [2, 3]},
{'clf': [clf3], 'clf__kernel': ['poly'],
'clf__degree': [2, 3], 'sel__k': [2, 3]},
{'clf': [clf2, clf4], 'sel__k': [2, 3]}]
assert build_param_grid(estimator) == param_grid
def get_sgdc(self):
return Pipeline([
('tfidf', TfidfVectorizer(stop_words=sw.words('dutch'), norm='l2', use_idf=True)),
('feat_select', SelectPercentile(percentile=10)),
('clf', SGDClassifier(alpha=0.0001,
average=False,
class_weight=None,
epsilon=0.1,
eta0=0.0,
fit_intercept=True,
l1_ratio=0.15,
learning_rate='optimal',
loss='log',
n_iter=10,
n_jobs=1,
penalty='l2',
power_t=0.5,
random_state=None,
shuffle=True,
verbose=0,
warm_start=False))
])
def run(self):
training_x, training_y, training_ids = self.get_training_data()
test_x, test_y, test_ids = self.get_test_data()
clf = self.define_model(self.model_name, self.model_params)
clf.fit(training_x, training_y)
res_predict = clf.predict(test_x)
if (self.model_name == "SGDClassifier" and (clf.loss =="hinge" or clf.loss == "perceptron")) or self.model_name == "linear.SVC":
res = list(clf.decision_function(test_x))
else:
res = list(clf.predict_proba(test_x)[:,1])
#fp, fn, tp, tn = self.compute_confusion_matrix(res[:,0], test_y)
result_dictionary = {'training_ids': training_ids,
'predictions_test_y': list(res_predict),
'prob_prediction_test_y': res ,
'test_y': list(test_y),
'test_ids': list(test_ids),
'model_name': self.model_name,
'model_params': self.model_params,
'label': self.label,
'feature_columns_used': self.cols_to_use,
'config': self.config,
'feature_importance': self.get_feature_importance(clf, self.model_name),
'columned_used_for_feat_importance': list(training_x.columns.values)}
return result_dictionary, clf
def demo():
import sys
sys.path.append( '../core' )
from tools import make_XOR_dataset
X,Y = make_XOR_dataset()
N,L = Y.shape
from sklearn import linear_model
h_ = linear_model.SGDClassifier(n_iter=100)
from CC import RCC
cc = RCC(h=h_)
e = Ensemble(n_estimators=10,base_estimator=cc)
e.fit(X, Y)
# test it
print(e.predict(X))
print("vs")
print(Y)
def get_classifier(self):
algo=self.algo
if algo=="GBT":
return GradientBoostingClassifier()
elif algo=="RF":
return RandomForestClassifier()
elif algo=="ADB":
return AdaBoostClassifier()
elif algo =="DT":
return DecisionTreeClassifier()
elif algo=="NB":
return BernoulliNB()
elif algo=="SGD":
return SGDClassifier()
elif algo=="SVC":
return LinearSVC()
elif algo=="MLPC":
return MLPClassifier(activation='logistic', batch_size='auto',
early_stopping=True, hidden_layer_sizes=(100,), learning_rate='adaptive',
learning_rate_init=0.1, max_iter=5000, random_state=1,
solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False,
warm_start=False)
return 0
def run_regression(train_embeds, train_labels, test_embeds, test_labels):
np.random.seed(1)
from sklearn.linear_model import SGDClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import f1_score
from sklearn.multioutput import MultiOutputClassifier
dummy = MultiOutputClassifier(DummyClassifier())
dummy.fit(train_embeds, train_labels)
log = MultiOutputClassifier(SGDClassifier(loss="log"), n_jobs=10)
log.fit(train_embeds, train_labels)
f1 = 0
for i in range(test_labels.shape[1]):
print("F1 score", f1_score(test_labels[:,i], log.predict(test_embeds)[:,i], average="micro"))
for i in range(test_labels.shape[1]):
print("Random baseline F1 score", f1_score(test_labels[:,i], dummy.predict(test_embeds)[:,i], average="micro"))
def test_cat():
print 'Testing categorization...'
filein = 'test_lookup.csv'
fileout = 'test_cat.csv'
df = pd.read_csv(filein)
model = linear_model.SGDClassifier(loss='log')
catData = df[~df.category.isnull()]
uncatData = df[df.category.isnull()]
print str(float(len(catData))/float(len(df)) * 100.) + "% of transactions categorized with lookup."
ts.train_model(catData,model,embeddings,model_type='logreg',new_run=True)
ts.use_model(uncatData,model,embeddings,0.0,model_type='logreg')
df = pd.concat([catData, uncatData])
df.sort_index(inplace=True)
df.to_csv(fileout,index=False)
def train_and_pickle_classifier():
import numpy as np
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss='log', random_state=1, n_iter=1)
csv_filename = os.path.join('datasets', 'movie_data.csv')
doc_stream = stream_docs(path=csv_filename)
classes = np.array([0, 1])
for _ in range(45):
X_train, y_train = get_minibatch(doc_stream, size=1000)
if X_train is None:
break
else:
X_train = vect.transform(X_train)
clf.partial_fit(X_train, y_train, classes=classes)
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print("Test accuracy: %.3f" % clf.score(X_test, y_test))
clf = clf.partial_fit(X_test, y_test)
pickle.dump(clf, open(CLF_FILENAME, 'wb'), protocol=4)
def __init__(self, path, etype, **kwargs):
super(EnsembleModel, self).__init__(path, etype=etype, **kwargs)
self.basedir = "models/ensemble/"
self.goldstd = kwargs.get("goldstd")
self.data = {}
self.offsets = []
self.pipeline = Pipeline(
[
#('clf', SGDClassifier(loss='hinge', penalty='l1', alpha=0.0001, n_iter=5, random_state=42)),
#('clf', SGDClassifier())
# ('clf', svm.NuSVC(nu=0.01 ))
('clf', RandomForestClassifier(class_weight={False:1, True:1}, n_jobs=-1, criterion="entropy", warm_start=True))
# ('clf', tree.DecisionTreeClassifier(criterion="entropy")),
# ('clf', MultinomialNB())
# ('clf', GaussianNB())
#('clf', svm.SVC(kernel="rbf", degree=2, C=1)),
#('clf', svm.SVC(kernel="linear", C=2))
#('clf', DummyClassifier(strategy="constant", constant=True))
])
def __init__(self, corpus, relationtype, modelname="scikit_classifier"):
super(ScikitRE, self).__init__()
self.modelname = relationtype + "_" + modelname
self.relationtype = relationtype
self.pairtype = relationtype
self.corpus = corpus
self.pairs = []
self.features = []
self.labels = []
self.pred = []
self.clusters = word2vec.load_clusters("corpora/Thaliana/documents-processed-clusters.txt")
self.posfmeasure = make_scorer(f1_score, average='binary', pos_label=True)
self.generate_data(corpus, modelname, relationtype)
self.text_clf = Pipeline([('vect', CountVectorizer(analyzer='char_wb', ngram_range=(3,20), min_df=0.0, max_df=0.7)),
#('vect', CountVectorizer(ngram_range=(1,3), binary=False, max_features=None)),
#('tfidf', TfidfTransformer(use_idf=True, norm="l2")),
#('clf', SGDClassifier(loss='hinge', penalty='l1', alpha=0.0001, n_iter=5, random_state=42)),
#('clf', SGDClassifier())
#('clf', svm.NuSVC(nu=0.01 ))
#('clf', RandomForestClassifier(class_weight={False:1, True:2}, n_jobs=-1))
('clf', MultinomialNB(alpha=0.01, fit_prior=False))
#('clf', DummyClassifier(strategy="constant", constant=True))
])
def test_transform_linear_model():
for clf in (LogisticRegression(C=0.1),
LinearSVC(C=0.01, dual=False),
SGDClassifier(alpha=0.001, n_iter=50, shuffle=True,
random_state=0)):
for thresh in (None, ".09*mean", "1e-5 * median"):
for func in (np.array, sp.csr_matrix):
X = func(data)
clf.set_params(penalty="l1")
clf.fit(X, y)
X_new = assert_warns(
DeprecationWarning, clf.transform, X, thresh)
if isinstance(clf, SGDClassifier):
assert_true(X_new.shape[1] <= X.shape[1])
else:
assert_less(X_new.shape[1], X.shape[1])
clf.set_params(penalty="l2")
clf.fit(X_new, y)
pred = clf.predict(X_new)
assert_greater(np.mean(pred == y), 0.7)
def test_prefit():
"""
Test all possible combinations of the prefit parameter.
"""
# Passing a prefit parameter with the selected model
# and fitting a unfit model with prefit=False should give same results.
clf = SGDClassifier(alpha=0.1, n_iter=10, shuffle=True, random_state=0)
model = SelectFromModel(clf)
model.fit(data, y)
X_transform = model.transform(data)
clf.fit(data, y)
model = SelectFromModel(clf, prefit=True)
assert_array_equal(model.transform(data), X_transform)
# Check that the model is rewritten if prefit=False and a fitted model is
# passed
model = SelectFromModel(clf, prefit=False)
model.fit(data, y)
assert_array_equal(model.transform(data), X_transform)
# Check that prefit=True and calling fit raises a ValueError
model = SelectFromModel(clf, prefit=True)
assert_raises(ValueError, model.fit, data, y)
def score_function(field):
stats = field.get_stats()
if "Creature" not in stats:
return 0
else:
return stats["Creature"]
# res = modelling.run_simulation(universe, check_stop_function, score_function, verbose=True, times=30)
# print res
# print np.asarray(res).mean()
# random 1000 10 [193, 37, 97, 224, 349, 165, 251, 130, 184, 335]
# SGDClassifier 1000 10 [9, 106, 127, 11, 187, 38, 193, 114, 236, 27]
# random 500 20 [63, 24, 38, 14, 30, 65, 29, 60, 28, 25, 93, 44, 51, 26, 104, 56, 53, 38, 23, 42] mean 45.299999999999997
# SGDClassifier 500 20 [116, 52, 50, 82, 109, 49, 109, 37, 25, 115, 130, 180, 52, 52, 113, 46, 34, 135, 26, 33] mean 77.25
# random 500 20 [71, 24, 57, 56, 34, 14, 75, 66, 41, 56, 29, 69, 30, 72, 40, 57, 49, 24, 41, 48] mean 47.65
# SGDClassifier 500 20 [175, 40, 117, 96, 119, 116, 58, 134, 67, 87, 73, 147, 124, 125, 82, 139, 78, 110, 74, 100] mean 103.05
# random 500 30 [42, 32, 62, 34, 30, 44, 51, 35, 63, 59, 50, 40, 75, 59, 50, 33, 45, 95, 82, 41, 43, 89, 94, 66, 64, 46, 34, 82, 66, 76]
# 56.0666666667
# SGDClassifier 500 30 [62, 85, 72, 42, 17, 48, 74, 53, 42, 73, 57, 29, 82, 51, 80, 84, 86, 73, 51, 36, 85, 85, 46, 59, 68, 33, 44, 38, 62, 26]
# 58.1
def compute_sgd(data):
logging.info('Computing SGD')
n_splits = 10
folder = StratifiedKFold(n_splits=n_splits, shuffle=True)
for ix_first, ix_second in tqdm_notebook(folder.split(np.zeros(data['y_train'].shape[0]), data['y_train']),
total=n_splits):
# {'en__l1_ratio': 0.0001, 'en__alpha': 1e-05}
model = SGDClassifier(
loss='log',
penalty='elasticnet',
fit_intercept=True,
n_iter=100,
shuffle=True,
n_jobs=-1,
l1_ratio=0.0001,
alpha=1e-05,
class_weight=None)
model = model.fit(data['X_train'][ix_first, :], data['y_train'][ix_first])
data['y_train_pred'][ix_second] = logit(model.predict_proba(data['X_train'][ix_second, :])[:, 1])
data['y_test_pred'].append(logit(model.predict_proba(data['X_test'])[:, 1]))
data['y_test_pred'] = np.array(data['y_test_pred']).T.mean(axis=1)
return data
def classify(n = 50):
#clf = MultinomialNB(fit_prior=False)
#clf = SVC(gamma=2, C=1, class_weight = {0.0:0.063829777, 1.0:1.0})
clf = SGDClassifier(loss="log", penalty="l1", class_weight = {0.0:0.022, 1.0:1.0})
clf.fit(mat[:n], rel[:n])
return clf
def initialModeling(data):
X, y = processData(data)
global n
n = X.shape[1]
print "I'm training the model using ", X.shape[0], " samples and ", n, " features.\n"
global model
model = SGDClassifier(loss="log", alpha=100, verbose=1)
model.fit(X, y)
# 6th: update model
def __init__(self, classifier=None):
if classifier:
self.clf = classifier
else:
self.clf = SGDClassifier(loss="log", penalty="l2", shuffle=True, n_iter=2500)
self.labels = preprocessing.LabelEncoder()
self.feature_length = -1
def predict(self, features, verbose=False):
""" Probability estimates of each feature
See sklearn's SGDClassifier predict and predict_proba methods.
Args:
features (:obj:`list` of :obj:`list` of :obj:`float`)
verbose: Boolean, optional. If true returns an array where each
element is a dictionary, where keys are labels and values are
the respective probabilities. Defaults to False.
Returns:
Array of array of numbers, or array of dictionaries if verbose i
True
"""
probs = self.clf.predict_proba(features)
if verbose:
labels = self.labels.classes_
res = []
for prob in probs:
vals = {}
for i, val in enumerate(prob):
label = labels[i]
vals[label] = val
res.append(vals)
return res
else:
return probs
load_feature.py 文件源码
项目:EmotiW-2017-Audio-video-Emotion-Recognition
作者: xujinchang
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def do_l2norm(X_data):
x_normalized=preprocessing.normalize(X_data,norm='l2')
return x_normalized
#svm = SGDClassifier(loss = 'hinge')
#https://ljalphabeta.gitbooks.io/python-/content/kernelsvm.html
load_feature.py 文件源码
项目:EmotiW-2017-Audio-video-Emotion-Recognition
作者: xujinchang
项目源码
文件源码
阅读 29
收藏 0
点赞 0
评论 0
def use_SGD(X_data,y_data):
clf = SGDClassifier(loss="hinge", penalty="l2")
clf.fit(X_data, y_data)
return clf
# def use_KNN(X_data,y_data):
# def use_RandomForest(X_data,y_data):
def test_basic(self, single_chunk_classification):
X, y = single_chunk_classification
a = lm.PartialSGDClassifier(classes=[0, 1], random_state=0,
max_iter=1000, tol=1e-3)
b = lm_.SGDClassifier(random_state=0, max_iter=1000, tol=1e-3)
a.fit(X, y)
b.partial_fit(X, y, classes=[0, 1])
assert_estimator_equal(a, b, exclude='loss_function_')
def test_init_no_file():
mm = sgdc_modelmanager.SGDCModelManager()
assert isinstance(mm, sgdc_modelmanager.SGDCModelManager)
assert isinstance(mm.clf, Pipeline)
assert isinstance(mm.clf.named_steps['clf'], SGDClassifier)
def test_init():
mm = sgdc_modelmanager.SGDCModelManager('sgdcmodel.pickle')
assert isinstance(mm, modelmanager.ModelManager)
assert isinstance(mm.clf, Pipeline)
assert isinstance(mm.clf.named_steps['clf'], SGDClassifier)