def test_init():
mm = mnb_modelmanager.MNBModelManager('sgdcmodel.pickle')
assert isinstance(mm, modelmanager.ModelManager)
assert isinstance(mm.clf, Pipeline)
assert isinstance(mm.clf.named_steps['clf'], SGDClassifier)
python类SGDClassifier()的实例源码
def test_init():
ct = classifytext.ClassifyText()
assert isinstance(ct.mm, sgdc_modelmanager.SGDCModelManager)
assert isinstance(ct.mm.clf, Pipeline)
assert isinstance(ct.mm.clf.named_steps['clf'], SGDClassifier)
def test_init_sgdc():
ct = classifytext.ClassifyText(type=classifytext.SGDC)
assert isinstance(ct.mm, sgdc_modelmanager.SGDCModelManager)
assert isinstance(ct.mm.clf, Pipeline)
assert isinstance(ct.mm.clf.named_steps['clf'], SGDClassifier)
def train_clf(self, X, idxss, rs):
N = sum(len(idx) for idx in idxss)
n_epochs = self.compute_epochs(N)
if self.optimization == 'fastxml':
penalty = 'l1'
else:
penalty = 'l2'
X_train, y_train = self.build_XY(X, idxss, rs)
in_liblinear = X_train.shape[0] > (self.auto_weight * self.max_leaf_size)
if self.engine == 'liblinear' or (self.engine == 'auto' and in_liblinear):
if self.loss == 'log':
# No control over penalty
clf = LogisticRegression(solver='liblinear', random_state=rs, tol=1,
C=self.C, penalty=penalty)
else:
clf = LinearSVC(C=self.C, fit_intercept=self.bias,
max_iter=n_epochs, class_weight='balanced',
penalty=penalty, random_state=rs)
else:
clf = SGDClassifier(loss=self.loss, penalty=penalty, n_iter=n_epochs,
alpha=self.alpha, fit_intercept=self.bias, class_weight='balanced',
random_state=rs)
clf.fit(X_train, y_train)
# Halves the memory requirement
clf.coef_ = sparsify(clf.coef_, self.eps)
if self.bias:
clf.intercept_ = clf.intercept_.astype('float32')
return clf, CLF(clf.coef_, clf.intercept_)
def demo():
import sys
sys.path.append( '../core' )
from tools import make_XOR_dataset
X,Y = make_XOR_dataset()
N,L = Y.shape
br = BR(L, linear_model.SGDClassifier(n_iter=100))
br.fit(X, Y)
# test it
print(br.predict(X))
print("vs")
print(Y)
def demo():
import sys
sys.path.append( '../core' )
from tools import make_XOR_dataset
from BR import BR
set_printoptions(precision=3, suppress=True)
X,Y = make_XOR_dataset()
N,L = Y.shape
print("CLASSIFICATION")
h = linear_model.SGDClassifier(n_iter=100)
nn = ELM(8,f=tanh,h=BR(-1,h))
nn.fit(X, Y)
# test it
print(nn.predict(X))
print("vs")
print(Y)
print("REGRESSION")
r = ELM(100,h=linear_model.LinearRegression())
r.fit(X,Y)
print(Y)
print(r.predict(X))
print("REGRESSION OI")
r = ELM_OI(100,h=BR(-1,h=linear_model.SGDRegressor()))
r.fit(X,Y)
print(Y)
print(r.predict(X))
def demo():
import sys
from molearn.core.tools import make_XOR_dataset
X,Y = make_XOR_dataset()
N,L = Y.shape
print(Y)
print("vs")
print("RCC")
cc = RCC(SGDClassifier(n_iter=100,loss='log'))
cc.fit(X, Y)
print(cc.predict(X))
print("MCC")
mcc = MCC(SGDClassifier(n_iter=100,loss='log'),M=1000)
mcc.fit(X, Y)
Yp = mcc.predict(X, M=50)
print("with 50 iterations ...")
print(Yp)
Yp = mcc.predict(X, 'default')
print("with default (%d) iterations ..." % 1000)
print(Yp)
print("PCC")
pcc = PCC(SGDClassifier(n_iter=100,loss='log'))
pcc.fit(X, Y)
print(pcc.predict(X))
def fit(self, dataset, filename):
self.logger.debug("fit")
self.clf = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)),
])
self.clf.fit(dataset.get_dataset()['data'], dataset.get_dataset()['target'])
joblib.dump(self.clf, filename + ".pkl", compress=9)
def model_trainer(model_dict, X_train, y_train, adv=None, rd=None, rev=None):
"""Trains and returns SVM. Also save SVM to file."""
print('Training model...')
start_time = time.time()
abs_path_m = resolve_path_m(model_dict)
svm_model = model_dict['svm_type']
C = model_dict['penconst']
penalty = model_dict['penalty']
if adv is None:
adv_mag = None
# Create model based on parameters
if svm_model == 'linear':
dual = True
if penalty == 'l1':
dual = False
clf = svm.LinearSVC(C=C, penalty=penalty, dual=dual)
# clf = linear_model.SGDClassifier(alpha=C,l1_ratio=0)
elif svm_model != 'linear':
clf = svm.SVC(C=C, kernel=svm_model)
# Train model
clf.fit(X_train, y_train)
print('Finish training in {:d}s'.format(int(time.time() - start_time)))
# Save model
joblib.dump(clf, abs_path_m +
get_svm_model_name(model_dict, rd, rev) + '.pkl')
return clf
#------------------------------------------------------------------------------#
def __init__(self,name,kwargs):
from sklearn.linear_model import SGDClassifier
super(GCSGDClassifier,self).__init__(name,SGDClassifier,kwargs)
def get_model_score(training, validation):
model = linear_model.SGDClassifier(loss='log', n_iter=5)
model.fit(get_input_data(training), get_output_data(training))
curr_score = model.score(get_input_data(validation), get_output_data(validation))
return curr_score
def __init__(self):
# loss="log" makes it use logistic regression
self.model = linear_model.SGDClassifier(loss="log", n_iter=5)
imdb_success_predictor.py 文件源码
项目:Movie-Success-Predictor
作者: Blueteak
项目源码
文件源码
阅读 16
收藏 0
点赞 0
评论 0
def main():
#before_release
movie_info_before_release = load_movie_info_before_release()
print '***Before release***'
X = create_input(movie_info_before_release)
Y = create_output_before_release(movie_info_before_release)
clf = linear_model.SGDClassifier(loss='log')
test_classifier(clf, X, Y, 'before_release')
clf = GaussianNB()
test_classifier(clf, X, Y, 'before_release')
clf = RandomForestClassifier(n_estimators=10, max_depth=10)
test_classifier(clf, X, Y, 'before_release')
#After release
movie_info = load_movie_info()
print '***After release***'
X = create_input(movie_info)
Y = create_output(movie_info)
clf = linear_model.SGDClassifier(loss='log')
test_classifier(clf, X, Y, 'after_release')
clf = GaussianNB()
test_classifier(clf, X, Y, 'after_release')
clf = RandomForestClassifier(n_estimators=10, max_depth=10)
test_classifier(clf, X, Y, 'after_release')
def run_regression(train_embeds, train_labels, test_embeds, test_labels):
np.random.seed(1)
from sklearn.linear_model import SGDClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import f1_score
dummy = DummyClassifier()
dummy.fit(train_embeds, train_labels)
log = SGDClassifier(loss="log", n_jobs=10)
log.fit(train_embeds, train_labels)
print("F1 score:", f1_score(test_labels, log.predict(test_embeds), average="micro"))
print("Random baseline f1 score:", f1_score(test_labels, dummy.predict(test_embeds), average="micro"))
def run_regression(train_embeds, train_labels, test_embeds, test_labels):
np.random.seed(1)
from sklearn.linear_model import SGDClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import f1_score
dummy = DummyClassifier()
dummy.fit(train_embeds, train_labels)
log = SGDClassifier(loss="log", n_jobs=55)
log.fit(train_embeds, train_labels)
print("Test scores")
print(f1_score(test_labels, log.predict(test_embeds), average="micro"))
print("Train scores")
print(f1_score(train_labels, log.predict(train_embeds), average="micro"))
print("Random baseline")
print(f1_score(test_labels, dummy.predict(test_embeds), average="micro"))
def get_data_preprocessor_balancing(params, y):
d_balancing = params['layer_dict_list'][1]
if params['balancing'] == str(d_balancing['None']) or params['balancing'] == 'None':
# for fp: ['ExtraTreesClassifier', 'LinearSVC'] + clf: ['DecisionTreeClassifier', 'ExtraTreesClassifier', 'LinearSVC', 'SVC', 'RandomForestClassifier', 'SGDClassifier']
params['class_weight'] = None
# for clf: ['Adasample_weightBoostClassifier', 'GradientBoostingClassifier']
params['sample_weight'] = None
elif params['balancing'] == str(d_balancing['weighting']) or params['balancing'] == 'weighting':
# for fp: ['ExtraTreesClassifier', 'LinearSVC'] + clf: ['DecisionTreeClassifier', 'ExtraTreesClassifier', 'LinearSVC', 'SVC', 'RandomForestClassifier', 'SGDClassifier']
params['class_weight'] = 'auto'
# for clf: ['AdaBoostClassifier', 'GradientBoostingClassifier']
if len(y.shape) > 1:
offsets = [2 ** i for i in range(y.shape[1])]
y_ = np.sum(y * offsets, axis=1)
else:
y_ = y
unique, counts = np.unique(y_, return_counts=True)
cw = 1. / counts
cw = cw / np.mean(cw)
sample_weight = np.ones(y_.shape)
for i, ue in enumerate(unique):
mask = y_ == ue
sample_weight[mask] *= cw[i]
params['sample_weight'] = sample_weight
return params
def run_cat(filename,modelname,fileout,embeddings,new_run=True,run_parse=True,
model_type='logreg',C=10.0,
alpha=1.0, cutoff=0.50, n_iter=1):
# pull relevant data and run parsing and classification
df = pd.read_csv(filename)
if (len(df.columns)==2): # make sure columns have the right names
df.columns = ['raw','amount']
if new_run: # initialize the model;
if model_type=='logreg':
model = linear_model.SGDClassifier(loss='log',warm_start=True,
n_iter=n_iter,alpha=alpha)
elif model_type=='passive-aggressive':
model = linear_model.PassiveAggressiveClassifier(C=C,warm_start=True)
elif model_type=='naive-bayes':
model = naive_bayes.GaussianNB()
else:
raise NameError('model_type must be logreg, passive-aggressive, or naive-bayes')
else: # load a saved, pre-trained model
modelFileLoad = open(modelname, 'rb')
model = pickle.load(modelFileLoad)
fileCities = dirs.data_dir + 'cities_by_state.pickle'
us_cities = pd.read_pickle(fileCities)
df = cat_df(df,model,us_cities,embeddings,new_run,run_parse,cutoff=cutoff,
model_type=model_type)
df.to_csv(fileout,index=False)
# Saving logistic regression model from training set 1
modelFileSave = open(modelname, 'wb')
pickle.dump(model, modelFileSave)
modelFileSave.close()
# ------ testing functions
def model_fitting(train_set, train_labels, classifier_name, n_jobs=cpu_count()):
"""
The fitting process with sklearn algorithms.
:param train_set: numpy array, required
:param train_labels: list, required
:param classifier_name: string, required
:param n_jobs: integer, required
:return: object
- Fit classifier model according to the given training data
"""
classifier_list = {"svm_linear": SVC(probability=True, kernel='linear', C=1.0),
"svm_poly": SVC(probability=True, kernel='poly', C=1.0),
"svm_rbf": SVC(probability=True, kernel='rbf', C=1.0, gamma=0.01),
"linear_svc": LinearSVC(penalty='l2', loss='squared_hinge', dual=True, tol=0.1, C=1.0, multi_class='ovr', fit_intercept=True,
intercept_scaling=1, random_state=None, max_iter=3000),
"knn": KNeighborsClassifier(n_neighbors=100, weights='distance', leaf_size=30, n_jobs=n_jobs),
"random_forests": RandomForestClassifier(n_estimators=350, criterion='entropy', min_samples_split=2,
min_samples_leaf=1, max_leaf_nodes=600, n_jobs=n_jobs),
"logistic_regression": LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=2.4, fit_intercept=True, intercept_scaling=1,
random_state=None, solver='liblinear', max_iter=1000, multi_class='ovr',
warm_start=False, n_jobs=n_jobs),
"decision_trees": DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, min_samples_split=2,
min_samples_leaf=100, min_weight_fraction_leaf=0.0, max_features=None,
random_state=None, max_leaf_nodes=None, presort=False),
"sgd": SGDClassifier(alpha=.0001, n_iter=500, penalty="elasticnet", n_jobs=n_jobs),
"neural_network": Classifier(layers=[Layer("Sigmoid", units=14), Layer("Sigmoid", units=13), Layer("Sigmoid", units=12),
Layer("Sigmoid", units=10), Layer("Softmax")], learning_rate=0.01, n_iter=200,
batch_size=10, regularize='L1', n_stable=50, dropout_rate=0, verbose=True),
"GBC": GradientBoostingClassifier(max_depth=10, max_leaf_nodes=850, min_samples_leaf=15, learning_rate=0.1),
"XGB": XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
max_depth=10, min_child_weight=2, missing=None, n_estimators=100, nthread=n_jobs, reg_alpha=0,
objective='binary:logistic', reg_lambda=1, scale_pos_weight=1, seed=0, silent=True, subsample=1)}
return classifier_list[classifier_name].fit(train_set, train_labels)
def fit(self, dataset, filename):
self.logger.debug("fit")
self.clf = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', SGDClassifier(loss='log', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)),
])
self.clf.fit(dataset.get_dataset()['data'], dataset.get_dataset()['target'])
joblib.dump(self.clf, filename + ".pkl", compress=9)
def train_classifier(download=True, parameters=None, ngram_range=(1, 1)):
"""Train the intent classifier."""
if download:
download_wiki()
path = os.path.join(l.TOPDIR, 'train.json')
training_set = json.load(open(path))
path = os.path.join(l.TOPDIR, 'wiki.json')
wiki_set = json.load(open(path))
target_names = list(set([i['unit'] for i in training_set + wiki_set]))
train_data, train_target = [], []
for example in training_set + wiki_set:
train_data.append(clean_text(example['text']))
train_target.append(target_names.index(example['unit']))
tfidf_model = TfidfVectorizer(sublinear_tf=True,
ngram_range=ngram_range,
stop_words='english')
matrix = tfidf_model.fit_transform(train_data)
if parameters is None:
parameters = {'loss': 'log', 'penalty': 'l2', 'n_iter': 50,
'alpha': 0.00001, 'fit_intercept': True}
clf = SGDClassifier(**parameters).fit(matrix, train_target)
obj = {'tfidf_model': tfidf_model,
'clf': clf,
'target_names': target_names}
path = os.path.join(l.TOPDIR, 'clf.pickle')
pickle.dump(obj, open(path, 'w'))
###############################################################################