def trainModel(featureCount, imageCount, save):
clf = RandomForestRegressor(n_estimators=1, n_jobs=-1)
features = generateFeatures(featureCount)
for image in range(0, imageCount):
print "Image " + str(image)
train(clf, features, image)
clf = clf.fit(X, Y)
model = (clf, features)
if save:
joblib.dump(model, "model.pkl")
return model
python类dump()的实例源码
def load_trained_model(self, classifier):
filename = '{}.pkl'.format(classifier.__name__.lower())
path = os.path.join(self.data_path, filename)
# palliative: this outputs a model too large for joblib
if classifier.__name__ == 'MonthlySubquotaLimitClassifier':
model = classifier()
model.fit(self.dataset)
else:
if os.path.isfile(path):
model = joblib.load(path)
else:
model = classifier()
model.fit(self.dataset)
joblib.dump(model, path)
return model
def make_check_point(self):
num, last_checkpoints = self.load_current_checkpoints()
if self.best_val_acc > last_checkpoints['best_val_acc']:
best_val_acc = self.best_val_acc
best_params = self.best_params
else:
best_val_acc = last_checkpoints['best_val_acc']
best_params = last_checkpoints['best_params']
checkpoints = {
'model': self.model,
'epoch': self.epoch,
'best_params': best_params,
'best_val_acc': best_val_acc,
'loss_history': self.loss_history,
'train_acc_history': self.train_acc_history,
'val_acc_history': self.val_acc_history}
name = 'check_' + str(num + 1)
os.mkdir(os.path.join(self.path_checkpoints, name))
joblib.dump(checkpoints, os.path.join(
self.path_checkpoints, name, name + '.pkl'))
def __init__(self, clf, scaler, pf_df, data_folder=""):
model_file_name = "banana.pkl"
scaler_file_name = "banana_scaler.pkl"
list_file_name = "banana_list.txt"
def_file_path = "../../models/"
self.data_folder = data_folder
if not data_folder:
model_file = os.path.join(os.path.dirname(__file__), def_file_path) + model_file_name
scaler_file = os.path.join(os.path.dirname(__file__), def_file_path) + scaler_file_name
list_file = os.path.join(os.path.dirname(__file__), def_file_path) + list_file_name
else:
model_file = self.data_folder + model_file_name
scaler_file = self.data_folder + scaler_file_name
list_file = self.data_folder + list_file_name
joblib.dump(clf, model_file)
joblib.dump(scaler, scaler_file)
with open(list_file, "w") as f:
f.write(" ".join(pf_df.columns.tolist()))
def get_cache_file(model_id, index, cache_dir='', suffix='csv'):
# Identify index trick.
# If sum of first 20 index, recognize as the same index.
if index is None:
raise IOError
if len(index) < 20:
sum_index = sum(index)
else:
sum_index = sum(index[:20])
return "{0}{1}_{2}.{3}".format(cache_dir,
model_id,
sum_index,
suffix)
##def saving_fit(learner, X, y, index):
## import os
## pkl_file = "{0}_{1}_{2}.pkl".format(learner.id, min(index), max(index))
## try:
## learner = joblib.load(pkl_file)
## print("**** learner is loaded from {0} ****".format(pkl_file))
## except IOError:
## learner.fit(X, y)
## joblib.dump(learner, pkl_file)
## return learner
def KmeansWrapper(true_k, data, load=False):
from sklearn.externals import joblib
modelName = 'doc_cluster.%s.plk' % true_k
if load:
km = joblib.load(modelName)
labels = km.labels_
else:
km = KMeans(n_clusters=true_k,
init='k-means++',
# max_iter=1000,
n_init=10,
n_jobs=-1,
random_state=0,
verbose=0)
km.fit_predict(data)
labels = km.labels_
joblib.dump(km, modelName)
return labels, km.cluster_centers_
def init_state(indata, test=False):
close = indata['close'].values
diff = np.diff(close)
diff = np.insert(diff, 0, 0)
sma15 = SMA(indata, timeperiod=15)
sma60 = SMA(indata, timeperiod=60)
rsi = RSI(indata, timeperiod=14)
atr = ATR(indata, timeperiod=14)
#--- Preprocess data
xdata = np.column_stack((close, diff, sma15, close-sma15, sma15-sma60, rsi, atr))
xdata = np.nan_to_num(xdata)
if test == False:
scaler = preprocessing.StandardScaler()
xdata = np.expand_dims(scaler.fit_transform(xdata), axis=1)
joblib.dump(scaler, 'data/scaler.pkl')
elif test == True:
scaler = joblib.load('data/scaler.pkl')
xdata = np.expand_dims(scaler.fit_transform(xdata), axis=1)
state = xdata[0:1, 0:1, :]
return state, xdata, close
#Take Action
def persist_pipelines(pipelines):
Path('models').mkdir(exist_ok=True)
fp_fmt = 'models/{}-{:%y-%m-%d}.pkl'
now = dt.datetime.now()
for pipe in pipelines:
print(utils.pipeline_name(pipe))
fp_name = fp_fmt.format(utils.pipeline_name(pipe), now)
joblib.dump(pipe, fp_name)
# Pickle fails to work on RandomForestRegressor
# with open(fp_name, 'wb') as fp:
# pickle.dump(pipe, fp)
def _vectorize_chunk(dsid_dir, k, pars, pretend=False):
""" Extract features on a chunk of files """
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.externals import joblib
filenames = pars['filenames_abs']
chunk_size = pars['chunk_size']
n_samples = pars['n_samples']
mslice = slice(k*chunk_size, min((k+1)*chunk_size, n_samples))
hash_opts = {key: vals for key, vals in pars.items()
if key in ['stop_words', 'n_features',
'analyser', 'ngram_range']}
hash_opts['alternate_sign'] = False
fe = HashingVectorizer(input='content', norm=None, **hash_opts)
if pretend:
return fe
fset_new = fe.transform(_read_file(fname) for fname in filenames[mslice])
fset_new.eliminate_zeros()
joblib.dump(fset_new, str(dsid_dir / 'features-{:05}'.format(k)))
def dump_classifier(self):
"""
This function ...
:return:
"""
# Determine the path to the pickle file
classifier_path = os.path.join(self.classification_mode_path, "classifier.pkl")
# Inform the user
self.log.info("Writing the classifier to " + classifier_path)
# Serialize and dump the classifier
joblib.dump(self.vector_classifier, classifier_path)
# -----------------------------------------------------------------
def dump_classifier(self):
"""
This function ...
:return:
"""
# Determine the path to the pickle file
classifier_path = os.path.join(self.classification_mode_path, "classifier.pkl")
# Inform the user
self.log.info("Writing the classifier to " + classifier_path)
# Serialize and dump the classifier
joblib.dump(self.vector_classifier, classifier_path)
# -----------------------------------------------------------------
def generate_LR_model(file_name):
train_df = read_from_file(file_name)
selected_train_df = train_df.filter(regex='label|connectionType_.*|telecomsOperator_.*|sitesetID_.*|positionType_.*|gender_.*|haveBaby_.*|age_scaled')
train_np = selected_train_df.as_matrix()
y = train_np[:,0]
X = train_np[:,1:]
print 'Train Logistic Regression Model...'
start_time = datetime.datetime.now()
clf = linear_model.LogisticRegression(penalty='l2',C=1.0,solver='sag',n_jobs=-1, tol=1e-6, max_iter=200)#, class_weight='balanced')
clf.fit(X,y)
end_time = datetime.datetime.now()
print 'Training Done..., Time Cost: '
print (end_time-start_time).seconds
print 'Save Model...'
joblib.dump(clf, 'LR.model')
return clf
def test():
iris = load_iris()
#print iris
#print iris['target'].shape
gbdt=GradientBoostingRegressor(n_estimators=1000, max_depth=4)
gbdt.fit(iris.data[:120],iris.target[:120])
#Save GBDT Model
joblib.dump(gbdt, 'GBDT.model')
predict = gbdt.predict(iris.data[:120])
total_err = 0
for i in range(len(predict)):
print predict[i],iris.target[i]
err = predict[i] - iris.target[i]
total_err += err * err
print 'Training Error: %f' % (total_err / len(predict))
pred = gbdt.predict(iris.data[120:])
error = 0
for i in range(len(pred)):
print pred[i],iris.target[i+120]
err = pred[i] - iris.target[i+120]
error += err * err
print 'Test Error: %f' % (error / len(pred))
def generate_GBDT_model(file_name):
train_df = read_from_file(file_name)
#featrue 18
selected_train_df = train_df.filter(regex='label|creativeID|positionID|connectionType|telecomsOperator|adID|camgaignID|advertiserID|appID|appPlatform|sitesetID|positionType|age|gender|education|marriageStatus|haveBaby|hometown|residence')
train_np = selected_train_df.as_matrix()
y = train_np[:,0]
X = train_np[:,1:]
print 'Train Gradient Boosting Regression Model...'
start_time = datetime.datetime.now()
gbdt = GradientBoostingRegressor(n_estimators=120, max_depth=10) #, class_weight='balanced')
gbdt.fit(X,y)
end_time = datetime.datetime.now()
print 'Training Done..., Time Cost: '
print (end_time - start_time).seconds
print 'Save Model...'
joblib.dump(gbdt, 'GBDT.model')
return gbdt
def generate_XGB_model(train_df):
train_df.drop(['conversionTime'], axis=1, inplace=True)
print 'Train And Fix Missing App Count Value...'
train_df, xgb_appcount = train_model_for_appcounts(train_df)
joblib.dump(xgb_appcount, 'XGB_missing.model')
'''print 'Train And Fix Missing Age Value...'
train_df, xgb_age = train_model_for_age(train_df)
joblib.dump(xgb_age, 'XGB_age.model')'''
train_df.drop(['marriageStatus','haveBaby','sitesetID', 'positionType'], axis=1, inplace=True)
print 'Done'
print train_df.info()
print train_df.describe()
print train_df.isnull().sum()
train_np = train_df.as_matrix()
y = train_np[:,0]
X = train_np[:,1:]
print 'Train Xgboost Model...'
start_time = datetime.datetime.now()
xbg_clf = XGBRegressor(n_estimators=100, max_depth=6, objective="binary:logistic", silent=False)
xbg_clf.fit(X,y)
end_time = datetime.datetime.now()
print 'Training Done..., Time Cost: %d' % ((end_time - start_time).seconds)
model_df = pd.DataFrame({'columns':list(train_df.columns)[1:], 'values':xbg_clf.feature_importances_})
print model_df
return xbg_clf
def xgb_model_select(train_file_name):
train_df = merge_features_to_use(train_file_name)
train_df.drop(['conversionTime'], axis=1, inplace=True)
print 'Train And Fix Missing App Count Value...'
train_df, xgb_appcount = train_model_for_appcounts(train_df)
joblib.dump(xgb_appcount, 'XGB_missing.model')
print train_df.info()
print train_df.describe()
print train_df.isnull().sum()
train_np = train_df.as_matrix()
y = train_np[:,0]
X = train_np[:,1:]
print 'Select Model...'
start_time = datetime.datetime.now()
xgb_clf = xgb.XGBRegressor()
parameters = {'n_estimators': [120, 100, 140], 'max_depth':[3,5,7,9], 'gamma':[0.1,0.3,0.5,0.7], 'min_child_weight':[1,3,5,7], }
grid_search = GridSearchCV(estimator=xgb_clf, param_grid=parameters, cv=10, n_jobs=-1)
print("parameters:")
pprint.pprint(parameters)
grid_search.fit(X, y)
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters=grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print("\t%s: %r" % (param_name, best_parameters[param_name]))
end_time = datetime.datetime.now()
print 'Select Done..., Time Cost: %d' % ((end_time - start_time).seconds)
def generate_RF_model(file_name):
train_df = read_from_file(file_name)
selected_train_df = train_df.filter(regex='label|creativeID|positionID|connectionType|telecomsOperator|adID|camgaignID|advertiserID|appID|appPlatform|sitesetID|positionType|age|gender|education|marriageStatus|haveBaby|hometown|residence')
train_np = selected_train_df.as_matrix()
y = train_np[:,0]
X = train_np[:,1:]
print 'Train Random Forest Regression Model...'
start_time = datetime.datetime.now()
rf = RandomForestRegressor(n_estimators=25, n_jobs=-1)#, class_weight='balanced')
rf.fit(X,y)
end_time = datetime.datetime.now()
print 'Training Done..., Time Cost: '
print (end_time-start_time).seconds
print 'Save Model...'
joblib.dump(rf, 'RF.model')
return rf
def load_data(test=False):
fname = FTEST if test else FTRAIN
df = pd.read_csv(fname)
cols = df.columns[:-1]
df['Image'] = df['Image'].apply(lambda im: np.fromstring(im, sep=' ') / 255.0)
df = df.dropna()
X = np.vstack(df['Image'])
X = X.reshape(-1, IMAGE_SIZE, IMAGE_SIZE, 1)
if not test:
# y = (df[cols].values -48) / 48.0
y = df[cols].values / 96.0
X, y = shuffle(X, y)
joblib.dump(cols, 'data/cols.pkl', compress=3)
else:
y = None
return X, y
def Dump(model,fnameMODEL,fnameWeight):
if str(type(model)).find("sklearn.")==-1:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.optimizers import SGD
json_string = model.to_json()
fm = open(fnameMODEL+".json","w")
fm.write(json_string)
fm.close()
model.save_weights(fnameWeight+".hdf5",overwrite=True)
else:
from sklearn.externals import joblib
def ensure_dir(f):
d = os.path.dirname(f)
if not os.path.exists(d):
os.makedirs(d)
ensure_dir('./skmodel/')
joblib.dump(model, "./skmodel/"+fnameMODEL+".pkl",compress=3)
def train_svms():
if not os.path.isfile('models/fine_tune.model.index'):
print('models/fine_tune.model doesn\'t exist.')
return
net = create_alexnet()
model = tflearn.DNN(net)
model.load('models/fine_tune.model')
train_file_dir = 'svm_train/'
flist = os.listdir(train_file_dir)
svms = []
for train_file in flist:
if "pkl" in train_file:
continue
X, Y = generate_single_svm_train_data(train_file_dir + train_file)
train_features = []
for i in X:
feats = model.predict([i])
train_features.append(feats[0])
print("feature dimension of fitting: {}".format(np.shape(train_features)))
clf = svm.LinearSVC()
clf.fit(train_features, Y)
svms.append(clf)
joblib.dump(svms, 'models/train_svm.model')
def train(self, training_data, trees=100,rf_out=None):
# Use CNN to extract features
self.cnn.set_intermediate(self.feature_layer)
features = self.extract_features(training_data)
# Create random forest
self.rf = RandomForestClassifier(n_estimators=trees, class_weight='balanced_subsample')
X_train = features['y_pred'] # inputs to train the random forest
y_train = np.asarray(features['y_true']) # ground truth for random forest
print "Training RF..."
self.rf.fit(X_train, y_train)
if rf_out:
joblib.dump(self.rf, rf_out)
return self.rf, X_train, y_train
def train_model(data, with_mac=True):
global without_mac_clf, mac_clf
df = pd.DataFrame.from_dict(data)
y = df.pop("location")
features = [f for f in df.columns if f is not 'mac']
df = df.rename(columns=dict(zip(features, [POWER_SLAVE_PREFIX + f for f in features])))
model_name = MODEL_MAC_NAME if with_mac else MODEL_NAME
if with_mac:
df = df.apply(LabelEncoder().fit_transform)
else:
df.drop("mac", axis=1, inplace=True)
clf = DecisionTreeClassifier()
clf.fit(df, y)
joblib.dump(clf, model_name)
if with_mac and mac_clf is None:
mac_clf = clf
if not with_mac and without_mac_clf is None:
without_mac_clf = clf
export_graphviz(clf, feature_names=list(df.columns), class_names=y.unique(), filled=True, rounded=True, out_file='model.dot')
os.system("dot -Tpng model.dot -o model.png")
def trainClassifier(foldername,classifierName):
model = cv2.ml.KNearest_create()
features = []
labels = []
os.chdir(foldername)
for filename in glob.iglob('*.png'):
features.append(cv2.imread((filename),-1))
labels.append(filename[0])
list_hog_fd = []
for feature in features:
fd = hog(feature.reshape((27, 35)), orientations=9, pixels_per_cell=(9, 7), cells_per_block=(1, 1), visualise=False)
list_hog_fd.append(fd)
hog_features = np.array(list_hog_fd, 'float64')
os.chdir("..")
clf = LinearSVC()
clf.fit(hog_features, labels)
joblib.dump(clf,classifierName, compress=3)
os.chdir("..")
def learn(fName, features, nRows=-1):
with open('bin/train.bin', 'r') as f:
train = np.load(f)
x = np.mat(train[:nRows,timbreVector[features[0]]]).reshape(nRows,1)
y = np.mat(train[:nRows,timbreVector[features[1]]]).reshape(nRows,1)
z = np.mat(train[:nRows,timbreVector[features[2]]]).reshape(nRows,1)
X = np.concatenate((x, y, z), axis=1)
Y = train[:nRows,0] % minYear
clf = svm.SVC(verbose=3)
clf.fit(X, Y)
print "[SUCCESS] Fitted training data to SVM (kernel: rbf)."
print "[STARTED] Dumping classifier."
joblib.dump(clf, 'bin/%s'%fName)
print "[SUCCESS] Dumped to ", fName
def train(self, training_set, training_target, fea_index):
clf = tree.DecisionTreeClassifier(criterion="entropy", min_samples_split=30, class_weight="balanced")
clf = clf.fit(training_set, training_target)
class_names = np.unique([str(i) for i in training_target])
feature_names = [attr_list[i] for i in fea_index]
dot_data = tree.export_graphviz(clf, out_file=None,
feature_names=feature_names,
class_names=class_names,
filled=True, rounded=True,
special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_pdf("output/tree-vis.pdf")
joblib.dump(clf, 'output/CART.pkl')
def saveDebugStateAtBatch(self, name, batchID, LPchunk=None, SS=None,
SSchunk=None, hmodel=None,
Dchunk=None):
if self.outputParams['debugBatch'] == batchID:
debugLap = self.outputParams['debugLap']
debugLapBuffer = self.outputParams['debugLapBuffer']
if self.lapFrac < 1:
joblib.dump(dict(Dchunk=Dchunk),
os.path.join(self.task_output_path, 'Debug-Data.dump'))
belowWindow = self.lapFrac < debugLap - debugLapBuffer
aboveWindow = self.lapFrac > debugLap + debugLapBuffer
if belowWindow or aboveWindow:
return
filename = 'DebugLap%04.0f-%s.dump' % (np.ceil(self.lapFrac), name)
SaveVars = dict(LP=LPchunk, SS=SS, hmodel=hmodel,
SSchunk=SSchunk,
lapFrac=self.lapFrac)
joblib.dump(SaveVars, os.path.join(self.task_output_path, filename))
if self.lapFrac < 1:
joblib.dump(dict(Dchunk=Dchunk),
os.path.join(self.task_output_path, 'Debug-Data.dump'))
6_PSO+PCA.py 文件源码
项目:SVM-classification-localization
作者: HandsomeHans
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def pca(dataMat,n):
print "Start to do PCA..."
newData,meanVal=zeroMean(dataMat)
# covMat=np.cov(newData,rowvar=0)
# eigVals,eigVects=np.linalg.eig(np.mat(covMat))
# joblib.dump(eigVals,'./features/PCA/eigVals_train_%s.eig' %m,compress=3)
# joblib.dump(eigVects,'./features/PCA/eigVects_train_%s.eig' %m,compress=3)
eigVals = joblib.load('./features/PCA/eigVals_train_%s.eig' %m)
eigVects = joblib.load('./features/PCA/eigVects_train_%s.eig' %m)
eigValIndice=np.argsort(eigVals)
n_eigValIndice=eigValIndice[-1:-(n+1):-1]
n_eigVect=eigVects[:,n_eigValIndice]
# joblib.dump(n_eigVect,'./features/PCA/n_eigVects_train_%s_%s.eig' %(m,n))
lowDDataMat=newData*n_eigVect
return lowDDataMat
4_Train_PCA+SVM.py 文件源码
项目:SVM-classification-localization
作者: HandsomeHans
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def pca(dataMat,n):
print "Start to do PCA..."
t1 = time.time()
newData,meanVal=zeroMean(dataMat)
covMat=np.cov(newData,rowvar=0)
eigVals,eigVects=np.linalg.eig(np.mat(covMat)) # calculate feature value and feature vector
joblib.dump(eigVals,'./features/PCA/%s/eigVals_train_%s.eig' %(m,m),compress=3)
joblib.dump(eigVects,'./features/PCA/%s/eigVects_train_%s.eig' %(m,m),compress=3)
# eigVals = joblib.load('./features/PCA/%s/eigVals_train_%s.eig' %(m,m))
# eigVects = joblib.load('./features/PCA/%s/eigVects_train_%s.eig' %(m,m))
eigValIndice=np.argsort(eigVals) # sort feature value
n_eigValIndice=eigValIndice[-1:-(n+1):-1] # take n feature value
n_eigVect=eigVects[:,n_eigValIndice] # take n feature vector
joblib.dump(n_eigVect,'./features/PCA/%s/n_eigVects_train_%s_%s.eig' %(m,m,n))
lowDDataMat=newData*n_eigVect # calculate low dimention data
# reconMat=(lowDDataMat*n_eigVect.T)+meanVal
t2 = time.time()
print "PCA takes %f seconds" %(t2-t1)
return lowDDataMat
1_HoG_extract_feature.py 文件源码
项目:SVM-classification-localization
作者: HandsomeHans
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
def getFeat(Data,mode): # get and save feature valuve
num = 0
for data in Data:
image = np.reshape(data[0], (200, 200, 3))
gray = rgb2gray(image)/255.0 # trans image to gray
fd = hog(gray, orientations, pixels_per_cell, cells_per_block, block_norm, visualize, normalize)
fd = np.concatenate((fd, data[1])) # add label in the end of the array
filename = list(data[2])
fd_name = filename[0].split('.')[0]+'.feat' # set file name
if mode == 'train':
fd_path = os.path.join('./features/train/', fd_name)
else:
fd_path = os.path.join('./features/test/', fd_name)
joblib.dump(fd, fd_path,compress=3) # save data to local
num += 1
print "%d saving: %s." %(num,fd_name)
train_novelty_detection.py 文件源码
项目:keras-transfer-learning-for-oxford102
作者: Arsey
项目源码
文件源码
阅读 31
收藏 0
点赞 0
评论 0
def train_logistic():
df = pd.read_csv(config.activations_path)
df, y, classes = encode(df)
X_train, X_test, y_train, y_test = train_test_split(df.values, y, test_size=0.2, random_state=17)
params = {'C': [10, 2, .9, .4, .1], 'tol': [0.0001, 0.001, 0.0005]}
log_reg = LogisticRegression(solver='lbfgs', multi_class='multinomial', class_weight='balanced')
clf = GridSearchCV(log_reg, params, scoring='neg_log_loss', refit=True, cv=3, n_jobs=-1)
clf.fit(X_train, y_train)
print("best params: " + str(clf.best_params_))
print("Accuracy: ", accuracy_score(y_test, clf.predict(X_test)))
setattr(clf, '__classes', classes)
# save results for further using
joblib.dump(clf, config.get_novelty_detection_model_path())