def get_feature_importance(self,clf, model_name ):
clfs = {'RandomForestClassifier':'feature_importances',
'ExtraTreesClassifier': 'feature_importances',
'AdaBoostClassifier': 'feature_importances',
'LogisticRegression': 'coef',
'svm.SVC': 'coef',
'GradientBoostingClassifier': 'feature_importances',
'GaussianNB': None,
'DecisionTreeClassifier': 'feature_importances',
'SGDClassifier': 'coef',
'KNeighborsClassifier': None,
'linear.SVC': 'coef'}
if clfs[model_name] == 'feature_importances':
return list(clf.feature_importances_)
elif clfs[model_name] == 'coef':
return list(clf.coef_.tolist())
else:
return None
python类GaussianNB()的实例源码
def get_classifier_class(class_name):
name_table = {
'svm': SVC,
'k_neighbors': KNeighborsClassifier,
'gaussian_process': GaussianProcessClassifier,
'decision_tree': DecisionTreeClassifier,
'random_forest': RandomForestClassifier,
'ada_boost': AdaBoostClassifier,
'mlp': MLPClassifier,
'gaussian_naive_bayes': GaussianNB,
'quadratic_discriminant_analysis': QuadraticDiscriminantAnalysis
}
if class_name not in name_table:
raise ValueError('No such classifier')
return name_table[class_name]
classify.py 文件源码
项目:oss-github-analysis-project
作者: itu-oss-project-team
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
def __create_classifiers(self):
classifiers = list()
classifiers.append({"func": linear_model.SGDClassifier(loss="log"),
"name": "sgd"})
classifiers.append({"func": neighbors.KNeighborsClassifier(1, weights='distance'),
"name": "knn1"})
classifiers.append({"func": neighbors.KNeighborsClassifier(3, weights='distance'),
"name": "knn3"})
classifiers.append({"func": neighbors.KNeighborsClassifier(5, weights='distance'),
"name": "knn5"})
classifiers.append({"func": GaussianNB(),
"name": "naive_bayes"})
# classifiers.append({"func": tree.DecisionTreeClassifier(), "name": "decision_tree"})
# classifiers.append({"func": MLPClassifier(max_iter=10000), "name": "mlp"})
# classifiers.append({"func": RandomForestClassifier(), "name": "random_forest"})
return classifiers
def define_model(self, model, parameters, n_cores = 0):
clfs = {'RandomForestClassifier': RandomForestClassifier(n_estimators=50, n_jobs=7),
'ExtraTreesClassifier': ExtraTreesClassifier(n_estimators=10, n_jobs=7, criterion='entropy'),
'AdaBoostClassifier': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200),
'LogisticRegression': LogisticRegression(penalty='l1', C=1e5),
'svm.SVC': svm.SVC(kernel='linear', probability=True, random_state=0),
'GradientBoostingClassifier': GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=10),
'GaussianNB': GaussianNB(),
'DecisionTreeClassifier': DecisionTreeClassifier(),
'SGDClassifier': SGDClassifier(loss="hinge", penalty="l2", n_jobs=7),
'KNeighborsClassifier': KNeighborsClassifier(n_neighbors=3),
'linear.SVC': svm.LinearSVC() }
if model not in clfs:
raise ConfigError("Unsupported model {}".format(model))
clf = clfs[model]
clf.set_params(**parameters)
return clf
sentiment.py 文件源码
项目:Twitter-and-IMDB-Sentimental-Analytics
作者: abhinandanramesh
项目源码
文件源码
阅读 28
收藏 0
点赞 0
评论 0
def build_models_DOC(train_pos_vec, train_neg_vec):
"""
Returns a GaussianNB and LosticRegression Model that are fit to the training data.
"""
Y = ["pos"]*len(train_pos_vec) + ["neg"]*len(train_neg_vec)
# Use sklearn's GaussianNB and LogisticRegression functions to fit two models to the training data.
# For LogisticRegression, pass no parameters
train_vec = []
train_vec.extend(train_pos_vec)
train_vec.extend(train_neg_vec)
nb_model = GaussianNB()
nb_model.fit(train_vec, Y)
lr_model = LogisticRegression()
lr_model.fit(train_vec, Y)
return nb_model, lr_model
def learns(tests,trains,indep=lambda x: x[:-1],
dep = lambda x: x[-1],
rf = Abcd(),
lg = Abcd(),
dt = Abcd(),
nb = Abcd()):
x1,y1,x2,y2= trainTest(tests,trains,indep,dep)
forest = RandomForestClassifier(n_estimators = 50)
forest = forest.fit(x1,y1)
for n,got in enumerate(forest.predict(x2)):
rf(predicted = got, actual = y2[n])
logreg = linear_model.LogisticRegression(C=1e5)
logreg.fit(x1, y1)
for n,got in enumerate(logreg.predict(x2)):
lg(predicted = got, actual = y2[n])
bayes = GaussianNB()
bayes.fit(x1,y1)
for n,got in enumerate(bayes.predict(x2)):
nb(predicted = got, actual = y2[n])
dectree = DecisionTreeClassifier(criterion="entropy",
random_state=1)
dectree.fit(x1,y1)
for n,got in enumerate(dectree.predict(x2)):
dt(predicted = got, actual = y2[n])
def main():
args = get_args()
# load and split data
dataset, target = load_dataset(args.file)
train_x, train_y, test_x, actual = split_dataset(
dataset, target, args.split)
print("Training set size: %d, Testing set size: %d" %
(len(train_x), len(test_x)))
# prepare model
summaries = summarize_by_class(train_x, train_y)
# test model
predictions = get_predictions(summaries, test_x)
display(actual, predictions)
# using scikit
gnb = GaussianNB()
y_pred = gnb.fit(train_x, train_y).predict(test_x)
display(actual, y_pred)
def classification_gaussian_nb(self):
self.signals.PrintInfo.emit("Gaussian NB")
output_dir = self.output_dir + 'gaussian_nb_out/'
if not os.path.exists(output_dir):
os.makedirs(output_dir)
vectorizer = HashingVectorizer()
fdata = vectorizer.fit_transform(self.fdata)
trainingSet = fdata[:self.split]
testSet = fdata[self.split:]
classificator = GaussianNB()
classificator.fit(trainingSet.toarray(), self.trainingClass)
results = classificator.predict(testSet.toarray())
proba = classificator.predict_proba(testSet.toarray())
self.write_results_to_file(output_dir + 'results.csv', results, proba, classificator.classes_,self.test_filenames)
out_text = self.compile_result_string(results, proba, classificator.classes_, self.test_filenames)
self.signals.PrintInfo.emit(out_text)
def train():
with open('./bin/train.bin', 'rb') as f:
ds = pickle.load(f)
XTrain, yTrain = ds['X'], ds['y']
del ds
with open('./bin/validation.bin', 'rb') as f:
ds = pickle.load(f)
XValidation, yValidation = ds['X'], ds['y']
del ds
clf = GaussianNB()
clf.fit(XTrain, yTrain)
print "Training Set Length:", XTrain.shape
print "Test Set Length:", XValidation.shape
print "Test Scores:", clf.score(XValidation, yValidation)
with open('./bin/gnbClf.bin', 'wb') as f:
pickle.dump(clf, f)
print "[SUCCESS] Saved classifier as `gnbClf.bin`"
def Fit(self, bags, bagData):
self.Bayes, self.GBayes = [], []
for i in xrange(10):
bnb = BernoulliNB()
gnb = GaussianNB()
x, y, xg = [], [], []
for j in xrange(10):
if i != j:
for vv in xrange(len(bagData[j][0])):
x.append(self.Convert(bagData[j][0][vv]))
xg.append(self.ConvertGauss(bagData[j][0][vv]))
y.extend(bagData[j][1])
bnb.fit(x, y)
gnb.fit(xg, y)
self.Bayes.append(bnb)
self.GBayes.append(gnb)
def test_GaussianNB(*data):
'''
Test Gaussian NB
:param data: train_data, test_data, train_value, test_value
:return: None
'''
X_train,X_test,y_train,y_test=data
cls=naive_bayes.GaussianNB()
cls.fit(X_train,y_train)
print('Training Score: {0}' .format( cls.score(X_train,y_train)))
print('Testing Score: {0}' .format( cls.score(X_test, y_test)))
def GaussianNBPredictModel(localTrainLabel, config):
train = pd.read_csv('../feature/trainQlist.csv', header = 0, sep = ",")
test = pd.read_csv('../feature/testQlist.csv', header = 0, sep = ",")
print "Train tf-idf vector Model..."
encode = TfidfVectorizer(decode_error = 'ignore', norm = "l2", binary = False, sublinear_tf = True, min_df = 50)
localTrainFeature = encode.fit_transform(train['qlist'].values)
localTestFeature = encode.transform(train['qlist'].values)
print localTrainFeature.shape, localTestFeature.shape
print 'train...'
model = GaussianNB()
model.fit(X = localTrainFeature.toarray(), y = localTrainLabel)
print 'predict...'
if config['prob'] == False:
return model.predict(localTestFeature.toarray()), test['uid'].values
else:
return model.predict_log_proba(localTestFeature.toarray()), test['uid'].values
#-- Multinomial Navie Bayes corss validation model frame
def test_discretenb_pickle():
# Test picklability of discrete naive Bayes classifiers
for cls in [BernoulliNB, MultinomialNB, GaussianNB]:
clf = cls().fit(X2, y2)
y_pred = clf.predict(X2)
store = BytesIO()
pickle.dump(clf, store)
clf = pickle.load(BytesIO(store.getvalue()))
assert_array_equal(y_pred, clf.predict(X2))
if cls is not GaussianNB:
# TODO re-enable me when partial_fit is implemented for GaussianNB
# Test pickling of estimator trained with partial_fit
clf2 = cls().partial_fit(X2[:3], y2[:3], classes=np.unique(y2))
clf2.partial_fit(X2[3:], y2[3:])
store = BytesIO()
pickle.dump(clf2, store)
clf2 = pickle.load(BytesIO(store.getvalue()))
assert_array_equal(y_pred, clf2.predict(X2))
def train_classifier_listing(self):
self.clfListing = GaussianNB()
files = self.b2s.ls('data/training')
X = np.zeros((len(files), self.numFeat))
Y = np.zeros(len(files))
for i, file in enumerate(files):
f = file['fileName']
# read json into feature vector
if not f.endswith('.json'):
continue
textJson = self.b2s.download(f)
listing = json.loads(textJson)
X[i] = self.bundle_json_obj(listing)
Y[i] = max(int(listing['price'] / 50), 10)
self.clfListing.fit(X, Y)
temp = tempfile.NamedTemporaryFile()
joblib.dump(self.clfListing, temp.name)
self.b2s.upload('classifiers/nb_listing.pkl',
temp.read(), 'application/octet-stream')
return self.clfListing.score(X, Y)
# train a classifier on description
def classify(train=None, test=None, data=None, res_dir="res/", disp=True, outfilename=None):
"""Description of compare
compare multiple classifier and display the best one
"""
utils.print_success("Comparison of differents classifiers")
if data is not None:
train_features = data["train_features"]
train_groundtruths = data["train_groundtruths"]
test_features = data["test_features"]
test_groundtruths = data["test_groundtruths"]
else:
train = utils.abs_path_file(train)
test = utils.abs_path_file(test)
train_features, train_groundtruths = read_file(train)
test_features, test_groundtruths = read_file(test)
if not utils.create_dir(res_dir):
res_dir = utils.abs_path_dir(res_dir)
classifiers = {
"RandomForest": RandomForestClassifier(n_jobs=-1)
# "RandomForest": RandomForestClassifier(n_estimators=5),
# "KNeighbors":KNeighborsClassifier(3),
# "GaussianProcess":GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True),
# "DecisionTree":DecisionTreeClassifier(max_depth=5),
# "MLP":MLPClassifier(),
# "AdaBoost":AdaBoostClassifier(),
# "GaussianNB":GaussianNB(),
# "QDA":QuadraticDiscriminantAnalysis(),
# "SVM":SVC(kernel="linear", C=0.025),
# "GradientBoosting":GradientBoostingClassifier(),
# "ExtraTrees":ExtraTreesClassifier(),
# "LogisticRegression":LogisticRegression(),
# "LinearDiscriminantAnalysis":LinearDiscriminantAnalysis()
}
for key in classifiers:
utils.print_success(key)
clf = classifiers[key]
utils.print_info("\tFit")
clf.fit(train_features, train_groundtruths)
utils.print_info("\tPredict")
predictions = clf.predict(test_features)
return predictions
def getEstimator(es):
estimator = None
algo = es.ml_algorithm.upper()
if algo == 'NAIVEBAYESGAUSSIAN':
estimator = naive_bayes.GaussianNB()
elif algo == 'SVM':
estimator = svm.SVC(kernel=es.svmKernel, degree = 3, C = 0.1, random_state=es.random_seed)
elif algo == 'RF':
estimator = RandomForestClassifier(n_estimators=100, random_state=es.random_seed)
elif algo == 'DECISIONTREE':
estimator = DecisionTreeClassifier(random_state=es.random_seed)
elif algo == 'RANDOM':
estimator = DummyClassifier(random_state=es.random_seed)
else:
print("Please enter correct estimator (NaiveBayesGaussian/SVM/RF/DecisionTree)")
#TODO: add regression?
return estimator
def test_smoke():
a = nb.GaussianNB()
b = nb_.GaussianNB()
a.fit(X, y)
b.fit(X.compute(), y.compute())
assert_eq(a.class_prior_.compute(), b.class_prior_)
assert_eq(a.class_count_.compute(), b.class_count_)
assert_eq(a.theta_.compute(), b.theta_)
assert_eq(a.sigma_.compute(), b.sigma_)
assert_eq(a.predict_proba(X).compute(), b.predict_proba(X_))
assert_eq(a.predict(X).compute(), b.predict(X_))
assert_eq(a.predict_log_proba(X).compute(), b.predict_log_proba(X_))
def build_naive_bayes_model(x_train, y_train):
nb_model = GaussianNB()
nb_model.fit(x_train,y_train.ravel())
return nb_model
def createPipeline(self):
self.pipeline = Pipeline([
('scaler', StandardScaler()),
('model', naive_bayes.GaussianNB())])
test.py 文件源码
项目:Audio-classification-using-Bag-of-Frames-approach
作者: amogh3892
项目源码
文件源码
阅读 19
收藏 0
点赞 0
评论 0
def naive_bayes_predict(training_samples, training_labels, test_samples, test_lables):
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
t0 = time()
clf.fit(training_samples,training_labels)
training_time = round(time()-t0, 3)
t0 = time()
pred = clf.predict(test_samples)
test_time = round(time()-t0, 3)
from sklearn.metrics import accuracy_score
acc = accuracy_score(pred,test_lables)
no_features = np.array(training_samples).shape[1]
training_samples = np.array(training_samples).shape[0]
test_samples = np.array(test_samples).shape[0]
with open("Temp\\results.txt","w") as outfile:
outfile.write("Alogirthm : {}\n".format("Naive Bayes"))
outfile.write("No of features : {}\n".format(no_features))
outfile.write("No of training samples : {}\n".format(training_samples))
outfile.write("No of test samples : {}\n".format(test_samples))
outfile.write("Training time : {}\n".format(training_time))
outfile.write("Test time : {}\n".format(test_time))
outfile.write("Accuracy : {}\n".format(acc))
with open("Temp\\result_labels.csv","wb") as outfile:
np.savetxt(outfile,pred)
imdb_success_predictor.py 文件源码
项目:Movie-Success-Predictor
作者: Blueteak
项目源码
文件源码
阅读 27
收藏 0
点赞 0
评论 0
def main():
#before_release
movie_info_before_release = load_movie_info_before_release()
print '***Before release***'
X = create_input(movie_info_before_release)
Y = create_output_before_release(movie_info_before_release)
clf = linear_model.SGDClassifier(loss='log')
test_classifier(clf, X, Y, 'before_release')
clf = GaussianNB()
test_classifier(clf, X, Y, 'before_release')
clf = RandomForestClassifier(n_estimators=10, max_depth=10)
test_classifier(clf, X, Y, 'before_release')
#After release
movie_info = load_movie_info()
print '***After release***'
X = create_input(movie_info)
Y = create_output(movie_info)
clf = linear_model.SGDClassifier(loss='log')
test_classifier(clf, X, Y, 'after_release')
clf = GaussianNB()
test_classifier(clf, X, Y, 'after_release')
clf = RandomForestClassifier(n_estimators=10, max_depth=10)
test_classifier(clf, X, Y, 'after_release')
model.py 文件源码
项目:5th_place_solution_facebook_check_ins
作者: aikinogard
项目源码
文件源码
阅读 22
收藏 0
点赞 0
评论 0
def nb_xyat_weight1(df_cell_train_feats, y_train, df_cell_test_feats):
def prepare_feats(df):
df_new = pd.DataFrame()
df_new["x"] = df["x"]
df_new["y"] = df["y"]
df_new["hour"] = df["hour"]
df_new["weekday"] = df["weekday"]
df_new["accuracy"] = df["accuracy"].apply(np.log10)
return df_new
logging.info("train nb_xyat_weight1 model")
clf = GaussianNB()
clf.fit(prepare_feats(df_cell_train_feats), y_train, df_cell_train_feats["time"] ** 2)
y_test_pred = clf.predict_proba(prepare_feats(df_cell_test_feats))
return y_test_pred
def run_cat(filename,modelname,fileout,embeddings,new_run=True,run_parse=True,
model_type='logreg',C=10.0,
alpha=1.0, cutoff=0.50, n_iter=1):
# pull relevant data and run parsing and classification
df = pd.read_csv(filename)
if (len(df.columns)==2): # make sure columns have the right names
df.columns = ['raw','amount']
if new_run: # initialize the model;
if model_type=='logreg':
model = linear_model.SGDClassifier(loss='log',warm_start=True,
n_iter=n_iter,alpha=alpha)
elif model_type=='passive-aggressive':
model = linear_model.PassiveAggressiveClassifier(C=C,warm_start=True)
elif model_type=='naive-bayes':
model = naive_bayes.GaussianNB()
else:
raise NameError('model_type must be logreg, passive-aggressive, or naive-bayes')
else: # load a saved, pre-trained model
modelFileLoad = open(modelname, 'rb')
model = pickle.load(modelFileLoad)
fileCities = dirs.data_dir + 'cities_by_state.pickle'
us_cities = pd.read_pickle(fileCities)
df = cat_df(df,model,us_cities,embeddings,new_run,run_parse,cutoff=cutoff,
model_type=model_type)
df.to_csv(fileout,index=False)
# Saving logistic regression model from training set 1
modelFileSave = open(modelname, 'wb')
pickle.dump(model, modelFileSave)
modelFileSave.close()
# ------ testing functions
def solve():
# ????????????
training_arr = numpy.loadtxt('adult.txt', dtype=bytes, comments='#', delimiter=',')
test_data = numpy.loadtxt('adult_test.txt', dtype=bytes, comments='#', delimiter=',')
# ??? x ???? y ???, ?????
# x_list = numpy.ndarray(len(training_arr))
# y_list = numpy.ndarray(len(training_arr))
# ????? y ?
y_list = [int(element[12]) for element in training_arr]
# ????, ??????? x ?
x_list = [[transform(x) for x in element[0:12]] for element in training_arr]
# ??????
test_data = [[transform(x) for x in element] for element in test_data]
assert isinstance(y_list[0], int) # y ??????? 0 ? 1, ? int
assert len(x_list[0]) == 12 # x ????? 12 ???, ???? 12 ?????
assert len(test_data[0]) == 12 # ????? x ??
clf = GaussianNB()
clf.partial_fit(x_list, y_list, numpy.unique(y_list)) # clf.fit(x_list, y_list) ?????
res_arr = clf.predict(test_data)
partial_fit_result = "".join([str(x) for x in res_arr])
print("[*] ??????: {}".format(partial_fit_result))
def train(self, pd):
model = naive_bayes.GaussianNB()
model.fit(pd.data, pd.target)
print model
return model
def nb_experiment(scope_name, X, y):
for lp in lp_cand:
results = []
for r in range(50):
with open('data/local/split/' + scope_name + '/lb' + str(lp).zfill(3) + '_' + str(r).zfill(
3) + '_train') as f:
trainLabel = pk.load(f)
with open('data/local/split/' + scope_name + '/lb' + str(lp).zfill(3) + '_' + str(r).zfill(
3) + '_test') as f:
testLabel = pk.load(f)
XTrain = X[trainLabel.keys()]
XTest = X[testLabel.keys()]
if not isinstance(XTrain, np.ndarray):
XTrain = XTrain.toarray()
XTest = XTest.toarray()
yTrain = y[trainLabel.keys()]
yTest = y[testLabel.keys()]
# train
#clf = MultinomialNB()
clf = GaussianNB()
#clf = BernoulliNB()
clf.fit(XTrain, yTrain)
# test
pred = clf.predict(XTest)
results.append(sum(pred == yTest) / float(yTest.shape[0]))
return np.mean(results)
def MakeClassification(index,instancesData,classesData,instancesTest,type="proba",classifiersType="normal"):
classifiers = [
OneVsRestClassifier(sklearn.svm.SVC(probability=1),4),
DecisionTreeClassifier(random_state=0),
KNeighborsClassifier(n_jobs=4),
MLPClassifier(),
sklearn.svm.SVC(probability=1,decision_function_shape="ovo"),
OutputCodeClassifier(LinearSVC(random_state=0),code_size=2, random_state=0)
]
if (classifiersType == "ova"):
classifiers = [
OneVsRestClassifier(sklearn.svm.SVC(probability=1),4),
OneVsRestClassifier(DecisionTreeClassifier(random_state=0),4),
OneVsRestClassifier(KNeighborsClassifier(),4),
OneVsRestClassifier(MLPClassifier(),4),
OneVsRestClassifier(GaussianNB(),4)
]
if (index >= len(classifiers)):
print "ERROR. The index is not valid."
return None
else:
#print "Performing classification"
if type == "proba":
return classifiers[index].fit(instancesData,classesData).predict_proba(instancesTest)
else:
return classifiers[index].fit(instancesData,classesData).predict(instancesTest)
def __init__(self, training_data, training_target):
self.training_data = training_data
self.training_target = training_target
self.clf = GaussianNB()
def bayes_train(train_data, train_target):
model = GaussianNB()
model.fit(train_data, train_target)
expected = train_target
predicted = model.predict(train_data)
# summarize the fit of the model
print metrics.classification_report(expected, predicted)
print metrics.confusion_matrix(expected, predicted)
def NB(train_x, train_y, test_x, test_y):
""" ????? """
classifier = GaussianNB()
classifier.fit(train_x, train_y)
pred = classifier.predict_proba(test_x)
predict_pro = []
for pro in pred:
predict_pro.append(pro[1])
predict_y = classifier.predict(test_x)
auc = evaluate_auc(predict_pro, test_y)
evaluate(predict_y, test_y)
return auc