def generate_LR_model(file_name):
train_df = read_from_file(file_name)
selected_train_df = train_df.filter(regex='label|connectionType_.*|telecomsOperator_.*|sitesetID_.*|positionType_.*|gender_.*|haveBaby_.*|age_scaled')
train_np = selected_train_df.as_matrix()
y = train_np[:,0]
X = train_np[:,1:]
print 'Train Logistic Regression Model...'
start_time = datetime.datetime.now()
clf = linear_model.LogisticRegression(penalty='l2',C=1.0,solver='sag',n_jobs=-1, tol=1e-6, max_iter=200)#, class_weight='balanced')
clf.fit(X,y)
end_time = datetime.datetime.now()
print 'Training Done..., Time Cost: '
print (end_time-start_time).seconds
print 'Save Model...'
joblib.dump(clf, 'LR.model')
return clf
python类LogisticRegression()的实例源码
def get_classifier(method='logistic_regression'):
if 'logistic_regression' == method:
return LogisticRegression(C=1e3,
tol=0.01,
multi_class='ovr',
solver='liblinear',
n_jobs=-1,
random_state=123)
if 'random_forest' == method:
return RandomForestClassifier(n_estimators=250,
bootstrap=False,
n_jobs=-1,
random_state=123)
if 'gradient_boosting' == method:
return xgb.XGBClassifier(max_depth=10,
subsample=0.7,
n_estimators=500,
min_child_weight=0.05,
colsample_bytree=0.3,
learning_rate=0.1)
AIserver.py 文件源码
项目:Using-machine-learning-to-detect-malicious-URLs
作者: faizann24
项目源码
文件源码
阅读 22
收藏 0
点赞 0
评论 0
def TL():
allurls = './data/data.csv' #path to our all urls file
allurlscsv = pd.read_csv(allurls,',',error_bad_lines=False) #reading file
allurlsdata = pd.DataFrame(allurlscsv) #converting to a dataframe
allurlsdata = np.array(allurlsdata) #converting it into an array
random.shuffle(allurlsdata) #shuffling
y = [d[1] for d in allurlsdata] #all labels
corpus = [d[0] for d in allurlsdata] #all urls corresponding to a label (either good or bad)
vectorizer = TfidfVectorizer(tokenizer=getTokens) #get a vector for each url but use our customized tokenizer
X = vectorizer.fit_transform(corpus) #get the X vector
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) #split into training and testing set 80/20 ratio
lgs = LogisticRegression() #using logistic regression
lgs.fit(X_train, y_train)
print(lgs.score(X_test, y_test)) #pring the score. It comes out to be 98%
return vectorizer, lgs
def test_mdr_sklearn_pipeline():
"""Ensure that MDR can be used as a transformer in a scikit-learn pipeline"""
features = np.array([[2, 0],
[0, 0],
[0, 1],
[0, 0],
[0, 0],
[0, 0],
[0, 1],
[0, 0],
[0, 0],
[0, 1],
[0, 0],
[0, 0],
[0, 0],
[1, 1],
[1, 1]])
classes = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0])
clf = make_pipeline(MDR(), LogisticRegression())
cv_scores = cross_val_score(clf, features, classes, cv=StratifiedKFold(n_splits=5, shuffle=True))
assert np.mean(cv_scores) > 0.
def test_mdr_sklearn_pipeline_parallel():
"""Ensure that MDR can be used as a transformer in a parallelized scikit-learn pipeline"""
features = np.array([[2, 0],
[0, 0],
[0, 1],
[0, 0],
[0, 0],
[0, 0],
[0, 1],
[0, 0],
[0, 0],
[0, 1],
[0, 0],
[0, 0],
[0, 0],
[1, 1],
[1, 1]])
classes = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0])
clf = make_pipeline(MDR(), LogisticRegression())
cv_scores = cross_val_score(clf, features, classes, cv=StratifiedKFold(n_splits=5, shuffle=True), n_jobs=-1)
assert np.mean(cv_scores) > 0.
def fitAndPredict(self):
# classifier = LogisticRegression()
# classifier.fit(self.trainingSet, self.trainingLabel)
# pred_labels = classifier.predict(self.testSet)
# print 'Logistic:'
# print classification_report(self.testLabel, pred_labels)
self.classifier = SVC()
self.classifier.fit(self.trainingSet, self.trainingLabel)
pred_labels = {}
for user in self.testDict:
pred_labels[user] = self.classifier.predict([[self.BDS[user]]])
# print 'SVM:'
# print classification_report(self.testLabel, pred_labels)
# classifier = DecisionTreeClassifier(criterion='entropy')
# classifier.fit(self.trainingSet, self.trainingLabel)
# pred_labels = classifier.predict(self.testSet)
# print 'Decision Tree:'
# print classification_report(self.testLabel, pred_labels)
# return self.trainingSet, self.trainingLabel, self.testSet, self.testLabel
return pred_labels
def fitAndPredict(self):
corpus = self.trainingSet+self.testSet
dictionary = corpora.Dictionary(corpus)
corpus = [dictionary.doc2bow(text) for text in corpus]
text_matrix = gensim.matutils.corpus2dense(corpus, num_terms=len(dictionary.token2id)).T
if PCA_Applied:
pca = PCA(n_components=PCA_nComponents)
text_matrix = pca.fit_transform(text_matrix)
classifier = LogisticRegression()
classifier.fit(text_matrix[0:len(self.trainingSet)], self.trainingLabel)
pred_labels = classifier.predict(text_matrix[len(self.trainingSet):])
print 'Logistic:'
print classification_report(self.testLabel, pred_labels)
classifier = SVC()
classifier.fit(text_matrix[0:len(self.trainingSet)], self.trainingLabel)
pred_labels = classifier.predict(text_matrix[len(self.trainingSet):])
print 'SVM:'
print classification_report(self.testLabel, pred_labels)
def fitAndPredict(self):
corpus = self.trainingSet+self.testSet
dictionary = corpora.Dictionary(corpus)
corpus = [dictionary.doc2bow(text) for text in corpus]
model = models.TfidfModel(corpus)
corpus = [text for text in model[corpus]]
text_matrix = gensim.matutils.corpus2dense(corpus, num_terms=len(dictionary.token2id)).T
if PCA_Applied:
pca = PCA(n_components=PCA_nComponents)
text_matrix = pca.fit_transform(text_matrix)
classifier = LogisticRegression()
classifier.fit(text_matrix[0:len(self.trainingSet)], self.trainingLabel)
pred_labels = classifier.predict(text_matrix[len(self.trainingSet):])
print 'Logistic:'
print classification_report(self.testLabel, pred_labels)
classifier = SVC()
classifier.fit(text_matrix[0:len(self.trainingSet)], self.trainingLabel)
pred_labels = classifier.predict(text_matrix[len(self.trainingSet):])
print 'SVM:'
print classification_report(self.testLabel, pred_labels)
def fitAndPredict(self):
# classifier = LogisticRegression()
# classifier.fit(self.trainingSet, self.trainingLabel)
# pred_labels = classifier.predict(self.testSet)
# print 'Logistic:'
# print classification_report(self.testLabel, pred_labels)
pred_labels = {}
classifier = SVC()
classifier.fit(self.trainingSet, self.trainingLabel)
for user in self.testDict:
pred_labels[user] = classifier.predict([[self.MUD[user], self.RUD[user], self.QUD[user]]])
# print 'SVM:'
# print classification_report(self.testLabel, pred_labels)
return pred_labels
# classifier = DecisionTreeClassifier(criterion='entropy')
# classifier.fit(self.trainingSet, self.trainingLabel)
# pred_labels = classifier.predict(self.testSet)
# print 'Decision Tree:'
# print classification_report(self.testLabel, pred_labels)
# return self.trainingSet, self.trainingLabel, self.testSet, self.testLabel
def cv_reg_lr(trX, trY, vaX, vaY, Cs=[0.01, 0.05, 0.1, 0.5, 1., 5., 10., 50., 100.]):
tr_accs = []
va_accs = []
models = []
for C in Cs:
model = LR(C=C)
model.fit(trX, trY)
tr_pred = model.predict(trX)
va_pred = model.predict(vaX)
tr_acc = metrics.accuracy_score(trY, tr_pred)
va_acc = metrics.accuracy_score(vaY, va_pred)
print '%.4f %.4f %.4f'%(C, tr_acc, va_acc)
tr_accs.append(tr_acc)
va_accs.append(va_acc)
models.append(model)
best = np.argmax(va_accs)
print 'best model C: %.4f tr_acc: %.4f va_acc: %.4f'%(Cs[best], tr_accs[best], va_accs[best])
return models[best]
def prepare_fit_model_for_factors(model_type, x_train, y_train):
"""
Given a model type, train and test data
Args:
model_type (str): 'classification' or 'regression'
x_train:
y_train:
Returns:
(sklearn.base.BaseEstimator): A fit model.
"""
if model_type == 'classification':
algorithm = LogisticRegression()
elif model_type == 'regression':
algorithm = LinearRegression()
else:
algorithm = None
if algorithm is not None:
algorithm.fit(x_train, y_train)
return algorithm
def __init__(self, info, verbose=True, debug_mode=False):
self.label_num=info['label_num']
self.target_num=info['target_num']
self.task = info['task']
self.metric = info['metric']
self.postprocessor = None
#self.postprocessor = MultiLabelEnsemble(LogisticRegression(), balance=True) # To calibrate proba
self.postprocessor = MultiLabelEnsemble(LogisticRegression(), balance=False) # To calibrate proba
if debug_mode>=2:
self.name = "RandomPredictor"
self.model = RandomPredictor(self.target_num)
self.predict_method = self.model.predict_proba
return
if info['task']=='regression':
if info['is_sparse']==True:
self.name = "BaggingRidgeRegressor"
self.model = BaggingRegressor(base_estimator=Ridge(), n_estimators=1, verbose=verbose) # unfortunately, no warm start...
else:
self.name = "GradientBoostingRegressor"
self.model = GradientBoostingRegressor(n_estimators=1, max_depth=4, min_samples_split=14, verbose=verbose, warm_start = True)
self.predict_method = self.model.predict # Always predict probabilities
else:
if info['has_categorical']: # Out of lazziness, we do not convert categorical variables...
self.name = "RandomForestClassifier"
self.model = RandomForestClassifier(n_estimators=1, verbose=verbose) # unfortunately, no warm start...
elif info['is_sparse']:
self.name = "BaggingNBClassifier"
self.model = BaggingClassifier(base_estimator=BernoulliNB(), n_estimators=1, verbose=verbose) # unfortunately, no warm start...
else:
self.name = "GradientBoostingClassifier"
self.model = eval(self.name + "(n_estimators=1, verbose=" + str(verbose) + ", random_state=1, warm_start = True)")
if info['task']=='multilabel.classification':
self.model = MultiLabelEnsemble(self.model)
self.predict_method = self.model.predict_proba
def run_predict_logistic_regression(X_train,Y_train,X_test,Y_test):
clf = LogisticRegression()
clf = clf.fit(X_train, Y_train)
pred = clf.predict(X_test)
print('Logistic 0-1 error. \n Training: ', zero_one_score(Y_train, clf.predict(X_train)),
'\n Test:', zero_one_score(Y_test, pred))
return clf
def run_predict_logistic_regression(X_train,Y_train,X_test,Y_test):
clf = LogisticRegression()
clf = clf.fit(X_train, Y_train)
pred = clf.predict(X_test)
print('Logistic 0-1 error. \n Training: ', zero_one_score(Y_train, clf.predict(X_train)),
'\n Test:', zero_one_score(Y_test, pred))
return clf
def run_predict_logistic_regression(X_train,Y_train,X_test,Y_test):
clf = LogisticRegression()
clf = clf.fit(X_train, Y_train)
pred = clf.predict(X_test)
print('Logistic 0-1 error. \n Training: ', zero_one_score(Y_train, clf.predict(X_train)),
'\n Test:', zero_one_score(Y_test, pred))
return clf
def run_predict_logistic_regression(X_train,Y_train,X_test,Y_test):
clf = LogisticRegression()
clf = clf.fit(X_train, Y_train)
pred = clf.predict(X_test)
print('Logistic 0-1 error. \n Training: ', zero_one_score(Y_train, clf.predict(X_train)),
'\n Test:', zero_one_score(Y_test, pred))
return clf
def run_predict_logistic_regression(X_train,Y_train,X_test,Y_test):
clf = LogisticRegression()
clf = clf.fit(X_train, Y_train)
pred = clf.predict(X_test)
print('Logistic 0-1 error. \n Training: ', zero_one_score(Y_train, clf.predict(X_train)),
'\n Test:', zero_one_score(Y_test, pred))
return clf
def train_clf(self, trainfiles):
# tokens: list of words, labels: list of corresponding labels
# go document by document because of local context
final_labels = []
featmat = []
for trainfile in trainfiles:
for tokens, labels in yield_tokens_labels(trainfile):
final_labels.extend(labels)
featmat.append(self.make_featmat_rep(tokens))
featmat = np.vstack(featmat)
print("training classifier")
clf = logreg(class_weight='balanced', random_state=1)
clf.fit(featmat, final_labels)
self.clf = clf
def classify(train=None, test=None, data=None, res_dir="res/", disp=True, outfilename=None):
"""Description of compare
compare multiple classifier and display the best one
"""
utils.print_success("Comparison of differents classifiers")
if data is not None:
train_features = data["train_features"]
train_groundtruths = data["train_groundtruths"]
test_features = data["test_features"]
test_groundtruths = data["test_groundtruths"]
else:
train = utils.abs_path_file(train)
test = utils.abs_path_file(test)
train_features, train_groundtruths = read_file(train)
test_features, test_groundtruths = read_file(test)
if not utils.create_dir(res_dir):
res_dir = utils.abs_path_dir(res_dir)
classifiers = {
"RandomForest": RandomForestClassifier(n_jobs=-1)
# "RandomForest": RandomForestClassifier(n_estimators=5),
# "KNeighbors":KNeighborsClassifier(3),
# "GaussianProcess":GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True),
# "DecisionTree":DecisionTreeClassifier(max_depth=5),
# "MLP":MLPClassifier(),
# "AdaBoost":AdaBoostClassifier(),
# "GaussianNB":GaussianNB(),
# "QDA":QuadraticDiscriminantAnalysis(),
# "SVM":SVC(kernel="linear", C=0.025),
# "GradientBoosting":GradientBoostingClassifier(),
# "ExtraTrees":ExtraTreesClassifier(),
# "LogisticRegression":LogisticRegression(),
# "LinearDiscriminantAnalysis":LinearDiscriminantAnalysis()
}
for key in classifiers:
utils.print_success(key)
clf = classifiers[key]
utils.print_info("\tFit")
clf.fit(train_features, train_groundtruths)
utils.print_info("\tPredict")
predictions = clf.predict(test_features)
return predictions
def reloadHelper():
reload(helper)
# http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html