def run_statsmodels_models(train, test, model_description):
"""
Run logistic regression model to predict whether a signed up driver ever actually drove.
:param input_df: Data frame prepared for statsmodels regression
:type input_df: pd.DataFrame
:return: AUC for model generated
:rtype: float
"""
# Run model on all observations
# Use dmatrices to format data
logging.info('Running model w/ description: %s' %model_description)
logging.debug('Train df: \n%s' % train.describe())
logging.debug('Test df: \n%s' % test.describe())
y_train, X_train = dmatrices(model_description, data=train, return_type='dataframe', NA_action='drop')
y_test, X_test = dmatrices(model_description, data=test, return_type='dataframe', NA_action='drop')
# Create, fit model
mod = sm.Logit(endog=y_train, exog=X_train)
res = mod.fit(method='bfgs', maxiter=100)
# Output model summary
print train['city_name'].value_counts()
print train['signup_channel'].value_counts()
print res.summary()
# Create, output AUC
predicted = res.predict(X_test)
auc = roc_auc_score(y_true=y_test, y_score=predicted)
print 'AUC for 20%% holdout: %s' %auc
# Return AUC for model generated
return auc
# Main section
python类Logit()的实例源码
def compute(self, method='logistic'):
"""
Compute propensity score and measures of goodness-of-fit
Parameters
----------
method : str
Propensity score estimation method. Either 'logistic' or 'probit'
"""
predictors = sm.add_constant(self.covariates, prepend=False)
if method == 'logistic':
model = sm.Logit(self.treatment, predictors).fit(disp=False, warn_convergence=True)
elif method == 'probit':
model = sm.Probit(self.treatment, predictors).fit(disp=False, warn_convergence=True)
else:
raise ValueError('Unrecognized method')
return model.predict()
business_case_solver_without_classes.py 文件源码
项目:themarketingtechnologist
作者: thomhopmans
项目源码
文件源码
阅读 25
收藏 0
点赞 0
评论 0
def run_logistic_regression(df):
# Logistic regression
X = df['pageviews_cumsum']
X = sm.add_constant(X)
y = df['is_conversion']
logit = sm.Logit(y, X)
logistic_regression_results = logit.fit()
print(logistic_regression_results.summary())
return logistic_regression_results
business_case_solver.py 文件源码
项目:themarketingtechnologist
作者: thomhopmans
项目源码
文件源码
阅读 22
收藏 0
点赞 0
评论 0
def run_logistic_regression(self):
# Logistic regression
X = self.df['pageviews_cumsum']
X = sm.add_constant(X)
y = self.df['is_conversion']
logit = sm.Logit(y, X)
self.logistic_regression_results = logit.fit()
print self.logistic_regression_results.summary()
def fit(self):
# self.model = linear_model.LogisticRegression(C=1e3)
# self.model.fit(self.X, self.y)
# self.model.score(self.X, self.y)
X = self.X.copy()
X['intercept'] = 1
logit = sm.Logit(self.y, X)
self.model = logit.fit()
print self.model.summary()
def sm_logit(self,Xtrain,ytrain, Xtest, ytest):
sm_results = sm.Logit(ytrain, Xtrain).fit_regularized(alpha = 10, disp = False)
print sm_results.summary()
# predict train labels
train_predictions = sm_results.predict(Xtrain)
train_accuracy = self.calculate_accuracy(train_predictions, ytrain)
print "train accuracy: ", train_accuracy * 100
for i in range(len(train_predictions)):
train_predictions[i] = round(train_predictions[i])
train_confMatrix = confusion_matrix(ytrain, train_predictions, labels = [1.0, 0.0])
print "train confusion matrix:", train_confMatrix
# predict test labels
test_predictions = sm_results.predict(Xtest)
test_accuracy = self.calculate_accuracy(test_predictions, ytest)
print "test accuracy: ", test_accuracy * 100
for i in range(len(test_predictions)):
test_predictions[i] = round(test_predictions[i])
test_confMatrix = confusion_matrix(ytest, test_predictions, labels = [1.0, 0.0])
print "test confusion matrix:", test_confMatrix
def get_data():
f_path = "../dataset/logistic_regression/UCLA_dataset.csv"
df = pd.read_csv(f_path)
print df.head()
print df.describe()
print df.std()
print pd.crosstab(df['admit'], df['rank'], rownames=['admit'])
# df.hist()
# pl.show()
# dummy_ranks = pd.get_dummies(df['rank'], prefix='rank')
# print dummy_ranks.head()
# train_cols = df.columns[1:]
# lr = sm.Logit(df['admit'], df[train_cols])
# ret = lr.fit()
# print ret.summary()
train, test = train_test_split(df, test_size=0.2)
train_x, train_y = train[train.columns[1:]], train['admit']
test_x, test_y = test[test.columns[1:]], test['admit']
lr = LogisticRegression()
lr.fit(train_x, train_y)
y_pred = lr.predict(test_x)
print accuracy_score(test_y, y_pred)
rf = RandomForestClassifier(n_jobs=4)
rf.fit(train_x, train_y)
Y_pred = rf.predict(test_x)
cnf_matrix = confusion_matrix(test_y, Y_pred)
print cnf_matrix
accuracy_percent = accuracy_score(test_y, Y_pred)
print "accuracy is: %s%s" % (accuracy_percent, '%')
recall_percent = recall_score(test_y, Y_pred)
print "recall is: %s%s" % (recall_percent, '%')