def run_statsmodels_models(train, test, model_description):
"""
Run logistic regression model to predict whether a signed up driver ever actually drove.
:param input_df: Data frame prepared for statsmodels regression
:type input_df: pd.DataFrame
:return: AUC for model generated
:rtype: float
"""
# Run model on all observations
# Use dmatrices to format data
logging.info('Running model w/ description: %s' %model_description)
logging.debug('Train df: \n%s' % train.describe())
logging.debug('Test df: \n%s' % test.describe())
y_train, X_train = dmatrices(model_description, data=train, return_type='dataframe', NA_action='drop')
y_test, X_test = dmatrices(model_description, data=test, return_type='dataframe', NA_action='drop')
# Create, fit model
mod = sm.Logit(endog=y_train, exog=X_train)
res = mod.fit(method='bfgs', maxiter=100)
# Output model summary
print train['city_name'].value_counts()
print train['signup_channel'].value_counts()
print res.summary()
# Create, output AUC
predicted = res.predict(X_test)
auc = roc_auc_score(y_true=y_test, y_score=predicted)
print 'AUC for 20%% holdout: %s' %auc
# Return AUC for model generated
return auc
# Main section
评论列表
文章目录