def _get_intercept_stats(self, add_slopes=True):
# start with mean and variance of Y on the link scale
mod = sm.GLM(endog=self.model.y.data,
exog=np.repeat(1, len(self.model.y.data)),
family=self.model.family.smfamily(),
missing='drop' if self.model.dropna else 'none').fit()
mu = mod.params
# multiply SE by sqrt(N) to turn it into (approx.) SD(Y) on link scale
sd = (mod.cov_params()[0] * len(mod.mu))**.5
# modify mu and sd based on means and SDs of slope priors.
if len(self.model.fixed_terms) > 1 and add_slopes:
means = np.array([x['mu'] for x in self.priors.values()])
sds = np.array([x['sd'] for x in self.priors.values()])
# add to intercept prior
index = list(self.priors.keys())
mu -= np.dot(means, self.stats['mean_x'][index])
sd = (sd**2 + np.dot(sds**2, self.stats['mean_x'][index]**2))**.5
return mu, sd
python类GLM的实例源码
def __init__(self, model, taylor):
self.model = model
self.stats = model.dm_statistics if hasattr(model, 'dm_statistics') \
else None
self.dm = pd.DataFrame({lev: t.data[:, i]
for t in model.fixed_terms.values()
for i, lev in enumerate(t.levels)})
self.priors = {}
missing = 'drop' if self.model.dropna else 'none'
self.mle = sm.GLM(endog=self.model.y.data, exog=self.dm,
family=self.model.family.smfamily(),
missing=missing).fit()
self.taylor = taylor
with open(join(dirname(__file__), 'config', 'derivs.txt'), 'r') as file:
self.deriv = [next(file).strip('\n') for x in range(taylor+1)]
def run_glm(X, y, model_name):
""" Train the binomial/negative binomial GLM
Args:
X (np.array): scaled X.
y (pd.df): four columns response table.
Returns:
sm.model: trained GLM models.
"""
# Add const manually. sm.add_constant cannot add 1 for shape (1, n)
X = np.c_[X, np.ones(X.shape[0])]
if model_name == 'Binomial':
logger.info('Building binomial GLM')
# make two columns response (# success, # failure)
y_binom = np.zeros((y.shape[0], 2), dtype=np.int_)
y_binom[:, 0] = y.nMut
y_binom[:, 1] = y.length * y.N - y.nMut
glm = sm.GLM(y_binom, X, family=sm.families.Binomial())
elif model_name == 'NegativeBinomial':
logger.info('Building negative binomial GLM')
# use nMut as response and length*N as exposure
glm = sm.GLM(y.nMut.values, X, family=sm.families.NegativeBinomial(), exposure=y.length.values*y.N.values+1)
else:
sys.stderr.write('Unknown GLM name {}. Must be Binomial or NegativeBinomial'.format(model_name))
sys.exit(1)
model = glm.fit()
return model
def glm(data, xseq, **params):
"""
Fit GLM
"""
X = sm.add_constant(data['x'])
Xseq = sm.add_constant(xseq)
results = sm.GLM(data['y'], X).fit(**params['method_args'])
data = pd.DataFrame({'x': xseq})
data['y'] = results.predict(Xseq)
if params['se']:
# TODO: Depends on statsmodel > 0.7
# https://github.com/statsmodels/statsmodels/pull/2151
# https://github.com/statsmodels/statsmodels/pull/3406
# Remove the try/except when a compatible version is released
try:
prediction = results.get_prediction(Xseq)
ci = prediction.conf_int(1 - params['level'])
data['ymin'] = ci[:, 0]
data['ymax'] = ci[:, 1]
except (AttributeError, TypeError):
warnings.warn(
"Cannot compute confidence intervals."
"Install latest/development version of statmodels.")
return data
def glmfit(X,Y):
'''
Wrapper for statsmodels glmfit that prepares a constant
parameter and configuration options for poisson-GLM fitting.
Please see the documentation for glmfit in statsmodels for
more details.
This method will automatically add a constant colum to the feature
matrix Y
Parameters
----------
X : array-like
A nobs x k array where `nobs` is the number of observations and `k`
is the number of regressors. An intercept is not included by default
and should be added by the user (models specified using a formula
include an intercept by default). See `statsmodels.tools.add_constant`.
Y : array-like
1d array of poisson counts. This array can be 1d or 2d.
'''
# check for and maybe add constant value to X
if not all(X[:,0]==X[0,0]):
X = hstack([ ones((shape(X)[0],1),dtype=X.dtype), X])
poisson_model = sm.GLM(Y,X,family=sm.families.Poisson())
poisson_results = poisson_model.fit()
M = poisson_results.params
return M
def _scale_random(self, term):
# these default priors are only defined for HalfNormal priors
if term.prior.args['sd'].name != 'HalfNormal':
return
sd_corr = term.prior.scale
# recreate the corresponding fixed effect data
fix_data = term.data.sum(axis=1)
# handle intercepts and cell means
if term.constant:
mu, sd = self._get_intercept_stats()
sd *= sd_corr
# handle slopes
else:
exists = [x for x in self.dm.columns
if np.array_equal(fix_data, self.dm[x].values)]
# handle case where there IS a corresponding fixed effect
if exists and exists[0] in self.priors.keys():
sd = self.priors[exists[0]]['sd']
# handle case where there IS NOT a corresponding fixed effect
else:
# the usual case: add the random effect data as a fixed effect
# in the design matrix
if not exists:
fix_dataframe = pd.DataFrame(fix_data)
# things break if column names are integers (the default)
fix_dataframe.rename(
columns={c: '_'+str(c) for c in fix_dataframe.columns},
inplace=True)
exog = self.dm.join(fix_dataframe)
# this handles the corner case where there technically is the
# corresponding fixed effect, but the parameterization differs
# between the fixed- and random-effect specification. usually
# this means the fixed effects use cell-means coding but the
# random effects use k-1 coding
else:
group = term.name.split('|')[1]
exog = self.model.random_terms.values()
exog = [v.data.sum(1) for v in exog
if v.name.split('|')[-1] == group]
index = ['_'+str(i) for i in range(len(exog))]
exog = pd.DataFrame(exog, index=index).T
# this will replace self.mle (which is missing predictors)
missing = 'drop' if self.model.dropna else 'none'
full_mod = sm.GLM(endog=self.model.y.data, exog=exog,
family=self.model.family.smfamily(),
missing=missing).fit()
sd = self._get_slope_stats(exog=exog, predictor=fix_data,
full_mod=full_mod, sd_corr=sd_corr)
# set the prior SD.
term.prior.args['sd'].update(sd=np.squeeze(np.atleast_1d(sd)))
def fit(self, y, X):
"""Fit the model.
Args:
y (pandas.DataFrame): The vector of endogenous variable.
X (pandas.DataFrame): The matrix of exogenous variables.
"""
# Creating the GLM model from StatsModels and fitting it
model = sm.GLM(y, X, family=sm.families.Binomial())
try:
fitted = model.fit()
except PerfectSeparationError as e:
raise StatsError(str(e))
out = {}
parameters = fitted.params.index
# Results about the model fit.
out = {
"MODEL": {
"log_likelihood": fitted.llf,
"nobs": fitted.nobs
},
}
# Getting the confidence intervals
conf_ints = fitted.conf_int()
for param in parameters:
# If GWAS, check that inference could be done on the SNP.
if param == "SNPs" and np.isnan(fitted.pvalues[param]):
raise StatsError("Inference did not converge.")
out[param] = {
"coef": fitted.params[param],
"std_err": fitted.bse[param],
"lower_ci": conf_ints.loc[param, 0],
"upper_ci": conf_ints.loc[param, 1],
"t_value": fitted.tvalues[param],
"p_value": fitted.pvalues[param],
}
return out