def lm(data, xseq, **params):
"""
Fit OLS / WLS if data has weight
"""
X = sm.add_constant(data['x'])
Xseq = sm.add_constant(xseq)
try:
model = sm.WLS(data['y'], X, weights=data['weight'])
except KeyError:
model = sm.OLS(data['y'], X)
results = model.fit(**params['method_args'])
data = pd.DataFrame({'x': xseq})
data['y'] = results.predict(Xseq)
if params['se']:
alpha = 1 - params['level']
prstd, iv_l, iv_u = wls_prediction_std(
results, Xseq, alpha=alpha)
data['se'] = prstd
data['ymin'] = iv_l
data['ymax'] = iv_u
return data
python类WLS的实例源码
def __init__(self, x, y, w):
self.x = x
self.y = y
self.w = w
WLS = sm.WLS(y,x, w)
OLS = sm.OLS(WLS.wendog,WLS.wexog).fit()
self.coefficient = OLS.params[0]
self.WSSResidual = OLS.ssr
self.fittedvalues = OLS.predict(x)
self.residual = OLS.resid
if len(x) == 1:
self.mean_square_error = np.nan
self.standard_error = np.nan
self.sigma = np.nan
self.std_resid = np.nan
else:
self.mean_square_error = OLS.mse_resid
self.standard_error = OLS.params[0]/OLS.tvalues[0]
self.sigma = np.sqrt(self.mean_square_error)
self.std_resid = OLSInfluence(OLS).resid_studentized_internal
test_ols.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 22
收藏 0
点赞 0
评论 0
def testWLS(self):
# WLS centered SS changed (fixed) in 0.5.0
sm_version = sm.version.version
if sm_version < LooseVersion('0.5.0'):
raise nose.SkipTest("WLS centered SS not fixed in statsmodels"
" version {0}".format(sm_version))
X = DataFrame(np.random.randn(30, 4), columns=['A', 'B', 'C', 'D'])
Y = Series(np.random.randn(30))
weights = X.std(1)
self._check_wls(X, Y, weights)
weights.ix[[5, 15]] = np.nan
Y[[2, 21]] = np.nan
self._check_wls(X, Y, weights)
test_ols.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def _check_wls(self, x, y, weights):
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
result = ols(y=y, x=x, weights=1 / weights)
combined = x.copy()
combined['__y__'] = y
combined['__weights__'] = weights
combined = combined.dropna()
endog = combined.pop('__y__').values
aweights = combined.pop('__weights__').values
exog = sm.add_constant(combined.values, prepend=False)
sm_result = sm.WLS(endog, exog, weights=1 / aweights).fit()
assert_almost_equal(sm_result.params, result._beta_raw)
assert_almost_equal(sm_result.resid, result._resid_raw)
self.checkMovingOLS('rolling', x, y, weights=weights)
self.checkMovingOLS('expanding', x, y, weights=weights)
ols.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
def __init__(self, y, x, intercept=True, weights=None, nw_lags=None,
nw_overlap=False):
import warnings
warnings.warn("The pandas.stats.ols module is deprecated and will be "
"removed in a future version. We refer to external packages "
"like statsmodels, see some examples here: http://statsmodels.sourceforge.net/stable/regression.html",
FutureWarning, stacklevel=4)
try:
import statsmodels.api as sm
except ImportError:
import scikits.statsmodels.api as sm
self._x_orig = x
self._y_orig = y
self._weights_orig = weights
self._intercept = intercept
self._nw_lags = nw_lags
self._nw_overlap = nw_overlap
(self._y, self._x, self._weights, self._x_filtered,
self._index, self._time_has_obs) = self._prepare_data()
if self._weights is not None:
self._x_trans = self._x.mul(np.sqrt(self._weights), axis=0)
self._y_trans = self._y * np.sqrt(self._weights)
self.sm_ols = sm.WLS(self._y.get_values(),
self._x.get_values(),
weights=self._weights.values).fit()
else:
self._x_trans = self._x
self._y_trans = self._y
self.sm_ols = sm.OLS(self._y.get_values(),
self._x.get_values()).fit()
def regression_apply(row, timepoints, weighted):
"""
:py:meth:`pandas.DataFrame.apply` apply function for calculating
enrichment using linear regression. If *weighted* is ``True`` perform
weighted least squares; else perform ordinary least squares.
Weights for weighted least squares are included in *row*.
Returns a :py:class:`pandas.Series` containing regression coefficients,
residuals, and statistics.
"""
# retrieve log ratios from the row
y = row[['L_{}'.format(t) for t in timepoints]]
# re-scale the x's to fall within [0, 1]
xvalues = [x / float(max(timepoints)) for x in timepoints]
# perform the fit
X = sm.add_constant(xvalues) # fit intercept
if weighted:
W = row[['W_{}'.format(t) for t in timepoints]]
fit = sm.WLS(y, X, weights=W).fit()
else:
fit = sm.OLS(y, X).fit()
# re-format as a data frame row
values = np.concatenate([fit.params, [fit.bse['x1'], fit.tvalues['x1'],
fit.pvalues['x1']], fit.resid])
index = ['intercept', 'slope', 'SE_slope', 't', 'pvalue_raw'] + \
['e_{}'.format(t) for t in timepoints]
return pd.Series(data=values, index=index)
def validate(self):
"""
Raises an informative ``ValueError`` if the time points in the analysis are not suitable.
Calls validate method on all child SeqLibs.
"""
# check the time points
if 0 not in self.timepoints:
raise ValueError("Missing timepoint 0 [{}]".format(self.name))
if self.timepoints[0] != 0:
raise ValueError("Invalid negative timepoint [{}]".format(self.name))
if len(self.timepoints) < 2:
raise ValueError("Multiple timepoints required [{}]".format(self.name))
elif len(self.timepoints) < 3 and self.scoring_method in ("WLS", "OLS"):
raise ValueError("Insufficient number of timepoints for regression scoring [{}]".format(self.name))
# check the wild type sequences
if self.has_wt_sequence():
for child in self.children[1:]:
if self.children[0].wt != child.wt:
logging.warning("Inconsistent wild type sequences", extra={'oname' : self.name})
break
# check that we're not doing wild type normalization on something with no wild type
#if not self.has_wt_sequence() and self.logr_method == "wt":
# raise ValueError("No wild type sequence for wild type normalization [{}]".format(self.name))
# validate children
for child in self.children:
child.validate()
def calculate(self):
"""
Wrapper method to calculate counts and enrichment scores
for all data in the :py:class:`~selection.Selection`.
"""
if len(self.labels) == 0:
raise ValueError("No data present across all sequencing libraries [{}]".format(self.name))
for label in self.labels:
self.merge_counts_unfiltered(label)
self.filter_counts(label)
if self.is_barcodevariant() or self.is_barcodeid():
self.combine_barcode_maps()
if self.scoring_method == "counts":
pass
elif self.scoring_method == "ratios":
for label in self.labels:
self.calc_ratios(label)
elif self.scoring_method == "simple":
for label in self.labels:
self.calc_simple_ratios(label)
elif self.scoring_method in ("WLS", "OLS"):
if len(self.timepoints) <= 2:
raise ValueError("Regression-based scoring requires three or more time points.")
for label in self.labels:
self.calc_log_ratios(label)
if self.scoring_method == "WLS":
self.calc_weights(label)
self.calc_regression(label)
else:
raise ValueError('Invalid scoring method "{}" [{}]'.format(self.scoring_method, self.name))
if self.scoring_method in ("ratios" , "WLS", "OLS") and self.component_outliers:
if self.is_barcodevariant() or self.is_barcodeid():
self.calc_outliers("barcodes")
if self.is_coding():
self.calc_outliers("variants")
def calc_regression(self, label):
"""
Calculate least squares regression for *label*. If *weighted* is ``True``, calculates weighted least squares; else ordinary least squares.
Regression results are stored in ``'/main/label/scores'``
"""
if self.check_store("/main/{}/scores".format(label)):
return
elif "/main/{}/scores".format(label) in self.store.keys():
# need to remove the current keys because we are using append
self.store.remove("/main/{}/scores".format(label))
logging.info("Calculating {} regression coefficients ({})".format(self.scoring_method, label), extra={'oname' : self.name})
# append is required because it takes the "min_itemsize" argument, and put doesn't
longest = self.store.select("/main/{}/log_ratios".format(label), "columns='index'").index.map(len).max()
chunk = 1
if self.scoring_method == "WLS":
for data in self.store.select_as_multiple(["/main/{}/log_ratios".format(label), "/main/{}/weights".format(label)], chunksize=self.chunksize):
logging.info("Calculating weighted least squares for chunk {} ({} rows)".format(chunk, len(data.index)), extra={'oname' : self.name})
result = data.apply(regression_apply, args=[self.timepoints, True], axis="columns")
self.store.append("/main/{}/scores".format(label), result, min_itemsize={"index" : longest})
chunk += 1
elif self.scoring_method == "OLS":
for data in self.store.select("/main/{}/log_ratios".format(label), chunksize=self.chunksize):
logging.info("Calculating ordinary least squares for chunk {} ({} rows)".format(chunk, len(data.index)), extra={'oname' : self.name})
result = data.apply(regression_apply, args=[self.timepoints, False], axis="columns")
self.store.append("/main/{}/scores".format(label), result, min_itemsize={"index" : longest})
chunk += 1
else:
raise ValueError('Invalid regression scoring method "{}" [{}]'.format(self.scoring_method, self.name))
# need to read from the file, calculate percentiles, and rewrite it
logging.info("Calculating slope standard error percentiles ({})".format(label), extra={'oname' : self.name})
data = self.store['/main/{}/scores'.format(label)]
data['score'] = data['slope']
data['SE'] = data['SE_slope']
data['SE_pctile'] = [stats.percentileofscore(data['SE'], x, "weak") for x in data['SE']]
data = data[['score', 'SE', 'SE_pctile', 'slope', 'intercept', 'SE_slope', 't', 'pvalue_raw']] # reorder columns
self.store.put("/main/{}/scores".format(label), data, format="table", data_columns=data.columns)
def wt_plot(self, pdf):
"""
Create a plot of the linear fit of the wild type variant.
*pdf* is an open PdfPages instance.
Only created for selections that use WLS or OLS scoring and have a wild type specified.
Uses :py:func:`~plots.fit_axes` for the plotting.
"""
logging.info("Creating wild type fit plot", extra={'oname' : self.name})
# get the data and calculate log ratios
if "variants" in self.labels:
wt_label = "variants"
elif "identifiers" in self.labels:
wt_label = "identifiers"
data = self.store.select("/main/{}/counts".format(wt_label), where='index = "{}"'.format(WILD_TYPE_VARIANT)).ix[0]
sums = self.store['/main/{}/counts'.format(wt_label)].sum(axis="index") # sum of complete cases (N')
yvalues = np.log(data + 0.5) - np.log(sums + 0.5)
xvalues = [tp / float(max(self.timepoints)) for tp in self.timepoints]
# fit the line
X = sm.add_constant(xvalues) # fit intercept
if self.scoring_method == "WLS":
W = 1 / (1 / (data + 0.5) + 1 / (sums + 0.5))
fit = sm.WLS(yvalues, X, weights=W).fit()
elif self.scoring_method == "OLS":
fit = sm.OLS(yvalues, X).fit()
else:
raise ValueError('Invalid regression scoring method "{}" [{}]'.format(self.scoring_method, self.name))
intercept, slope = fit.params
slope_se = fit.bse['x1']
# make the plot
fig, ax = plt.subplots()
fig.set_tight_layout(True)
fit_axes(ax, xvalues, yvalues, slope, intercept, xlabels=self.timepoints)
fit_axes_text(ax, cornertext="Slope {:3.2f}\nSE {:.1f}".format(slope, slope_se))
ax.set_title("Wild Type Shape\n{}".format(self.name))
ax.set_ylabel("Log Ratio (Complete Cases)")
pdf.savefig(fig)
plt.close(fig)
def scatterColor(x0, y, w):
"""Creates scatter plot with points colored by variable.
All input arrays must have matching lengths
:param x0: x values to plot
:type x0: list
:param y: y values to plot
:type y: list
:param w: z values to plot
:returns: plot; slope and intercept of the RLM best fit line shown on the plot
.. warning:: all input arrays must have matching lengths and scalar values
.. note:: See documentation at http://statsmodels.sourceforge.net/0.6.0/generated/statsmodels.robust.robust_linear_model.RLM.html
for the RLM line
"""
import matplotlib as mpl
import matplotlib.cm as cm
import statsmodels.api as sm
from scipy.stats import linregress
cmap = plt.cm.get_cmap('RdYlBu')
norm = mpl.colors.Normalize(vmin=w.min(), vmax=w.max())
m = cm.ScalarMappable(norm=norm, cmap=cmap)
m.set_array(w)
sc = plt.scatter(x0, y, label='', color=m.to_rgba(w))
xa = sm.add_constant(x0)
est = sm.RLM(y, xa).fit()
r2 = sm.WLS(y, xa, weights=est.weights).fit().rsquared
slope = est.params[1]
x_prime = np.linspace(np.min(x0), np.max(x0), 100)[:, np.newaxis]
x_prime = sm.add_constant(x_prime)
y_hat = est.predict(x_prime)
const = est.params[0]
y2 = [i * slope + const for i in x0]
lin = linregress(x0, y)
x1 = np.arange(np.min(x0), np.max(x0), 0.1)
y1 = [i * lin[0] + lin[1] for i in x1]
y2 = [i * slope + const for i in x1]
plt.plot(x1, y1, c='g',
label='simple linear regression m = {:.2f} b = {:.0f}, r^2 = {:.2f}'.format(lin[0], lin[1], lin[2] ** 2))
plt.plot(x1, y2, c='r', label='rlm regression m = {:.2f} b = {:.0f}, r2 = {:.2f}'.format(slope, const, r2))
plt.legend()
cbar = plt.colorbar(m)
cbar.set_label('Julian Date')
return slope, const