python类WLS的实例源码-面圈网

smoothers.py 文件源码项目：plotnine 作者: has2k1 项目源码文件源码阅读 19 收藏 0 点赞 0 评论 0

def lm(data, xseq, **params):
    """
    Fit OLS / WLS if data has weight
    """
    X = sm.add_constant(data['x'])
    Xseq = sm.add_constant(xseq)

    try:
        model = sm.WLS(data['y'], X, weights=data['weight'])
    except KeyError:
        model = sm.OLS(data['y'], X)

    results = model.fit(**params['method_args'])
    data = pd.DataFrame({'x': xseq})
    data['y'] = results.predict(Xseq)

    if params['se']:
        alpha = 1 - params['level']
        prstd, iv_l, iv_u = wls_prediction_std(
            results, Xseq, alpha=alpha)
        data['se'] = prstd
        data['ymin'] = iv_l
        data['ymax'] = iv_u

    return data

Chainladder.py 文件源码项目：chainladder-python 作者: jbogaardt 项目源码文件源码阅读 16 收藏 0 点赞 0 评论 0

def __init__(self, x, y, w):
        self.x = x
        self.y = y
        self.w = w
        WLS = sm.WLS(y,x, w)
        OLS = sm.OLS(WLS.wendog,WLS.wexog).fit()
        self.coefficient = OLS.params[0]
        self.WSSResidual = OLS.ssr
        self.fittedvalues = OLS.predict(x)
        self.residual = OLS.resid
        if len(x) == 1:
            self.mean_square_error = np.nan 
            self.standard_error = np.nan
            self.sigma = np.nan
            self.std_resid = np.nan
        else:
            self.mean_square_error = OLS.mse_resid 
            self.standard_error = OLS.params[0]/OLS.tvalues[0]
            self.sigma = np.sqrt(self.mean_square_error)
            self.std_resid = OLSInfluence(OLS).resid_studentized_internal

test_ols.py 文件源码项目：PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def testWLS(self):
        # WLS centered SS changed (fixed) in 0.5.0
        sm_version = sm.version.version
        if sm_version < LooseVersion('0.5.0'):
            raise nose.SkipTest("WLS centered SS not fixed in statsmodels"
                                " version {0}".format(sm_version))

        X = DataFrame(np.random.randn(30, 4), columns=['A', 'B', 'C', 'D'])
        Y = Series(np.random.randn(30))
        weights = X.std(1)

        self._check_wls(X, Y, weights)

        weights.ix[[5, 15]] = np.nan
        Y[[2, 21]] = np.nan
        self._check_wls(X, Y, weights)

test_ols.py 文件源码项目：PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def _check_wls(self, x, y, weights):
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = ols(y=y, x=x, weights=1 / weights)

        combined = x.copy()
        combined['__y__'] = y
        combined['__weights__'] = weights
        combined = combined.dropna()

        endog = combined.pop('__y__').values
        aweights = combined.pop('__weights__').values
        exog = sm.add_constant(combined.values, prepend=False)

        sm_result = sm.WLS(endog, exog, weights=1 / aweights).fit()

        assert_almost_equal(sm_result.params, result._beta_raw)
        assert_almost_equal(sm_result.resid, result._resid_raw)

        self.checkMovingOLS('rolling', x, y, weights=weights)
        self.checkMovingOLS('expanding', x, y, weights=weights)

ols.py 文件源码项目：PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def __init__(self, y, x, intercept=True, weights=None, nw_lags=None,
                 nw_overlap=False):
        import warnings
        warnings.warn("The pandas.stats.ols module is deprecated and will be "
                      "removed in a future version. We refer to external packages "
                      "like statsmodels, see some examples here: http://statsmodels.sourceforge.net/stable/regression.html",
                      FutureWarning, stacklevel=4)

        try:
            import statsmodels.api as sm
        except ImportError:
            import scikits.statsmodels.api as sm

        self._x_orig = x
        self._y_orig = y
        self._weights_orig = weights
        self._intercept = intercept
        self._nw_lags = nw_lags
        self._nw_overlap = nw_overlap

        (self._y, self._x, self._weights, self._x_filtered,
         self._index, self._time_has_obs) = self._prepare_data()

        if self._weights is not None:
            self._x_trans = self._x.mul(np.sqrt(self._weights), axis=0)
            self._y_trans = self._y * np.sqrt(self._weights)
            self.sm_ols = sm.WLS(self._y.get_values(),
                                 self._x.get_values(),
                                 weights=self._weights.values).fit()
        else:
            self._x_trans = self._x
            self._y_trans = self._y
            self.sm_ols = sm.OLS(self._y.get_values(),
                                 self._x.get_values()).fit()

selection.py 文件源码项目：Enrich2 作者: FowlerLab 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def regression_apply(row, timepoints, weighted):
    """
    :py:meth:`pandas.DataFrame.apply` apply function for calculating 
    enrichment using linear regression. If *weighted* is ``True`` perform
    weighted least squares; else perform ordinary least squares.

    Weights for weighted least squares are included in *row*.

    Returns a :py:class:`pandas.Series` containing regression coefficients,
    residuals, and statistics.
    """
    # retrieve log ratios from the row
    y = row[['L_{}'.format(t) for t in timepoints]]

    # re-scale the x's to fall within [0, 1]
    xvalues = [x / float(max(timepoints)) for x in timepoints]

    # perform the fit
    X = sm.add_constant(xvalues) # fit intercept
    if weighted:
        W = row[['W_{}'.format(t) for t in timepoints]]
        fit = sm.WLS(y, X, weights=W).fit()
    else:
        fit = sm.OLS(y, X).fit()

    # re-format as a data frame row
    values = np.concatenate([fit.params,  [fit.bse['x1'], fit.tvalues['x1'], 
                            fit.pvalues['x1']], fit.resid])
    index = ['intercept', 'slope', 'SE_slope', 't', 'pvalue_raw'] + \
            ['e_{}'.format(t) for t in timepoints]
    return pd.Series(data=values, index=index)

selection.py 文件源码项目：Enrich2 作者: FowlerLab 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def validate(self):
        """
        Raises an informative ``ValueError`` if the time points in the analysis are not suitable.

        Calls validate method on all child SeqLibs.
        """
        # check the time points
        if 0 not in self.timepoints:
            raise ValueError("Missing timepoint 0 [{}]".format(self.name))
        if self.timepoints[0] != 0:
            raise ValueError("Invalid negative timepoint [{}]".format(self.name))
        if len(self.timepoints) < 2:
            raise ValueError("Multiple timepoints required [{}]".format(self.name))
        elif len(self.timepoints) < 3 and self.scoring_method in ("WLS", "OLS"):
            raise ValueError("Insufficient number of timepoints for regression scoring [{}]".format(self.name))

        # check the wild type sequences
        if self.has_wt_sequence():
            for child in self.children[1:]:
                if self.children[0].wt != child.wt:
                    logging.warning("Inconsistent wild type sequences", extra={'oname' : self.name})
                    break

        # check that we're not doing wild type normalization on something with no wild type
        #if not self.has_wt_sequence() and self.logr_method == "wt":
        #    raise ValueError("No wild type sequence for wild type normalization [{}]".format(self.name))

        # validate children
        for child in self.children:
            child.validate()

selection.py 文件源码项目：Enrich2 作者: FowlerLab 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def calculate(self):
        """
        Wrapper method to calculate counts and enrichment scores 
        for all data in the :py:class:`~selection.Selection`.
        """
        if len(self.labels) == 0:
            raise ValueError("No data present across all sequencing libraries [{}]".format(self.name))
        for label in self.labels:
            self.merge_counts_unfiltered(label)
            self.filter_counts(label)
        if self.is_barcodevariant() or self.is_barcodeid():
            self.combine_barcode_maps()
        if self.scoring_method == "counts":
            pass
        elif self.scoring_method == "ratios":
            for label in self.labels:
                self.calc_ratios(label)
        elif self.scoring_method == "simple":
            for label in self.labels:
                self.calc_simple_ratios(label)
        elif self.scoring_method in ("WLS", "OLS"):
            if len(self.timepoints) <= 2:
                raise ValueError("Regression-based scoring requires three or more time points.")
            for label in self.labels:
                self.calc_log_ratios(label)
                if self.scoring_method == "WLS":
                    self.calc_weights(label)
                self.calc_regression(label)
        else:
            raise ValueError('Invalid scoring method "{}" [{}]'.format(self.scoring_method, self.name))

        if self.scoring_method in ("ratios" , "WLS", "OLS") and self.component_outliers:
            if self.is_barcodevariant() or self.is_barcodeid():
                self.calc_outliers("barcodes")
            if self.is_coding():
                self.calc_outliers("variants")

selection.py 文件源码项目：Enrich2 作者: FowlerLab 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def calc_regression(self, label):
        """
        Calculate least squares regression for *label*. If *weighted* is ``True``, calculates weighted least squares; else ordinary least squares.

        Regression results are stored in ``'/main/label/scores'``

        """
        if self.check_store("/main/{}/scores".format(label)):
            return
        elif "/main/{}/scores".format(label) in self.store.keys():
            # need to remove the current keys because we are using append
            self.store.remove("/main/{}/scores".format(label))

        logging.info("Calculating {} regression coefficients ({})".format(self.scoring_method, label), extra={'oname' : self.name})
        # append is required because it takes the "min_itemsize" argument, and put doesn't
        longest = self.store.select("/main/{}/log_ratios".format(label), "columns='index'").index.map(len).max()
        chunk = 1
        if self.scoring_method == "WLS":
            for data in self.store.select_as_multiple(["/main/{}/log_ratios".format(label), "/main/{}/weights".format(label)], chunksize=self.chunksize):
                logging.info("Calculating weighted least squares for chunk {} ({} rows)".format(chunk, len(data.index)), extra={'oname' : self.name})
                result = data.apply(regression_apply, args=[self.timepoints, True], axis="columns")
                self.store.append("/main/{}/scores".format(label), result, min_itemsize={"index" : longest})
                chunk += 1
        elif self.scoring_method == "OLS":
            for data in self.store.select("/main/{}/log_ratios".format(label), chunksize=self.chunksize):
                logging.info("Calculating ordinary least squares for chunk {} ({} rows)".format(chunk, len(data.index)), extra={'oname' : self.name})
                result = data.apply(regression_apply, args=[self.timepoints, False], axis="columns")
                self.store.append("/main/{}/scores".format(label), result, min_itemsize={"index" : longest})
                chunk += 1
        else:
            raise ValueError('Invalid regression scoring method "{}" [{}]'.format(self.scoring_method, self.name))

        # need to read from the file, calculate percentiles, and rewrite it
        logging.info("Calculating slope standard error percentiles ({})".format(label), extra={'oname' : self.name})
        data = self.store['/main/{}/scores'.format(label)]
        data['score'] = data['slope']
        data['SE'] = data['SE_slope']
        data['SE_pctile'] = [stats.percentileofscore(data['SE'], x, "weak") for x in data['SE']]
        data = data[['score', 'SE', 'SE_pctile', 'slope', 'intercept', 'SE_slope', 't', 'pvalue_raw']] # reorder columns
        self.store.put("/main/{}/scores".format(label), data, format="table", data_columns=data.columns)

selection.py 文件源码项目：Enrich2 作者: FowlerLab 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def wt_plot(self, pdf):
        """
        Create a plot of the linear fit of the wild type variant.

        *pdf* is an open PdfPages instance.

        Only created for selections that use WLS or OLS scoring and have a wild type specified. 
        Uses :py:func:`~plots.fit_axes` for the plotting.
        """
        logging.info("Creating wild type fit plot", extra={'oname' : self.name})

        # get the data and calculate log ratios
        if "variants" in self.labels:
            wt_label = "variants"
        elif "identifiers" in self.labels:
            wt_label = "identifiers"
        data = self.store.select("/main/{}/counts".format(wt_label), where='index = "{}"'.format(WILD_TYPE_VARIANT)).ix[0]
        sums = self.store['/main/{}/counts'.format(wt_label)].sum(axis="index")  # sum of complete cases (N')
        yvalues = np.log(data + 0.5) - np.log(sums + 0.5)
        xvalues = [tp / float(max(self.timepoints)) for tp in self.timepoints]

        # fit the line
        X = sm.add_constant(xvalues) # fit intercept
        if self.scoring_method == "WLS":
            W =  1 / (1 / (data + 0.5) + 1 / (sums + 0.5))
            fit = sm.WLS(yvalues, X, weights=W).fit()
        elif self.scoring_method == "OLS":
            fit = sm.OLS(yvalues, X).fit()
        else:
            raise ValueError('Invalid regression scoring method "{}" [{}]'.format(self.scoring_method, self.name))
        intercept, slope = fit.params
        slope_se = fit.bse['x1']

        # make the plot
        fig, ax = plt.subplots()
        fig.set_tight_layout(True)
        fit_axes(ax, xvalues, yvalues, slope, intercept, xlabels=self.timepoints)
        fit_axes_text(ax, cornertext="Slope {:3.2f}\nSE {:.1f}".format(slope, slope_se))
        ax.set_title("Wild Type Shape\n{}".format(self.name))
        ax.set_ylabel("Log Ratio (Complete Cases)")

        pdf.savefig(fig)
        plt.close(fig)

graphs.py 文件源码项目：WellApplication 作者: inkenbrandt 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def scatterColor(x0, y, w):
    """Creates scatter plot with points colored by variable.
    All input arrays must have matching lengths

    :param x0: x values to plot
    :type x0: list
    :param y: y values to plot
    :type y: list
    :param w: z values to plot

    :returns: plot; slope and intercept of the RLM best fit line shown on the plot
    .. warning:: all input arrays must have matching lengths and scalar values
    .. note:: See documentation at http://statsmodels.sourceforge.net/0.6.0/generated/statsmodels.robust.robust_linear_model.RLM.html
    for the RLM line
    """
    import matplotlib as mpl
    import matplotlib.cm as cm
    import statsmodels.api as sm
    from scipy.stats import linregress
    cmap = plt.cm.get_cmap('RdYlBu')
    norm = mpl.colors.Normalize(vmin=w.min(), vmax=w.max())
    m = cm.ScalarMappable(norm=norm, cmap=cmap)
    m.set_array(w)
    sc = plt.scatter(x0, y, label='', color=m.to_rgba(w))

    xa = sm.add_constant(x0)

    est = sm.RLM(y, xa).fit()
    r2 = sm.WLS(y, xa, weights=est.weights).fit().rsquared
    slope = est.params[1]

    x_prime = np.linspace(np.min(x0), np.max(x0), 100)[:, np.newaxis]
    x_prime = sm.add_constant(x_prime)
    y_hat = est.predict(x_prime)

    const = est.params[0]
    y2 = [i * slope + const for i in x0]

    lin = linregress(x0, y)
    x1 = np.arange(np.min(x0), np.max(x0), 0.1)
    y1 = [i * lin[0] + lin[1] for i in x1]
    y2 = [i * slope + const for i in x1]
    plt.plot(x1, y1, c='g',
             label='simple linear regression m = {:.2f} b = {:.0f}, r^2 = {:.2f}'.format(lin[0], lin[1], lin[2] ** 2))
    plt.plot(x1, y2, c='r', label='rlm regression m = {:.2f} b = {:.0f}, r2 = {:.2f}'.format(slope, const, r2))
    plt.legend()
    cbar = plt.colorbar(m)

    cbar.set_label('Julian Date')

    return slope, const