def regression_apply(row, timepoints, weighted):
"""
:py:meth:`pandas.DataFrame.apply` apply function for calculating
enrichment using linear regression. If *weighted* is ``True`` perform
weighted least squares; else perform ordinary least squares.
Weights for weighted least squares are included in *row*.
Returns a :py:class:`pandas.Series` containing regression coefficients,
residuals, and statistics.
"""
# retrieve log ratios from the row
y = row[['L_{}'.format(t) for t in timepoints]]
# re-scale the x's to fall within [0, 1]
xvalues = [x / float(max(timepoints)) for x in timepoints]
# perform the fit
X = sm.add_constant(xvalues) # fit intercept
if weighted:
W = row[['W_{}'.format(t) for t in timepoints]]
fit = sm.WLS(y, X, weights=W).fit()
else:
fit = sm.OLS(y, X).fit()
# re-format as a data frame row
values = np.concatenate([fit.params, [fit.bse['x1'], fit.tvalues['x1'],
fit.pvalues['x1']], fit.resid])
index = ['intercept', 'slope', 'SE_slope', 't', 'pvalue_raw'] + \
['e_{}'.format(t) for t in timepoints]
return pd.Series(data=values, index=index)
python类OLS的实例源码
def validate(self):
"""
Raises an informative ``ValueError`` if the time points in the analysis are not suitable.
Calls validate method on all child SeqLibs.
"""
# check the time points
if 0 not in self.timepoints:
raise ValueError("Missing timepoint 0 [{}]".format(self.name))
if self.timepoints[0] != 0:
raise ValueError("Invalid negative timepoint [{}]".format(self.name))
if len(self.timepoints) < 2:
raise ValueError("Multiple timepoints required [{}]".format(self.name))
elif len(self.timepoints) < 3 and self.scoring_method in ("WLS", "OLS"):
raise ValueError("Insufficient number of timepoints for regression scoring [{}]".format(self.name))
# check the wild type sequences
if self.has_wt_sequence():
for child in self.children[1:]:
if self.children[0].wt != child.wt:
logging.warning("Inconsistent wild type sequences", extra={'oname' : self.name})
break
# check that we're not doing wild type normalization on something with no wild type
#if not self.has_wt_sequence() and self.logr_method == "wt":
# raise ValueError("No wild type sequence for wild type normalization [{}]".format(self.name))
# validate children
for child in self.children:
child.validate()
def calculate(self):
"""
Wrapper method to calculate counts and enrichment scores
for all data in the :py:class:`~selection.Selection`.
"""
if len(self.labels) == 0:
raise ValueError("No data present across all sequencing libraries [{}]".format(self.name))
for label in self.labels:
self.merge_counts_unfiltered(label)
self.filter_counts(label)
if self.is_barcodevariant() or self.is_barcodeid():
self.combine_barcode_maps()
if self.scoring_method == "counts":
pass
elif self.scoring_method == "ratios":
for label in self.labels:
self.calc_ratios(label)
elif self.scoring_method == "simple":
for label in self.labels:
self.calc_simple_ratios(label)
elif self.scoring_method in ("WLS", "OLS"):
if len(self.timepoints) <= 2:
raise ValueError("Regression-based scoring requires three or more time points.")
for label in self.labels:
self.calc_log_ratios(label)
if self.scoring_method == "WLS":
self.calc_weights(label)
self.calc_regression(label)
else:
raise ValueError('Invalid scoring method "{}" [{}]'.format(self.scoring_method, self.name))
if self.scoring_method in ("ratios" , "WLS", "OLS") and self.component_outliers:
if self.is_barcodevariant() or self.is_barcodeid():
self.calc_outliers("barcodes")
if self.is_coding():
self.calc_outliers("variants")
def calc_regression(self, label):
"""
Calculate least squares regression for *label*. If *weighted* is ``True``, calculates weighted least squares; else ordinary least squares.
Regression results are stored in ``'/main/label/scores'``
"""
if self.check_store("/main/{}/scores".format(label)):
return
elif "/main/{}/scores".format(label) in self.store.keys():
# need to remove the current keys because we are using append
self.store.remove("/main/{}/scores".format(label))
logging.info("Calculating {} regression coefficients ({})".format(self.scoring_method, label), extra={'oname' : self.name})
# append is required because it takes the "min_itemsize" argument, and put doesn't
longest = self.store.select("/main/{}/log_ratios".format(label), "columns='index'").index.map(len).max()
chunk = 1
if self.scoring_method == "WLS":
for data in self.store.select_as_multiple(["/main/{}/log_ratios".format(label), "/main/{}/weights".format(label)], chunksize=self.chunksize):
logging.info("Calculating weighted least squares for chunk {} ({} rows)".format(chunk, len(data.index)), extra={'oname' : self.name})
result = data.apply(regression_apply, args=[self.timepoints, True], axis="columns")
self.store.append("/main/{}/scores".format(label), result, min_itemsize={"index" : longest})
chunk += 1
elif self.scoring_method == "OLS":
for data in self.store.select("/main/{}/log_ratios".format(label), chunksize=self.chunksize):
logging.info("Calculating ordinary least squares for chunk {} ({} rows)".format(chunk, len(data.index)), extra={'oname' : self.name})
result = data.apply(regression_apply, args=[self.timepoints, False], axis="columns")
self.store.append("/main/{}/scores".format(label), result, min_itemsize={"index" : longest})
chunk += 1
else:
raise ValueError('Invalid regression scoring method "{}" [{}]'.format(self.scoring_method, self.name))
# need to read from the file, calculate percentiles, and rewrite it
logging.info("Calculating slope standard error percentiles ({})".format(label), extra={'oname' : self.name})
data = self.store['/main/{}/scores'.format(label)]
data['score'] = data['slope']
data['SE'] = data['SE_slope']
data['SE_pctile'] = [stats.percentileofscore(data['SE'], x, "weak") for x in data['SE']]
data = data[['score', 'SE', 'SE_pctile', 'slope', 'intercept', 'SE_slope', 't', 'pvalue_raw']] # reorder columns
self.store.put("/main/{}/scores".format(label), data, format="table", data_columns=data.columns)
c15_08.py 文件源码
项目:Python-for-Finance-Second-Edition
作者: PacktPublishing
项目源码
文件源码
阅读 18
收藏 0
点赞 0
评论 0
def breusch_pagan_test(y,x):
results=sm.OLS(y,x).fit()
resid=results.resid
n=len(resid)
sigma2 = sum(resid**2)/n
f = resid**2/sigma2 - 1
results2=sm.OLS(f,x).fit()
fv=results2.fittedvalues
bp=0.5 * sum(fv**2)
df=results2.df_model
p_value=1-sp.stats.chi.cdf(bp,df)
return round(bp,6), df, round(p_value,7)
#
def sm_multireg(self,Xtrain,ytrain, Xtest, ytest):
self.normalize(Xtrain)
results = sm.OLS(ytrain, Xtrain).fit_regularized()
#print "coefficient: ", results.params
# train accuracy
predictions = results.predict(Xtrain)
correct = 0
for i in range(len(predictions)):
if round(predictions[i], 1) == ytrain[i]:
correct += 1
accuracy = correct * 1.0 / len(ytrain)
print "train accuracy: ", accuracy * 100, "%"
# calculate SSE, SSM & SST
SSE = 0
for i in range(len(predictions)):
SSE += (predictions[i] - ytrain[i])**2
yAverage = np.mean(ytrain)
SSM = 0
for pred in predictions:
SSM += (pred - yAverage)**2
print "SSM:", SSM
SST = SSE + SSM
print "SST:", SST
# calculate PVE = SSM / SST
PVE = SSM / SST
print "PVE:", PVE
# test accuracy
self.normalize(Xtest)
predictions = results.predict(Xtest)
correct = 0
for i in range(len(predictions)):
print round(predictions[i], 1), ytest[i]
if round(predictions[i], 1) == ytest[i]:
correct += 1
accuracy = correct * 1.0 / len(ytest)
print "test accuracy: ", accuracy * 100, "%"
return results
def fit_ols(y, x, idx=-1):
ols = OLS(y, add_constant(x))
results = ols.fit()
return results.params.values[idx], results.cov_params().values[idx, idx]
def weak_instruments(self, n_sims=20):
np.random.seed(1692)
model = feedforward.FeedForwardModel(19, 1, dense_size=60, n_dense_layers=2)
treatment_effects = []
ols_betas, ols_ses = [], []
old_corrs, new_corrs = [], []
for _ in xrange(n_sims):
df = self.treatment_gen.simulate_data(False)
X = np.hstack((self.x, df['new_treat'].values[:, None]))
Z = np.hstack((self.x, df['instrument'].values[:, None]))
ols_beta, ols_se = self.fit_ols(df['treatment_effect'], X)
ols_betas.append(ols_beta)
ols_ses.append(ols_se)
old_corr = df[['instrument', 'new_treat']].corr().values[0, 1]
new_instrument, new_corr = model.fit_instruments(X, Z, df['treatment_effect'].values, batchsize=128)
new_corrs.append(new_corr)
old_corrs.append(old_corr)
Z2 = Z.copy()
Z2[:, -1] = new_instrument[:, 0]
iv = IV2SLS(df['treatment_effect'].values.flatten(), add_constant(X), add_constant(Z2))
model.reset_params()
if new_corr:
logger.info("Old corr: %.2f, New corr: %.2f", np.mean(old_corrs), np.mean(new_corrs))
logger.info("Treatment effect (OLS): %.3f (%.4f)", np.mean(ols_betas), np.mean(ols_ses))
logger.info("Treatment effect: %.3f (%.4f)", np.mean(treatment_effects), np.std(treatment_effects))
def estimate(self, n_sims=20):
np.random.seed(1692)
model = feedforward.FeedForwardModel(19, 1, dense_size=10, n_dense_layers=2)
treatment_effects = []
ols_betas, ols_ses = [], []
old_corrs, new_corrs = [], []
for _ in xrange(n_sims):
df = self.treatment_gen.simulate_data(True)
X = np.hstack((self.x, df['new_treat'].values[:, None]))
Z = np.hstack((self.x, df['instrument'].values[:, None]))
ols_beta, ols_se = self.fit_ols(df['treatment_effect'], X)
ols_betas.append(ols_beta)
ols_ses.append(ols_se)
new_corr = model.fit(X, Z, df['treatment_effect'].values, instrument=True, batchsize=64)
if new_corr:
old_corr = df[['instrument', 'new_treat']].corr().values[0, 1]
new_corrs.append(new_corr)
old_corrs.append(old_corr)
treatment_effects.append(model.get_treatment_effect(X))
model.reset_params()
if new_corr:
logger.info("Old corr: %.2f, New corr: %.2f", np.mean(old_corrs), np.mean(new_corrs))
logger.info("Treatment effect (OLS): %.3f (%.4f)", np.mean(ols_betas), np.mean(ols_ses))
logger.info("Treatment effect: %.3f (%.4f)", np.mean(treatment_effects), np.std(treatment_effects))
def solution3():
print("Start problem 3")
print("Start data preparation for problem 3")
# firstly prepare data for problem 3, all text files will be saved at current path
data_preparation()
print("Results: ")
tags = ['tweets_#gohawks', 'tweets_#gopatriots', 'tweets_#nfl', \
'tweets_#patriots', 'tweets_#sb49', 'tweets_#superbowl']
# idx for tag index in tags
idx=0
for tag in tags:
tweets, parameters = [], []
f = open('problem 3 ' + tag + ' data.txt')
line = f.readline()
while len(line):
p = line.split()
tweets.append(float(p[0]))
parameters.append([float(p[i]) for i in range(len(p))])
line = f.readline()
del(tweets[0])
next_hour_tweets = np.array(tweets)
parameters.pop()
X = np.matrix(parameters)
res = sm.OLS(next_hour_tweets, X).fit()
print("Result of " + tag)
print(res.summary())
if tag == 'tweets_#gohawks' or tag=='tweets_#nfl' or tag=='tweets_#patriots':
scatterPlot(idx,0,1,2,X,next_hour_tweets,tag)
elif tag == 'tweets_#gopatriots':
scatterPlot(idx,1,3,4,X,next_hour_tweets, tag)
elif tag == 'tweets_#sb49':
scatterPlot(idx,0,1,5,X,next_hour_tweets,tag)
elif tag == 'tweets_#superbowl':
scatterPlot(idx,0,2,5,X, next_hour_tweets,tag)
idx += 1
f.close()
def macro_regression(ret, macro_index,macro_index_list):
'''
?????????????????????????????????????????????????
??????????????????????????*???????????????-??????
:param DataFrame ret: ??????????
:param DataFrame macro_index: ??????????
:return: [DataFrame,dict] [loading,significant_list]: ???????????????;??????????
'''
loading = pd.DataFrame(np.zeros([ret.shape[1], macro_index.shape[1]]), columns=macro_index.columns)
# ??????????????????
significant_list = dict()
# ??????
for i in range(len(macro_index_list)):
significant_list[macro_index_list[i]] = 0
# p???????????
pvalue_threshold = 0.1
# ??????????
for i in range(ret.shape[1]):
y = ret.values[:, i]
# ??????????????
for j in range(len(macro_index_list)):
x = macro_index.values[:, j]
model = sm.OLS(y, x).fit()
loading.iloc[i, j] = model.params[0]
if model.pvalues < pvalue_threshold:
significant_list[macro_index_list[j]] += 1
return [loading, significant_list]
def rm_reg_ri(rm,ret):
'''
???????????????????????????
'''
#??????
X=np.array(np.zeros([ret.shape[1],1]))
#?????????????
for i in range(ret.shape[1]):
model=sm.OLS(ret.values[:,i],rm.values).fit()
X[i,:]=model.params[0]
X=pd.DataFrame(X)
return X
def fb_reg_over_time(ret, data,interval):
'''
?????????????????????????????????????????????????????????
:param DataFrame ret: ???
:param {string:DataFrame} data: ?????????
:param DataFrame ind_ret: ???????????
:param [int] interval????????????
:return: DataFrame X: ?????????????;??????
'''
# X???????????????????X[i,j]???(i+1)???????(j+1)????(row????col???)
X = np.zeros([ret.shape[1], len(data)])
# num_of_factor????????factor??????????????1
num_of_factor = 0
# name of factors,prepared for converting X to DataFrame,with columns=factor_name
factor_name = []
# ?????????,i???tuple,i[0]?????i[1]???DataFrame???[???,????]??????
for i in data.items():
factor_name = factor_name + [i[0]]
interval_data=i[1].ix[interval]
# ????????????????0
for j in range(i[1].shape[1]):
# ??j??????????????????????
model = sm.OLS(ret[j].values, interval_data[j].values).fit()
# ?????????????
X[j, num_of_factor] = model.params[0]
# ?????????????1
num_of_factor += 1
# ?X??DataFrame????
X = pd.DataFrame(X)
X.fillna(0, inplace=True)
X.columns = factor_name
#??????????DataFrame
return X
def fb_reg_over_all_time(ret, data):
'''
???????????????????????????????????????????????????????
:param DataFrame ret: ???
:param {string:DataFrame} data: ?????????
:param DataFrame ind_ret: ???????????
:param [int] interval????????????
:return: DataFrame X: ?????????????;??????
'''
# X???????????????????X[i,j]???(i+1)???????(j+1)????(row????col???)
X = np.zeros([ret.shape[1], len(data)])
# num_of_factor????????factor??????????????1
num_of_factor = 0
# name of factors,prepared for converting X to DataFrame,with columns=factor_name
factor_name = []
# ?????????,i???tuple,i[0]?????i[1]???DataFrame???[???,????]??????
for i in data.items():
factor_name = factor_name + [i[0]]
# ????????????????0
for j in range(i[1].shape[1]):
# ??j??????????????????????
model = sm.OLS(ret[j].values, i[1][j].values).fit()
# ?????????????
X[j, num_of_factor] = model.params[0]
# ?????????????1
num_of_factor += 1
# ?X??DataFrame????
X = pd.DataFrame(X)
X.fillna(0, inplace=True)
X.columns = factor_name
#??????????DataFrame
return X
def fit(self, values, targets, penalty=0.0):
self._knots = _get_percentiles(values, num_percentiles=self.num_percentiles)
self.basis_matrix = _get_basis_vector(values, self._knots)
X = np.vstack((self.basis_matrix, np.sqrt(penalty) * self._penalty_matrix()))
y = np.asarray(targets + np.zeros((self.num_percentiles + 2, 1)).flatten().tolist())
self.spline = sm.OLS(y, X).fit()
return self
ols.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 28
收藏 0
点赞 0
评论 0
def _filter_data(lhs, rhs, weights=None):
"""
Cleans the input for single OLS.
Parameters
----------
lhs : Series
Dependent variable in the regression.
rhs : dict, whose values are Series, DataFrame, or dict
Explanatory variables of the regression.
weights : array-like, optional
1d array of weights. If None, equivalent to an unweighted OLS.
Returns
-------
Series, DataFrame
Cleaned lhs and rhs
"""
if not isinstance(lhs, Series):
if len(lhs) != len(rhs):
raise AssertionError("length of lhs must equal length of rhs")
lhs = Series(lhs, index=rhs.index)
rhs = _combine_rhs(rhs)
lhs = DataFrame({'__y__': lhs}, dtype=float)
pre_filt_rhs = rhs.dropna(how='any')
combined = rhs.join(lhs, how='outer')
if weights is not None:
combined['__weights__'] = weights
valid = (combined.count(1) == len(combined.columns)).values
index = combined.index
combined = combined[valid]
if weights is not None:
filt_weights = combined.pop('__weights__')
else:
filt_weights = None
filt_lhs = combined.pop('__y__')
filt_rhs = combined
if hasattr(filt_weights, 'to_dense'):
filt_weights = filt_weights.to_dense()
return (filt_lhs.to_dense(), filt_rhs.to_dense(), filt_weights,
pre_filt_rhs.to_dense(), index, valid)
def _residualNet(data,uselog=True,c=None,p=None,x=None,useaggregate=True,numericalControls=[],categoricalControls=[]):
'''
Given the data on a bipartite network of the form source,target,flow
Parameters
----------
data : pandas.DataFrame
Raw data. It has source,target,volume (trade, number of people etc.).
c,p,x : str (optional)
Labels of the columns in data used for source,target,volume.
If not provided it will use the first, second, and third.
numericalControls : list
List of columns to use as numerical controls.
categoricalControls : list
List of columns to use as categorical controls.
uselog : boolean (True)
If True it will use the logarithm of the provided weight.
useaggregate : boolean (True)
If true it will calculate the aggregate of the volume on both sides (c and p) and use as numbercal controls.
Returns
-------
net : pandas.Dataframe
Table with c,p,x,x_res, where x_res is the residual of regressing x on the given control variables.
'''
c = data.columns.values[0] if c is None else c
p = data.columns.values[1] if p is None else p
x = data.columns.values[2] if x is None else x
data_ = data[[c,p,x]+numericalControls+categoricalControls]
if useaggregate:
data_ = merge(data_,data.groupby(c).sum()[[x]].reset_index().rename(columns={x:x+'_'+c}))
data_ = merge(data_,data.groupby(p).sum()[[x]].reset_index().rename(columns={x:x+'_'+p}))
numericalControls+=[x+'_'+c,x+'_'+p]
if uselog:
data_ = data_[data_[x]!=0]
data_[x] = np.log10(data_[x])
if useaggregate:
data_[x+'_'+c] = np.log10(data_[x+'_'+c])
data_[x+'_'+p] = np.log10(data_[x+'_'+p])
_categoricalControls = []
for var in ser(categoricalControls):
vals = list(set(data_[var]))
for v in vals[1:]:
_categoricalControls.append(var+'_'+str(v))
data_[var+'_'+str(v)]=0
data_.loc[data_[var]==v,var+'_'+str(v)]=1
Y = data_[x].values
X = data_[list(set(numericalControls))+list(set(_categoricalControls))].values
X = sm.add_constant(X)
model = sm.OLS(Y,X).fit()
data_[x+'_res'] = Y-model.predict(X)
return data_[[c,p,x,x+'_res']]
def wt_plot(self, pdf):
"""
Create a plot of the linear fit of the wild type variant.
*pdf* is an open PdfPages instance.
Only created for selections that use WLS or OLS scoring and have a wild type specified.
Uses :py:func:`~plots.fit_axes` for the plotting.
"""
logging.info("Creating wild type fit plot", extra={'oname' : self.name})
# get the data and calculate log ratios
if "variants" in self.labels:
wt_label = "variants"
elif "identifiers" in self.labels:
wt_label = "identifiers"
data = self.store.select("/main/{}/counts".format(wt_label), where='index = "{}"'.format(WILD_TYPE_VARIANT)).ix[0]
sums = self.store['/main/{}/counts'.format(wt_label)].sum(axis="index") # sum of complete cases (N')
yvalues = np.log(data + 0.5) - np.log(sums + 0.5)
xvalues = [tp / float(max(self.timepoints)) for tp in self.timepoints]
# fit the line
X = sm.add_constant(xvalues) # fit intercept
if self.scoring_method == "WLS":
W = 1 / (1 / (data + 0.5) + 1 / (sums + 0.5))
fit = sm.WLS(yvalues, X, weights=W).fit()
elif self.scoring_method == "OLS":
fit = sm.OLS(yvalues, X).fit()
else:
raise ValueError('Invalid regression scoring method "{}" [{}]'.format(self.scoring_method, self.name))
intercept, slope = fit.params
slope_se = fit.bse['x1']
# make the plot
fig, ax = plt.subplots()
fig.set_tight_layout(True)
fit_axes(ax, xvalues, yvalues, slope, intercept, xlabels=self.timepoints)
fit_axes_text(ax, cornertext="Slope {:3.2f}\nSE {:.1f}".format(slope, slope_se))
ax.set_title("Wild Type Shape\n{}".format(self.name))
ax.set_ylabel("Log Ratio (Complete Cases)")
pdf.savefig(fig)
plt.close(fig)