def build_model(y, x, add_constant=True):
'''
Build a linear regression model from the provided data
Provided:
y: a series or single column dataframe holding our solution vector for linear regression
x: a dataframe that runs parallel to y, with all the features for our linear regression
add_constant: a boolean, if true it will add a constant row to our provided x data. Otherwise
this method assumes you've done-so already, or do not want one for some good reason
Return: a linear regression model from StatsModels and the data which was used to train the model.
If add_constant was true this will be a new dataframe, otherwise it will be x
'''
if add_constant:
x = sm.add_constant(x)
model = sm.OLS(y, x).fit()
return (model, x)
python类OLS的实例源码
def dispersion_test(yhat, y, k=100):
""" Implement the regression based dispersion test with k re-sampling.
Args:
yhat (np.array): predicted mutation count
y (np.array): observed mutation count
k (int):
Returns:
float, float: p-value, theta
"""
theta = 0
pval = 0
for i in range(k):
y_sub, yhat_sub = resample(y, yhat, random_state=i)
# (np.power((y - yhat), 2) - y) / yhat for Poisson regression
aux = (np.power((y_sub - yhat_sub), 2) - yhat_sub) / yhat_sub
mod = sm.OLS(aux, yhat_sub)
res = mod.fit()
theta += res.params[0]
pval += res.pvalues[0]
theta = theta/k
pval = pval/k
return pval, theta
def lm(data, xseq, **params):
"""
Fit OLS / WLS if data has weight
"""
X = sm.add_constant(data['x'])
Xseq = sm.add_constant(xseq)
try:
model = sm.WLS(data['y'], X, weights=data['weight'])
except KeyError:
model = sm.OLS(data['y'], X)
results = model.fit(**params['method_args'])
data = pd.DataFrame({'x': xseq})
data['y'] = results.predict(Xseq)
if params['se']:
alpha = 1 - params['level']
prstd, iv_l, iv_u = wls_prediction_std(
results, Xseq, alpha=alpha)
data['se'] = prstd
data['ymin'] = iv_l
data['ymax'] = iv_u
return data
def _degradation_CI(results):
'''
Description
-----------
Monte Carlo estimation of uncertainty in degradation rate from OLS results
Parameters
----------
results: OLSResults object from fitting a model of the form:
results = sm.OLS(endog = df.energy_ma, exog = df.loc[:,['const','years']]).fit()
Returns
-------
68.2% confidence interval for degradation rate
'''
sampled_normal = np.random.multivariate_normal(results.params, results.cov_params(), 10000)
dist = sampled_normal[:, 1] / sampled_normal[:, 0]
Rd_CI = np.percentile(dist, [50 - 34.1, 50 + 34.1]) * 100.0
return Rd_CI
def __get_tail_weighted_time_period(self):
""" Method to approximate the weighted-average development age assuming
exponential tail fit.
Returns: float32
"""
#n = self.triangle.ncol-1
#y = self.f[:n]
#x = np.array([i + 1 for i in range(len(y))])
#ldf_reg = stats.linregress(x, np.log(y - 1))
#time_pd = (np.log(self.f[n] - 1) - ldf_reg[1]) / ldf_reg[0]
n = self.triangle.ncol-1
y = Series(self.f[:n])
x = [num+1 for num, item in enumerate(y)]
y.index = x
x = sm.add_constant((y.index)[y>1])
y = y[y>1]
ldf_reg = sm.OLS(np.log(y-1),x).fit()
time_pd = (np.log(self.f[n] - 1) - ldf_reg.params[0]) / ldf_reg.params[1]
#tail_factor = np.exp(tail_model.params[0] + np.array([i+2 for i in range(n,n+100)]) * tail_model.params[1]).astype(float) + 1)
return time_pd
def __init__(self, x, y, w):
self.x = x
self.y = y
self.w = w
WLS = sm.WLS(y,x, w)
OLS = sm.OLS(WLS.wendog,WLS.wexog).fit()
self.coefficient = OLS.params[0]
self.WSSResidual = OLS.ssr
self.fittedvalues = OLS.predict(x)
self.residual = OLS.resid
if len(x) == 1:
self.mean_square_error = np.nan
self.standard_error = np.nan
self.sigma = np.nan
self.std_resid = np.nan
else:
self.mean_square_error = OLS.mse_resid
self.standard_error = OLS.params[0]/OLS.tvalues[0]
self.sigma = np.sqrt(self.mean_square_error)
self.std_resid = OLSInfluence(OLS).resid_studentized_internal
def __get_MCL_model(self):
modelsI=[]
modelsP=[]
for i in range(len(self.Incurred.data.columns)):
modelsI.append(cl.WRTO(self.Incurred.data.iloc[:,i].dropna(), self.Paid.data.iloc[:,i].dropna(), w=1/self.Incurred.data.iloc[:,i].dropna()))
modelsP.append(cl.WRTO(self.Paid.data.iloc[:,i].dropna(), self.Incurred.data.iloc[:,i].dropna(), w=1/self.Paid.data.iloc[:,i].dropna()))
q_f = np.array([item.coefficient for item in modelsI])
qinverse_f = np.array([item.coefficient for item in modelsP])
rhoI_sigma = np.array([item.sigma for item in modelsI])
rhoP_sigma = np.array([item.sigma for item in modelsP])
#y = np.log(rhoI_sigma[:-1])
#x = np.array([i + 1 for i in range(len(y))])
#x = sm.add_constant(x)
#OLS = sm.OLS(y,x).fit()
#tailsigma = np.exp((x[:,1][-1]+ 1) * OLS.params[1] + OLS.params[0])
return rhoI_sigma, rhoP_sigma, q_f, qinverse_f
def model_fit_to_dataframe(fit):
"""
Take an object containing a statsmodels OLS model fit and extact
the main model fit metrics into a data frame.
Parameters
----------
fit : a statsmodels fit object
Model fit object obtained from a linear model trained using
`statsmodels.OLS`.
Returns
-------
df_fit : pandas DataFrame
Data frame with the main model fit metrics.
"""
df_fit = pd.DataFrame({"N responses": [int(fit.nobs)]})
df_fit['N features'] = int(fit.df_model)
df_fit['R2'] = fit.rsquared
df_fit['R2_adjusted'] = fit.rsquared_adj
return df_fit
test_ols.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 19
收藏 0
点赞 0
评论 0
def checkOLS(self, exog, endog, x, y):
reference = sm.OLS(endog, sm.add_constant(exog, prepend=False)).fit()
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
result = ols(y=y, x=x)
# check that sparse version is the same
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
sparse_result = ols(y=y.to_sparse(), x=x.to_sparse())
_compare_ols_results(result, sparse_result)
assert_almost_equal(reference.params, result._beta_raw)
assert_almost_equal(reference.df_model, result._df_model_raw)
assert_almost_equal(reference.df_resid, result._df_resid_raw)
assert_almost_equal(reference.fvalue, result._f_stat_raw[0])
assert_almost_equal(reference.pvalues, result._p_value_raw)
assert_almost_equal(reference.rsquared, result._r2_raw)
assert_almost_equal(reference.rsquared_adj, result._r2_adj_raw)
assert_almost_equal(reference.resid, result._resid_raw)
assert_almost_equal(reference.bse, result._std_err_raw)
assert_almost_equal(reference.tvalues, result._t_stat_raw)
assert_almost_equal(reference.cov_params(), result._var_beta_raw)
assert_almost_equal(reference.fittedvalues, result._y_fitted_raw)
_check_non_raw_results(result)
def ols(data,iv,dv1,dv2,dv3,title='OLS Summary',table_title=''):
if len(table_title) == 0:
table_title = "Independent Variable : " + str(iv)
features = str(iv + ' ~ ' + dv1 + ' + ' + dv2 + ' + ' + dv3)
y, X = dmatrices(features, data=data, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
result = pd.DataFrame.transpose(pd.DataFrame([res.params,res.tvalues,res.pvalues,res.bse]))
result.columns = ['coef','t','p_t','std_error']
data = result.round(decimals=4)
html = display(HTML("<style type=\"text/css\"> .tg {border-collapse:collapse;border-spacing:0;border:none;} .tg td{font-family:Arial, sans-serif;font-size:14px;padding:10px 5px;border-style:solid;border-width:0px;overflow:hidden;word-break:normal;} .tg th{font-family:Arial, sans-serif;font-size:14px;font-weight:normal;padding:10px 5px;border-style:solid;border-width:0px;overflow:hidden;word-break:normal;} .tg .tg-ejgj{font-family:Verdana, Geneva, sans-serif !important;;vertical-align:top} .tg .tg-anay{font-family:Verdana, Geneva, sans-serif !important;;text-align:right;vertical-align:top} .tg .tg-jua3{font-weight:bold;font-family:Verdana, Geneva, sans-serif !important;;text-align:right;vertical-align:top} h5{font-family:Verdana;} h4{font-family:Verdana;} hr{height: 3px; background-color: #333;} .hr2{height: 1px; background-color: #333;} </style> <table class=\"tg\" style=\"undefined;table-layout: fixed; width: 500px; border-style: hidden; border-collapse: collapse;\"> <colgroup> <col style=\"width: 150px\"> <col style=\"width: 120px\"> <col style=\"width: 120px\"> <col style=\"width: 120px\"> <col style=\"width: 120px\"> </colgroup> <h5>" + str(table_title) + "</h5> <h4><i>" + str(title) + "</i></h4> <hr align=\"left\", width=\"630\"> <tr> <th class=\"tg-ejgj\"></th> <th class=\"tg-anay\">" + str(data.keys()[0]) + "</th> <th class=\"tg-anay\">" + str(data.keys()[1]) + "</th> <th class=\"tg-anay\">" + str(data.keys()[2]) + "</th> <th class=\"tg-anay\">" + str(data.keys()[3]) + "</th> </tr> <tr> <td class=\"tg-ejgj\">" + str(data.index[0]) + "</td> <td class=\"tg-jua3\">" + str(data[data.keys()[0]][0]) + "</td> <td class=\"tg-jua3\">" + str(data[data.keys()[1]][0]) + "</td> <td class=\"tg-jua3\">" + str(data[data.keys()[2]][0]) + "</td> <td class=\"tg-jua3\">" + str(data[data.keys()[3]][0]) + "</td> </tr> <tr> <td class=\"tg-ejgj\">" + str(data.index[1]) + "</td> <td class=\"tg-jua3\">" + str(data[data.keys()[0]][1]) + "</td> <td class=\"tg-jua3\">" + str(data[data.keys()[1]][1]) + "</td> <td class=\"tg-jua3\">" + str(data[data.keys()[2]][1]) + "</td> <td class=\"tg-jua3\">" + str(data[data.keys()[3]][1]) + "</td> </tr> <tr> <td class=\"tg-ejgj\">" + str(data.index[2]) + "</td> <td class=\"tg-jua3\">" + str(data[data.keys()[0]][2]) + "</td> <td class=\"tg-jua3\">" + str(data[data.keys()[1]][2]) + "</td> <td class=\"tg-jua3\">" + str(data[data.keys()[2]][2]) + "</td> <td class=\"tg-jua3\">" + str(data[data.keys()[3]][2]) + "</td> </tr> <tr> <td class=\"tg-ejgj\">" + str(data.index[3]) + "</td> <td class=\"tg-jua3\">" + str(data[data.keys()[0]][3]) + "</td> <td class=\"tg-jua3\">" + str(data[data.keys()[1]][3]) + "</td> <td class=\"tg-jua3\">" + str(data[data.keys()[2]][3]) + "</td> <td class=\"tg-jua3\">" + str(data[data.keys()[3]][3]) + "</td> </tr>"))
return html
def estimate(self):
"""
Estimate the FE model with OLS
:return: results
"""
# demean
self.__demean()
# first, make a dataframe out of the panel of indvars
x_dataframe = self.x_demeaned.transpose(2,0,1).to_frame(False) # set to False to not drop NaNs
# unstack the depvar dataframe into a series
y_series = self.y_demeaned.unstack()
# fit regression model with statsmodels
results = sm.OLS(y_series.values, x_dataframe.values, missing='drop').fit()
print(results.summary(self.depvar, self.indvars))
self.result = results
def estimate(self):
"""
Estimate the first differenced OLS
:return: results
"""
# first difference data
self.__first_diff()
# first, make a dataframe out of the panel of indvars
x_dataframe = self.fd_x.transpose(2,0,1).to_frame(False) # set to False to not drop NaNs
# unstack the depvar dataframe into a series
y_series = self.fd_y.unstack()
# fit regression model with statsmodels
results = sm.OLS(y_series.values, x_dataframe.values, missing='drop').fit()
print(results.summary(self.depvar, self.indvars))
self.result = results
def solution2():
print("Start problem 2")
print("Start data preparation for problem 2")
dataPreparation()
print("Results: ")
tags = ['tweets_#gohawks', 'tweets_#gopatriots', 'tweets_#nfl', \
'tweets_#patriots', 'tweets_#sb49', 'tweets_#superbowl']
#tweets, retweets, sum of followers, maximum followers, time of the day
for tag in tags:
tweets, parameters = [], []
f = open('problem 2 ' + tag + ' data.txt')
line = f.readline()
while len(line):
p = line.split()
tweets.append(float(p[0]))
parameters.append([float(p[i]) for i in range(len(p))])
line = f.readline()
del(tweets[0])
next_hour_tweets = tweets
parameters.pop()
res = sm.OLS(next_hour_tweets, parameters).fit()
print("Result of " + tag)
print(res.summary())
f.close()
def macro_reg_ret(ret, macro_data):
'''
?????????????????????????????????????????????????
??????????????????????????*???????????????-??????
:param DataFrame ret: ??????????
:param DataFrame macro_index: ??????????
:return: [DataFrame,dict] [loading,significant_list]: ???????????????;??????????
'''
macro_loading = pd.DataFrame(np.zeros([ret.shape[1], macro_data.shape[1]]))
# ??????????
for i in range(ret.shape[1]):
y = ret.values[:, i]
# ??????????????
for j in range(macro_data.shape[1]):
x = macro_data.values[:, j]
model = sm.OLS(y, x).fit()
macro_loading.iloc[i, j] = model.params[0]
return macro_loading
def ret_reg_loading(tech_loading,ret,dummy):
'''
???????111????Loading??????111??????????????????????????
:param tech_loading:
:param ret:
:return:
'''
# ???????
significant_days=dict()
for tech in tech_loading.columns:
significant_days[tech]=0
# ??????111?????loading????
for tech in tech_loading.columns:
# ?????111???????????
for i in range(ret.shape[0]):
model = sm.OLS(ret.iloc[i,:].values, pd.concat([tech_loading[tech],dummy],axis=1).values).fit()
pvalue=model.pvalues[0]
if pvalue<0.1:
significant_days[tech]+=1
return significant_days
def ind_reg_ret(ind_ret,ret):
'''
???????????????????????????????????????????????
:param DataFrame ind_ret: ????????*????
:param DataFrame ret: ????????*????
:return:
'''
#???pvalue????0.1
pvalue_threshold=0.1
ind_loading=np.zeros([ret.shape[1],ind_ret.shape[1]])
# ?????????????????????????
for i in range(ret.shape[1]):
# ?????????????
for j in range(ind_ret.shape[1]):
model=sm.OLS(ret.values[:,i],ind_ret.values[:,j]).fit()
#???????????
ind_loading[i,j]=model.params[0]
#??pvalue?????????????????significant_stocks_list?1
ind_loading=pd.DataFrame(ind_loading,columns=ind_ret.columns)
return ind_loading
def Regress(self, print_data):
a = np.ndarray(shape=(len(self._rows), len(self._REGRESS_COLUMNS)))
b = np.zeros(shape=(len(self._rows)))
rn = 0
if print_data:
print('Found %d rows' % (len(self._rows)))
print('Columns')
print('\t'.join(sorted(self._REGRESS_COLUMNS)))
for juris,cols in self._rows.iteritems():
if not self._IsValidInputData(cols):
continue
b[rn] = self._values[juris]
cn = 0
for c in sorted(self._REGRESS_COLUMNS):
a[rn,cn] = cols[c]
cn += 1
if print_data:
print('\t'.join(str(x) for x in a[rn:rn+1,][0].tolist()))
rn += 1
a.resize((rn, len(self._REGRESS_COLUMNS)))
b.resize(rn)
if print_data:
print('LogValue')
print('\n'.join(str(x) for x in b.tolist()))
results = sm.OLS(b, a).fit()
print(results.summary(yname='log(Votes)',
xname=sorted(self._REGRESS_COLUMNS), alpha=0.01))
i = 0
for juris,cols in sorted(self._rows.iteritems()):
p = []
for c in sorted(self._REGRESS_COLUMNS):
p.append(cols[c])
pred = results.predict(p)
diff = math.e**self._values[juris] - math.e**pred
print('%-20s\t%7d\t%7d\t%7.4f\t%7d' % (juris, math.e**pred,
math.e**self._values[juris], diff / math.e**pred, diff))
return results
def get_tail_factor(self, colname_sep='-'):
"""Estimate tail factor, idea from Thomas Mack:
Returns a tail factor based off of an exponential fit to the LDFs. This will
fail if the product of 2nd and 3rd to last LDF < 1.0001. This also fails if
the estimated tail is larger than 2.0. In other areas, exponential fit is
rejected if the slope parameter p-value >0.5. This is currently representative
of the R implementation of this package, but may be enhanced in the future to be
p-value based.
Parameters:
colname_sep : str
text to join the names of two adjacent columns representing the
age-to-age factor column name.
Returns:
Pandas.Series of the tail factor.
"""
LDF = np.array(self.get_LDF()[:self.triangle.ncol-1])
if self.tail==False:
tail_factor=1
elif len(LDF[LDF>1]) < 2:
warn("Not enough factors larger than 1.0 to fit an exponential regression.")
tail_factor = 1
elif (LDF[-3] * LDF[-2] > 1.0001):
y = Series(LDF)
x = sm.add_constant((y.index+1)[y>1])
y = LDF[LDF>1]
n, = np.where(LDF==y[-1])[0]
tail_model = sm.OLS(np.log(y-1),x).fit()
tail_factor = np.product(np.exp(tail_model.params[0] + np.array([i+2 for i in range(n,n+100)]) * tail_model.params[1]).astype(float) + 1)
if tail_factor > 2:
warn("The estimate tail factor was bigger than 2 and has been reset to 1.")
tail_factor = 1
if tail_model.f_pvalue > 0.05:
warn("The p-value of the exponential tail fit is insignificant and tail has been set to 1.")
tail_factor = 1
else:
warn("LDF[-2] * LDF[-1] is not greater than 1.0001 and tail has been set to 1.")
tail_factor = 1
return Series(tail_factor, index = [self.triangle.data.columns[-1] + colname_sep + 'Ult'])
def apply_half_life(self, time_series):
lag = np.roll(time_series, 1)
lag[0] = 0
ret = time_series - lag
ret[0] = 0
# adds intercept terms to X variable for regression
lag2 = sm.add_constant(lag)
model = sm.OLS(ret, lag2)
res = model.fit()
self.half_life = -np.log(2) / res.params[1]
def hedge_ratio(Y, X):
# Look into using Kalman Filter to calculate the hedge ratio
X = sm.add_constant(X)
model = sm.OLS(Y, X).fit()
return model.params[1]
def ols_coefficients_to_dataframe(coefs):
"""
Take a series containing OLS coefficients and convert it
to a data frame.
Parameters
----------
coefs : pandas Series
Series with feature names in the index and the coefficient
values as the data, obtained from a linear model trained
using `statsmodels.OLS`.
Returns
-------
df_coef : pandas DataFrame
Data frame with two columns, the first being the feature name
and the second being the coefficient value.
Note
----
The first row in the output data frame is always for the intercept
and the rest are sorted by feature name.
"""
# first create a sorted data frame for all the non-intercept features
non_intercept_columns = [c for c in coefs.index if c != 'const']
df_non_intercept = pd.DataFrame(coefs.filter(non_intercept_columns), columns=['coefficient'])
df_non_intercept.index.name = 'feature'
df_non_intercept = df_non_intercept.sort_index()
df_non_intercept.reset_index(inplace=True)
# now create a data frame that just has the intercept
df_intercept = pd.DataFrame([{'feature': 'Intercept',
'coefficient': coefs['const']}])
# append the non-intercept frame to the intercept one
df_coef = df_intercept.append(df_non_intercept, ignore_index=True)
# we always want to have the feature column first
df_coef = df_coef[['feature', 'coefficient']]
return df_coef
test_ols.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 19
收藏 0
点赞 0
评论 0
def checkForSeries(self, x, y, series_x, series_y, **kwds):
# Consistency check with simple OLS.
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
result = ols(y=y, x=x, **kwds)
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
reference = ols(y=series_y, x=series_x, **kwds)
self.compare(reference, result)
ols.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 28
收藏 0
点赞 0
评论 0
def __init__(self, y, x, intercept=True, weights=None, nw_lags=None,
nw_overlap=False):
import warnings
warnings.warn("The pandas.stats.ols module is deprecated and will be "
"removed in a future version. We refer to external packages "
"like statsmodels, see some examples here: http://statsmodels.sourceforge.net/stable/regression.html",
FutureWarning, stacklevel=4)
try:
import statsmodels.api as sm
except ImportError:
import scikits.statsmodels.api as sm
self._x_orig = x
self._y_orig = y
self._weights_orig = weights
self._intercept = intercept
self._nw_lags = nw_lags
self._nw_overlap = nw_overlap
(self._y, self._x, self._weights, self._x_filtered,
self._index, self._time_has_obs) = self._prepare_data()
if self._weights is not None:
self._x_trans = self._x.mul(np.sqrt(self._weights), axis=0)
self._y_trans = self._y * np.sqrt(self._weights)
self.sm_ols = sm.WLS(self._y.get_values(),
self._x.get_values(),
weights=self._weights.values).fit()
else:
self._x_trans = self._x
self._y_trans = self._y
self.sm_ols = sm.OLS(self._y.get_values(),
self._x.get_values()).fit()
ols.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 33
收藏 0
点赞 0
评论 0
def _prepare_data(self):
"""
Cleans the input for single OLS.
Parameters
----------
lhs: Series
Dependent variable in the regression.
rhs: dict, whose values are Series, DataFrame, or dict
Explanatory variables of the regression.
Returns
-------
Series, DataFrame
Cleaned lhs and rhs
"""
(filt_lhs, filt_rhs, filt_weights,
pre_filt_rhs, index, valid) = _filter_data(self._y_orig, self._x_orig,
self._weights_orig)
if self._intercept:
filt_rhs['intercept'] = 1.
pre_filt_rhs['intercept'] = 1.
if hasattr(filt_weights, 'to_dense'):
filt_weights = filt_weights.to_dense()
return (filt_lhs, filt_rhs, filt_weights,
pre_filt_rhs, index, valid)
ols.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def summary_as_matrix(self):
"""Returns the formatted results of the OLS as a DataFrame."""
results = self._results
beta = results['beta']
data = {'beta': results['beta'],
't-stat': results['t_stat'],
'p-value': results['p_value'],
'std err': results['std_err']}
return DataFrame(data, beta.index).T
def Orthogonalize(left_data, right_data, left_name, right_name):
"""?????
?????????[date,IDs,factor1,factor2...]
??
--------
left_name: str
??1?????
right_name: str or list
??2??????????????????
industry: str
??????????None?????????
"""
def OLS(data, left_name):
tempData = data.copy()
yData = np.array(tempData.pop(left_name))
xData = np.array(tempData)
NaNInd = pd.notnull(yData) & pd.notnull(xData).all(axis=1)
model = sm.OLS(yData, xData, missing='drop')
res = model.fit()
data[left_name+'_orthogonalized'] = np.nan
data.ix[NaNInd, left_name+'_orthogonalized'] = res.resid
return data
factor_1 = left_data[[left_name]]
if not isinstance(right_name, list):
right_name = [right_name]
factor_2 = right_data[right_name]
factor = pd.concat([factor_1, factor_2], axis=1)
factor['alpha'] = 1 # ?????
factor = factor.groupby(level=0).apply(OLS, left_name=left_name)
return factor[[left_name+'_orthogonalized']]
def Calculate_Liquidity_Coeff(df_list):
#list of lists of liq coeff per bond
liq_arr_list = []
#print 'df_list size: ', len(df_list)
for df in df_list:
if df.empty:
continue
# A temporary array for holding liquidity beta for each month
liq_arr = [np.nan] * num_months_CONST
#print df['cusip_id'][0]
# Group dataframe on index by month
for date, df_group in df.groupby(pd.TimeGrouper("M")):
month = ''.join([str(date.month),str(date.year)])
month_key = month_keys[month]
# When there are some data in current month,
if df_group.shape[0] > 0:
# Run regression (as equation (2)) to get liquidity measure
y,X = dmatrices('excess_return_1 ~ yld_pt + volume_and_sign',
data=df_group, return_type='dataframe')
#print date, X.shape
mod = sm.OLS(y,X)
res = mod.fit()
#set specific months with liquidity factors
#res.params(2) = liquidity coefficient
liq_arr[month_key] = res.params[2]
liq_arr_list.append(liq_arr) #store all liq coeff for each month per bond
return liq_arr_list
def Run_Regression(liq_month_list):
df = pd.DataFrame(index = month_list) #set dates as index
# Liquidity pi_t
df['liq_month_list_1'] = liq_month_list
# Liquidity pi_t-1
df['liq_month_list'] = df['liq_month_list_1'].shift(1)
# After shift, the first row is not longer valid data
df = df.drop(df.index[0], inplace = False)
## FIX A scaling factor to delta_pi_t is dropped here because we do not
## having historical amount outstanding of the bonds in the portfolio.
## The scaling factor is (M_t-1 - M_1), where M_t is total dollar value at
## end of month t-1, representing the total dollar value of the bonds
## in month t
# delta_pi_t = pi_t - pi_t-1
df['liq_delta_1'] = df['liq_month_list_1'] - df['liq_month_list']
# delta_pi_t-1
df['liq_delta'] = df['liq_delta_1'].shift(1)
# After shift, the first row is not longer valid data
df = df.drop(df.index[0], inplace = False)
# Run linear regression using equation (4)
y,X = dmatrices('liq_delta_1 ~ liq_delta + liq_month_list',
data = df, return_type = 'dataframe')
mod = sm.OLS(y,X)
res = mod.fit()
# Calculate the predicted change in liquidty values
df['liq_proxy_values'] = \
res.params[0] + res.params[1] * df['liq_delta'] + \
res.params[2] * df['liq_month_list']
# Calculate the actual - predicted change in liquidity --> the residual term
# --> the liquidity risk
df['residual_term'] = df['liq_delta_1'] - df['liq_proxy_values']
# Scale the magnitude of the liquidity risk for convenient use in later steps
df['residual_term'] = df['residual_term'] * 10000
return df
def fit(self, y, X, handler=None):
"""Fit the model.
Args:
y (pandas.DataFrame): The vector of endogenous variable.
X (pandas.DataFrame): The matrix of exogenous variables.
"""
# Creating the OLS model from StatsModels and fitting it
model = sm.OLS(y, X)
handler = self._results_handler if handler is None else handler
return handler(model.fit())
def ols(self, x, y):
xflatten = np.delete(x, [i*(x.shape[0]+1)for i in range(x.shape[0])])
yflatten = np.delete(y, [i*(y.shape[0]+1)for i in range(y.shape[0])])
xflatten = sm.add_constant(xflatten)
model = sm.OLS(yflatten,xflatten)
results = model.fit()
print results.summary()