def test_rank_methods_frame(self):
tm.skip_if_no_package('scipy', '0.13', 'scipy.stats.rankdata')
import scipy
from scipy.stats import rankdata
xs = np.random.randint(0, 21, (100, 26))
xs = (xs - 10.0) / 10.0
cols = [chr(ord('z') - i) for i in range(xs.shape[1])]
for vals in [xs, xs + 1e6, xs * 1e-6]:
df = DataFrame(vals, columns=cols)
for ax in [0, 1]:
for m in ['average', 'min', 'max', 'first', 'dense']:
result = df.rank(axis=ax, method=m)
sprank = np.apply_along_axis(
rankdata, ax, vals,
m if m != 'first' else 'ordinal')
sprank = sprank.astype(np.float64)
expected = DataFrame(sprank, columns=cols)
if LooseVersion(scipy.__version__) >= '0.17.0':
expected = expected.astype('float64')
tm.assert_frame_equal(result, expected)
python类rankdata()的实例源码
test_stats.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 35
收藏 0
点赞 0
评论 0
def score_candidates(reactants, candidate_list, xs):
pred = model.predict(xs, batch_size = 20)[0]
rank = ss.rankdata(pred)
fname = raw_input('Enter file name to save to: ') + '.dat'
with open(os.path.join(FROOT, fname), 'w') as fid:
fid.write('FOR REACTANTS {}\n'.format(Chem.MolToSmiles(reactants)))
fid.write('Candidate product\tCandidate edit\tProbability\tRank\n')
for (c, candidate) in enumerate(candidate_list):
candidate_smile = candidate[0]
candidate_edit = candidate[1]
fid.write('{}\t{}\t{}\t{}\n'.format(
candidate_smile, candidate_edit, pred[c], 1 + len(pred) - rank[c]
))
print('Wrote to file {}'.format(os.path.join(FROOT, fname)))
def precision_n(test_scores, num_true, n):
"""
Precision at n measure is the number of instances where the any crowd's answer occur within ranker's firs n choices
For more details take a look at: http://www.aclweb.org/anthology/D13-1030
The first num_true_antec[i] dev_scores are predicted scores for true antecedents of the i-th sentence w/ PA
:param test_scores: \in [batch_size, num of candidates], for every sent w\ PA predicted scores for its candidates
:param num_true_antec: \in [batch_size], for every sent w\ PA number of true antecedents
:return: list of size 10
"""
precisions = []
for i in range(n):
precision = 0
for k, item in enumerate(test_scores):
ranks = len(item) - rankdata(item, method='ordinal').astype(int)
precision += min(1, len(set(ranks[:num_true[k]]) & set(range(i))))
print num_true[k]
print ranks
precision /= float(len(test_scores))
precision *= 100
precisions.append(precision)
return precisions
def precision_n(test_scores, num_true, n):
"""
Precision at n measure is the number of instances where the any crowd's answer occur within ranker's firs n choices
For more details take a look at: http://www.aclweb.org/anthology/D13-1030
The first num_true_antec[i] dev_scores are predicted scores for true antecedents of the i-th sentence w/ PA
:param test_scores: \in [batch_size, num of candidates], for every sent w\ PA predicted scores for its candidates
:param num_true_antec: \in [batch_size], for every sent w\ PA number of true antecedents
:return: list of size 10
"""
precisions = []
for i in range(n):
precision = 0
for k, item in enumerate(test_scores):
ranks = len(item) - rankdata(item, method='ordinal').astype(int)
precision += min(1, len(set(ranks[:i+1]) & set(range(num_true[k]))))
precision /= float(len(test_scores))
precision *= 100
precisions.append(precision)
return precisions
def precision_n(test_scores, num_true, n):
"""
Precision at n measure is the number of instances where the any crowd's answer occur within ranker's firs n choices
For more details take a look at: http://www.aclweb.org/anthology/D13-1030
The first num_true_antec[i] dev_scores are predicted scores for true antecedents of the i-th sentence w/ PA
:param test_scores: \in [batch_size, num of candidates], for every sent w\ PA predicted scores for its candidates
:param num_true_antec: \in [batch_size], for every sent w\ PA number of true antecedents
:return: list of size 10
"""
precisions = []
for i in range(n):
precision = 0
for k, item in enumerate(test_scores):
ranks = len(item) - rankdata(item, method='ordinal').astype(int)
precision += min(1, len(set(ranks[:num_true[k]]) & set(range(i+1))))
precision /= float(len(test_scores))
precision *= 100
precisions.append(precision)
return precisions
def compute(self, today, assets, out, close, returns):
v000 = np.empty((5, out.shape[0]))
for i0 in range(1, 6):
v000000 = returns[-i0]
v000001 = np.full(out.shape[0], 0.0)
v00000 = v000000 < v000001
v000010 = np.empty((20, out.shape[0]))
for i1 in range(1, 21):
v000010[-i1] = returns[- i0 -i1]
v00001 = np.std(v000010, axis=0)
v00002 = close[-i0]
v0000lgcl = np.empty(out.shape[0])
v0000lgcl[v00000] = v00001[v00000]
v0000lgcl[~v00000] = v00002[~v00000]
v0000 = v0000lgcl
v0001 = np.full(out.shape[0], 2.0)
v000[-i0] = np.power(v0000, v0001)
v00 = np.argmax(v000, axis=0)
v0 = stats.rankdata(v00)
v1 = np.full(out.shape[0], 0.5)
out[:] = v0 - v1
# (-1 * correlation(rank(delta(log(volume), 2)), rank(((close - open) / open)), 6))
def compute(self, today, assets, out, volume, close, open):
v0 = np.full(out.shape[0], -1.0)
v10 = np.empty((6, out.shape[0]))
for i0 in range(1, 7):
v1000 = np.empty((3, out.shape[0]))
for i1 in range(1, 4):
v10000 = volume[- i0 -i1]
v1000[-i1] = np.log(v10000)
v100 = v1000[-1] - v1000[-3]
v10[-i0] = stats.rankdata(v100)
v11 = np.empty((6, out.shape[0]))
for i0 in range(1, 7):
v11000 = close[-i0]
v11001 = open[-i0]
v1100 = v11000 - v11001
v1101 = open[-i0]
v110 = v1100 / v1101
v11[-i0] = stats.rankdata(v110)
v1 = pd.DataFrame(v10).rolling(window=6).corr(pd.DataFrame(v11)).tail(1).as_matrix()[-1]
out[:] = v0 * v1
# (-1 * correlation(rank(open), rank(volume), 10))
def compute(self, today, assets, out, close, open, vwap):
v000 = open[-1]
v00100 = np.empty((10, out.shape[0]))
for i0 in range(1, 11):
v00100[-i0] = vwap[-i0]
v0010 = v00100.sum(axis=0)
v0011 = np.full(out.shape[0], 10.0)
v001 = v0010 / v0011
v00 = v000 - v001
v0 = stats.rankdata(v00)
v10 = np.full(out.shape[0], -1.0)
v11000 = close[-1]
v11001 = vwap[-1]
v1100 = v11000 - v11001
v110 = stats.rankdata(v1100)
v11 = np.abs(v110)
v1 = v10 * v11
out[:] = v0 * v1
# (-1 * correlation(open, volume, 10))
def compute(self, today, assets, out, volume, close, vwap):
v0000 = np.empty((3, out.shape[0]))
for i0 in range(1, 4):
v00000 = vwap[-i0]
v00001 = close[-i0]
v0000[-i0] = v00000 - v00001
v000 = np.max(v0000, axis=0)
v00 = stats.rankdata(v000)
v0100 = np.empty((3, out.shape[0]))
for i0 in range(1, 4):
v01000 = vwap[-i0]
v01001 = close[-i0]
v0100[-i0] = v01000 - v01001
v010 = np.min(v0100, axis=0)
v01 = stats.rankdata(v010)
v0 = v00 + v01
v100 = np.empty((4, out.shape[0]))
for i0 in range(1, 5):
v100[-i0] = volume[-i0]
v10 = v100[-1] - v100[-4]
v1 = stats.rankdata(v10)
out[:] = v0 * v1
# (sign(delta(volume, 1)) * (-1 * delta(close, 1)))
def compute(self, today, assets, out, volume, returns, open):
v00 = np.full(out.shape[0], -1.0)
v0100 = np.empty((4, out.shape[0]))
for i0 in range(1, 5):
v0100[-i0] = returns[-i0]
v010 = v0100[-1] - v0100[-4]
v01 = stats.rankdata(v010)
v0 = v00 * v01
v10 = np.empty((10, out.shape[0]))
for i0 in range(1, 11):
v10[-i0] = open[-i0]
v11 = np.empty((10, out.shape[0]))
for i0 in range(1, 11):
v11[-i0] = volume[-i0]
v1 = pd.DataFrame(v10).rolling(window=10).corr(pd.DataFrame(v11)).tail(1).as_matrix()[-1]
out[:] = v0 * v1
# (-1 * sum(rank(correlation(rank(high), rank(volume), 3)), 3))
def compute(self, today, assets, out, high, volume):
v0 = np.full(out.shape[0], -1.0)
v10 = np.empty((3, out.shape[0]))
for i0 in range(1, 4):
v1000 = np.empty((3, out.shape[0]))
for i1 in range(1, 4):
v10000 = high[- i0 -i1]
v1000[-i1] = stats.rankdata(v10000)
v1001 = np.empty((3, out.shape[0]))
for i1 in range(1, 4):
v10010 = volume[- i0 -i1]
v1001[-i1] = stats.rankdata(v10010)
v100 = pd.DataFrame(v1000).rolling(window=3).corr(pd.DataFrame(v1001)).tail(1).as_matrix()[-1]
v10[-i0] = stats.rankdata(v100)
v1 = v10.sum(axis=0)
out[:] = v0 * v1
# (-1 * rank(covariance(rank(high), rank(volume), 5)))
def compute(self, today, assets, out, close, open):
v0 = np.full(out.shape[0], -1.0)
v10000 = np.empty((5, out.shape[0]))
for i0 in range(1, 6):
v1000000 = close[-i0]
v1000001 = open[-i0]
v100000 = v1000000 - v1000001
v10000[-i0] = np.abs(v100000)
v1000 = np.std(v10000, axis=0)
v10010 = close[-1]
v10011 = open[-1]
v1001 = v10010 - v10011
v100 = v1000 + v1001
v1010 = np.empty((10, out.shape[0]))
for i0 in range(1, 11):
v1010[-i0] = close[-i0]
v1011 = np.empty((10, out.shape[0]))
for i0 in range(1, 11):
v1011[-i0] = open[-i0]
v101 = pd.DataFrame(v1010).rolling(window=10).corr(pd.DataFrame(v1011)).tail(1).as_matrix()[-1]
v10 = v100 + v101
v1 = stats.rankdata(v10)
out[:] = v0 * v1
# ((-1 * sign(((close - delay(close, 7)) + delta(close, 7)))) * (1 + rank((1 + sum(returns, 250)))))
def compute(self, today, assets, out, close, returns):
v00 = np.full(out.shape[0], -1.0)
v01000 = close[-1]
v010010 = close[-8]
v01001 = v010010 # delay
v0100 = v01000 - v01001
v01010 = np.empty((8, out.shape[0]))
for i0 in range(1, 9):
v01010[-i0] = close[-i0]
v0101 = v01010[-1] - v01010[-8]
v010 = v0100 + v0101
v01 = np.sign(v010)
v0 = v00 * v01
v10 = np.full(out.shape[0], 1.0)
v1100 = np.full(out.shape[0], 1.0)
v11010 = np.empty((250, out.shape[0]))
for i0 in range(1, 251):
v11010[-i0] = returns[-i0]
v1101 = v11010.sum(axis=0)
v110 = v1100 + v1101
v11 = stats.rankdata(v110)
v1 = v10 + v11
out[:] = v0 * v1
# (((-1 * rank((open - delay(high, 1)))) * rank((open - delay(close, 1)))) * rank((open - delay(low, 1))))
def compute(self, today, assets, out, high, close, open, low):
v000 = np.full(out.shape[0], -1.0)
v00100 = open[-1]
v001010 = high[-2]
v00101 = v001010 # delay
v0010 = v00100 - v00101
v001 = stats.rankdata(v0010)
v00 = v000 * v001
v0100 = open[-1]
v01010 = close[-2]
v0101 = v01010 # delay
v010 = v0100 - v0101
v01 = stats.rankdata(v010)
v0 = v00 * v01
v100 = open[-1]
v1010 = low[-2]
v101 = v1010 # delay
v10 = v100 - v101
v1 = stats.rankdata(v10)
out[:] = v0 * v1
# ((((sum(close, 8) / 8) + stddev(close, 8)) < (sum(close, 2) / 2)) ? (-1 * 1) : (((sum(close, 2) / 2) < ((sum(close, 8) / 8) - stddev(close, 8))) ? 1 : (((1 < (volume / adv20)) || ((volume / adv20) == 1)) ? 1 : (-1 * 1))))
def compute(self, today, assets, out, high, volume, close):
v0 = np.full(out.shape[0], -1.0)
v100 = np.empty((6, out.shape[0]))
for i0 in range(1, 7):
v1000 = np.empty((5, out.shape[0]))
for i1 in range(1, 6):
v1000[-i1] = high[- i0 -i1]
v1001 = np.empty((5, out.shape[0]))
for i1 in range(1, 6):
v1001[-i1] = volume[- i0 -i1]
v100[-i0] = pd.DataFrame(v1000).rolling(window=5).corr(pd.DataFrame(v1001)).tail(1).as_matrix()[-1]
v10 = v100[-1] - v100[-6]
v1100 = np.empty((20, out.shape[0]))
for i0 in range(1, 21):
v1100[-i0] = close[-i0]
v110 = np.std(v1100, axis=0)
v11 = stats.rankdata(v110)
v1 = v10 * v11
out[:] = v0 * v1
# (((sum(high, 20) / 20) < high) ? (-1 * delta(high, 2)) : 0)
def compute(self, today, assets, out, close, open):
v000 = np.empty((200, out.shape[0]))
for i0 in range(1, 201):
v00000 = open[-2]
v00001 = close[-2]
v0000 = v00000 - v00001
v000[-i0] = v0000 # delay
v001 = np.empty((200, out.shape[0]))
for i0 in range(1, 201):
v001[-i0] = close[-i0]
v00 = pd.DataFrame(v000).rolling(window=200).corr(pd.DataFrame(v001)).tail(1).as_matrix()[-1]
v0 = stats.rankdata(v00)
v100 = open[-1]
v101 = close[-1]
v10 = v100 - v101
v1 = stats.rankdata(v10)
out[:] = v0 + v1
# ((-1 * rank(Ts_Rank(close, 10))) * rank((close / open)))
def compute(self, today, assets, out, high, volume):
v00 = np.full(out.shape[0], -1.0)
v0100 = np.empty((10, out.shape[0]))
for i0 in range(1, 11):
v0100[-i0] = high[-i0]
v010 = np.std(v0100, axis=0)
v01 = stats.rankdata(v010)
v0 = v00 * v01
v10 = np.empty((10, out.shape[0]))
for i0 in range(1, 11):
v10[-i0] = high[-i0]
v11 = np.empty((10, out.shape[0]))
for i0 in range(1, 11):
v11[-i0] = volume[-i0]
v1 = pd.DataFrame(v10).rolling(window=10).corr(pd.DataFrame(v11)).tail(1).as_matrix()[-1]
out[:] = v0 * v1
# (((high * low)^0.5) - vwap)
def compute(self, today, assets, out, volume, vwap):
v0 = np.full(out.shape[0], -1.0)
v10 = np.empty((5, out.shape[0]))
for i0 in range(1, 6):
v1000 = np.empty((5, out.shape[0]))
for i1 in range(1, 6):
v10000 = volume[-i0 - i1]
v1000[-i1] = stats.rankdata(v10000)
v1001 = np.empty((5, out.shape[0]))
for i1 in range(1, 6):
v10010 = vwap[-i0 - i1]
v1001[-i1] = stats.rankdata(v10010)
v100 = pd.DataFrame(v1000).rolling(window=5).corr(pd.DataFrame(v1001)).tail(1).as_matrix()[-1]
v10[-i0] = stats.rankdata(v100)
v1 = np.max(v10, axis=0)
out[:] = v0 * v1
# (((((delay(close, 20) - delay(close, 10)) / 10) - ((delay(close, 10) - close) / 10)) < (-1 * 0.05)) ? 1 : ((-1 * 1) * (close - delay(close, 1))))
def compute(self, today, assets, out, returns):
v0 = np.full(out.shape[0], 0.0)
v10 = np.full(out.shape[0], 1.0)
v110000 = np.empty((10, out.shape[0]))
for i0 in range(1, 11):
v110000[-i0] = returns[-i0]
v11000 = v110000.sum(axis=0)
v110010 = np.empty((3, out.shape[0]))
for i0 in range(1, 4):
v1100100 = np.empty((2, out.shape[0]))
for i1 in range(1, 3):
v1100100[-i1] = returns[-i0 - i1]
v110010[-i0] = v1100100.sum(axis=0)
v11001 = v110010.sum(axis=0)
v1100 = v11000 / v11001
v110 = stats.rankdata(v1100)
v11100 = returns[-1]
v11101 = cap[-1]
v1110 = v11100 * v11101
v111 = stats.rankdata(v1110)
v11 = v110 * v111
v1 = v10 * v11
out[:] = v0 - v1
# (0 - (1 * ((close - vwap) / decay_linear(rank(ts_argmax(close, 30)), 2))))
def compute(self, today, assets, out, close, vwap):
v0 = np.full(out.shape[0], 0.0)
v10 = np.full(out.shape[0], 1.0)
v1100 = close[-1]
v1101 = vwap[-1]
v110 = v1100 - v1101
v1110 = np.empty((2, out.shape[0]))
for i0 in range(1, 3):
v111000 = np.empty((30, out.shape[0]))
for i1 in range(1, 31):
v111000[-i1] = close[-i0 - i1]
v11100 = np.argmax(v111000, axis=0)
v1110[-i0] = stats.rankdata(v11100)
v111 = (v1110 * (np.arange(1.0, 3, 1.0) / 3)[:, np.newaxis]).sum(axis=0) # decay_linear
v11 = v110 / v111
v1 = v10 * v11
out[:] = v0 - v1
def compute(self, today, assets, out, adv180, vwap):
v000 = vwap[-1]
v0010 = np.empty((16, out.shape[0]))
for i0 in range(1, 17):
v0010[-i0] = vwap[-i0]
v001 = np.min(v0010, axis=0)
v00 = v000 - v001
v0 = stats.rankdata(v00)
v100 = np.empty((18, out.shape[0]))
for i0 in range(1, 19):
v100[-i0] = vwap[-i0]
v101 = np.empty((18, out.shape[0]))
for i0 in range(1, 19):
v101[-i0] = adv180[-i0]
v10 = pd.DataFrame(v100).rolling(window=18).corr(pd.DataFrame(v101)).tail(1).as_matrix()[-1]
v1 = stats.rankdata(v10)
out[:] = v0 < v1
# ((rank(correlation(vwap, sum(adv20, 22.4101), 9.91009)) < rank(((rank(open) + rank(open)) < (rank(((high + low) / 2)) + rank(high))))) * -1)
def compute(self, today, assets, out, volume, adv50, low, vwap):
v000 = np.empty((4, out.shape[0]))
for i0 in range(1, 5):
v000[-i0] = vwap[-i0]
v001 = np.empty((4, out.shape[0]))
for i0 in range(1, 5):
v001[-i0] = volume[-i0]
v00 = pd.DataFrame(v000).rolling(window=4).corr(pd.DataFrame(v001)).tail(1).as_matrix()[-1]
v0 = stats.rankdata(v00)
v100 = np.empty((12, out.shape[0]))
for i0 in range(1, 13):
v1000 = low[-i0]
v100[-i0] = stats.rankdata(v1000)
v101 = np.empty((12, out.shape[0]))
for i0 in range(1, 13):
v1010 = adv50[-i0]
v101[-i0] = stats.rankdata(v1010)
v10 = pd.DataFrame(v100).rolling(window=12).corr(pd.DataFrame(v101)).tail(1).as_matrix()[-1]
v1 = stats.rankdata(v10)
out[:] = v0 < v1
def compute(self, today, assets, out, close, adv20, vwap, open):
v000 = np.empty((20, out.shape[0]))
for i0 in range(1, 21):
v0000 = np.empty((6, out.shape[0]))
for i1 in range(1, 7):
v0000[-i1] = close[-i0 - i1]
v0001 = np.empty((6, out.shape[0]))
for i1 in range(1, 7):
v00010 = np.empty((15, out.shape[0]))
for i2 in range(1, 16):
v00010[-i2] = adv20[-i0 - i1 - i2]
v0001[-i1] = v00010.sum(axis=0)
v000[-i0] = pd.DataFrame(v0000).rolling(window=6).corr(pd.DataFrame(v0001)).tail(1).as_matrix()[-1]
v00 = pd.DataFrame(v000).rank().tail(1).as_matrix()[-1]
v01000 = open[-1]
v01001 = close[-1]
v0100 = v01000 + v01001
v01010 = vwap[-1]
v01011 = open[-1]
v0101 = v01010 + v01011
v010 = v0100 - v0101
v01 = stats.rankdata(v010)
v0 = v00 < v01
v1 = np.full(out.shape[0], -1.0)
out[:] = v0 * v1
test_stats.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 31
收藏 0
点赞 0
评论 0
def test_rank_methods_series(self):
tm.skip_if_no_package('scipy', '0.13', 'scipy.stats.rankdata')
import scipy
from scipy.stats import rankdata
xs = np.random.randn(9)
xs = np.concatenate([xs[i:] for i in range(0, 9, 2)]) # add duplicates
np.random.shuffle(xs)
index = [chr(ord('a') + i) for i in range(len(xs))]
for vals in [xs, xs + 1e6, xs * 1e-6]:
ts = Series(vals, index=index)
for m in ['average', 'min', 'max', 'first', 'dense']:
result = ts.rank(method=m)
sprank = rankdata(vals, m if m != 'first' else 'ordinal')
expected = Series(sprank, index=index)
if LooseVersion(scipy.__version__) >= '0.17.0':
expected = expected.astype('float64')
tm.assert_series_equal(result, expected)
def make_submit(self, model, submit_file):
data = self.eval_sets().values()[0]
target_lines = list()
answers = np.asarray([[idx] for idx in self.entity.keys()])
for i, d in enumerate(data):
num_candidate = len(self.entity)
index_entities = xrange(num_candidate)
terms = d.split('\t')
subjects = np.asarray([[terms[0]]] * num_candidate)
relations = np.asarray([[terms[1]]] * num_candidate)
sims = model.predict([subjects, relations, answers], batch_size=num_candidate).flatten()
print(i)
r = rankdata(sims, method='ordinal')
index_candidates = nlargest(200, index_entities, key=lambda j: r[j])
one_line = ' '.join([str(index_candidate) for index_candidate in index_candidates])
target_lines.append(one_line + '\n')
submit_file.writelines(target_lines)
def make_submit_rt(self, model, submit_file):
data = self.eval_sets_rt().values()[0]
target_lines = list()
answers = np.asarray([[idx] for idx in self.entity.keys()])
for i, d in enumerate(data):
num_candidate = len(self.entity)
index_entities = xrange(num_candidate)
terms = d.split('\t')
relations = np.asarray([[terms[0]]] * num_candidate)
objects = np.asarray([[terms[1]]] * num_candidate)
sims = model.predict_rt([answers, relations, objects], batch_size=num_candidate).flatten()
print(i)
r = rankdata(sims, method='ordinal')
index_candidates = nlargest(200, index_entities, key=lambda j: r[j])
one_line = ' '.join([str(index_candidate) for index_candidate in index_candidates])
target_lines.append(one_line + '\n')
submit_file.writelines(target_lines)
def arrau_precision_n(test_scores, num_true, all_candidates, punctuation_ids, ns):
"""
Precision at n measure is the number of instances where the any crowd's answer occur within ranker's firs n choices
For more details take a look at: http://www.aclweb.org/anthology/D13-1030
The first num_true_antec[i] dev_scores are predicted scores for true antecedents of the i-th sentence w/ PA
:param test_scores: \in [batch_size, num of candidates], for every sent w\ PA predicted scores for its candidates
:param num_true_antec: \in [batch_size], for every sent w\ PA number of true antecedents
:return: list of size 10
"""
precisions = []
for i in range(ns):
precision = 0
for k, item in enumerate(test_scores):
ranks = len(item) - rankdata(item, method='ordinal').astype(int)
#precision += min(1, len(set(ranks[:i+1]) & set(range(num_true))))
counter = 0
for pred in ranks[:i+1]:
for gold in range(num_true[k]):
sym_difference = list(set(all_candidates[k][pred]) ^ set(all_candidates[k][gold]))
intersection_strip = [s for s in sym_difference if s not in punctuation_ids]
if len(intersection_strip) <= 1:
counter += 1
precision += min(1, counter)
precision /= float(len(test_scores))
precision *= 100
precisions.append(precision)
return precisions
def preprocess(self, x, fit=False):
"""Transform each marginal to be as close to a standard Gaussian as possible.
'standard' (default) just subtracts the mean and scales by the std.
'empirical' does an empirical gaussianization (but this cannot be inverted).
'outliers' tries to squeeze in the outliers
Any other choice will skip the transformation."""
if self.missing_values is not None:
x, self.n_obs = mean_impute(x, self.missing_values) # Creates a copy
else:
self.n_obs = len(x)
if self.gaussianize == 'none':
pass
elif self.gaussianize == 'standard':
if fit:
mean = np.mean(x, axis=0)
# std = np.std(x, axis=0, ddof=0).clip(1e-10)
std = np.sqrt(np.sum((x - mean)**2, axis=0) / self.n_obs).clip(1e-10)
self.theta = (mean, std)
x = ((x - self.theta[0]) / self.theta[1])
if np.max(np.abs(x)) > 6 and self.verbose:
print("Warning: outliers more than 6 stds away from mean. Consider using gaussianize='outliers'")
elif self.gaussianize == 'outliers':
if fit:
mean = np.mean(x, axis=0)
std = np.std(x, axis=0, ddof=0).clip(1e-10)
self.theta = (mean, std)
x = g((x - self.theta[0]) / self.theta[1]) # g truncates long tails
elif self.gaussianize == 'empirical':
print("Warning: correct inversion/transform of empirical gauss transform not implemented.")
x = np.array([norm.ppf((rankdata(x_i) - 0.5) / len(x_i)) for x_i in x.T]).T
if self.gpu and fit: # Don't return GPU matrices when only transforming
x = cm.CUDAMatrix(x)
return x
def compute(self, today, assets, out, volume, open):
v0 = np.full(out.shape[0], -1.0)
v10 = np.empty((10, out.shape[0]))
for i0 in range(1, 11):
v100 = open[-i0]
v10[-i0] = stats.rankdata(v100)
v11 = np.empty((10, out.shape[0]))
for i0 in range(1, 11):
v110 = volume[-i0]
v11[-i0] = stats.rankdata(v110)
v1 = pd.DataFrame(v10).rolling(window=10).corr(pd.DataFrame(v11)).tail(1).as_matrix()[-1]
out[:] = v0 * v1
# (-1 * Ts_Rank(rank(low), 9))
def compute(self, today, assets, out, low):
v0 = np.full(out.shape[0], -1.0)
v10 = np.empty((9, out.shape[0]))
for i0 in range(1, 10):
v100 = low[-i0]
v10[-i0] = stats.rankdata(v100)
v1 = pd.DataFrame(v10).rank().tail(1).as_matrix()[-1]
out[:] = v0 * v1
# (rank((open - (sum(vwap, 10) / 10))) * (-1 * abs(rank((close - vwap)))))