def evaluate(model, dev_data):
pred = model.predict_proba(dev_data.data, batch_size=32)
corr = spearmanr(pred, dev_data.labels)
print "Spearman's R: {0}".format(corr)
python类spearmanr()的实例源码
def ma_ribbon(df, ma_series):
ma_array = np.zeros([len(df)])
for idx, ma_len in enumerate(ma_series):
key = 'EMA_CLOSE_' + str(ma_len)
ema(df, ma_len, field = 'close')
ma_array[idx] = df[key][-1]
corr, pval = stats.spearmanr(ma_array, range(len(ma_series), 0, -1))
dist = max(ma_array) - min(ma_array)
df["MARIBBON_CORR"][-1] = corr * 100
df["MARIBBON_PVAL"][-1] = pval * 100
df["MARIBBON_DIST"][-1] = dist
def getSpearmanr(infile):
x_list = list()
y_list = list()
for i, line in enumerate(open(infile, 'r')):
words = line.strip('\n').split('\t')
x_list.append((i, float(words[2])))
y_list.append((i, float(words[3])))
x_list = sorted(x_list, key=lambda x:x[1])
y_list = sorted(y_list, key=lambda x:x[1])
x_list = sorted([(x, i) for i, (x, score) in enumerate(x_list)], key=lambda x: x[0])
y_list = sorted([(y, i) for i, (y, score) in enumerate(y_list)], key=lambda x: x[0])
x_list, y_list = np.array(x_list), np.array(y_list)
rho, pval = spearmanr(x_list[:, 1], y_list[:, 1])
return rho, pval
def calc_correl(self, dev_pred, test_pred):
dev_prs, _ = pearsonr(dev_pred, self.dev_y_org)
test_prs, _ = pearsonr(test_pred, self.test_y_org)
dev_spr, _ = spearmanr(dev_pred, self.dev_y_org)
test_spr, _ = spearmanr(test_pred, self.test_y_org)
dev_tau, _ = kendalltau(dev_pred, self.dev_y_org)
test_tau, _ = kendalltau(test_pred, self.test_y_org)
return dev_prs, test_prs, dev_spr, test_spr, dev_tau, test_tau
def check_similarity_match(X_embed, S):
"""
Since SimEcs are supposed to project the data into an embedding space where the target similarities
can be linearly approximated, check if X_embed*X_embed^T = S
(check mean squared error and Spearman correlation coefficient)
Inputs:
- X_embed: Nxd matrix with coordinates in the embedding space
- S: NxN matrix with target similarities (do whatever transformations were done before using this
as input to the SimEc, e.g. centering, etc.)
Returns:
- msq, rho, r: mean squared error, Spearman and Pearson correlation coefficent between linear kernel of embedding
and target similarities (mean squared error is more exact, corrcoef a more relaxed error measure)
"""
# compute linear kernel as approximated similarities
S_approx = X_embed.dot(X_embed.T)
# to get results that are comparable across similarity measures, we have to normalize them somehow,
# in this case by dividing by the absolute max value of the target similarity matrix
n = np.max(np.abs(S))
S_norm = S/n
S_approx /= n
# compute mean squared error
msqe = np.mean((S_norm - S_approx) ** 2)
# compute Spearman correlation coefficient
rho = spearmanr(S_norm.flatten(), S_approx.flatten())[0]
# compute Pearson correlation coefficient
r = pearsonr(S_norm.flatten(), S_approx.flatten())[0]
return msqe, rho, r
def compute_score(self, conf, hy):
conf['_r2'] = r2_score(self.test_y, hy)
conf['_spearmanr'] = spearmanr(self.test_y, hy)[0]
conf['_pearsonr'] = pearsonr(self.test_y, hy)[0]
conf['_score'] = conf['_' + self.score]
# print(conf)
def profile(filepath, n, exact=True, save=False, verbose=True, use_gpu=False, report=open('temp.txt', 'w')):
if exact:
tol = 0
else:
tol = None
solpath = 'data/{}_sol.dat'.format(filepath2name(filepath))
if not os.path.isfile(solpath):
solve(filepath, n, seed=0, verbose=verbose)
q, r, ranks = pickle.load(open(solpath, 'rb'))
if use_gpu:
model_classes = [PPRIterativeTF, PPRLUDecompositionTF, PPRBearTF]
else:
model_classes = [PPRIterative, PPRLUDecomposition, PPRBear]
for model_class in model_classes:
with tf.Session() as sess:
start = time.time()
if use_gpu:
model = model_class(sess, n, filepath, drop_tol=tol, verbose=verbose)
else:
model = model_class(drop_tol=tol, verbose=verbose)
model.preprocess(filepath)
end = time.time()
if use_gpu:
sess.run(tf.global_variables_initializer())
elapsed = end - start
if save:
model.save('models/{}.ppr'.format(model.alias))
print("[{}]({},{},n={})".format(model.alias, 'gpu' if use_gpu else 'cpu', 'exact' if exact else 'apprx', n), file=report)
print("preprocess\t{}".format(elapsed), file=report)
start = time.time()
r_ = model.query(q)
end = time.time()
elapsed = end - start
print("query time\t{}".format(elapsed), file=report)
ranks_ = pr2ranks(r_)
spearman = spearmanr(ranks, ranks_)
r_ = r_ / r_.sum()
print("diff norm\t{}".format(norm(r - r_)), file=report)
print("cosine sim\t{}".format(r.dot(r_) / norm(r) / norm(r_)), file=report)
print("spearman corr\t{}".format(spearman.correlation), file=report)
print("", file=report)
test_analytics.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 29
收藏 0
点赞 0
评论 0
def test_corr_rank(self):
tm._skip_if_no_scipy()
import scipy
import scipy.stats as stats
# kendall and spearman
A = tm.makeTimeSeries()
B = tm.makeTimeSeries()
A[-5:] = A[:5]
result = A.corr(B, method='kendall')
expected = stats.kendalltau(A, B)[0]
self.assertAlmostEqual(result, expected)
result = A.corr(B, method='spearman')
expected = stats.spearmanr(A, B)[0]
self.assertAlmostEqual(result, expected)
# these methods got rewritten in 0.8
if scipy.__version__ < LooseVersion('0.9'):
raise nose.SkipTest("skipping corr rank because of scipy version "
"{0}".format(scipy.__version__))
# results from R
A = Series(
[-0.89926396, 0.94209606, -1.03289164, -0.95445587, 0.76910310, -
0.06430576, -2.09704447, 0.40660407, -0.89926396, 0.94209606])
B = Series(
[-1.01270225, -0.62210117, -1.56895827, 0.59592943, -0.01680292,
1.17258718, -1.06009347, -0.10222060, -0.89076239, 0.89372375])
kexp = 0.4319297
sexp = 0.5853767
self.assertAlmostEqual(A.corr(B, method='kendall'), kexp)
self.assertAlmostEqual(A.corr(B, method='spearman'), sexp)
test_nanops.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 39
收藏 0
点赞 0
评论 0
def test_nancorr_spearman(self):
tm.skip_if_no_package('scipy.stats')
from scipy.stats import spearmanr
targ0 = spearmanr(self.arr_float_2d, self.arr_float1_2d)[0]
targ1 = spearmanr(self.arr_float_2d.flat, self.arr_float1_2d.flat)[0]
self.check_nancorr_nancov_2d(nanops.nancorr, targ0, targ1,
method='spearman')
targ0 = spearmanr(self.arr_float_1d, self.arr_float1_1d)[0]
targ1 = spearmanr(self.arr_float_1d.flat, self.arr_float1_1d.flat)[0]
self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1,
method='spearman')
def spearman(x,y):
return stats.spearmanr(x, y)[0]
###########################################
# Start
###########################################
def forward(self, bottom, top):
"""Compute the SROCC and LCC and output them to top."""
#ipdb.set_trace()
testPreds = bottom[0].data
testPreds = np.reshape(testPreds,testPreds.shape[0])
testLabels = bottom[1].data
testLabels = np.reshape(testLabels,testLabels.shape[0])
top[0].data[...] = stats.spearmanr(testPreds, testLabels)[0]
top[1].data[...] = stats.pearsonr(testPreds, testLabels)[0]
def comp_corr(df, ptype):
if ptype=='begin':
valid_df = df[(df.p00 >2)]
else:
valid_df = df[(df.p11 >2)]
lengths = valid_df.span_length.values
if ptype == 'begin':
plengths = valid_df.p00.values
else:
plengths = valid_df.p11.values
print float(len(valid_df))/len(df), '\t', stats.spearmanr(plengths, lengths)[0]
def _correlation(self, output, score):
return [spearmanr(output, score), pearsonr(output, score)]
def calc_correl(self, dev_pred, test_pred):
dev_prs, _ = pearsonr(dev_pred, self.dev_y_org)
test_prs, _ = pearsonr(test_pred, self.test_y_org)
dev_spr, _ = spearmanr(dev_pred, self.dev_y_org)
test_spr, _ = spearmanr(test_pred, self.test_y_org)
dev_tau, _ = kendalltau(dev_pred, self.dev_y_org)
test_tau, _ = kendalltau(test_pred, self.test_y_org)
return dev_prs, test_prs, dev_spr, test_spr, dev_tau, test_tau
def do_spearmanr(list1, list2, alpha=0.05):
c, p = spearmanr(list1, list2)
if p < alpha:
return c
return 'n.s.'
def calcroh(file_name):
human_list = list()
pred_list = list()
with open(file_name) as i_f:
for line in i_f:
human_list.append(line.strip().split()[2])
pred_list.append(line.strip().split()[3])
return spearmanr(human_list, pred_list)
def cal_spear(text):
list_1 = []
list_2 = []
with open(text) as i_f:
for line in i_f:
list_1.append(line.strip().split()[2])
list_2.append(line.strip().split()[3])
return spearmanr(list_1,list_2)
generate_ngram_indicator.py 文件源码
项目:kaggle-quora-solution-8th
作者: qqgeogor
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
def generate_indicator_(gram_q1,gram_q2,N):
len_gram_q1 = list(map(len,gram_q1))
len_gram_q2 = list(map(len,gram_q2))
max_len = max(max(len_gram_q1),max(len_gram_q2))
q1_indicator = np.zeros((N,max_len))
q2_indicator = np.zeros((N,max_len))
for i in tqdm(np.arange(N)):
for j,w in enumerate(gram_q1[i]):
if w in gram_q2[i]:
q1_indicator[i,j] = 1
for j,w in enumerate(gram_q2[i]):
if w in gram_q1[i]:
q2_indicator[i,j] = 1
return q1_indicator,q2_indicator
# sps.spearmanr(q1_indicator[:,1],y_train)[0]
generate_neighbor_dis.py 文件源码
项目:kaggle-quora-solution-8th
作者: qqgeogor
项目源码
文件源码
阅读 41
收藏 0
点赞 0
评论 0
def calc_dis_jarccard2(neighs,neighs2):
sim_fea = []
for i in neighs:
for j in neighs2:
if i==j:continue
if (j in index_q) and (i in index_q):
q_str = index_q[i]
nei_str = index_q[j]
s1 = set(q_str.lower().split())
s2 = set(nei_str.lower().split())
sim_fea.append(dist_utils._jaccard_coef(s1, s2))
aggregation_mode = ["mean", "std", "max", "min", "median"]
aggregator = [None if m == "" else getattr(np, m) for m in aggregation_mode]
score = []
for n, agg in enumerate(aggregator):
if len(sim_fea) == 0:
s = -1
try:
s = agg(sim_fea)
except:
s = -1
score.append(s)
return score
# sps.spearmanr(train_fea,train['is_duplicate'])[0]
generate_ngram_bleu.py 文件源码
项目:kaggle-quora-solution-8th
作者: qqgeogor
项目源码
文件源码
阅读 35
收藏 0
点赞 0
评论 0
def drop_feature(data):
drop_list = []
for i in range(data.shape[1]):
for j in range(i,data.shape[1]):
s = sps.spearmanr(data[:,i],data[:,j])[0]
if abs(s)>0.8:
drop_list.append(j)
drop_list = set(drop_list)
return drop_list
#select imp feature