def calc_word_sim(model, eval_file):
df = pd.read_csv(eval_file, sep=',', header=0) # eval dataset
col1, col2, score = df.columns.values
model_vocab = model.vocab.keys()
ground = []
sys = []
for idx, row in df.iterrows():
if row[col1] in model_vocab and row[col2] in model_vocab:
ground.append(float(row[score]))
sys.append(model.similarity(row[col1], row[col2]))
# compute Spearman's rank correlation coefficient (https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient)
print sys
# import pdb;pdb.set_trace()
corr, p_val = stats.spearmanr(sys, ground)
logger.info("# of pairs found: %s / %s" % (len(ground), len(df)))
logger.info("correlation: %s" % corr)
return corr, p_val
python类spearmanr()的实例源码
def word_sim_test(filename, pos_vectors):
delim = ','
actual_sim_list, pred_sim_list = [], []
missed = 0
with open(filename, 'r') as pairs:
for pair in pairs:
w1, w2, actual_sim = pair.strip().split(delim)
try:
w1_vec = create_word_vector(w1, pos_vectors)
w2_vec = create_word_vector(w2, pos_vectors)
pred = float(np.inner(w1_vec, w2_vec))
actual_sim_list.append(float(actual_sim))
pred_sim_list.append(pred)
except KeyError:
missed += 1
spearman, _ = st.spearmanr(actual_sim_list, pred_sim_list)
pearson, _ = st.pearsonr(actual_sim_list, pred_sim_list)
return spearman, pearson, missed
def sim_getCorrelation(We,words,f, weight4ind, scoring_function, params):
f = open(f,'r')
lines = f.readlines()
golds = []
seq1 = []
seq2 = []
for i in lines:
i = i.split("\t")
p1 = i[0]; p2 = i[1]; score = float(i[2])
X1, X2 = data_io.getSeqs(p1,p2,words)
seq1.append(X1)
seq2.append(X2)
golds.append(score)
x1,m1 = data_io.prepare_data(seq1)
x2,m2 = data_io.prepare_data(seq2)
m1 = data_io.seq2weight(x1, m1, weight4ind)
m2 = data_io.seq2weight(x2, m2, weight4ind)
scores = scoring_function(We,x1,x2,m1,m2, params)
preds = np.squeeze(scores)
return pearsonr(preds,golds)[0], spearmanr(preds,golds)[0]
def getCorrelation(model,words,f, params=[]):
f = open(f,'r')
lines = f.readlines()
preds = []
golds = []
seq1 = []
seq2 = []
for i in lines:
i = i.split("\t")
p1 = i[0]; p2 = i[1]; score = float(i[2])
X1, X2 = data_io.getSeqs(p1,p2,words)
seq1.append(X1)
seq2.append(X2)
golds.append(score)
x1,m1 = data_io.prepare_data(seq1)
x2,m2 = data_io.prepare_data(seq2)
if params and params.weightfile:
m1 = data_io.seq2weight(x1, m1, params.weight4ind)
m2 = data_io.seq2weight(x2, m2, params.weight4ind)
scores = model.scoring_function(x1,x2,m1,m2)
preds = np.squeeze(scores)
return pearsonr(preds,golds)[0], spearmanr(preds,golds)[0]
def correlations(A,B,pc_n=100):
p = (1 - distance.correlation(A.flatten(),B.flatten()))
spear = spearmanr(A.flatten(),B.flatten())
dist_genes = np.zeros(A.shape[0])
for i in range(A.shape[0]):
dist_genes[i] = 1 - distance.correlation(A[i],B[i])
pg = (np.average(dist_genes[np.isfinite(dist_genes)]))
dist_sample = np.zeros(A.shape[1])
for i in range(A.shape[1]):
dist_sample[i] = 1 - distance.correlation(A[:,i],B[:,i])
ps = (np.average(dist_sample[np.isfinite(dist_sample)]))
pc_dist = []
if pc_n > 0:
u0,s0,vt0 = np.linalg.svd(A)
u,s,vt = np.linalg.svd(B)
for i in range(pc_n):
pc_dist.append(abs(1 - distance.cosine(u0[:,i],u[:,i])))
pc_dist = np.array(pc_dist)
return p,spear[0],pg,ps,pc_dist
def white4D_functional():
print("Testing correlation for 4D white noise")
N = 20
x1 = randrange(-1000, 1000, 1)
y1 = randrange(-1000, 1000, 1)
z1 = randrange(-1000, 1000, 1)
w1 = randrange(-1000, 1000, 1)
x2 = x1 + randrange(-1000, 1000, 1)
y2 = y1 + randrange(-1000, 1000, 1)
z2 = z1 + randrange(-1000, 1000, 1)
w2 = w1 + randrange(-1000, 1000, 1)
values1 = [[[[combined(white, x/N, y/N) for x in range(x1, x1 + N)] for y in range(y1, y1 + N)] for z in range(z1, z1 + N)] for w in range(w1, w1 + N)]
values2 = [[[[combined(white, x/N, y/N) for x in range(x2, x2 + N)] for y in range(y2, y2 + N)] for z in range(z2, z2 + N)] for w in range(w2, w2 + N)]
rho = spearmanr(values1, values2, axis = None)
assert abs(rho[0]) < 0.5
print("rho = %s" % rho[0])
print("\tNot signifying correlation found")
def run(self):
for d_type, datasets in self.sim_datasets.iteritems():
for data, fn in datasets:
logging.info(
'testing on data {0} of type {1} ({2} pairs)'.format(
fn, d_type, len(data.pairs)))
for e_type, models in self.e_models.iteritems():
for model, fn in models:
logging.info(
'\ttesting embedding {0} of type {1}'.format(
fn, e_type))
answers, gold_sims, oovs = [], [], 0
for (w1, w2), gold in data.pairs.iteritems():
sim = model.get_sim(w1, w2)
if sim:
answers.append(sim)
gold_sims.append(gold)
else:
oovs += 1
corr = spearmanr(answers, gold_sims)
logging.info('Spearman correlation: {0}'.format(corr))
logging.info('pairs skipped (OOVs): {0}'.format(oovs))
def MA_RIBBON(df, ma_series):
ma_array = np.zeros([len(df), len(ma_series)])
ema_list = []
for idx, ma_len in enumerate(ma_series):
ema_i = EMA(df, n = ma_len, field = 'close')
ma_array[:, idx] = ema_i
ema_list.append(ema_i)
corr = np.empty([len(df)])
pval = np.empty([len(df)])
dist = np.empty([len(df)])
corr[:] = np.NAN
pval[:] = np.NAN
dist[:] = np.NAN
max_n = max(ma_series)
for idy in range(len(df)):
if idy >= max_n - 1:
corr[idy], pval[idy] = stats.spearmanr(ma_array[idy,:], range(len(ma_series), 0, -1))
dist[idy] = max(ma_array[idy,:]) - min(ma_array[idy,:])
corr_ts = pd.Series(corr*100, index = df.index, name = "MARIBBON_CORR")
pval_ts = pd.Series(pval*100, index = df.index, name = "MARIBBON_PVAL")
dist_ts = pd.Series(dist, index = df.index, name = "MARIBBON_DIST")
return pd.concat([corr_ts, pval_ts, dist_ts] + ema_list, join='outer', axis=1)
def eval_sts(ycat, y, name, quiet=False):
""" Evaluate given STS regression-classification predictions and print results. """
if ycat.ndim == 1:
ypred = ycat
else:
ypred = loader.sts_categorical2labels(ycat)
if y.ndim == 1:
ygold = y
else:
ygold = loader.sts_categorical2labels(y)
pr = pearsonr(ypred, ygold)[0]
sr = spearmanr(ypred, ygold)[0]
e = mse(ypred, ygold)
if not quiet:
print('%s Pearson: %f' % (name, pr,))
print('%s Spearman: %f' % (name, sr,))
print('%s MSE: %f' % (name, e,))
return STSRes(pr, sr, e)
def eval_sts(ycat, y, name, quiet=False):
""" Evaluate given STS regression-classification predictions and print results. """
if ycat.ndim == 1:
ypred = ycat
else:
ypred = loader.sts_categorical2labels(ycat)
if y.ndim == 1:
ygold = y
else:
ygold = loader.sts_categorical2labels(y)
pr = pearsonr(ypred, ygold)[0]
sr = spearmanr(ypred, ygold)[0]
e = mse(ypred, ygold)
if not quiet:
print('%s Pearson: %f' % (name, pr,))
print('%s Spearman: %f' % (name, sr,))
print('%s MSE: %f' % (name, e,))
return STSRes(pr, sr, e)
evaluate.py 文件源码
项目:Learning-sentence-representation-with-guidance-of-human-attention
作者: wangshaonan
项目源码
文件源码
阅读 30
收藏 0
点赞 0
评论 0
def getCorrelation(model,words,f):
f = open(f,'r')
lines = f.readlines()
preds = []
golds = []
seq1 = []
seq2 = []
for i in lines:
i = i.split("\t")
p1 = i[1]; p2 = i[2]; score = float(i[0])
if len(p1.split()[0].split('_')) == 2:
X1, X2, SX1, SX2 = getSeqs2(p1,p2,words)
else:
X1, X2 = getSeqs(p1,p2,words)
seq1.append(X1)
seq2.append(X2)
golds.append(score)
x1,m1 = utils.prepare_data(seq1)
x2,m2 = utils.prepare_data(seq2)
scores = model.scoring_function(x1,x2,m1,m2)
preds = np.squeeze(scores)
return pearsonr(preds,golds)[0], spearmanr(preds,golds)[0]
evaluate.py 文件源码
项目:Learning-sentence-representation-with-guidance-of-human-attention
作者: wangshaonan
项目源码
文件源码
阅读 32
收藏 0
点赞 0
评论 0
def getCorrelation2(model,words,f):
f = open(f,'r')
lines = f.readlines()
preds = []
golds = []
seq1 = []
seq2 = []
sseq1 = []
sseq2 = []
for i in lines:
i = i.split("\t")
p1 = i[1]; p2 = i[2]; score = float(i[0])
X1, X2, SX1, SX2 = getSeqs2(p1,p2,words)
seq1.append(X1)
seq2.append(X2)
sseq1.append(SX1)
sseq2.append(SX2)
golds.append(score)
x1,m1,s1 = utils.prepare_data2(seq1,sseq1)
x2,m2,s2 = utils.prepare_data2(seq2,sseq2)
scores = model.scoring_function2(x1,x2,m1,m2,s1,s2)
preds = np.squeeze(scores)
return pearsonr(preds,golds)[0], spearmanr(preds,golds)[0]
def spearman(self, dataset):
if not isinstance(dataset, list) \
or len(dataset) == 0 \
or len(dataset[0]) != 3 \
or not isinstance(dataset[0][2], float):
raise TypeError('Dataset is not of correct type, list of [str, str, float] triples expected.')
gs_scores, sys_scores = [], []
for one, two, gs_score in dataset:
try:
sys_score = self.sim(one, two)
gs_scores.append(gs_score)
sys_scores.append(sys_score)
except KeyError:
if self.reportMissing:
print('Warning: Missing pair %s-%s - skipping' % (one, two))
continue
return spearmanr(gs_scores, sys_scores)
nanops.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 33
收藏 0
点赞 0
评论 0
def get_corr_func(method):
if method in ['kendall', 'spearman']:
from scipy.stats import kendalltau, spearmanr
def _pearson(a, b):
return np.corrcoef(a, b)[0, 1]
def _kendall(a, b):
rs = kendalltau(a, b)
if isinstance(rs, tuple):
return rs[0]
return rs
def _spearman(a, b):
return spearmanr(a, b)[0]
_cor_methods = {
'pearson': _pearson,
'kendall': _kendall,
'spearman': _spearman
}
return _cor_methods[method]
def evaluate1Word(wv, reference):
"""Evaluate wv against reference, return (rho, count) where rwo is
Spearman's rho and count is the number of reference word pairs
that could be evaluated against.
"""
count=0
gold, predicted = [], []
for words, sim in sorted(reference, key=lambda ws: ws[1]):
if " " not in words[0] and " " not in words[1]:
#print words[0],words[1]
try:
v1, v2 = wv[words[0]], wv[words[1]]
except KeyError:
count+=1
continue
#print words
gold.append((words, sim))
predicted.append((words, cosine(v1, v2)))
simlist = lambda ws: [s for w,s in ws]
rho, p = spearmanr(simlist(gold), simlist(predicted))
print "Word not found in WordVector",count
return (rho, len(gold))
def _corrfunc(x, y, **kws):
""" Annotate grid with correaltion coefficient.
Solution from http://stackoverflow.com/a/30942817
"""
if args.c == 'spearman':
r, _ = stats.spearmanr(x, y)
corr_type = 'Rho'
elif args.c == 'pearson':
r, _ = stats.pearsonr(x, y)
corr_type = 'r'
else:
raise Exception('Invalid correlation statistic.')
correlations.append(r)
ax = plotter.plt.gca()
ax.annotate("{} = {:.2f}".format(corr_type, r),
xy=(.1, .9), xycoords=ax.transAxes)
def get_feature_importance(feature):
import scipy.stats as sps
import pandas as pd
y_train = pd.read_csv('../data/train.csv')['is_duplicate']
return sps.spearmanr(feature,y_train)[0]
# import pickle
# pickle.dump(X_train,open("data_train.pkl", 'wb'), protocol=2)
#
# data_file=['test_deptree','test_glove_sim_dist','test_pca_glove',
# 'test_pca_pattern','test_w2w','test_pos','test_pca_char']
#
# path='../test/'
# for it in range(6):
# tmp=[]
# flist=[item+str(it) for item in data_file]
# test=np.empty((400000,0))
# if it==5:
# test=np.empty((345796,0))
# for f in flist:
# test=np.hstack([test,pd.read_pickle(path+f+'.pkl')])
# pickle.dump(test,open('data_test{0}.pkl'.format(it),'wb'),protocol=2)
def getCorrelation(model,words,f):
f = open(f,'r')
lines = f.readlines()
preds = []
golds = []
seq1 = []
seq2 = []
for i in lines:
i = i.split("\t")
p1 = i[0]; p2 = i[1]; score = float(i[2])
X1, X2 = getSeqs(p1,p2,words)
seq1.append(X1)
seq2.append(X2)
golds.append(score)
x1,m1 = utils.prepare_data(seq1)
x2,m2 = utils.prepare_data(seq2)
scores = model.scoring_function(x1,x2,m1,m2)
preds = np.squeeze(scores)
return pearsonr(preds,golds)[0], spearmanr(preds,golds)[0]
def validation_check():
# Load graph
g = Graph(is_training=False); print("Graph loaded")
# Load data
X, Y = load_data(mode="val")
with g.graph.as_default():
sv = tf.train.Supervisor()
with sv.managed_session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:
# Restore parameters
sv.saver.restore(sess, tf.train.latest_checkpoint(hp.logdir)); print("Restored!")
# Get model
mname = open(hp.logdir + '/checkpoint', 'r').read().split('"')[1] # model name
# Inference
if not os.path.exists(hp.results): os.mkdir(hp.results)
with open(os.path.join(hp.results, "validation_results.txt"), 'a') as fout:
expected, predicted = [], []
for step in range(len(X) // hp.batch_size):
x = X[step * hp.batch_size: (step + 1) * hp.batch_size]
y = Y[step * hp.batch_size: (step + 1) * hp.batch_size]
# predict intensities
logits = sess.run(g.logits, {g.x: x})
expected.extend(list(y))
predicted.extend(list(logits))
# Get spearman coefficients
score, _ = spearmanr(expected, predicted)
fout.write("{}\t{}\n".format(mname, score))
def _calculate(self, input):
input = input[~np.isnan(input).any(axis=1)]
return spearmanr(input[:,0], input[:,1])[0]
def series_corr(word_year_series_1, word_year_series_2, i_year_words, start_year=1900, end_year=2000, series_1_norms=None, series_2_norms=None):
"""
Gets the per-year correlation between the two word time series.
Words are included even if they have values missing for a year, but there missing values are excluded from the year in question.
"""
year_corrs = []
year_ps = []
years = range(start_year, end_year + 1)
if start_year not in i_year_words:
i_year_words = {year:i_year_words for year in years}
if series_1_norms == None:
series_1_norms = ([0 for year in years], [1 for year in years])
if series_2_norms == None:
series_2_norms = ([0 for year in years], [1 for year in years])
for i in xrange(len(years)):
year = years[i]
s1 = []
s2 = []
for word in i_year_words[year]:
if word in word_year_series_1 and word in word_year_series_2:
if not np.isnan(word_year_series_1[word][year]) and not np.isnan(word_year_series_2[word][year]):
s1.append((word_year_series_1[word][year] - series_1_norms[0][i]) / series_1_norms[1][i])
s2.append((word_year_series_2[word][year] - series_2_norms[0][i]) / series_2_norms[1][i])
corr, p = spearmanr(s1, s2)
year_corrs.append(corr)
year_ps.append(p)
return year_corrs, year_ps
def get_scores(self):
self.model.eval()
num_classes = self.dataset_cls.NUM_CLASSES
predict_classes = torch.arange(1, num_classes + 1).expand(self.batch_size, num_classes)
test_kl_div_loss = 0
predictions = []
true_labels = []
for batch in self.data_loader:
output = self.model(batch.sentence_1, batch.sentence_2, batch.ext_feats)
test_kl_div_loss += F.kl_div(output, batch.label, size_average=False).data[0]
# handle last batch which might have smaller size
if len(predict_classes) != len(batch.sentence_1):
predict_classes = torch.arange(1, num_classes + 1).expand(len(batch.sentence_1), num_classes)
if self.data_loader.device != -1:
with torch.cuda.device(self.device):
predict_classes = predict_classes.cuda()
true_labels.append((predict_classes * batch.label.data).sum(dim=1))
predictions.append((predict_classes * output.data.exp()).sum(dim=1))
del output
predictions = torch.cat(predictions).cpu().numpy()
true_labels = torch.cat(true_labels).cpu().numpy()
test_kl_div_loss /= len(batch.dataset.examples)
pearson_r = pearsonr(predictions, true_labels)[0]
spearman_r = spearmanr(predictions, true_labels)[0]
return [pearson_r, spearman_r, test_kl_div_loss], ['pearson_r', 'spearman_r', 'KL-divergence loss']
def spearman(y_true, y_pred):
"""
Calculate Spearman's rank correlation coefficient between ``y_true`` and
``y_pred``.
:param y_true: The true/actual/gold labels for the data.
:type y_true: array-like of float
:param y_pred: The predicted/observed labels for the data.
:type y_pred: array-like of float
:returns: Spearman's rank correlation coefficient if well-defined, else 0
"""
ret_score = spearmanr(y_true, y_pred)[0]
return ret_score if not np.isnan(ret_score) else 0.0
def compare_distances(A,B,random_samples=[],s=200,pvalues=False):
if len(random_samples) == 0:
random_samples = np.zeros(A.shape[1],dtype=np.bool)
random_samples[:min(s,A.shape[1])] = True
np.random.shuffle(random_samples)
dist_x = distance.pdist(A[:,random_samples].T,'euclidean')
dist_y = distance.pdist(B[:,random_samples].T,'euclidean')
pear = pearsonr(dist_x,dist_y)
spear = spearmanr(dist_x,dist_y)
if pvalues:
return pear,spear
else:
return pear[0],spear[0]
def calAvgSimC(test_score, senseVec1, senseScore1,senseVec2, senseScore2):
assert(len(senseVec1)==len(senseVec2))
avgCos = []
for t in xrange(len(senseVec1)):
thisCos = []
p1 = (senseScore1[t])
p2 = (senseScore2[t])
for i in xrange(len(senseVec1[t])):
for j in xrange(len(senseVec2[t])):
thisCos.append((1-cosine(senseVec1[t][i],senseVec2[t][j]))*p1[i]*p2[j])
avgCos.append(np.sum(thisCos))
return spearmanr(test_score, avgCos)[0]
def calMaxSimC(test_score, senseVec1, senseScore1,senseVec2, senseScore2):
assert(len(senseVec1)==len(senseVec2))
avgCos = []
for t in xrange(len(senseVec1)):
i = np.argmax(senseScore1[t])
j = np.argmax(senseScore2[t])
thisCos = (1-cosine(senseVec1[t][i],senseVec2[t][j]))
avgCos.append(thisCos)
return spearmanr(test_score, avgCos)[0]
def white2D_functional():
print("Testing correlation for 2D white noise")
N = 100
x1 = randrange(-1000, 1000, 1)
y1 = randrange(-1000, 1000, 1)
x2 = x1 + randrange(-1000, 1000, 1)
y2 = y1 + randrange(-1000, 1000, 1)
values1 = [[combined(white, x/N, y/N) for x in range(x1, x1 + N)] for y in range(y1, y1 + N)]
values2 = [[combined(white, x/N, y/N) for x in range(x2, x2 + N)] for y in range(y2, y2 + N)]
rho = spearmanr(values1, values2, axis = None)
assert abs(rho[0]) < 0.5
print("rho = %s" % rho[0])
print("\tNot signifying correlation found")
def white3D_functional():
print("Testing correlation for 3D white noise")
N = 100
x1 = randrange(-1000, 1000, 1)
y1 = randrange(-1000, 1000, 1)
z1 = randrange(-1000, 1000, 1)
x2 = x1 + randrange(-1000, 1000, 1)
y2 = y1 + randrange(-1000, 1000, 1)
z2 = z1 + randrange(-1000, 1000, 1)
values1 = [[[combined(white, x/N, y/N) for x in range(x1, x1 + N)] for y in range(y1, y1 + N)] for z in range(z1, z1 + N)]
values2 = [[[combined(white, x/N, y/N) for x in range(x2, x2 + N)] for y in range(y2, y2 + N)] for z in range(z2, z2 + N)]
rho = spearmanr(values1, values2, axis = None)
assert abs(rho[0]) < 0.5
print("rho = %s" % rho[0])
print("\tNot signifying correlation found")
def spearman_scorer(estimator, X, y):
logging.info('predicting ...')
predicted = estimator.predict(y)
return spearmanr(list(predicted), y)
def test():
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s : " +
"%(module)s (%(lineno)s) - %(levelname)s - %(message)s")
data = [((f[0], f[1]), float(f[2]))
for f in [line.strip().split("|||")
for line in open(sys.argv[1])]]
print "sample data:", data[:3]
train_data, devel_data, test_data = cut(data)
logging.info('loading model...')
glove_embedding = GloveEmbedding(sys.argv[2])
logging.info('done!')
dim = int(sys.argv[3])
X_train = featurize(train_data, glove_embedding, dim)
Y_train = np.array([e[1] for e in train_data])
logging.info("Input shape: {0}".format(X_train.shape))
print X_train[:3]
logging.info("Label shape: {0}".format(Y_train.shape))
print Y_train[:3]
input_dim = X_train.shape[1]
output_dim = 1
model = create_model(input_dim, output_dim)
model.fit(X_train, Y_train, nb_epoch=int(sys.argv[4]), batch_size=32)
X_devel = featurize(devel_data, glove_embedding, dim)
Y_devel = np.array([e[1] for e in devel_data])
pred = model.predict_proba(X_devel, batch_size=32)
corr = spearmanr(pred, Y_devel)
print "Spearman's R: {0}".format(corr)