def corr_fea(df,cols,de=None,bar=0.9):
from scipy.stats import pearsonr
xcols = []
for c,i in enumerate(cols[:-1]):
for j in cols[c+1:]:
if i==j:
continue
#score = pearsonr(df[i],df[j])[0]
score = df[i].corr(df[j])
#print(i,j,score)
if score>bar:
df["%s-%s"%(i,j)] = df[i]-df[j]
if de is not None:
de["%s-%s"%(i,j)] = de[i]-de[j]
xcols.append(j)
if score<-bar:
df["%s+%s"%(i,j)] = df[i]+df[j]
if de is not None:
de["%s+%s"%(i,j)] = de[i]+de[j]
xcols.append(j)
return xcols
python类pearsonr()的实例源码
def word_sim_test(filename, pos_vectors):
delim = ','
actual_sim_list, pred_sim_list = [], []
missed = 0
with open(filename, 'r') as pairs:
for pair in pairs:
w1, w2, actual_sim = pair.strip().split(delim)
try:
w1_vec = create_word_vector(w1, pos_vectors)
w2_vec = create_word_vector(w2, pos_vectors)
pred = float(np.inner(w1_vec, w2_vec))
actual_sim_list.append(float(actual_sim))
pred_sim_list.append(pred)
except KeyError:
missed += 1
spearman, _ = st.spearmanr(actual_sim_list, pred_sim_list)
pearson, _ = st.pearsonr(actual_sim_list, pred_sim_list)
return spearman, pearson, missed
def sim_getCorrelation(We,words,f, weight4ind, scoring_function, params):
f = open(f,'r')
lines = f.readlines()
golds = []
seq1 = []
seq2 = []
for i in lines:
i = i.split("\t")
p1 = i[0]; p2 = i[1]; score = float(i[2])
X1, X2 = data_io.getSeqs(p1,p2,words)
seq1.append(X1)
seq2.append(X2)
golds.append(score)
x1,m1 = data_io.prepare_data(seq1)
x2,m2 = data_io.prepare_data(seq2)
m1 = data_io.seq2weight(x1, m1, weight4ind)
m2 = data_io.seq2weight(x2, m2, weight4ind)
scores = scoring_function(We,x1,x2,m1,m2, params)
preds = np.squeeze(scores)
return pearsonr(preds,golds)[0], spearmanr(preds,golds)[0]
def getCorrelation(model,words,f, params=[]):
f = open(f,'r')
lines = f.readlines()
preds = []
golds = []
seq1 = []
seq2 = []
for i in lines:
i = i.split("\t")
p1 = i[0]; p2 = i[1]; score = float(i[2])
X1, X2 = data_io.getSeqs(p1,p2,words)
seq1.append(X1)
seq2.append(X2)
golds.append(score)
x1,m1 = data_io.prepare_data(seq1)
x2,m2 = data_io.prepare_data(seq2)
if params and params.weightfile:
m1 = data_io.seq2weight(x1, m1, params.weight4ind)
m2 = data_io.seq2weight(x2, m2, params.weight4ind)
scores = model.scoring_function(x1,x2,m1,m2)
preds = np.squeeze(scores)
return pearsonr(preds,golds)[0], spearmanr(preds,golds)[0]
def get_pearson_coeff(similar_stroke):
stroke1 = similar_stroke[0]
stroke2 = similar_stroke[1]
min_len = min(len(stroke1), len(stroke2))
sx1 = [stroke1[i][0] for i in range(0, min_len)]
sx2 = [stroke2[i][0] for i in range(0, min_len)]
sy1 = [stroke1[i][1] for i in range(0, min_len)]
sy2 = [stroke2[i][1] for i in range(0, min_len)]
x_pearson = pearsonr(sx1, sy1)[0]
y_pearson = pearsonr(sy1, sy2)[0]
if x_pearson > 0.5 or y_pearson > 0.5:
print similar_stroke[2], similar_stroke[3]
print x_pearson, y_pearson
plt.plot(sx1, label = "Stroke 1 X Co-ordinate")
plt.plot(sx2, label = "Stroke 2 X Co-ordinate")
plt.plot(sy1, label = "Stroke 1 Y Co-ordinate")
plt.plot(sy2, label = "Stroke 2 Y Co-ordinate")
plt.legend(ncol= 2, fancybox=True)
plt.show()
def eval_sts(ycat, y, name, quiet=False):
""" Evaluate given STS regression-classification predictions and print results. """
if ycat.ndim == 1:
ypred = ycat
else:
ypred = loader.sts_categorical2labels(ycat)
if y.ndim == 1:
ygold = y
else:
ygold = loader.sts_categorical2labels(y)
pr = pearsonr(ypred, ygold)[0]
sr = spearmanr(ypred, ygold)[0]
e = mse(ypred, ygold)
if not quiet:
print('%s Pearson: %f' % (name, pr,))
print('%s Spearman: %f' % (name, sr,))
print('%s MSE: %f' % (name, e,))
return STSRes(pr, sr, e)
def eval_sts(ycat, y, name, quiet=False):
""" Evaluate given STS regression-classification predictions and print results. """
if ycat.ndim == 1:
ypred = ycat
else:
ypred = loader.sts_categorical2labels(ycat)
if y.ndim == 1:
ygold = y
else:
ygold = loader.sts_categorical2labels(y)
pr = pearsonr(ypred, ygold)[0]
sr = spearmanr(ypred, ygold)[0]
e = mse(ypred, ygold)
if not quiet:
print('%s Pearson: %f' % (name, pr,))
print('%s Spearman: %f' % (name, sr,))
print('%s MSE: %f' % (name, e,))
return STSRes(pr, sr, e)
evaluate.py 文件源码
项目:Learning-sentence-representation-with-guidance-of-human-attention
作者: wangshaonan
项目源码
文件源码
阅读 33
收藏 0
点赞 0
评论 0
def getCorrelation(model,words,f):
f = open(f,'r')
lines = f.readlines()
preds = []
golds = []
seq1 = []
seq2 = []
for i in lines:
i = i.split("\t")
p1 = i[1]; p2 = i[2]; score = float(i[0])
if len(p1.split()[0].split('_')) == 2:
X1, X2, SX1, SX2 = getSeqs2(p1,p2,words)
else:
X1, X2 = getSeqs(p1,p2,words)
seq1.append(X1)
seq2.append(X2)
golds.append(score)
x1,m1 = utils.prepare_data(seq1)
x2,m2 = utils.prepare_data(seq2)
scores = model.scoring_function(x1,x2,m1,m2)
preds = np.squeeze(scores)
return pearsonr(preds,golds)[0], spearmanr(preds,golds)[0]
evaluate.py 文件源码
项目:Learning-sentence-representation-with-guidance-of-human-attention
作者: wangshaonan
项目源码
文件源码
阅读 36
收藏 0
点赞 0
评论 0
def getCorrelation2(model,words,f):
f = open(f,'r')
lines = f.readlines()
preds = []
golds = []
seq1 = []
seq2 = []
sseq1 = []
sseq2 = []
for i in lines:
i = i.split("\t")
p1 = i[1]; p2 = i[2]; score = float(i[0])
X1, X2, SX1, SX2 = getSeqs2(p1,p2,words)
seq1.append(X1)
seq2.append(X2)
sseq1.append(SX1)
sseq2.append(SX2)
golds.append(score)
x1,m1,s1 = utils.prepare_data2(seq1,sseq1)
x2,m2,s2 = utils.prepare_data2(seq2,sseq2)
scores = model.scoring_function2(x1,x2,m1,m2,s1,s2)
preds = np.squeeze(scores)
return pearsonr(preds,golds)[0], spearmanr(preds,golds)[0]
def test_compute_correlations_between_versions_default_columns():
df_old = pd.DataFrame({'spkitemid': ['a', 'b', 'c'],
'feature1': [1.3, 1.5, 2.1],
'feature2': [1.1, 6.2, 2.1],
'sc1': [2, 3, 4]})
df_new = pd.DataFrame({'spkitemid': ['a', 'b', 'c'],
'feature1': [-1.3, -1.5, -2.1],
'feature2': [1.1, 6.2, 2.1],
'sc1': [2, 3, 4]})
df_cors = compute_correlations_between_versions(df_old, df_new)
assert_equal(df_cors.get_value('feature1', 'old_new'), -1.0)
assert_equal(df_cors.get_value('feature2', 'old_new'), 1.0)
assert_equal(df_cors.get_value('feature1', 'human_old'), pearsonr(df_old['feature1'],
df_old['sc1'])[0])
assert_equal(df_cors.get_value('feature1', 'human_new'), pearsonr(df_new['feature1'],
df_new['sc1'])[0])
assert_equal(df_cors.get_value('feature1', "N"), 3)
def test_compute_correlations_between_versions_custom_columns():
df_old = pd.DataFrame({'id': ['a', 'b', 'c'],
'feature1': [1.3, 1.5, 2.1],
'feature2': [1.1, 6.2, 2.1],
'r1': [2, 3, 4]})
df_new = pd.DataFrame({'id': ['a', 'b', 'c'],
'feature1': [-1.3, -1.5, -2.1],
'feature2': [1.1, 6.2, 2.1],
'r1': [2, 3, 4]})
df_cors = compute_correlations_between_versions(df_old,
df_new,
human_score='r1',
id_column='id')
assert_equal(df_cors.get_value('feature1', 'old_new'), -1.0)
assert_equal(df_cors.get_value('feature2', 'old_new'), 1.0)
assert_equal(df_cors.get_value('feature1', 'human_old'), pearsonr(df_old['feature1'],
df_old['r1'])[0])
assert_equal(df_cors.get_value('feature1', 'human_new'), pearsonr(df_new['feature1'],
df_new['r1'])[0])
assert_equal(df_cors.get_value('feature1', "N"), 3)
def plot_correlation(X, Y, title, corr=None):
if corr == None:
corr, _ = stats.pearsonr(X, Y)
# extract 90-th percentile
thresh = np.percentile(Y, 99)
X90 = X[X > thresh]
Y90 = Y[X > thresh]
sample = np.random.choice(X90.shape[0], size=100, replace=False)
Xsample = X90[sample]
Ysample = Y90[sample]
plt.scatter(Xsample, Ysample, color="red")
plt.xlim([np.min(Xsample), np.max(Xsample)])
plt.ylim([np.min(Ysample), np.max(Ysample)])
plt.title("{:s} (corr: {:.3f})".format(title, corr))
plt.xlabel("X")
plt.ylabel("Y")
def _corrfunc(x, y, **kws):
""" Annotate grid with correaltion coefficient.
Solution from http://stackoverflow.com/a/30942817
"""
if args.c == 'spearman':
r, _ = stats.spearmanr(x, y)
corr_type = 'Rho'
elif args.c == 'pearson':
r, _ = stats.pearsonr(x, y)
corr_type = 'r'
else:
raise Exception('Invalid correlation statistic.')
correlations.append(r)
ax = plotter.plt.gca()
ax.annotate("{} = {:.2f}".format(corr_type, r),
xy=(.1, .9), xycoords=ax.transAxes)
def train_model(lrmodel, X, Y, devX, devY, devscores):
"""
Train model, using pearsonr on dev for early stopping
"""
done = False
best = -1.0
r = np.arange(1,6)
while not done:
# Every 100 epochs, check Pearson on development set
lrmodel.fit(X, Y, verbose=2, shuffle=False, validation_data=(devX, devY))
yhat = np.dot(lrmodel.predict_proba(devX, verbose=2), r)
score = pearsonr(yhat, devscores)[0]
if score > best:
print score
best = score
bestlrmodel = copy.deepcopy(lrmodel)
else:
done = True
yhat = np.dot(bestlrmodel.predict_proba(devX, verbose=2), r)
score = pearsonr(yhat, devscores)[0]
print 'Dev Pearson: ' + str(score)
return bestlrmodel
def test_partial_fit():
data = load_diabetes()
clf = MLPRegressor(n_epochs=1)
X, y = data['data'], data['target']
for _ in range(30):
clf.partial_fit(X, y)
y_pred = clf.predict(X)
assert pearsonr(y_pred, y)[0] > 0.5
def joint_plot(x, y, xlabel=None,
ylabel=None, xlim=None, ylim=None,
loc="best", color='#0485d1',
size=8, markersize=50, kind="kde",
scatter_color="r"):
with sns.axes_style("darkgrid"):
if xlabel and ylabel:
g = SubsampleJointGrid(xlabel, ylabel,
data=DataFrame(data={xlabel: x, ylabel: y}),
space=0.1, ratio=2, size=size, xlim=xlim, ylim=ylim)
else:
g = SubsampleJointGrid(x, y, size=size,
space=0.1, ratio=2, xlim=xlim, ylim=ylim)
g.plot_joint(sns.kdeplot, shade=True, cmap="Blues")
g.plot_sub_joint(plt.scatter, 1000, s=20, c=scatter_color, alpha=0.3)
g.plot_marginals(sns.distplot, kde=False, rug=False)
g.annotate(ss.pearsonr, fontsize=25, template="{stat} = {val:.2g}\np = {p:.2g}")
g.ax_joint.set_yticklabels(g.ax_joint.get_yticks())
g.ax_joint.set_xticklabels(g.ax_joint.get_xticks())
return g
def plotCorrelation(stats):
#columnsToDrop = ['sleep_interval_max_len', 'sleep_interval_min_len',
# 'sleep_interval_avg_len', 'sleep_inefficiency',
# 'sleep_hours', 'total_hours']
#stats = stats.drop(columnsToDrop, axis=1)
g = sns.PairGrid(stats)
def corrfunc(x, y, **kws):
r, p = scipystats.pearsonr(x, y)
ax = plt.gca()
ax.annotate("r = {:.2f}".format(r),xy=(.1, .9), xycoords=ax.transAxes)
ax.annotate("p = {:.2f}".format(p),xy=(.2, .8), xycoords=ax.transAxes)
if p>0.04:
ax.patch.set_alpha(0.1)
g.map_upper(plt.scatter)
g.map_diag(plt.hist)
g.map_lower(sns.kdeplot, cmap="Blues_d")
g.map_upper(corrfunc)
sns.plt.show()
feature_selection.py 文件源码
项目:Default-Credit-Card-Prediction
作者: AlexPnt
项目源码
文件源码
阅读 28
收藏 0
点赞 0
评论 0
def pearson_correlation_matrix(X):
"""
Computes the Pearson Correlation matrix
Keyword arguments:
X -- The feature vectors
"""
n_features=len(X[0])
correlation_matrix=np.zeros(shape=(n_features,n_features))
for i in xrange(n_features):
for j in xrange(n_features):
pearson_corr=stats.pearsonr(X[:,i],X[:,j])[0]
correlation_matrix[i][j]=pearson_corr
return correlation_matrix
feature_selection.py 文件源码
项目:Default-Credit-Card-Prediction
作者: AlexPnt
项目源码
文件源码
阅读 61
收藏 0
点赞 0
评论 0
def pearson_between_feature_class(X,y,threshold):
"""
Computes the Pearson Correlation between each feature and the target class and keeps the higlhy correlated features-class
Keyword arguments:
X -- The feature vectors
y -- The target vector
threshold -- Threshold value used to decide which features to keep (above the threshold)
"""
if verbose:
print '\nPerforming Feature Selection based on the correlation between each feature and class ...'
feature_indexes=[]
for i in xrange(len(X[0])):
if abs(stats.pearsonr(X[:,i],y)[0])>threshold:
feature_indexes+=[i]
if len(feature_indexes)!=0:
return X[:,feature_indexes],feature_indexes #return selected features and original index features
else:
return X,feature_indexes
def getCorrelation(model,words,f):
f = open(f,'r')
lines = f.readlines()
preds = []
golds = []
seq1 = []
seq2 = []
for i in lines:
i = i.split("\t")
p1 = i[0]; p2 = i[1]; score = float(i[2])
X1, X2 = getSeqs(p1,p2,words)
seq1.append(X1)
seq2.append(X2)
golds.append(score)
x1,m1 = utils.prepare_data(seq1)
x2,m2 = utils.prepare_data(seq2)
scores = model.scoring_function(x1,x2,m1,m2)
preds = np.squeeze(scores)
return pearsonr(preds,golds)[0], spearmanr(preds,golds)[0]
def pearson(X, y):
r = []
p = []
for c in X.columns:
r_, p_ = pearsonr(X[c], y)
r.append(r_)
p.append(p_)
dfr = pd.DataFrame(index=range(1, 1+len(X.columns)))
dfr['pearson'] = r
dfr['pearson_p'] = p
return dfr
def _calculate(self, input):
input = input[~np.isnan(input).any(axis=1)]
return pearsonr(input[:,0], input[:,1])
def calcCorrelation(df,col1,col2):
x,y,n = discardNans(df,col1,col2)
return stats.pearsonr(x, y)
def run(self):
self.nepoch = 0
bestpr = -1
early_stop_count = 0
r = np.arange(1, 6)
stop_train = False
# Preparing data
trainX, trainy, devX, devy, testX, testy = self.prepare_data(
self.train['X'], self.train['y'],
self.valid['X'], self.valid['y'],
self.test['X'], self.test['y'])
# Training
while not stop_train and self.nepoch <= self.maxepoch:
self.trainepoch(trainX, trainy, nepoches=50)
yhat = np.dot(self.predict_proba(devX), r)
pr = pearsonr(yhat, self.devscores)[0]
# early stop on Pearson
if pr > bestpr:
bestpr = pr
bestmodel = copy.deepcopy(self.model)
elif self.early_stop:
if early_stop_count >= 3:
stop_train = True
early_stop_count += 1
self.model = bestmodel
yhat = np.dot(self.predict_proba(testX), r)
return bestpr, yhat
def report_metrics(yhat, y):
# report metrics of training set
r2 = r2_score(y, yhat)
var_exp = explained_variance_score(y, yhat)
r = stats.pearsonr(yhat, y)[0]
logger.info('Model metrics for training set: r2={:.2f}, Variance explained={:.2f}, Pearson\'r={:.2f}'.format(r2, var_exp, r))
def return_correlations(instances, labels):
feature_correlation = {}
nplabels = numpy.array(labels)
for i in range(instances.shape[1]):
feature_vals = instances[:,i].toarray()
corr,p = stats.pearsonr(feature_vals,nplabels)
feature_correlation[i] = [corr,p]
return feature_correlation[i]
def calculate_ordinal_correlation_feature_labels(instances,labels):
# calculate correlation by feature
feature_correlation = []
for i in range(instances.shape[1]):
feature_vals = instances[:,i].transpose().toarray()[0]
try:
corr,p = stats.pearsonr(feature_vals,labels)
if math.isnan(corr):
corr = 0
except:
corr = 0
feature_correlation.append([i,abs(corr),corr,p])
sorted_feature_correlation = sorted(feature_correlation,key=lambda k : k[1],reverse=True)
return sorted_feature_correlation
def calculate_feature_correlation(instances):
# calculate correlation by feature
feature_correlation = []
for i in range(instances.shape[1]):
feature_vals_i = instances[:,i].transpose().toarray()[0]
for j in range(i+1,instances.shape[1]):
feature_vals_j = instances[:,j].transpose().toarray()[0]
try:
corr,p = stats.pearsonr(feature_vals_i,feature_vals_j)
if math.isnan(corr):
corr = 0
except:
corr = 0
feature_correlation.append([i,j,abs(corr),corr,p])
return feature_correlation
def get_scores(self):
self.model.eval()
num_classes = self.dataset_cls.NUM_CLASSES
predict_classes = torch.arange(1, num_classes + 1).expand(self.batch_size, num_classes)
test_kl_div_loss = 0
predictions = []
true_labels = []
for batch in self.data_loader:
output = self.model(batch.sentence_1, batch.sentence_2, batch.ext_feats)
test_kl_div_loss += F.kl_div(output, batch.label, size_average=False).data[0]
# handle last batch which might have smaller size
if len(predict_classes) != len(batch.sentence_1):
predict_classes = torch.arange(1, num_classes + 1).expand(len(batch.sentence_1), num_classes)
if self.data_loader.device != -1:
with torch.cuda.device(self.device):
predict_classes = predict_classes.cuda()
true_labels.append((predict_classes * batch.label.data).sum(dim=1))
predictions.append((predict_classes * output.data.exp()).sum(dim=1))
del output
predictions = torch.cat(predictions).cpu().numpy()
true_labels = torch.cat(true_labels).cpu().numpy()
test_kl_div_loss /= len(batch.dataset.examples)
pearson_r = pearsonr(predictions, true_labels)[0]
spearman_r = spearmanr(predictions, true_labels)[0]
return [pearson_r, spearman_r, test_kl_div_loss], ['pearson_r', 'spearman_r', 'KL-divergence loss']
def get_scores(self):
self.model.eval()
num_classes = self.dataset_cls.NUM_CLASSES
predict_classes = torch.arange(0, num_classes).expand(self.batch_size, num_classes)
test_kl_div_loss = 0
predictions = []
true_labels = []
for batch in self.data_loader:
output = self.model(batch.sentence_1, batch.sentence_2, batch.ext_feats)
test_kl_div_loss += F.kl_div(output, batch.label, size_average=False).data[0]
# handle last batch which might have smaller size
if len(predict_classes) != len(batch.sentence_1):
predict_classes = torch.arange(0, num_classes).expand(len(batch.sentence_1), num_classes)
if self.data_loader.device != -1:
with torch.cuda.device(self.device):
predict_classes = predict_classes.cuda()
true_labels.append((predict_classes * batch.label.data).sum(dim=1))
predictions.append((predict_classes * output.data.exp()).sum(dim=1))
del output
predictions = torch.cat(predictions).cpu().numpy()
true_labels = torch.cat(true_labels).cpu().numpy()
test_kl_div_loss /= len(batch.dataset.examples)
pearson_r = pearsonr(predictions, true_labels)[0]
return [pearson_r, test_kl_div_loss], ['pearson_r', 'KL-divergence loss']