def norm_EDA(vectors, lg, embedding):
"""EDA on the norm of the vectors
vectors = word embedding vectors
lg = language
embedding = gensim, polyglot, etc."""
# L2 norm of vectors, then normalize distribution of L2 norms
vectors_norm = np.linalg.norm(vectors, axis=1)
vectors_norm_normalized = (vectors_norm - vectors_norm.mean()) \
/ vectors_norm.std()
# Histogram compared to normal dist
plt.figure(figsize=(10, 6))
plt.xlim((-3, 5))
plt.hist(vectors_norm_normalized, bins=100, normed=True)
x = np.linspace(-3, 3, 100)
plt.plot(x, norm.pdf(x, 0, 1), color='r', linewidth=3)
plt.savefig('../images/' + lg + '_' + embedding + '_norm.png')
plt.close('all')
# Anderson Darling
# If test stat is greater than crit val, reject ho=normal
# crit_val_1 is critical value for p-value of 1%
ad = anderson(vectors_norm_normalized, 'norm')
ad_test_stat = ad.statistic
ad_crit_val_1 = ad.critical_values[-1]
ad_result = 'Reject' if ad_test_stat > ad_crit_val_1 else 'Fail to Reject'
# Kolmogorov-Smirnov
ks_p_val = kstest(vectors_norm_normalized, 'norm')[1]
ks_result = 'Reject' if ks_p_val < .01 else 'Fail to Reject'
# Shapiro
sh_p_val = shapiro(vectors_norm_normalized)[1]
sh_result = 'Reject' if sh_p_val < .01 else 'Fail to Reject'
result = (ad_test_stat, ad_crit_val_1, ad_result,
ks_p_val, ks_result,
sh_p_val, sh_result)
return result
评论列表
文章目录