eda.py 文件源码-python代码片段

def norm_EDA(vectors, lg, embedding):
    """EDA on the norm of the vectors
    vectors = word embedding vectors
    lg = language
    embedding = gensim, polyglot, etc."""
    # L2 norm of vectors, then normalize distribution of L2 norms
    vectors_norm = np.linalg.norm(vectors, axis=1)
    vectors_norm_normalized = (vectors_norm - vectors_norm.mean()) \
        / vectors_norm.std()

    # Histogram compared to normal dist
    plt.figure(figsize=(10, 6))
    plt.xlim((-3, 5))
    plt.hist(vectors_norm_normalized, bins=100, normed=True)
    x = np.linspace(-3, 3, 100)
    plt.plot(x, norm.pdf(x, 0, 1), color='r', linewidth=3)
    plt.savefig('../images/' + lg + '_' + embedding + '_norm.png')
    plt.close('all')

    # Anderson Darling
    # If test stat is greater than crit val, reject ho=normal
    # crit_val_1 is critical value for p-value of 1%
    ad = anderson(vectors_norm_normalized, 'norm')
    ad_test_stat = ad.statistic
    ad_crit_val_1 = ad.critical_values[-1]
    ad_result = 'Reject' if ad_test_stat > ad_crit_val_1 else 'Fail to Reject'

    # Kolmogorov-Smirnov
    ks_p_val = kstest(vectors_norm_normalized, 'norm')[1]
    ks_result = 'Reject' if ks_p_val < .01 else 'Fail to Reject'

    # Shapiro
    sh_p_val = shapiro(vectors_norm_normalized)[1]
    sh_result = 'Reject' if sh_p_val < .01 else 'Fail to Reject'

    result = (ad_test_stat, ad_crit_val_1, ad_result,
              ks_p_val, ks_result,
              sh_p_val, sh_result)
    return result