sequence.py 文件源码-python代码片段

sequence.py 文件源码

python

阅读 28 收藏 0 点赞 0 评论 0

def make_sampling_table(size, sampling_factor=1e-5):
    '''This generates an array where the ith element
    is the probability that a word of rank i would be sampled,
    according to the sampling distribution used in word2vec.

    The word2vec formula is:
        p(word) = min(1, sqrt(word.frequency/sampling_factor) / (word.frequency/sampling_factor))

    We assume that the word frequencies follow Zipf's law (s=1) to derive
    a numerical approximation of frequency(rank):
       frequency(rank) ~ 1/(rank * (log(rank) + gamma) + 1/2 - 1/(12*rank))
        where gamma is the Euler-Mascheroni constant.

    # Arguments
        size: int, number of possible words to sample.
    '''
    gamma = 0.577
    rank = np.array(list(range(size)))
    rank[0] = 1
    inv_fq = rank * (np.log(rank) + gamma) + 0.5 - 1./(12.*rank)
    f = sampling_factor * inv_fq

    return np.minimum(1., f / np.sqrt(f))