def make_sampling_table(size, sampling_factor=1e-5):
'''This generates an array where the ith element
is the probability that a word of rank i would be sampled,
according to the sampling distribution used in word2vec.
The word2vec formula is:
p(word) = min(1, sqrt(word.frequency/sampling_factor) / (word.frequency/sampling_factor))
We assume that the word frequencies follow Zipf's law (s=1) to derive
a numerical approximation of frequency(rank):
frequency(rank) ~ 1/(rank * (log(rank) + gamma) + 1/2 - 1/(12*rank))
where gamma is the Euler-Mascheroni constant.
# Arguments
size: int, number of possible words to sample.
'''
gamma = 0.577
rank = np.array(list(range(size)))
rank[0] = 1
inv_fq = rank * (np.log(rank) + gamma) + 0.5 - 1./(12.*rank)
f = sampling_factor * inv_fq
return np.minimum(1., f / np.sqrt(f))
评论列表
文章目录