def sub_sampling(data, word_counter, word_dict, sampling_rate):
total_words = sum([len(sentence) for sentence in data])
prob_dict = dict()
for word, count in word_counter:
f = count / total_words
p = max(0, 1 - math.sqrt(sampling_rate / f))
prob_dict[word_dict[word]] = p
new_data = list()
for sentence in data:
s = list()
for word in sentence:
prob = prob_dict[word]
if random.random() > prob:
s.append(word)
new_data.append(s)
return new_data
评论列表
文章目录