def worker(proc_num, queue, out_dir, in_dir, count_dir, words, dim, num_words, min_count=100):
while True:
if queue.empty():
break
year = queue.get()
print "Loading embeddings for year", year
time.sleep(random.random() * 120)
valid_words = set(words_above_count(count_dir, year, min_count))
print len(valid_words)
words = list(valid_words.intersection(words[year][:num_words]))
print len(words)
base_embed = Explicit.load((in_dir + INPUT_FORMAT).format(year=year), normalize=False)
base_embed = base_embed.get_subembed(words, restrict_context=True)
print "SVD for year", year
u, s, v = randomized_svd(base_embed.m, n_components=dim, n_iter=5)
print "Saving year", year
np.save((out_dir + OUT_FORMAT).format(year=year, dim=dim) + "-u.npy", u)
np.save((out_dir + OUT_FORMAT).format(year=year, dim=dim) + "-v.npy", v)
np.save((out_dir + OUT_FORMAT).format(year=year, dim=dim) + "-s.npy", s)
write_pickle(base_embed.iw, (out_dir + OUT_FORMAT).format(year=year, dim=dim) + "-vocab.pkl")
评论列表
文章目录