def make_hash_embeddings(igor, vocab):
assert os.path.exists(igor.target_glove), "You need to specify a real file"
fileiter = open(igor.target_glove).readlines()
hash_vocab = Vocabulary()
hash_vocab.use_mask = True
hash_vocab.add(hash_vocab.mask_symbol)
hash_vocab.add(hash_vocab.unk_symbol)
word2hash = {}
for word, v_id in vocab.items():
ids = hash_vocab.add_many(hash_word(word))
word2hash[v_id] = ids
embeddings = np.zeros((len(hash_vocab), igor.embedding_size))
remaining_vocab = set(vocab.keys())
remaining_hashes = set(hash_vocab.values())
for line in tqdm(fileiter):
line = line.replace("\n","").split(" ")
word, nums = line[0], [float(x.strip()) for x in line[1:]]
word_hash = hash_word(word)
if word in remaining_vocab:
hash_ids = word2hash[vocab[word]]
remaining_vocab.remove(word)
remaining_hashes.difference_update(hash_ids)
embeddings[hash_ids] += np.array(nums) / len(hash_ids)
print("{} words were not seen. {} hashes were not seen".format(len(remaining_vocab),
len(remaining_hashes)))
for hash_id in remaining_hashes:
embeddings[hash_id] = np.asarray(glorot_uniform((igor.embedding_size,)).eval())
glove_name = igor.target_glove[igor.target_glove.find("glove"):].replace("/","")
hash_vocab.save('hash_embedding_{}.vocab'.format(glove_name))
with open(path.join(igor.save_dir, "hash_embedding_{}.npy".format(glove_name)), "wb") as fp:
np.save(fp, embeddings)
with open(path.join(igor.save_dir, "word2hash.json".format(glove_name)), "w") as fp:
json.dump(word2hash, fp)
评论列表
文章目录