def get_embedding_matrix(self, embedding_file, onto_aware):
# embedding_file is a tsv with words on the first column and vectors on the
# remaining. This will add to word_embedding if for_words is true, or else to
# synset embedding.
# For words that do not have vectors, we sample from a uniform distribution in the
# range of max and min of the word embedding.
embedding_map = {}
rep_max = -float("inf")
rep_min = float("inf")
for line in gzip.open(embedding_file):
ln_parts = line.strip().split()
if len(ln_parts) == 2:
continue
element = ln_parts[0]
vec = numpy.asarray([float(f) for f in ln_parts[1:]])
vec_max, vec_min = vec.max(), vec.min()
if vec_max > rep_max:
rep_max = vec_max
if vec_min < rep_min:
rep_min = vec_min
embedding_map[element] = vec
embedding_dim = len(vec)
target_index = self.synset_index if onto_aware else self.word_index
# Initialize target embedding with all random vectors
target_vocab_size = self.get_vocab_size(onto_aware=onto_aware)
target_embedding = self.numpy_rng.uniform(low=rep_min, high=rep_max, size=(target_vocab_size, embedding_dim))
num_found_elements = 0
num_all_elements = 0
for element in target_index:
num_all_elements += 1
if element in embedding_map:
vec = embedding_map[element]
target_embedding[target_index[element]] = vec
num_found_elements += 1
print >>sys.stderr, "Found vectors for %.4f of the words" % (float(num_found_elements) / num_all_elements)
return target_embedding
评论列表
文章目录