def load_embeddings(filename):
"""Loads embedings, returns weight matrix and dict from words to indices."""
weight_vectors = []
word_idx = {}
with codecs.open(filename, encoding='utf-8') as f:
for line in f:
word, vec = line.split(u' ', 1)
word_idx[word] = len(weight_vectors)
weight_vectors.append(np.array(vec.split(), dtype=np.float32))
# Annoying implementation detail; '(' and ')' are replaced by '-LRB-' and
# '-RRB-' respectively in the parse-trees.
word_idx[u'-LRB-'] = word_idx.pop(u'(')
word_idx[u'-RRB-'] = word_idx.pop(u')')
# Random embedding vector for unknown words.
weight_vectors.append(np.random.uniform(
-0.05, 0.05, weight_vectors[0].shape).astype(np.float32))
return np.stack(weight_vectors), word_idx
评论列表
文章目录