def get_global_embeddings(self, filenames, embedding_size, embedding_dir):
""" Construct the Embedding Matrix for the sentences in filenames.
Args:
filenames: File names of the training files: Based on
which the vocab will be built. This is used when there
are no pretrained embeddings present. Then instead of
using random embeddings, Word2Vec algorithm is used
to train the embeddings on the dataset avaliable.
embedding_size: Dimensions for the embedding to be used.
Returns
Embedding matrix.
"""
sentences = []
if (os.path.exists(embedding_dir + 'vocab_len.pkl')):
vocab_len_stored = pickle.load(open(embedding_dir + "vocab_len.pkl"))
else:
vocab_len_stored = 0
if (vocab_len_stored == self.len_vocab and os.path.exists(embedding_dir + "embeddings.pkl")):
print ("Load file")
self.embeddings = pickle.load(open(embedding_dir + "embeddings.pkl"))
return None
if (os.path.exists(embedding_dir + 'embeddings') == True):
model = KeyedVectors.load_word2vec_format(embedding_dir + 'embeddings', binary = False)
print ("Loading pretriained embeddings")
else:
for file in filenames:
with open(file, 'rb') as f:
for lines in f:
words = [lines.split()]
sentences.extend(words)
model = Word2Vec(sentences, size=embedding_size, min_count=0)
model.save(embedding_dir + 'embeddings')
self.embeddings_model = model
return model
评论列表
文章目录