vocab.py 文件源码-python代码片段

def get_global_embeddings(self, filenames, embedding_size, embedding_dir):
        """ Construct the Embedding Matrix for the sentences in filenames.

            Args:
                filenames: File names of the training files: Based on 
                which the vocab will be built. This is used when there
                are no pretrained embeddings present. Then instead of 
                using random embeddings, Word2Vec algorithm is used 
        to train the embeddings on the dataset avaliable.
                embedding_size: Dimensions for the embedding to be used.

            Returns
                Embedding matrix.
        """
        sentences = []

        if (os.path.exists(embedding_dir + 'vocab_len.pkl')):
                vocab_len_stored = pickle.load(open(embedding_dir + "vocab_len.pkl"))
        else:
                vocab_len_stored = 0

        if (vocab_len_stored == self.len_vocab and os.path.exists(embedding_dir + "embeddings.pkl")):
                print ("Load file")
                self.embeddings = pickle.load(open(embedding_dir +  "embeddings.pkl"))
                return None

        if (os.path.exists(embedding_dir + 'embeddings') == True):
            model = KeyedVectors.load_word2vec_format(embedding_dir + 'embeddings', binary = False)
            print ("Loading pretriained embeddings")

        else:
            for file in filenames:
                with open(file, 'rb') as f:
                    for lines in f:
                        words = [lines.split()]
                        sentences.extend(words)

            model = Word2Vec(sentences, size=embedding_size, min_count=0)
            model.save(embedding_dir + 'embeddings')

        self.embeddings_model = model
        return model