index_data.py 文件源码-python代码片段

def get_embedding_matrix(self, embedding_file, onto_aware):
        # embedding_file is a tsv with words on the first column and vectors on the
        # remaining. This will add to word_embedding if for_words is true, or else to 
        # synset embedding.
        # For words that do not have vectors, we sample from a uniform distribution in the
        # range of max and min of the word embedding.
        embedding_map = {}
        rep_max = -float("inf")
        rep_min = float("inf")
        for line in gzip.open(embedding_file):
            ln_parts = line.strip().split()
            if len(ln_parts) == 2:
                continue
            element = ln_parts[0]
            vec = numpy.asarray([float(f) for f in ln_parts[1:]])
            vec_max, vec_min = vec.max(), vec.min()
            if vec_max > rep_max:
                rep_max = vec_max
            if vec_min < rep_min:
                rep_min = vec_min
            embedding_map[element] = vec
        embedding_dim = len(vec)
        target_index = self.synset_index if onto_aware else self.word_index
        # Initialize target embedding with all random vectors
        target_vocab_size = self.get_vocab_size(onto_aware=onto_aware)
        target_embedding = self.numpy_rng.uniform(low=rep_min, high=rep_max, size=(target_vocab_size, embedding_dim))
        num_found_elements = 0
        num_all_elements = 0
        for element in target_index:
            num_all_elements += 1
            if element in embedding_map:
                vec = embedding_map[element]
                target_embedding[target_index[element]] = vec
                num_found_elements += 1
        print >>sys.stderr, "Found vectors for %.4f of the words" % (float(num_found_elements) / num_all_elements)
        return target_embedding