word2vec.py 文件源码

python
阅读 23 收藏 0 点赞 0 评论 0

项目:conec 作者: cod3licious 项目源码 文件源码
def _create_binary_tree(self):
        """
        Create a binary Huffman tree using stored vocabulary word counts. Frequent words
        will have shorter binary codes. Called internally from `build_vocab()`.
        """
        vocab_size = len(self.vocab)
        logger.info("constructing a huffman tree from %i words" % vocab_size)
        # build the huffman tree
        heap = list(self.vocab.values())
        heapq.heapify(heap)
        for i in range(vocab_size - 1):
            min1, min2 = heapq.heappop(heap), heapq.heappop(heap)
            heapq.heappush(heap, Vocab(count=min1.count + min2.count, index=i + vocab_size, left=min1, right=min2))
        # recurse over the tree, assigning a binary code to each vocabulary word
        if heap:
            max_depth, stack = 0, [(heap[0], [], [])]
            while stack:
                node, codes, points = stack.pop()
                if node.index < vocab_size:
                    # leaf node => store its path from the root
                    node.code, node.point = codes, points
                    max_depth = max(len(codes), max_depth)
                else:
                    # inner node => continue recursion
                    points = np.array(list(points) + [node.index - vocab_size], dtype=int)
                    stack.append((node.left, np.array(list(codes) + [0], dtype=int), points))
                    stack.append((node.right, np.array(list(codes) + [1], dtype=int), points))
            logger.info("built huffman tree with maximum node depth %i" % max_depth)
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号