def _create_binary_tree(self):
"""
Create a binary Huffman tree using stored vocabulary word counts. Frequent words
will have shorter binary codes. Called internally from `build_vocab()`.
"""
vocab_size = len(self.vocab)
logger.info("constructing a huffman tree from %i words" % vocab_size)
# build the huffman tree
heap = list(self.vocab.values())
heapq.heapify(heap)
for i in range(vocab_size - 1):
min1, min2 = heapq.heappop(heap), heapq.heappop(heap)
heapq.heappush(heap, Vocab(count=min1.count + min2.count, index=i + vocab_size, left=min1, right=min2))
# recurse over the tree, assigning a binary code to each vocabulary word
if heap:
max_depth, stack = 0, [(heap[0], [], [])]
while stack:
node, codes, points = stack.pop()
if node.index < vocab_size:
# leaf node => store its path from the root
node.code, node.point = codes, points
max_depth = max(len(codes), max_depth)
else:
# inner node => continue recursion
points = np.array(list(points) + [node.index - vocab_size], dtype=int)
stack.append((node.left, np.array(list(codes) + [0], dtype=int), points))
stack.append((node.right, np.array(list(codes) + [1], dtype=int), points))
logger.info("built huffman tree with maximum node depth %i" % max_depth)
评论列表
文章目录