def create_binary_tree(self):
"""
Create a binary Huffman tree using stored vocabulary word counts. Frequent words
will have shorter binary codes. Called internally from `build_vocab()`.
"""
logger.info("constructing a huffman tree from %i words" % len(self.vocab))
# build the huffman tree
heap = list(itervalues(self.vocab))
heapq.heapify(heap)
for i in xrange(len(self.vocab) - 1):
min1, min2 = heapq.heappop(heap), heapq.heappop(heap)
heapq.heappush(heap, Vocab(count=min1.count + min2.count, index=i + len(self.vocab), left=min1, right=min2))
# recurse over the tree, assigning a binary code to each vocabulary word
if heap:
max_depth, stack = 0, [(heap[0], [], [])]
while stack:
node, codes, points = stack.pop()
if node.index < len(self.vocab):
# leaf node => store its path from the root
node.code, node.point = codes, points
max_depth = max(len(codes), max_depth)
else:
# inner node => continue recursion
points = array(list(points) + [node.index - len(self.vocab)], dtype=uint32)
stack.append((node.left, array(list(codes) + [0], dtype=uint8), points))
stack.append((node.right, array(list(codes) + [1], dtype=uint8), points))
logger.info("built huffman tree with maximum node depth %i" % max_depth)
评论列表
文章目录