def layout_seg(image, page_text):
if page_text.endswith(u'\r\n'):
separator = u'\r\n'
else:
separator = u'\n'
texts = page_text.rstrip(separator).split(separator)
bw = binarisation(image)
image_height, image_width = bw.shape
bw = (1 - bw).astype('ubyte')
label_image = label(bw, connectivity=2)
line_region_lst = get_line_region_lst(label_image)
region_lst = []
line_idx = 0
text_len = len(texts)
page_bar_no = texts[0].strip()
for i in range(1, text_len):
text = texts[i].rstrip()
if text:
region_seg(image, bw, image_height, page_bar_no, i, line_region_lst[line_idx], text, region_lst)
line_idx = line_idx + 1
else:
left = line_region_lst[line_idx].right
right = line_region_lst[line_idx-1].left
region = {
u'text': text,
u'left': left,
u'right': right,
u'top': 0,
u'bottom': image_height,
u'line_no': i,
u'region_no': 1,
u'page_bar_no': page_bar_no,
}
region_lst.append(region)
return region_lst
评论列表
文章目录