def get_ocr_from_hocr(hocr_file, out_dir):
"""Extract OCR from the Hocr data
Keyword arguments
hocr_file -- The HOCR file
out_dir -- Directory to write OCR file to.
"""
output_file = os.path.join(out_dir, 'OCR.txt')
if os.path.exists(output_file) and os.path.isfile(output_file) and options.overwrite:
os.remove(output_file)
logger.debug("{} exists and we are deleting it.".format(output_file))
if not os.path.exists(output_file):
logger.debug("Generating OCR.")
data = ''
with open(hocr_file, 'r') as fpr:
data += fpr.read()
data = html.unescape(blanklines.sub('', htmlmatch.sub('\1', data)))
with open(output_file, 'w') as fpw:
fpw.write(data)
multipage2book.py 文件源码
python
阅读 25
收藏 0
点赞 0
评论 0
评论列表
文章目录