def overlay_hocr_pages(self, dpi, hocr_filenames, orig_pdf_filename):
logging.debug("Going to overlay following files onto %s" % orig_pdf_filename)
# Sort the hocr_filenames into natural keys!
hocr_filenames.sort(key=lambda x: self.natural_keys(x[0] ))
logging.debug(hocr_filenames)
pdf_dir, pdf_basename = os.path.split(orig_pdf_filename)
basename = os.path.splitext(pdf_basename)[0]
pdf_filename = os.path.join(pdf_dir, "%s_ocr.pdf" % (basename))
text_pdf_filenames = []
for img_filename, hocr_filename in hocr_filenames:
text_pdf_filename = self.overlay_hocr_page(dpi, hocr_filename, img_filename)
logging.info("Created temp OCR'ed pdf containing only the text as %s" % (text_pdf_filename))
text_pdf_filenames.append(text_pdf_filename)
writer = PdfFileWriter()
orig = open(orig_pdf_filename, 'rb')
for orig_pg, text_pg_filename in zip(self.iter_pdf_page(orig), text_pdf_filenames):
text_file = open(text_pg_filename, 'rb')
text_pg = self.iter_pdf_page(text_file).next()
orig_rotation_angle = int(orig_pg.get('/Rotate', 0))
if orig_rotation_angle != 0:
logging.info("Original Rotation: %s" % orig_pg.get("/Rotate", 0))
self.mergeRotateAroundPointPage(orig_pg, text_pg, orig_rotation_angle, text_pg.mediaBox.getWidth()/2, text_pg.mediaBox.getWidth()/2)
# None of these commands worked for me:
#orig_pg.rotateCounterClockwise(orig_rotation_angle)
#orig_pg.mergeRotatedPage(text_pg,text_rotation_angle)
else:
orig_pg.mergePage(text_pg)
orig_pg.compressContentStreams()
writer.addPage(orig_pg)
with open(pdf_filename, 'wb') as f:
# Flush out this page merge so we can close the text_file
writer.write(f)
text_file.close()
orig.close()
for fn in text_pdf_filenames:
os.remove(fn)
logging.info("Created OCR'ed pdf as %s" % (pdf_filename))
return pdf_filename
评论列表
文章目录