pypdfocr_pdf.py 文件源码

python
阅读 28 收藏 0 点赞 0 评论 0

项目:pdf_liberty 作者: mplitnikas 项目源码 文件源码
def overlay_hocr_pages(self, dpi, hocr_filenames, orig_pdf_filename):

        logging.debug("Going to overlay following files onto %s" % orig_pdf_filename)
        # Sort the hocr_filenames into natural keys!
        hocr_filenames.sort(key=lambda x: self.natural_keys(x[0] ))
        logging.debug(hocr_filenames)

        pdf_dir, pdf_basename = os.path.split(orig_pdf_filename)
        basename = os.path.splitext(pdf_basename)[0]
        pdf_filename = os.path.join(pdf_dir, "%s_ocr.pdf" % (basename))

        text_pdf_filenames = []
        for img_filename, hocr_filename in hocr_filenames:
            text_pdf_filename = self.overlay_hocr_page(dpi, hocr_filename, img_filename)
            logging.info("Created temp OCR'ed pdf containing only the text as %s" % (text_pdf_filename))
            text_pdf_filenames.append(text_pdf_filename)


        writer = PdfFileWriter()
        orig = open(orig_pdf_filename, 'rb')
        for orig_pg, text_pg_filename in zip(self.iter_pdf_page(orig), text_pdf_filenames):
            text_file = open(text_pg_filename, 'rb')
            text_pg = self.iter_pdf_page(text_file).next()
            orig_rotation_angle = int(orig_pg.get('/Rotate', 0))

            if orig_rotation_angle != 0:
                logging.info("Original Rotation: %s" % orig_pg.get("/Rotate", 0))
                self.mergeRotateAroundPointPage(orig_pg, text_pg, orig_rotation_angle, text_pg.mediaBox.getWidth()/2, text_pg.mediaBox.getWidth()/2)

                # None of these commands worked for me:
                    #orig_pg.rotateCounterClockwise(orig_rotation_angle)
                    #orig_pg.mergeRotatedPage(text_pg,text_rotation_angle)
            else:
                orig_pg.mergePage(text_pg)
            orig_pg.compressContentStreams()
            writer.addPage(orig_pg)

            with open(pdf_filename, 'wb') as f:
                # Flush out this page merge so we can close the text_file
                writer.write(f)
            text_file.close()

        orig.close()

        for fn in text_pdf_filenames:
            os.remove(fn)

        logging.info("Created OCR'ed pdf as %s" % (pdf_filename))
        return pdf_filename
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号