def render_pdf(self):
outpdf = PdfFileWriter()
for page in self.pages:
if page.extension == "pdf":
# the page is already a PDF so append directly
outpdf.addPage(PdfFileReader(BytesIO(page.binary)).getPage(0))
else:
# otherwise, the page is an image that needs to be converted to PDF first
buf = BytesIO()
img = Image.open(BytesIO(page.binary))
img.convert("RGB").save(buf, format="pdf")
# once image is PDF, it can be appended
outpdf.addPage(PdfFileReader(buf).getPage(0))
pdf_page_buf = BytesIO()
outpdf.write(pdf_page_buf)
return(pdf_page_buf.getvalue())
python类PdfFileWriter()的实例源码
def pdf_page_to_png(src_pdf, pagenum = 0, resolution = 72,):
'''
Returns specified PDF page as wand.image.Image png.
:param PyPDF2.PdfFileReader src_pdf: PDF from which to take pages.
:param int pagenum: Page number to take.
:param int resolution: Resolution for resulting png in DPI.
'''
dst_pdf = PyPDF2.PdfFileWriter()
dst_pdf.addPage(src_pdf.getPage(pagenum))
pdf_bytes = io.BytesIO()
dst_pdf.write(pdf_bytes)
pdf_bytes.seek(0)
img = Image(file = pdf_bytes, resolution = resolution)
img.convert("png")
return img
# Example of converting exam.pdf located at the same direcory
# convert('exam') # NOTE : default resolution is 72 dpi
def outputpapertemplate(self, dest, listchar, output=None):
if output == None:
output = PyPDF2.PdfFileWriter()
while listchar:
iopage = self.outputtemplateonepage(listchar)
page = PyPDF2.PdfFileReader(iopage)
output.addPage(page.getPage(0))
if dest != None:
if isinstance(dest, str): # when dest is a file path
destdir = os.path.dirname(dest)
if destdir != '' and not os.path.isdir(destdir):
os.makedirs(destdir)
with open(dest, "wb") as w:
output.write(w)
else: # when dest is io.IOBase
output.write(dest)
else:
return output
def pdf_splitter(self):
self.log.info('Called pdf_splitter')
input_pdf = PdfFileReader(file(self.pdf_file, 'rb'))
self.total_pages = input_pdf.numPages
for page_number in range(self.total_pages):
output = PdfFileWriter()
output.addPage(input_pdf.getPage(page_number))
# new filename
new_pdf = '_%s%s' % (str(page_number+1), '.pdf')
new_pdf = self.pdf_file.replace('.pdf', new_pdf)
file_stream = file(new_pdf, 'wb')
output.write(file_stream)
file_stream.close()
# calling pdf to image conversion
self.pdf_to_image(new_pdf)
def pdf_page_to_png(src_pdf, pagenum=0, resolution=154):
"""
Returns specified PDF page as wand.image.Image png.
:param PyPDF2.PdfFileReader src_pdf: PDF from which to take pages.
:param int pagenum: Page number to take.
:param int resolution: Resolution for resulting png in DPI.
"""
check_dependencies(__optional_dependencies__['pdf'])
# Import libraries within this function so as to avoid import-time dependence
import PyPDF2
from wand.image import Image # TODO: When we start using this again, document which system-level libraries are required.
dst_pdf = PyPDF2.PdfFileWriter()
dst_pdf.addPage(src_pdf.getPage(pagenum))
pdf_bytes = io.BytesIO()
dst_pdf.write(pdf_bytes)
pdf_bytes.seek(0)
img = Image(file=pdf_bytes, resolution=resolution)
img.convert("png")
return img
def get_pages(pdf_filename, from_, to):
if to < from_:
to = from_
file = open(pdf_filename, 'rb')
pdf = PdfFileReader(file)
output = PdfFileWriter()
for i in range(from_ - 1, to):
output.addPage(pdf.getPage(i))
stream = BytesIO()
output.write(stream)
data = stream.getvalue()
file.close()
return data
def slice(self, pdf_file_path, page_actions, final_pdf_path):
"""
Create new pdf from a slice of pages of a PDF
:param pdf_file_path: path of the source PDF document, from which a new PDF file will be created.
:param pages_actions: list of tuples, each tuple containing the page number and the clockwise rotation to
be applied. The page number is non-zero indexed (first is page 1, and so on).
:return: None. Writes the resulting PDF file into the provided path.
"""
output = PdfFileWriter()
with open(pdf_file_path, 'rb') as file_input:
input = PdfFileReader(file_input, strict=False)
# Check page actions correspond to valid input PDF pages
input_num_pages = input.getNumPages()
actions_page_numbers = zip(*page_actions)[0]
largest_page_num = max(actions_page_numbers)
lowest_page_num = min(actions_page_numbers)
if lowest_page_num < 1:
raise Exception(u"Invalid page numbers range in actions: page numbers cannot be lower than 1.")
if (largest_page_num - 1) > input_num_pages:
raise Exception(u"Invalid page numbers range in actions: page numbers cannot exceed the maximum numbers"
u"of pages of the source PDF document.")
# Perform actual slicing + rotation
for num_page, rotation in page_actions:
output.addPage(input.getPage(num_page-1).rotateCounterClockwise(rotation) if rotation
else input.getPage(num_page-1))
self._write_to_pdf(output, final_pdf_path)
def save(self, filename=None, overwrite=False):
"""
Applies the bates numbers and saves to file.
Args:
filename (str): Path where the PDF should be saved.
overwrite (bool): Switch to allow overwriting of existing files.
Returns:
str: Path where the file was saved.
Raises:
FileExistsError: When the file already exists and overwrite is not enabled.
"""
filename = filename or "{begin}.pdf".format(begin=self.begin)
if os.path.exists(filename) and not overwrite:
raise FileExistsError("PDF file {} already exists and overwrite is disabled.".format(filename))
with open(filename, "wb") as out_file:
writer = PdfFileWriter()
for page in self:
page.apply()
writer.addPage(page.page)
writer.write(out_file)
return filename
def __init__(self, path):
self.path = path
reader = PdfFileReader(open(path, "rb"))
self.writer = PdfFileWriter()
self.writer.appendPagesFromReader(reader)
self.writer.addMetadata({k: v for k, v in reader.getDocumentInfo().items()
if isinstance(v, (utils.string_type, utils.bytes_type))})
def generate_document(self, data):
packet = StringIO()
if self.template_file is not None:
template = PdfFileReader(open(self.template_file, 'rb'))
c = canvas.Canvas(packet, pagesize=(self.width, self.height))
i = 0
for field_cls in self.fields:
# TODO: Catch exception if there is less columns than fields
field = field_cls(self, c, data[i])
field.render()
i += 1
# Save canvas
c.save()
packet.seek(0)
text = PdfFileReader(packet)
output = PdfFileWriter()
if self.template_file is not None:
# Merge text with base
page = template.getPage(0)
page.mergePage(text.getPage(0))
else:
page = text.getPage(0)
output.addPage(page)
# Save file
filename = "%s/%s.pdf" % (self.output_dir, self.generate_filename(data))
outputStream = open(filename, 'wb')
output.write(outputStream)
outputStream.close()
def add_outlines(toc, filename, output):
build_outlines_btree(toc)
pdf_out = PdfFileWriter()
pdf_in = PdfFileReader(open(filename, 'rb'))
for p in pdf_in.pages:
pdf_out.addPage(p)
toc_num = len(toc)
idoix = len(pdf_out._objects) + 1
idorefs = [PDF.IndirectObject(x + idoix, 0, pdf_out) for x in range(toc_num + 1)]
ol = PDF.DictionaryObject()
ol.update({
PDF.NameObject("/Type"): PDF.NameObject("/Outlines"),
PDF.NameObject("/First"): idorefs[1],
PDF.NameObject("/Last"): idorefs[-1],
PDF.NameObject("/Count"): PDF.NumberObject(toc_num)
})
olitems = []
for t in toc:
oli = PDF.DictionaryObject()
oli.update({
PDF.NameObject("/Title"): PDF.TextStringObject(t["title"].decode("utf-8")),
PDF.NameObject("/Dest"): make_dest(pdf_out, t["page"])
})
opt_keys = {"real_parent": "/Parent", "prev": "/Prev", "next": "/Next", "first": "/First", "last": "/Last"}
for k, v in opt_keys.items():
n = getattr(t["node"], k)()
if n is not None:
oli.update({
PDF.NameObject(v): idorefs[n.index]
})
olitems.append(oli)
pdf_out._addObject(ol)
for i in olitems:
pdf_out._addObject(i)
pdf_out._root_object.update({
PDF.NameObject("/Outlines"): idorefs[0]
})
outputFile = open(output, "wb")
pdf_out.write(outputFile)
outputFile.close()
def attach(self, binary):
# determine the format of the file
ext = puremagic.from_string(binary)
page = None
# if the attachment is a PDF
if ext == ".pdf":
# use PyPDF2 to read the stream
pdf = PdfFileReader(BytesIO(binary))
# if it is a multi-page PDF
if pdf.getNumPages() > 1:
# add the pages individually
for pdf_page in pdf.pages:
output = PdfFileWriter()
output.addPage(pdf_page)
pdf_page_buf = BytesIO()
output.write(pdf_page_buf)
page = self.add_page(pdf_page_buf.getvalue())
# if it is just a single page PDF
else:
# then add the original bytestream
page = self.add_page(binary)
# if the attachment is a recognized image
elif ext in [".png", ".jfif", ".gif", ".jpeg", ".jpg"]:
page = self.add_page(binary)
# could not recognize file
else:
pass
if page:
return(page)
def BurnSudoOnPdf(path,numpage,diffarray):
pdf = PdfFileWriter()
# Using ReportLab Canvas to insert image into PDF
imgTemp = BytesIO()
imgDoc = canvas.Canvas(imgTemp, pagesize=A4)
# Draw image on Canvas and save PDF in buffer
pdfmetrics.registerFont(TTFont('VeraIt', 'VeraIt.ttf'))
for i in range(len(path)):
if ((i+1)%2==0):
x=345
else:
x=55
if (i<2):
y=590
elif (i<4):
y=320
else:
y=50
imgDoc.drawImage(path[i], x, y,200,200)
imgDoc.setFont('VeraIt', 9)
imgDoc.drawString(x+2,y+203,getStrDiff(diffarray[i]))
pdfmetrics.registerFont(TTFont('Vera', 'Vera.ttf'))
pdfmetrics.registerFont(TTFont('VeraBd', 'VeraBd.ttf'))
pdfmetrics.registerFont(TTFont('VeraIt', 'VeraIt.ttf'))
#pdfmetrics.registerFont(TTFont('VeraBI', 'VeraBI.ttf'))
imgDoc.setFont('Vera', 13)
imgDoc.drawString(30,820,"BurnYourPc Organization/")
imgDoc.setFont('VeraBd', 9)
imgDoc.drawString(197,820,"Sudoku Project")
imgDoc.setFont('VeraIt', 8)
imgDoc.drawString(430,20,"By PantelisPanka, nikfot, TolisChal")
imgDoc.setFont('Vera', 8)
imgDoc.drawString(550,820,str(numpage))
imgDoc.save()
# Use PyPDF to merge the image-PDF into the template
pdf.addPage(PdfFileReader(BytesIO(imgTemp.getvalue())).getPage(0))
pdf.write(open("output"+ str(numpage)+".pdf","wb"))
def split_pdf(fp, pagenos=[]):
# Remove any possible duplicate pages
pagenos = list(set(pagenos))
# Create the pdf reader
inputpdf = PdfFileReader(fp)
# Create the pdf writer
output = PdfFileWriter()
# Loop through all the page numbers we want to split
for i in pagenos:
# Add each page to the writer
output.addPage(inputpdf.getPage(i))
return output
# Gets all top level sections from the PDF
def postprocess_pdf(input_pdf, qr_data, qr_x=545, qr_y=20, version=None):
""" PDF post-processor. Append QR code on each PDF page.
:param input_pdf: PDF byte content
:param qr_data: QR code data
:param qr_x: X possition of QR image
:param qr_y: Y possition of QR image
"""
qr = pyqrcode.create(qr_data, version=version)
eps = StringIO()
qr.eps(eps)
eps.seek(0)
qr_pdf = BytesIO()
qr_img = Image(file=BytesIO(bytes(eps.read(), 'utf-8')))
qr_img.format = 'pdf'
qr_img.save(qr_pdf)
qr_page = PdfFileReader(qr_pdf).getPage(0)
output_writer = PdfFileWriter()
output_pdf = BytesIO()
for page in PdfFileReader(BytesIO(input_pdf)).pages:
page.mergeTranslatedPage(qr_page, qr_x, qr_y)
output_writer.addPage(page)
output_writer.write(output_pdf)
output_pdf.seek(0)
return output_pdf.read()
def merge_pdfs(self, final_pdf_path, actions, append_blank_page=True):
"""
Merge pdf files in only one PDF
:param final_pdf_path: file path to save pdf
:param actions: list of tuples, each tuple containing a PDF file path and the degrees of counterclockwise
rotation to perform on the PDF document.
:param append_blank_page: append a blank page between documents if True.
:return:
"""
""" Merge all pdf of a folder in one single file '.pdf'. """
output = PdfFileWriter()
docs_to_close = []
for num_doc, (pdf_file_path, rotation) in enumerate(actions):
if pdf_file_path == final_pdf_path:
continue
if not pdf_file_path:
continue
logging.info(u"Parse '%s'" % pdf_file_path)
try:
document_file = open(pdf_file_path, 'rb')
document = PdfFileReader(document_file, strict=False)
num_pages = document.getNumPages()
except Exception as exc:
logging.exception("Error merging pdf %s: %s" % (pdf_file_path, str(exc)))
raise DocumentClipperError
# Rotation must be performed per page, not per document
for num_page in range(num_pages):
page = document.getPage(num_page)
page = page.rotateCounterClockwise(rotation)
output.addPage(page)
if append_blank_page:
output.addBlankPage()
docs_to_close.append(document_file)
self._write_to_pdf(output, final_pdf_path)
self._close_files(docs_to_close)
def main():
context = {
'instructor': INSTRUCTOR,
'course': COURSE,
'student': STUDENT,
'hours': HOURS,
}
styles = getSampleStyleSheet()
styles.add(
ParagraphStyle(
name='Justify', alignment=TA_JUSTIFY, fontSize=16, leading=22
)
)
elements = []
certificate_txt = render(TEXT_PATH, context)
paragraphs = certificate_txt.split(os.linesep)
elements.append(Spacer(1, 50))
for p in paragraphs:
elements.append(Paragraph(p, styles['Justify']))
elements.append(Spacer(1, 16))
certificate = tempfile.NamedTemporaryFile()
doc = SimpleDocTemplate(certificate, topMargin=3 * cm, bottomMargin=0)
doc.pagesize = landscape(A4)
doc.build(elements)
output = PdfFileWriter()
template_file = open('template.pdf', 'rb')
input1 = PdfFileReader(template_file)
page1 = input1.getPage(0)
content = PdfFileReader(certificate)
page1.mergePage(content.getPage(0))
output.addPage(page1)
save_file = open(SAVE_AS, 'wb')
output.write(save_file)
print('Certificado gerado com sucesso em %s' % SAVE_AS)
def read(self, path):
self.filename = os.path.basename(path)
self.file_basename, self.file_extension = os.path.splitext(self.filename)
self.path = path
self.mime_type = mimetypes.guess_type(path)
self.file_basepath = os.path.dirname(path)
# If the file is a pdf, split the pdf and prep the pages.
if self.mime_type[0] == "application/pdf":
file_temp = open(self.path, 'rb')
pdf_reader = pyPdf.PdfFileReader(file_temp)
self.num_pages = pdf_reader.numPages
try:
for i in xrange(self.num_pages):
output = pyPdf.PdfFileWriter()
output.addPage(pdf_reader.getPage(i))
path = 'temp.pdf'
im_path = 'temp.png'
with open(path, 'wb') as f:
output.write(f)
im = PythonMagick.Image()
im.density("300")
im.read(path)
im.write(im_path)
orig_im = cv2.imread(im_path, 0)
page = Page(orig_im, i, self.lang)
self.pages.append(page)
os.remove(path)
os.remove(im_path)
self.prepared = True
except Exception as e:
self.error = e
raise
# If the file is an image, think of it as a 1-page pdf.
elif self.mime_type[0] in acceptable_mime:
self.num_pages = 1
im = PythonMagick.Image()
im.density("300")
im.read(path)
temp_path = os.path.normpath(os.path.join(
self.file_basepath, self.file_basename + '_temp.png'
))
im.write(temp_path)
orig_im = cv2.imread(temp_path, 0)
os.remove(temp_path)
page = Page(orig_im, 0)
self.pages.append(page)
# Otherwise, out of luck.
else:
print(self.mime_type[0])
raise FileNotAcceptedException
def overlay_hocr_pages(self, dpi, hocr_filenames, orig_pdf_filename):
logging.debug("Going to overlay following files onto %s" % orig_pdf_filename)
# Sort the hocr_filenames into natural keys!
hocr_filenames.sort(key=lambda x: self.natural_keys(x[0] ))
logging.debug(hocr_filenames)
pdf_dir, pdf_basename = os.path.split(orig_pdf_filename)
basename = os.path.splitext(pdf_basename)[0]
pdf_filename = os.path.join(pdf_dir, "%s_ocr.pdf" % (basename))
text_pdf_filenames = []
for img_filename, hocr_filename in hocr_filenames:
text_pdf_filename = self.overlay_hocr_page(dpi, hocr_filename, img_filename)
logging.info("Created temp OCR'ed pdf containing only the text as %s" % (text_pdf_filename))
text_pdf_filenames.append(text_pdf_filename)
writer = PdfFileWriter()
orig = open(orig_pdf_filename, 'rb')
for orig_pg, text_pg_filename in zip(self.iter_pdf_page(orig), text_pdf_filenames):
text_file = open(text_pg_filename, 'rb')
text_pg = self.iter_pdf_page(text_file).next()
orig_rotation_angle = int(orig_pg.get('/Rotate', 0))
if orig_rotation_angle != 0:
logging.info("Original Rotation: %s" % orig_pg.get("/Rotate", 0))
self.mergeRotateAroundPointPage(orig_pg, text_pg, orig_rotation_angle, text_pg.mediaBox.getWidth()/2, text_pg.mediaBox.getWidth()/2)
# None of these commands worked for me:
#orig_pg.rotateCounterClockwise(orig_rotation_angle)
#orig_pg.mergeRotatedPage(text_pg,text_rotation_angle)
else:
orig_pg.mergePage(text_pg)
orig_pg.compressContentStreams()
writer.addPage(orig_pg)
with open(pdf_filename, 'wb') as f:
# Flush out this page merge so we can close the text_file
writer.write(f)
text_file.close()
orig.close()
for fn in text_pdf_filenames:
os.remove(fn)
logging.info("Created OCR'ed pdf as %s" % (pdf_filename))
return pdf_filename
def overlay_hocr_pages(self, dpi, hocr_filenames, orig_pdf_filename):
logging.debug("Going to overlay following files onto %s" % orig_pdf_filename)
# Sort the hocr_filenames into natural keys!
hocr_filenames.sort(key=lambda x: self.natural_keys(x[0] ))
logging.debug(hocr_filenames)
pdf_dir, pdf_basename = os.path.split(orig_pdf_filename)
basename = os.path.splitext(pdf_basename)[0]
pdf_filename = os.path.join(pdf_dir, "%s_ocr.pdf" % (basename))
text_pdf_filenames = []
for img_filename, hocr_filename in hocr_filenames:
text_pdf_filename = self.overlay_hocr_page(dpi, hocr_filename, img_filename)
logging.info("Created temp OCR'ed pdf containing only the text as %s" % (text_pdf_filename))
text_pdf_filenames.append(text_pdf_filename)
writer = PdfFileWriter()
orig = open(orig_pdf_filename, 'rb')
for orig_pg, text_pg_filename in zip(self.iter_pdf_page(orig), text_pdf_filenames):
text_file = open(text_pg_filename, 'rb')
text_pg = self.iter_pdf_page(text_file).next()
orig_rotation_angle = int(orig_pg.get('/Rotate', 0))
if orig_rotation_angle != 0:
logging.info("Original Rotation: %s" % orig_pg.get("/Rotate", 0))
self.mergeRotateAroundPointPage(orig_pg, text_pg, orig_rotation_angle, text_pg.mediaBox.getWidth()/2, text_pg.mediaBox.getWidth()/2)
# None of these commands worked for me:
#orig_pg.rotateCounterClockwise(orig_rotation_angle)
#orig_pg.mergeRotatedPage(text_pg,text_rotation_angle)
else:
orig_pg.mergePage(text_pg)
orig_pg.compressContentStreams()
writer.addPage(orig_pg)
with open(pdf_filename, 'wb') as f:
# Flush out this page merge so we can close the text_file
writer.write(f)
text_file.close()
orig.close()
for fn in text_pdf_filenames:
os.remove(fn)
logging.info("Created OCR'ed pdf as %s" % (pdf_filename))
return pdf_filename
def overlay_hocr_pages(self, dpi, hocr_filenames, orig_pdf_filename):
logging.debug("Going to overlay following files onto %s" % orig_pdf_filename)
# Sort the hocr_filenames into natural keys!
hocr_filenames.sort(key=lambda x: self.natural_keys(x[0] ))
logging.debug(hocr_filenames)
pdf_dir, pdf_basename = os.path.split(orig_pdf_filename)
basename = os.path.splitext(pdf_basename)[0]
pdf_filename = os.path.join(pdf_dir, "%s_ocr.pdf" % (basename))
text_pdf_filenames = []
for img_filename, hocr_filename in hocr_filenames:
text_pdf_filename = self.overlay_hocr_page(dpi, hocr_filename, img_filename)
logging.info("Created temp OCR'ed pdf containing only the text as %s" % (text_pdf_filename))
text_pdf_filenames.append(text_pdf_filename)
writer = PdfFileWriter()
orig = open(orig_pdf_filename, 'rb')
for orig_pg, text_pg_filename in zip(self.iter_pdf_page(orig), text_pdf_filenames):
text_file = open(text_pg_filename, 'rb')
text_pg = self.iter_pdf_page(text_file).next()
orig_rotation_angle = int(orig_pg.get('/Rotate', 0))
if orig_rotation_angle != 0:
logging.info("Original Rotation: %s" % orig_pg.get("/Rotate", 0))
self.mergeRotateAroundPointPage(orig_pg, text_pg, orig_rotation_angle, text_pg.mediaBox.getWidth()/2, text_pg.mediaBox.getWidth()/2)
# None of these commands worked for me:
#orig_pg.rotateCounterClockwise(orig_rotation_angle)
#orig_pg.mergeRotatedPage(text_pg,text_rotation_angle)
else:
orig_pg.mergePage(text_pg)
orig_pg.compressContentStreams()
writer.addPage(orig_pg)
with open(pdf_filename, 'wb') as f:
# Flush out this page merge so we can close the text_file
writer.write(f)
text_file.close()
orig.close()
for fn in text_pdf_filenames:
os.remove(fn)
logging.info("Created OCR'ed pdf as %s" % (pdf_filename))
return pdf_filename
def overlay_hocr_pages(self, dpi, hocr_filenames, orig_pdf_filename):
logging.debug("Going to overlay following files onto %s" % orig_pdf_filename)
# Sort the hocr_filenames into natural keys!
hocr_filenames.sort(key=lambda x: self.natural_keys(x[0] ))
logging.debug(hocr_filenames)
pdf_dir, pdf_basename = os.path.split(orig_pdf_filename)
basename = os.path.splitext(pdf_basename)[0]
pdf_filename = os.path.join(pdf_dir, "%s_ocr.pdf" % (basename))
text_pdf_filenames = []
for img_filename, hocr_filename in hocr_filenames:
text_pdf_filename = self.overlay_hocr_page(dpi, hocr_filename, img_filename)
logging.info("Created temp OCR'ed pdf containing only the text as %s" % (text_pdf_filename))
text_pdf_filenames.append(text_pdf_filename)
writer = PdfFileWriter()
orig = open(orig_pdf_filename, 'rb')
for orig_pg, text_pg_filename in zip(self.iter_pdf_page(orig), text_pdf_filenames):
text_file = open(text_pg_filename, 'rb')
text_pg = self.iter_pdf_page(text_file).next()
orig_rotation_angle = int(orig_pg.get('/Rotate', 0))
if orig_rotation_angle != 0:
logging.info("Original Rotation: %s" % orig_pg.get("/Rotate", 0))
self.mergeRotateAroundPointPage(orig_pg, text_pg, orig_rotation_angle, text_pg.mediaBox.getWidth()/2, text_pg.mediaBox.getWidth()/2)
# None of these commands worked for me:
#orig_pg.rotateCounterClockwise(orig_rotation_angle)
#orig_pg.mergeRotatedPage(text_pg,text_rotation_angle)
else:
orig_pg.mergePage(text_pg)
orig_pg.compressContentStreams()
writer.addPage(orig_pg)
with open(pdf_filename, 'wb') as f:
# Flush out this page merge so we can close the text_file
writer.write(f)
text_file.close()
orig.close()
for fn in text_pdf_filenames:
os.remove(fn)
logging.info("Created OCR'ed pdf as %s" % (pdf_filename))
return pdf_filename
def overlay_hocr_pages(self, dpi, hocr_filenames, orig_pdf_filename):
logging.debug("Going to overlay following files onto %s" % orig_pdf_filename)
# Sort the hocr_filenames into natural keys!
hocr_filenames.sort(key=lambda x: self.natural_keys(x[0] ))
logging.debug(hocr_filenames)
pdf_dir, pdf_basename = os.path.split(orig_pdf_filename)
basename = os.path.splitext(pdf_basename)[0]
pdf_filename = os.path.join(pdf_dir, "%s_ocr.pdf" % (basename))
text_pdf_filenames = []
for img_filename, hocr_filename in hocr_filenames:
text_pdf_filename = self.overlay_hocr_page(dpi, hocr_filename, img_filename)
logging.info("Created temp OCR'ed pdf containing only the text as %s" % (text_pdf_filename))
text_pdf_filenames.append(text_pdf_filename)
# Now, concatenate this text_pdfs into one single file.
# This is a hack to save memory/running time when we have to do the actual merge with a writer
all_text_filename = os.path.join(pdf_dir, "%s_text.pdf" % (basename))
merger = PdfFileMerger()
for text_pdf_filename in text_pdf_filenames:
merger.append(PdfFileReader(file(text_pdf_filename, 'rb')))
merger.write(all_text_filename)
merger.close()
del merger
writer = PdfFileWriter()
orig = open(orig_pdf_filename, 'rb')
text_file = open(all_text_filename, 'rb')
for orig_pg, text_pg in zip(self.iter_pdf_page(orig), self.iter_pdf_page(text_file)):
orig_pg = self._get_merged_single_page(orig_pg, text_pg)
writer.addPage(orig_pg)
with open(pdf_filename, 'wb') as f:
# Flush out this page merge so we can close the text_file
writer.write(f)
orig.close()
text_file.close()
# Windows sometimes locks the temp text file for no reason, so we need to retry a few times to delete
for fn in text_pdf_filenames:
#os.remove(fn)
Retry(partial(os.remove, fn), tries=10, pause=3).call_with_retry()
os.remove(all_text_filename)
logging.info("Created OCR'ed pdf as %s" % (pdf_filename))
return pdf_filename