def get_pages(pdf_filename, from_, to):
if to < from_:
to = from_
file = open(pdf_filename, 'rb')
pdf = PdfFileReader(file)
output = PdfFileWriter()
for i in range(from_ - 1, to):
output.addPage(pdf.getPage(i))
stream = BytesIO()
output.write(stream)
data = stream.getvalue()
file.close()
return data
python类PdfFileReader()的实例源码
def pdfMetaData(file_path, save=True):
'''Get PDF document metadata, takes 2 arguments, file_path and save (boolean, default is True)'''
pdf_doc = PdfFileReader(open(file_path, "rb"))
if pdf_doc.isEncrypted:
try:
if pdf_doc.decrypt("") != 1:
sys.exit("target pdf document is encrypted... exiting...")
except:
sys.exit("target pdf document is encrypted with an unsupported algorithm... exiting...")
doc_info = pdf_doc.getDocumentInfo()
stats = os.stat(file_path)
now = dt.now()
file_name = getFileName(file_path)
metadata = "Time: %d/%d/%d %d : %d : %d. Found the following metadata for file %s:\n\n" % (now.year, now.month,
now.day, now.hour, now.minute,
now.second, file_name[:-4])
try:
for md in doc_info:
metadata += str(md[1:]) + " : " + pretifyPyPDF2Time(str(md[1:]) ,str(doc_info[md])) + "\n"
except TypeError:
sys.exit("Couldn't read document info! Make sure target is a valid pdf document...")
metadata += "Last metadata mod Date: %s\nLast Mod Date: %s\nLast Access Date: %s\nOwner User ID: %s" %(dt.fromtimestamp(stats.st_ctime),
dt.fromtimestamp(stats.st_mtime),
dt.fromtimestamp(stats.st_atime),
stats.st_uid)
try:
print(metadata)
except UnicodeEncodeError:
print("Console encoding can't decode the result. Enter chcp 65001 in the console and rerun the script.")
if save:
file_name = getFileName(file_path)
tgt = file_name + ".txt"
saveResult(tgt, metadata)
def validate_pdf(value):
"""
Validates the uploading file if it is a PDF.
Raises an error if validation not passed.
:param value: The file object.
"""
try:
PyPDF2.PdfFileReader(io.BytesIO(value.read()))
except PyPDF2.utils.PdfReadError:
raise ValidationError('Tried to upload not PDF as a book!')
def slice(self, pdf_file_path, page_actions, final_pdf_path):
"""
Create new pdf from a slice of pages of a PDF
:param pdf_file_path: path of the source PDF document, from which a new PDF file will be created.
:param pages_actions: list of tuples, each tuple containing the page number and the clockwise rotation to
be applied. The page number is non-zero indexed (first is page 1, and so on).
:return: None. Writes the resulting PDF file into the provided path.
"""
output = PdfFileWriter()
with open(pdf_file_path, 'rb') as file_input:
input = PdfFileReader(file_input, strict=False)
# Check page actions correspond to valid input PDF pages
input_num_pages = input.getNumPages()
actions_page_numbers = zip(*page_actions)[0]
largest_page_num = max(actions_page_numbers)
lowest_page_num = min(actions_page_numbers)
if lowest_page_num < 1:
raise Exception(u"Invalid page numbers range in actions: page numbers cannot be lower than 1.")
if (largest_page_num - 1) > input_num_pages:
raise Exception(u"Invalid page numbers range in actions: page numbers cannot exceed the maximum numbers"
u"of pages of the source PDF document.")
# Perform actual slicing + rotation
for num_page, rotation in page_actions:
output.addPage(input.getPage(num_page-1).rotateCounterClockwise(rotation) if rotation
else input.getPage(num_page-1))
self._write_to_pdf(output, final_pdf_path)
def parse_pdf_doc(self):
"""
Open a pdf document filetype and parse contents to string variable
for matching comparison.
"""
docText = ''
# open the file, with read/binary priviledges
f = open(self.file, 'rb')
pdf = PyPDF2.PdfFileReader(f)
for page in pdf.pages :
docText += page.extractText()
f.close()
return docText.strip() or None
def __init__(self, file, prefix, fill, start, area):
"""
Represents a document to be numbered.
Args:
file (): PDF file associated with this document.
prefix (str): Bates number prefix.
fill (int): Length to zero-pad number to.
start (int): Number to start with.
area (Area): Area on the document where the number should be drawn
"""
try:
self.file = io.BytesIO(file.read())
except AttributeError:
with open(file, "rb") as file:
self.file = io.BytesIO(file.read())
self.reader = PdfFileReader(self.file)
self.prefix = prefix
self.fill = fill
self.start = copy.copy(start)
self.area = area
self.overlays = {x: None for x in Area}
self.overlays[area] = BatesOverlay(None, self.area)
self.index = 0
self.pages = []
for num, page in enumerate(self.reader.pages):
p = Page(self, page, self.prefix, self.fill, self.start + num)
self.pages.append(p)
def __init__(self, path):
self.path = path
reader = PdfFileReader(open(path, "rb"))
self.writer = PdfFileWriter()
self.writer.appendPagesFromReader(reader)
self.writer.addMetadata({k: v for k, v in reader.getDocumentInfo().items()
if isinstance(v, (utils.string_type, utils.bytes_type))})
def reader(title):
"""Leest de PDF en converteert het naar TEXT"""
pdfFileObj = open(title,'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
text = ""
maxpage = int(pdfReader.numPages) + 1
for x in range(0, maxpage + 100):
try:
pageObj = pdfReader.getPage(x)
text += str(pageObj.extractText())
except:
break
return text
def generate_document(self, data):
packet = StringIO()
if self.template_file is not None:
template = PdfFileReader(open(self.template_file, 'rb'))
c = canvas.Canvas(packet, pagesize=(self.width, self.height))
i = 0
for field_cls in self.fields:
# TODO: Catch exception if there is less columns than fields
field = field_cls(self, c, data[i])
field.render()
i += 1
# Save canvas
c.save()
packet.seek(0)
text = PdfFileReader(packet)
output = PdfFileWriter()
if self.template_file is not None:
# Merge text with base
page = template.getPage(0)
page.mergePage(text.getPage(0))
else:
page = text.getPage(0)
output.addPage(page)
# Save file
filename = "%s/%s.pdf" % (self.output_dir, self.generate_filename(data))
outputStream = open(filename, 'wb')
output.write(outputStream)
outputStream.close()
def getPDFContent(path):
content = ""
pdf = PyPDF2.PdfFileReader(path, "rb")
for i in range(0, pdf.getNumPages()):
content += pdf.getPage(i).extractText() + "\n"
content = " ".join(content.strip().split())
return content
def add_outlines(toc, filename, output):
build_outlines_btree(toc)
pdf_out = PdfFileWriter()
pdf_in = PdfFileReader(open(filename, 'rb'))
for p in pdf_in.pages:
pdf_out.addPage(p)
toc_num = len(toc)
idoix = len(pdf_out._objects) + 1
idorefs = [PDF.IndirectObject(x + idoix, 0, pdf_out) for x in range(toc_num + 1)]
ol = PDF.DictionaryObject()
ol.update({
PDF.NameObject("/Type"): PDF.NameObject("/Outlines"),
PDF.NameObject("/First"): idorefs[1],
PDF.NameObject("/Last"): idorefs[-1],
PDF.NameObject("/Count"): PDF.NumberObject(toc_num)
})
olitems = []
for t in toc:
oli = PDF.DictionaryObject()
oli.update({
PDF.NameObject("/Title"): PDF.TextStringObject(t["title"].decode("utf-8")),
PDF.NameObject("/Dest"): make_dest(pdf_out, t["page"])
})
opt_keys = {"real_parent": "/Parent", "prev": "/Prev", "next": "/Next", "first": "/First", "last": "/Last"}
for k, v in opt_keys.items():
n = getattr(t["node"], k)()
if n is not None:
oli.update({
PDF.NameObject(v): idorefs[n.index]
})
olitems.append(oli)
pdf_out._addObject(ol)
for i in olitems:
pdf_out._addObject(i)
pdf_out._root_object.update({
PDF.NameObject("/Outlines"): idorefs[0]
})
outputFile = open(output, "wb")
pdf_out.write(outputFile)
outputFile.close()
def iter_pdf_page(self, f):
reader = PdfFileReader(f)
for pgnum in range(reader.getNumPages()):
pg = reader.getPage(pgnum)
yield pg
def iter_pdf_page_text(self, filename):
self.filename = filename
reader = PdfFileReader(filename)
logging.info("pdf scanner found %d pages in %s" % (reader.getNumPages(), filename))
for pgnum in range(reader.getNumPages()):
text = reader.getPage(pgnum).extractText()
text = text.encode('ascii', 'ignore')
text = text.replace('\n', ' ')
yield text
def iter_pdf_page(self, f):
reader = PdfFileReader(f)
for pgnum in range(reader.getNumPages()):
pg = reader.getPage(pgnum)
yield pg
def iter_pdf_page_text(self, filename):
self.filename = filename
reader = PdfFileReader(filename)
logging.info("pdf scanner found %d pages in %s" % (reader.getNumPages(), filename))
for pgnum in range(reader.getNumPages()):
text = reader.getPage(pgnum).extractText()
text = text.encode('ascii', 'ignore')
text = text.replace('\n', ' ')
yield text
def iter_pdf_page_text(self, filename):
self.filename = filename
reader = PdfFileReader(filename)
logging.info("pdf scanner found %d pages in %s" % (reader.getNumPages(), filename))
for pgnum in range(reader.getNumPages()):
text = reader.getPage(pgnum).extractText()
text = text.encode('ascii', 'ignore')
text = text.replace('\n', ' ')
yield text
def iter_pdf_page(self, f):
reader = PdfFileReader(f)
for pgnum in range(reader.getNumPages()):
pg = reader.getPage(pgnum)
yield pg
def iter_pdf_page_text(self, filename):
self.filename = filename
reader = PdfFileReader(filename)
logging.info("pdf scanner found %d pages in %s" % (reader.getNumPages(), filename))
for pgnum in range(reader.getNumPages()):
text = reader.getPage(pgnum).extractText()
text = text.encode('ascii', 'ignore')
text = text.replace('\n', ' ')
yield text
def check_nb_pages(self, data):
"""
Does this PDF contain enough pages?
"""
try:
s_io = StringIO(data)
reader = PyPDF2.PdfFileReader(s_io)
num_pages = reader.getNumPages()
print("num pages: %d" % num_pages)
return num_pages > 2
except PyPdfError as e:
return False
def loadFromStream(self, stream):
self.reader = PdfFileReader(stream, strict=False)