def Analyze_Metadata_pdf(filename):
####### FUNCTION AnalyzeMetadata ######
pdfFile = PdfFileReader(file(filename, 'rb'))
metadata = pdfFile.getDocumentInfo()
print ' - Document: ' + str(filename)
for meta in metadata:
value=(metadata[meta])
print ' - ' + meta + ':' + metadata[meta]
if meta == "/Author":
if value not in meta_author_array:
meta_author_array.append(value)
elif meta =="/Producer":
if value not in meta_producer_array:
meta_producer_array.append(value)
elif meta == "/Creator":
if value not in meta_creator_array:
meta_creator_array.append(value)
#Group the different arrays in one with all metadata
metadata_files.append(meta_author_array)
metadata_files.append(meta_producer_array)
metadata_files.append(meta_creator_array)
####### FUNCTION AnalyzeMetadata doc ######
python类PdfFileReader()的实例源码
def render_pdf(self):
outpdf = PdfFileWriter()
for page in self.pages:
if page.extension == "pdf":
# the page is already a PDF so append directly
outpdf.addPage(PdfFileReader(BytesIO(page.binary)).getPage(0))
else:
# otherwise, the page is an image that needs to be converted to PDF first
buf = BytesIO()
img = Image.open(BytesIO(page.binary))
img.convert("RGB").save(buf, format="pdf")
# once image is PDF, it can be appended
outpdf.addPage(PdfFileReader(buf).getPage(0))
pdf_page_buf = BytesIO()
outpdf.write(pdf_page_buf)
return(pdf_page_buf.getvalue())
def pdf_page_to_png(src_pdf, pagenum = 0, resolution = 72,):
'''
Returns specified PDF page as wand.image.Image png.
:param PyPDF2.PdfFileReader src_pdf: PDF from which to take pages.
:param int pagenum: Page number to take.
:param int resolution: Resolution for resulting png in DPI.
'''
dst_pdf = PyPDF2.PdfFileWriter()
dst_pdf.addPage(src_pdf.getPage(pagenum))
pdf_bytes = io.BytesIO()
dst_pdf.write(pdf_bytes)
pdf_bytes.seek(0)
img = Image(file = pdf_bytes, resolution = resolution)
img.convert("png")
return img
# Example of converting exam.pdf located at the same direcory
# convert('exam') # NOTE : default resolution is 72 dpi
def outputpapertemplate(self, dest, listchar, output=None):
if output == None:
output = PyPDF2.PdfFileWriter()
while listchar:
iopage = self.outputtemplateonepage(listchar)
page = PyPDF2.PdfFileReader(iopage)
output.addPage(page.getPage(0))
if dest != None:
if isinstance(dest, str): # when dest is a file path
destdir = os.path.dirname(dest)
if destdir != '' and not os.path.isdir(destdir):
os.makedirs(destdir)
with open(dest, "wb") as w:
output.write(w)
else: # when dest is io.IOBase
output.write(dest)
else:
return output
def apply(self):
"""
Applies all requested overlays to the page
Returns:
bool
"""
for overlay in self.document.overlays.values():
if isinstance(overlay, BatesOverlay):
overlay.text = self.number
overlay.apply(self.canvas)
elif isinstance(overlay, GenericTextOverlay):
overlay.apply(self.canvas)
for redaction in self.redactions:
redaction.apply(self.canvas)
self.canvas.showPage()
self.canvas.save()
self.canvas_file.seek(0)
reader = PdfFileReader(self.canvas_file)
overlay_page = reader.getPage(0)
self.page.mergePage(overlay_page)
return True
def pdf_parser(s):
s = s.strip()
# required to suppress warning messages
with open(os.devnull, 'w') as fp:
pdf = PdfFileReader(StringIO(s), strict=False, warndest=fp)
if pdf.isEncrypted:
try:
pdf.decrypt('')
except NotImplementedError:
return {}
meta = pdf.getDocumentInfo()
#print(str(meta))
result = {
'author': meta.author,
}
return result
def test_report_pdf(self):
self.client_post_report_creation()
pdf = report_delivery.report_as_pdf(
report=self.report,
data=mock_report_data,
recipient=None,
)
pdf_reader = PyPDF2.PdfFileReader(BytesIO(pdf))
self.assertIn(
"Reported by: testing_12",
pdf_reader.getPage(0).extractText())
self.assertIn('food options', pdf_reader.getPage(0).extractText())
self.assertIn('vegetables', pdf_reader.getPage(0).extractText())
self.assertIn('apples: red', pdf_reader.getPage(0).extractText())
self.assertIn('eat it now???', pdf_reader.getPage(0).extractText())
def create(self, cr, uid, ids, data, context=None):
pool = registry(cr.dbname)
merger = PdfFileMerger()
outfiles = []
for p in pool.get(self.model).read(cr,uid,ids):
outfiles.append(self.newfilename())
sla = self.render(cr,uid,p,data.get('template') or self.template)
if self.report_type == 'scribus_sla':
os.unlink(outfiles[-1])
return (sla.read(),'sla')
command = "xvfb-run -a scribus-ng -ns -g %s -py %s -pa -o %s" % (sla.name,os.path.join(get_module_path('report_scribus'), 'scribus.py'),outfiles[-1])
_logger.info(command)
res = os.system(command)
sla.close()
if not os.path.exists(outfiles[-1]) or os.stat(outfiles[-1]).st_size == 0:
raise MissingError('There are something wrong with the template or scribus installation')
merger.append(PdfFileReader(file(outfiles[-1], 'rb')))
outfile = tempfile.NamedTemporaryFile(mode='w+b',suffix='.pdf')
merger.write(outfile.name)
for filename in outfiles:
os.unlink(filename)
outfile.seek(0)
pdf = outfile.read()
outfile.close()
return (pdf,'pdf')
def run(self):
objeto_pdf = open(self.caminho_arquivo, 'rb')
reader = PyPDF2.PdfFileReader(objeto_pdf)
conteudos = []
for num in range(reader.numPages):
texto = reader.getPage(num).extractText()
conteudos.append(texto.encode("utf-8"))
instancia = object.__new__(GravarConvertido)
instancia.__init__(self.caminho_arquivo," ".join(conteudos))
self.next_pipe = instancia
self.next_pipe.run()
def _destinations_in_two_columns(pdf, destinations, cutoff=3):
"""
Check if the named destinations are organized along two columns (heuristic)
@param pdf: a PdfFileReader object
@param destinations:
'cutoff' is used to tune the heuristic: if 'cutoff' destinations in the
would-be second column start at the same position, return True
"""
# iterator for the x coordinates of refs in the would-be second column
xpositions = (_destination_position(pdf, dest)[3] for (_, dest)
in destinations
if _destination_position(pdf, dest)[1] == 1)
xpos_count = {}
for xpos in xpositions:
xpos_count[xpos] = xpos_count.get(xpos, 0) + 1
if xpos_count[xpos] >= cutoff:
return True
return False
def extract_links(self, response):
pdf = pyPdf.PdfFileReader(BytesIO(response.body))
pgs = pdf.getNumPages()
for page_num in range(pgs):
page = pdf.getPage(page_num)
annotations = page.get('/Annots', [])
for annotation in annotations:
annot_object = annotation.getObject()
a_tag = annot_object.get('/A')
if a_tag and '/URI' in a_tag:
uri = a_tag['/URI']
if isinstance(uri, pyPdf.generic.ByteStringObject):
uri = uri.decode("utf-8").replace("\x00", "")
yield (uri, uri)
def extract_metadata(self, file_path):
with open(file_path, 'rb') as fh:
pdf = PdfFileReader(fh, strict=False)
meta = pdf.getDocumentInfo()
if meta is not None:
self.update('title', meta.title)
self.update('author', meta.author)
self.update('generator', meta.creator)
self.update('generator', meta.producer)
if meta.subject:
self.result.keywords.append(meta.subject)
xmp = pdf.getXmpMetadata()
if xmp is not None:
self.update('id', xmp.xmpmm_documentId)
for lang, title in xmp.dc_title.items():
self.update('title', title)
self.result.languages.append(lang)
self.update('generator', xmp.pdf_producer)
self.update('created_at', xmp.xmp_createDate)
self.update('modified_at', xmp.xmp_modifyDate)
self.result.languages.extend(xmp.dc_language)
# from pprint import pprint
# pprint(self.result.to_dict())
def __loadSATPDF(self, filename):
print("loading SAT score pdf")
"""
loads the SAT PDF file, deletes all nonsense and creates an array containing only the numbers
from the table
Return
------
All numbers from the SAT table in a string array
"""
pdf = pypdf.PdfFileReader(open(filename, "rb"))
tableContents = []
for page in pdf.pages:
content = page.extractText()
tableHeader = "Total \nMale Female \nScore \nNumber Percentile Number Percentile Number Percentile "
tableFooter = "De˜nitions of statistical terms are provided online at research."
tableContents += self.__getTableContent(content, tableHeader, tableFooter)
if "Number" and "Mean" and "S.D." in tableContents:
tableContents = tableContents[:tableContents.index("S.D.") - 2]
return tableContents
merged_pdf.py 文件源码
项目:python_for_linux_system_administration
作者: lalor
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def main():
all_pdfs = get_all_pdf_files(os.path.expanduser('~lmx/'))
if not all_pdfs:
raise SystemExit('No pdf file found!')
merger = PyPDF2.PdfFileMerger()
with open(all_pdfs[0], 'rb') as first_obj:
merger.append(first_obj)
for pdf in all_pdfs[1:]:
with open(pdf, 'rb') as obj:
reader = PyPDF2.PdfFileReader(obj)
merger.append(fileobj=obj, pages=(1, reader.getNumPages()))
with open('merge-pdfs.pdf', 'wb') as f:
merger.write(f)
def Analyze_Metadata_pdf(filename):
####### FUNCTION AnalyzeMetadata ######
pdfFile = PdfFileReader(file(filename, 'rb'))
metadata = pdfFile.getDocumentInfo()
print ' - Document: ' + str(filename)
for meta in metadata:
value=(metadata[meta])
print ' - ' + meta + ':' + metadata[meta]
if meta == "/Author":
if value not in meta_author_array:
meta_author_array.append(value)
elif meta =="/Producer":
if value not in meta_producer_array:
meta_producer_array.append(value)
elif meta == "/Creator":
if value not in meta_creator_array:
meta_creator_array.append(value)
#Group the different arrays in one with all metadata
metadata_files.append(meta_author_array)
metadata_files.append(meta_producer_array)
metadata_files.append(meta_creator_array)
#print metadata_files
####### FUNCTION AnalyzeMetadata doc ######
def Analyze_Metadata_pdf(filename):
####### FUNCTION AnalyzeMetadata ######
pdfFile = PdfFileReader(file(filename, 'rb'))
metadata = pdfFile.getDocumentInfo()
print ' - Document: ' + str(filename)
for meta in metadata:
value=(metadata[meta])
print ' - ' + meta + ':' + metadata[meta]
if meta == "/Author":
if value not in meta_author_array:
meta_author_array.append(value)
elif meta =="/Producer":
if value not in meta_producer_array:
meta_producer_array.append(value)
elif meta == "/Creator":
if value not in meta_creator_array:
meta_creator_array.append(value)
#Group the different arrays in one with all metadata
metadata_files.append(meta_author_array)
metadata_files.append(meta_producer_array)
metadata_files.append(meta_creator_array)
#print metadata_files
####### FUNCTION AnalyzeMetadata doc ######
def Analyze_Metadata_pdf(filename):
####### FUNCTION AnalyzeMetadata ######
pdfFile = PdfFileReader(file(filename, 'rb'))
metadata = pdfFile.getDocumentInfo()
print ' - Document: ' + str(filename)
for meta in metadata:
value=(metadata[meta])
print ' - ' + meta + ':' + metadata[meta]
if meta == "/Author":
if value not in meta_author_array:
meta_author_array.append(value)
elif meta =="/Producer":
if value not in meta_producer_array:
meta_producer_array.append(value)
elif meta == "/Creator":
if value not in meta_creator_array:
meta_creator_array.append(value)
#Group the different arrays in one with all metadata
metadata_files.append(meta_author_array)
metadata_files.append(meta_producer_array)
metadata_files.append(meta_creator_array)
####### FUNCTION AnalyzeMetadata doc ######
def Analyze_Metadata_pdf(filename):
####### FUNCTION AnalyzeMetadata ######
pdfFile = PdfFileReader(file(filename, 'rb'))
metadata = pdfFile.getDocumentInfo()
print ' - Document: ' + str(filename)
for meta in metadata:
value=(metadata[meta])
print ' - ' + meta + ':' + metadata[meta]
if meta == "/Author":
if value not in meta_author_array:
meta_author_array.append(value)
elif meta =="/Producer":
if value not in meta_producer_array:
meta_producer_array.append(value)
elif meta == "/Creator":
if value not in meta_creator_array:
meta_creator_array.append(value)
#Group the different arrays in one with all metadata
metadata_files.append(meta_author_array)
metadata_files.append(meta_producer_array)
metadata_files.append(meta_creator_array)
####### FUNCTION AnalyzeMetadata doc ######
def Analyze_Metadata_pdf(filename):
####### FUNCTION AnalyzeMetadata ######
pdfFile = PdfFileReader(file(filename, 'rb'))
metadata = pdfFile.getDocumentInfo()
print ' - Document: ' + str(filename)
for meta in metadata:
value=(metadata[meta])
print ' - ' + meta + ':' + metadata[meta]
if meta == "/Author":
if value not in meta_author_array:
meta_author_array.append(value)
elif meta =="/Producer":
if value not in meta_producer_array:
meta_producer_array.append(value)
elif meta == "/Creator":
if value not in meta_creator_array:
meta_creator_array.append(value)
#Group the different arrays in one with all metadata
metadata_files.append(meta_author_array)
metadata_files.append(meta_producer_array)
metadata_files.append(meta_creator_array)
#print metadata_files
####### FUNCTION AnalyzeMetadata doc ######
def Analyze_Metadata_pdf(filename):
####### FUNCTION AnalyzeMetadata ######
pdfFile = PdfFileReader(file(filename, 'rb'))
metadata = pdfFile.getDocumentInfo()
print ' - Document: ' + str(filename)
for meta in metadata:
value=(metadata[meta])
print ' - ' + meta + ':' + metadata[meta]
if meta == "/Author":
if value not in meta_author_array:
meta_author_array.append(value)
elif meta =="/Producer":
if value not in meta_producer_array:
meta_producer_array.append(value)
elif meta == "/Creator":
if value not in meta_creator_array:
meta_creator_array.append(value)
#Group the different arrays in one with all metadata
metadata_files.append(meta_author_array)
metadata_files.append(meta_producer_array)
metadata_files.append(meta_creator_array)
####### FUNCTION AnalyzeMetadata doc ######
def Analyze_Metadata_pdf(filename):
####### FUNCTION AnalyzeMetadata ######
pdfFile = PdfFileReader(file(filename, 'rb'))
metadata = pdfFile.getDocumentInfo()
print ' - Document: ' + str(filename)
for meta in metadata:
value=(metadata[meta])
print ' - ' + meta + ':' + metadata[meta]
if meta == "/Author":
if value not in meta_author_array:
meta_author_array.append(value)
elif meta =="/Producer":
if value not in meta_producer_array:
meta_producer_array.append(value)
elif meta == "/Creator":
if value not in meta_creator_array:
meta_creator_array.append(value)
#Group the different arrays in one with all metadata
metadata_files.append(meta_author_array)
metadata_files.append(meta_producer_array)
metadata_files.append(meta_creator_array)
####### FUNCTION AnalyzeMetadata doc ######
def pdf_splitter(self):
self.log.info('Called pdf_splitter')
input_pdf = PdfFileReader(file(self.pdf_file, 'rb'))
self.total_pages = input_pdf.numPages
for page_number in range(self.total_pages):
output = PdfFileWriter()
output.addPage(input_pdf.getPage(page_number))
# new filename
new_pdf = '_%s%s' % (str(page_number+1), '.pdf')
new_pdf = self.pdf_file.replace('.pdf', new_pdf)
file_stream = file(new_pdf, 'wb')
output.write(file_stream)
file_stream.close()
# calling pdf to image conversion
self.pdf_to_image(new_pdf)
def pdf_parser(s):
s = s.strip()
# required to suppress warning messages
with open(os.devnull, 'w') as fp:
pdf = PdfFileReader(StringIO(s), strict=False, warndest=fp)
if pdf.isEncrypted:
try:
pdf.decrypt('')
except NotImplementedError:
return {}
meta = pdf.getDocumentInfo()
#print(str(meta))
result = {}
for key in meta.keys():
result[key[1:]] = meta.get(key)
return result
def pdf_page_to_png(src_pdf, pagenum=0, resolution=154):
"""
Returns specified PDF page as wand.image.Image png.
:param PyPDF2.PdfFileReader src_pdf: PDF from which to take pages.
:param int pagenum: Page number to take.
:param int resolution: Resolution for resulting png in DPI.
"""
check_dependencies(__optional_dependencies__['pdf'])
# Import libraries within this function so as to avoid import-time dependence
import PyPDF2
from wand.image import Image # TODO: When we start using this again, document which system-level libraries are required.
dst_pdf = PyPDF2.PdfFileWriter()
dst_pdf.addPage(src_pdf.getPage(pagenum))
pdf_bytes = io.BytesIO()
dst_pdf.write(pdf_bytes)
pdf_bytes.seek(0)
img = Image(file=pdf_bytes, resolution=resolution)
img.convert("png")
return img
multipage2book.py 文件源码
项目:multipage_to_book_batch_converter
作者: uml-digitalinitiatives
项目源码
文件源码
阅读 25
收藏 0
点赞 0
评论 0
def count_pages(input_file):
"""Count the number of pages in a file
Keyword arguments
input_file -- the full path to the input file
"""
count = 0
if is_pdf.match(input_file):
with open(input_file, 'rb') as fp:
count += len(rxcountpages.findall(fp.read()))
if count == 0:
pdf_read = PyPDF2.PdfFileReader(input_file)
count = pdf_read.getNumPages()
pdf_read = None
else:
ops = [
'identify', '-ping', '-format', "%n\\n", input_file
]
results = do_system_call(ops, return_result=True)
count = int(results.rstrip().split('\n').pop())
return count
def pdf_parser(s):
s = s.strip()
# required to suppress warning messages
with open(os.devnull, 'w') as fp:
pdf = PdfFileReader(StringIO(s), strict=False, warndest=fp)
if pdf.isEncrypted:
try:
pdf.decrypt('')
except NotImplementedError:
return {}
meta = pdf.getDocumentInfo()
#print(str(meta))
result = {}
for key in meta.keys():
result[key[1:]] = meta.get(key)
return result
def pdf_date(path):
"""
Extract a date from PDF file metadata.
Args:
path (str): The file path.
Returns:
datetime: The created date.
"""
reader = PdfFileReader(path)
# Get rid of `D:` prefix and timezone.
stamp = reader.documentInfo['/CreationDate']
match = re.search('\d+', stamp)
return datetime.strptime(
match.group(),
'%Y%m%d%H%M%S'
)
def extract(text, paper=None, logger=logger):
# try using pypdf2/pdfminer
try:
pdf = paper._read_document()
pdfReader = PdfFileReader(pdf)
author = pdfReader.getDocumentInfo().author
value_text, value_result = author, author
source_type = "extracted"
source_detail = "pdf metadata"
if author:
return (value_text, value_result, source_type, source_detail)
else:
return None
except:
# search for author or return None
# Though currently there is no search function
return None
def extract(text, paper=None, logger=logger):
# try using pypdf2/pdfminer
try:
pdf = paper._read_document()
pdfReader = PdfFileReader(pdf)
title = pdfReader.getDocumentInfo().title
value_text, value_result = title, title
source_type = "extracted"
source_detail = "pdf metadata"
if title:
return (value_text, value_result, source_type, source_detail)
else:
return None
except:
# search for title or return None
# Though currently there is no search function
return None
def test_split():
i = ILovePdf(config.PUBLIC_KEY, config.SECRET_KEY)
i.new_task("split")
i.add_file("test.pdf")
i.execute(ranges="1-2,5-8")
i.download()
zip_ref = zipfile.ZipFile("out.zip", "r")
zip_ref.extractall("test_split")
zip_ref.close()
assert len(glob.glob("test_split/*.pdf")) == 2
output_file1 = PdfFileReader(open("test_split/test-1-2.pdf", "rb"))
output_file2 = PdfFileReader(open("test_split/test-5-8.pdf", "rb"))
assert output_file1.getNumPages() == 2
assert output_file2.getNumPages() == 4
os.remove("out.zip")
shutil.rmtree("test_split")