如何获取书签的页码
from typing import List
from PyPDF2 import PdfFileReader
from PyPDF2.generic import Destination
def get_outlines(pdf_filepath: str) -> List[Destination]:
"""Get the bookmarks of a PDF file."""
with open(pdf_filepath, "rb") as fp:
pdf_file_reader = PdfFileReader(fp)
outlines = pdf_file_reader.getOutlines()
return outlines
print(get_outlines("PDF-export-example.pdf"))
pyPdf.pdf.Destination
具有许多属性,但是找不到该书签的任何引用页码。如何获得书签的页码?
例如,outlines[1].page.idnum
返回一个数字,该数字大约是PDF文档中引用的页码的3倍,我认为引用的对象比页面小,因为.page.idnum
在整个PDF文档轮廓上运行时返回的数字数组甚至与“实数”都不线性相关PDF文档中的页码目标,大约是3的倍数
-
正如@theta指出的那样,“根据轮廓分割pdf
”具有提取页码所需的代码。如果您觉得这很复杂,我复制了一部分代码,该代码将页面ID映射到页面编号并使其成为函数。这是一个打印书签o [0]的页码的工作示例:from PyPDF2 import PdfFileReader def _setup_page_id_to_num(pdf, pages=None, _result=None, _num_pages=None): if _result is None: _result = {} if pages is None: _num_pages = [] pages = pdf.trailer["/Root"].getObject()["/Pages"].getObject() t = pages["/Type"] if t == "/Pages": for page in pages["/Kids"]: _result[page.idnum] = len(_num_pages) _setup_page_id_to_num(pdf, page.getObject(), _result, _num_pages) elif t == "/Page": _num_pages.append(1) return _result # main f = open('document.pdf','rb') p = PdfFileReader(f) # map page ids to page numbers pg_id_num_map = _setup_page_id_to_num(p) o = p.getOutlines() pg_num = pg_id_num_map[o[0].page.idnum] + 1 print(pg_num)
@theta可能为时已晚,但可能会对其他人有所帮助:) btw我关于stackoverflow的第一篇文章,所以请问如果我不遵循通常的格式
进一步扩展此功能: 如果您希望在页面上获得书签的确切位置,这将使您的工作更加轻松:
from PyPDF2 import PdfFileReader import PyPDF2 as pyPdf def _setup_page_id_to_num(pdf, pages=None, _result=None, _num_pages=None): if _result is None: _result = {} if pages is None: _num_pages = [] pages = pdf.trailer["/Root"].getObject()["/Pages"].getObject() t = pages["/Type"] if t == "/Pages": for page in pages["/Kids"]: _result[page.idnum] = len(_num_pages) _setup_page_id_to_num(pdf, page.getObject(), _result, _num_pages) elif t == "/Page": _num_pages.append(1) return _result def outlines_pg_zoom_info(outlines, pg_id_num_map, result=None): if result is None: result = dict() if type(outlines) == list: for outline in outlines: result = outlines_pg_zoom_info(outline, pg_id_num_map, result) elif type(outlines) == pyPdf.pdf.Destination: title = outlines['/Title'] result[title.split()[0]] = dict(title=outlines['/Title'], top=outlines['/Top'], \ left=outlines['/Left'], page=(pg_id_num_map[outlines.page.idnum]+1)) return result # main pdf_name = 'document.pdf' f = open(pdf_name,'rb') pdf = PdfFileReader(f) # map page ids to page numbers pg_id_num_map = _setup_page_id_to_num(pdf) outlines = pdf.getOutlines() bookmarks_info = outlines_pg_zoom_info(outlines, pg_id_num_map) print(bookmarks_info)
注意:我的书签是区号(例如:1.1 Introduction),我正在将书签信息映射到区号。 如果您的书签不同,请修改此部分代码:
elif type(outlines) == pyPdf.pdf.Destination: title = outlines['/Title'] result[title.split()[0]] = dict(title=outlines['/Title'], top=outlines['/Top'], \ left=outlines['/Left'], page=(pg_id_num_map[outlines.page.idnum]+1))