from pyPdf import PdfFileReader
f = open('document.pdf', 'rb')
p = PdfFileReader(f)
o = p.getOutlines()
Run Code Online (Sandbox Code Playgroud)
列表对象o由字典对象pyPdf.pdf.Destination(书签)组成,它具有许多属性,但我找不到该书签的任何引用页码
如何返回页码,让我们说o[1]书签?
例如o[1].page.idnum返回数字大约是PDF文档中引用页面数量的3倍,我假设引用一些小于页面的对象,因为.page.idnum在整个PDF文档大纲上运行返回与"真实"页面甚至不是线性相关的数字数组PDF文档中的数字目的地,它大约是〜3的倍数
更新:这个问题与此相同:基于大纲分割pdf,虽然我不明白作者在那里的自我答案中做了什么.对我来说似乎太复杂了
小智 10
正如@theta所指出的那样," 基于大纲拆分pdf "具有提取页码所需的代码.如果您觉得这很复杂,我会复制部分代码,将页面ID映射到页码并使其成为一个函数.这是打印书签o [0]的页码的工作示例:
from PyPDF2 import PdfFileReader
def _setup_page_id_to_num(pdf, pages=None, _result=None, _num_pages=None):
if _result is None:
_result = {}
if pages is None:
_num_pages = []
pages = pdf.trailer["/Root"].getObject()["/Pages"].getObject()
t = pages["/Type"]
if t == "/Pages":
for page in pages["/Kids"]:
_result[page.idnum] = len(_num_pages)
_setup_page_id_to_num(pdf, page.getObject(), _result, _num_pages)
elif t == "/Page":
_num_pages.append(1)
return _result
# main
f = open('document.pdf','rb')
p = PdfFileReader(f)
# map page ids to page numbers
pg_id_num_map = _setup_page_id_to_num(p)
o = p.getOutlines()
pg_num = pg_id_num_map[o[0].page.idnum] + 1
print(pg_num)
Run Code Online (Sandbox Code Playgroud)
@theta可能为时已晚,但可能会帮助其他人:)顺便说一下我在stackoverflow上发表的第一篇文章,请原谅我,如果我没有遵循通常的格式
要进一步扩展这一点: 如果您希望获得书签页面上的确切位置,这将使您的工作更轻松:
from PyPDF2 import PdfFileReader
import PyPDF2 as pyPdf
def _setup_page_id_to_num(pdf, pages=None, _result=None, _num_pages=None):
if _result is None:
_result = {}
if pages is None:
_num_pages = []
pages = pdf.trailer["/Root"].getObject()["/Pages"].getObject()
t = pages["/Type"]
if t == "/Pages":
for page in pages["/Kids"]:
_result[page.idnum] = len(_num_pages)
_setup_page_id_to_num(pdf, page.getObject(), _result, _num_pages)
elif t == "/Page":
_num_pages.append(1)
return _result
def outlines_pg_zoom_info(outlines, pg_id_num_map, result=None):
if result is None:
result = dict()
if type(outlines) == list:
for outline in outlines:
result = outlines_pg_zoom_info(outline, pg_id_num_map, result)
elif type(outlines) == pyPdf.pdf.Destination:
title = outlines['/Title']
result[title.split()[0]] = dict(title=outlines['/Title'], top=outlines['/Top'], \
left=outlines['/Left'], page=(pg_id_num_map[outlines.page.idnum]+1))
return result
# main
pdf_name = 'document.pdf'
f = open(pdf_name,'rb')
pdf = PdfFileReader(f)
# map page ids to page numbers
pg_id_num_map = _setup_page_id_to_num(pdf)
outlines = pdf.getOutlines()
bookmarks_info = outlines_pg_zoom_info(outlines, pg_id_num_map)
print(bookmarks_info)
Run Code Online (Sandbox Code Playgroud)
注意:我的书签是章节编号(例如:1.1简介),我将书签信息映射到章节编号.如果您的书签不同,请修改以下代码:
elif type(outlines) == pyPdf.pdf.Destination:
title = outlines['/Title']
result[title.split()[0]] = dict(title=outlines['/Title'], top=outlines['/Top'], \
left=outlines['/Left'], page=(pg_id_num_map[outlines.page.idnum]+1))
Run Code Online (Sandbox Code Playgroud)
使用 vjayky 和 Giulio D 建议递归管理书签。
PyPDF2 >= v1.25
from PyPDF2 import PdfFileReader
def printBookmarksPageNumbers(pdf):
def reviewAndPrintBookmarks(bookmarks, indent=0):
for b in bookmarks:
if type(b) == list:
reviewAndPrintBookmarks(b, indent + 4)
continue
pg_num = pdf.getDestinationPageNumber(b) + 1 # page count starts from 0
print("%s%s: Page %s" % (" " * indent, b.title, pg_num))
reviewAndPrintBookmarks(pdf.getOutlines())
with open('document.pdf', "rb") as f:
pdf = PdfFileReader(f)
printBookmarksPageNumbers(pdf)
Run Code Online (Sandbox Code Playgroud)
PyPDF2 < v1.25
from PyPDF2 import PdfFileReader
def printBookmarksPageNumbers(pdf):
# Map page ids to page numbers
pg_id_to_num = {}
for pg_num in range(0, pdf.getNumPages()):
pg_id_to_num[pdf.getPage(pg_num).indirectRef.idnum] = pg_num
def reviewAndPrintBookmarks(bookmarks, indent=0):
for b in bookmarks:
if type(b) == list:
reviewAndPrintBookmarks(b, indent + 4)
continue
pg_num = pg_id_to_num[b.page.idnum] + 1 # page count starts from 0
print("%s%s: Page %s" % (" " * indent, b.title, pg_num))
reviewAndPrintBookmarks(pdf.getOutlines())
with open('document.pdf', "rb") as f:
pdf = PdfFileReader(f)
printBookmarksPageNumbers(pdf)
Run Code Online (Sandbox Code Playgroud)
输出示例(两种方法):
Bookmark 1: Page 1
Bookmark 1.1: Page 2
Bookmark 1.2: Page 3
Bookmark 2: Page 4
Bookmark 3: Page 5
Bookmark 3.1: Page 6
Run Code Online (Sandbox Code Playgroud)