Skip to content

Commit

Permalink
Merge pull request #1094 from dhdaines/structure_fixes
Browse files Browse the repository at this point in the history
Handle missing ParentTree
  • Loading branch information
jsvine authored Feb 16, 2024
2 parents 1ad3905 + 3e74fb1 commit 8912931
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 15 deletions.
36 changes: 21 additions & 15 deletions pdfplumber/structure.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,24 +86,30 @@ def __init__(self, doc: "PDF", page: Optional["Page"] = None):
# If we have a specific page then we will work backwards from
# its ParentTree - this is because structure elements could
# span multiple pages, and the "Pg" attribute is *optional*,
# so this is the approved way to get a page's structure
# so this is the approved way to get a page's structure...
if page is not None:
self.page = page.page_obj
self.page_dict = None
parent_tree = NumberTree(self.root["ParentTree"])
# If there is no marked content in the structure tree for
# this page (which can happen even when there is a
# structure tree) then there is no `StructParents`.
# Note however that if there are XObjects in a page,
# *they* may have `StructParent` (not `StructParents`)
if "StructParents" not in self.page.attrs:
return
parent_id = self.page.attrs["StructParents"]
# NumberTree should have a `get` method like it does in pdf.js...
parent_array = resolve1(
next(array for num, array in parent_tree.values if num == parent_id)
)
self._parse_parent_tree(parent_array)
# ...EXCEPT that the ParentTree is sometimes missing, in which
# case we fall back to the non-approved way.
parent_tree_obj = self.root.get("ParentTree")
if parent_tree_obj is None:
self._parse_struct_tree()
else:
parent_tree = NumberTree(parent_tree_obj)
# If there is no marked content in the structure tree for
# this page (which can happen even when there is a
# structure tree) then there is no `StructParents`.
# Note however that if there are XObjects in a page,
# *they* may have `StructParent` (not `StructParents`)
if "StructParents" not in self.page.attrs:
return
parent_id = self.page.attrs["StructParents"]
# NumberTree should have a `get` method like it does in pdf.js...
parent_array = resolve1(
next(array for num, array in parent_tree.values if num == parent_id)
)
self._parse_parent_tree(parent_array)
else:
self.page = None
# Overhead of creating pages shouldn't be too bad we hope!
Expand Down
11 changes: 11 additions & 0 deletions tests/test_structure.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import unittest
from collections import deque

from pdfminer.pdftypes import resolve1

import pdfplumber
from pdfplumber.structure import PDFStructTree

Expand Down Expand Up @@ -593,6 +595,7 @@ def test_structure_tree(self):
}
]


IMAGESTRUCT = [
{
"type": "Document",
Expand Down Expand Up @@ -894,6 +897,14 @@ def test_proces_verbal(self):
page = pdf.pages[1]
assert page.structure_tree == PVSTRUCT1

def test_missing_parenttree(self):
"""Verify we can get structure without a ParentTree."""
path = os.path.join(HERE, "pdfs/2023-06-20-PV.pdf")
pdf = pdfplumber.open(path)
root = resolve1(pdf.doc.catalog["StructTreeRoot"])
del root["ParentTree"]
assert pdf.pages[1].structure_tree == PVSTRUCT1

def test_image_structure(self):
path = os.path.join(HERE, "pdfs/image_structure.pdf")

Expand Down

0 comments on commit 8912931

Please sign in to comment.