Merge pull request #1094 from dhdaines/structure_fixes

Handle missing ParentTree
jsvine · Feb 16, 2024 · 8912931 · 8912931
2 parents 1ad3905 + 3e74fb1
commit 8912931
Show file tree

Hide file tree

Showing 2 changed files with 32 additions and 15 deletions.
diff --git a/pdfplumber/structure.py b/pdfplumber/structure.py
@@ -86,24 +86,30 @@ def __init__(self, doc: "PDF", page: Optional["Page"] = None):
         # If we have a specific page then we will work backwards from
         # its ParentTree - this is because structure elements could
         # span multiple pages, and the "Pg" attribute is *optional*,
-        # so this is the approved way to get a page's structure
+        # so this is the approved way to get a page's structure...
         if page is not None:
             self.page = page.page_obj
             self.page_dict = None
-            parent_tree = NumberTree(self.root["ParentTree"])
-            # If there is no marked content in the structure tree for
-            # this page (which can happen even when there is a
-            # structure tree) then there is no `StructParents`.
-            # Note however that if there are XObjects in a page,
-            # *they* may have `StructParent` (not `StructParents`)
-            if "StructParents" not in self.page.attrs:
-                return
-            parent_id = self.page.attrs["StructParents"]
-            # NumberTree should have a `get` method like it does in pdf.js...
-            parent_array = resolve1(
-                next(array for num, array in parent_tree.values if num == parent_id)
-            )
-            self._parse_parent_tree(parent_array)
+            # ...EXCEPT that the ParentTree is sometimes missing, in which
+            # case we fall back to the non-approved way.
+            parent_tree_obj = self.root.get("ParentTree")
+            if parent_tree_obj is None:
+                self._parse_struct_tree()
+            else:
+                parent_tree = NumberTree(parent_tree_obj)
+                # If there is no marked content in the structure tree for
+                # this page (which can happen even when there is a
+                # structure tree) then there is no `StructParents`.
+                # Note however that if there are XObjects in a page,
+                # *they* may have `StructParent` (not `StructParents`)
+                if "StructParents" not in self.page.attrs:
+                    return
+                parent_id = self.page.attrs["StructParents"]
+                # NumberTree should have a `get` method like it does in pdf.js...
+                parent_array = resolve1(
+                    next(array for num, array in parent_tree.values if num == parent_id)
+                )
+                self._parse_parent_tree(parent_array)
         else:
             self.page = None
             # Overhead of creating pages shouldn't be too bad we hope!

diff --git a/tests/test_structure.py b/tests/test_structure.py
@@ -4,6 +4,8 @@
 import unittest
 from collections import deque
 
+from pdfminer.pdftypes import resolve1
+
 import pdfplumber
 from pdfplumber.structure import PDFStructTree
 
@@ -593,6 +595,7 @@ def test_structure_tree(self):
     }
 ]
 
+
 IMAGESTRUCT = [
     {
         "type": "Document",
@@ -894,6 +897,14 @@ def test_proces_verbal(self):
         page = pdf.pages[1]
         assert page.structure_tree == PVSTRUCT1
 
+    def test_missing_parenttree(self):
+        """Verify we can get structure without a ParentTree."""
+        path = os.path.join(HERE, "pdfs/2023-06-20-PV.pdf")
+        pdf = pdfplumber.open(path)
+        root = resolve1(pdf.doc.catalog["StructTreeRoot"])
+        del root["ParentTree"]
+        assert pdf.pages[1].structure_tree == PVSTRUCT1
+
     def test_image_structure(self):
         path = os.path.join(HERE, "pdfs/image_structure.pdf")