Skip to content

Commit

Permalink
Fix layout-caching issue (#1097) caused by 0bfffc2
Browse files Browse the repository at this point in the history
Layout wasn't actually getting cached, now it is.
  • Loading branch information
jsvine committed Mar 3, 2024
1 parent 8912931 commit efca277
Showing 1 changed file with 12 additions and 22 deletions.
34 changes: 12 additions & 22 deletions pdfplumber/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,13 @@ def normalize_color(
return separate_pattern(tuplefied)


def tuplify_list_kwargs(kwargs: Dict[str, Any]) -> Dict[str, Any]:
return {
key: (tuple(value) if isinstance(value, list) else value)
for key, value in kwargs.items()
}


class PDFPageAggregatorWithMarkedContent(PDFPageAggregator):
"""Extract layout from a specific page, adding marked-content IDs to
objects where found."""
Expand Down Expand Up @@ -169,23 +176,6 @@ def paint_path(self, *args, **kwargs) -> None: # type: ignore
self.tag_cur_item()


def textmap_cacher(func: Callable[..., TextMap]) -> Callable[..., TextMap]:
"""
Caches the kwargs to Page._get_textmap
and converts list kwargs (which would be unhashable) into tuples.
"""

def new_func(**kwargs: Any) -> TextMap:
return lru_cache()(func)(
**{
key: (tuple(value) if isinstance(value, list) else value)
for key, value in kwargs.items()
}
)

return new_func


def _normalize_box(box_raw: T_bbox, rotation: T_num = 0) -> T_bbox:
# Per PDF Reference 3.8.4: "Note: Although rectangles are
# conventionally specified by their lower-left and upperright
Expand Down Expand Up @@ -251,7 +241,7 @@ def get_attr(key: str, default: Any = None) -> Any:
self.bbox = self.mediabox

# See https://rednafi.com/python/lru_cache_on_methods/
self.get_textmap = textmap_cacher(self._get_textmap)
self.get_textmap = lru_cache()(self._get_textmap)

def close(self) -> None:
self.flush_cache()
Expand Down Expand Up @@ -493,7 +483,7 @@ def search(
return_groups: bool = True,
**kwargs: Any,
) -> List[Dict[str, Any]]:
textmap = self.get_textmap(**kwargs)
textmap = self.get_textmap(**tuplify_list_kwargs(kwargs))
return textmap.search(
pattern,
regex=regex,
Expand All @@ -504,7 +494,7 @@ def search(
)

def extract_text(self, **kwargs: Any) -> str:
return self.get_textmap(**kwargs).as_string
return self.get_textmap(**tuplify_list_kwargs(kwargs)).as_string

def extract_text_simple(self, **kwargs: Any) -> str:
return utils.extract_text_simple(self.chars, **kwargs)
Expand All @@ -515,7 +505,7 @@ def extract_words(self, **kwargs: Any) -> T_obj_list:
def extract_text_lines(
self, strip: bool = True, return_chars: bool = True, **kwargs: Any
) -> T_obj_list:
return self.get_textmap(**kwargs).extract_text_lines(
return self.get_textmap(**tuplify_list_kwargs(kwargs)).extract_text_lines(
strip=strip, return_chars=return_chars
)

Expand Down Expand Up @@ -625,7 +615,7 @@ def __init__(self, parent_page: Page):
self.mediabox = parent_page.mediabox
self.cropbox = parent_page.cropbox
self.flush_cache(Container.cached_properties)
self.get_textmap = textmap_cacher(self._get_textmap)
self.get_textmap = lru_cache()(self._get_textmap)


def test_proposed_bbox(bbox: T_bbox, parent_bbox: T_bbox) -> None:
Expand Down

2 comments on commit efca277

@indigoviolet
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jsvine is it expected that this or Fix Page.get_textmap caching for extra_attrs=[...] · jsvine/pdfplumber@0bfffc2 caused extra_attrs to be respected when it wasn't previously?

@jsvine
Copy link
Owner Author

@jsvine jsvine commented on efca277 Mar 25, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't believe so, otherwise the relevant unit tests should/would have failed. But let me know if you see evidence otherwise, and I can look into it.

Please sign in to comment.