From 4b36de2ee17156657805d70cc1d3d130b76ac3d8 Mon Sep 17 00:00:00 2001 From: "Richard Smith (smir)" Date: Mon, 19 Aug 2024 15:18:05 +0100 Subject: [PATCH] Send fetch requests for all page dict lookups in parallel - When adding page dict candidates to the lookup tree, also initiate fetching them from xref, so if they are not yet loaded at all, the XHR will be sent - We can then await on the cached Promise without making the requests pipeline - This has a significant performance improvement for load-on-demand (i.e. with auto-fetch turned off) when a PDF has a large number of pages in the top level /Pages collection, and those pages are spread through a file, so every candidate needs to be fetched separately - PDFs with many pages where each page is a big image and all the pages are at the top level are quite a common output for digitisation programmes - I would have liked to do something like "if it's the top level collection and page count = number of kids, then just fetch that page without traversing the tree" but unfortunately I agree with comments on #8088 that there is no good general solution to allow for /Pages nodes with empty /Kids arrays - The other alternative for fixing this use case is to simply not validate the last page at all, so pages can be loaded on demand. But that validation was added for good reasons, and this would also result in a bad experience if you didn't read the document from the front. Or assume in certain conditions that the top level /Pages contains only pages (see https://github.com/mozilla/pdf.js/compare/master...richard-smith-preservica:pdf.js:rcs/assume-all-pages-in-top-level-when-likely?expand=1), but that allows for particular edge case 'bad' PDFs to render incorrectly eslint Review - Fix new promise side of fetch; local cache variable; validation on when to prefetch --- src/core/catalog.js | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/src/core/catalog.js b/src/core/catalog.js index 2c4551d3ab2c7a..74d199e54d2046 100644 --- a/src/core/catalog.js +++ b/src/core/catalog.js @@ -143,6 +143,7 @@ class Catalog { this.globalImageCache = new GlobalImageCache(); this.pageKidsCountCache = new RefSetCache(); this.pageIndexCache = new RefSetCache(); + this.pageDictCache = new RefSetCache(); this.nonBlendModesSet = new RefSet(); this.systemFontCache = new Map(); } @@ -1161,6 +1162,7 @@ class Catalog { this.globalImageCache.clear(/* onlyData = */ manuallyTriggered); this.pageKidsCountCache.clear(); this.pageIndexCache.clear(); + this.pageDictCache.clear(); this.nonBlendModesSet.clear(); const translatedFonts = await Promise.all(this.fontCache); @@ -1184,7 +1186,8 @@ class Catalog { } const xref = this.xref, pageKidsCountCache = this.pageKidsCountCache, - pageIndexCache = this.pageIndexCache; + pageIndexCache = this.pageIndexCache, + pageDictCache = this.pageDictCache; let currentPageIndex = 0; while (nodesToVisit.length) { @@ -1203,7 +1206,8 @@ class Catalog { } visitedNodes.put(currentNode); - const obj = await xref.fetchAsync(currentNode); + const obj = await (pageDictCache.get(currentNode) || + xref.fetchAsync(currentNode)); if (obj instanceof Dict) { let type = obj.getRaw("Type"); if (type instanceof Ref) { @@ -1285,7 +1289,14 @@ class Catalog { // node further down in the tree (see issue5644.pdf, issue8088.pdf), // and to ensure that we actually find the correct `Page` dict. for (let last = kids.length - 1; last >= 0; last--) { - nodesToVisit.push(kids[last]); + const lastKid = kids[last]; + nodesToVisit.push(lastKid); + + // Launch all requests in parallel so we don't wait for each one in turn + // when looking for a page near the end + if (lastKid instanceof Ref && !pageDictCache.has(lastKid)) { + pageDictCache.put(lastKid, xref.fetchAsync(lastKid)); + } } }