From 4b36de2ee17156657805d70cc1d3d130b76ac3d8 Mon Sep 17 00:00:00 2001
From: "Richard Smith (smir)" <smir@preservica.com>
Date: Mon, 19 Aug 2024 15:18:05 +0100
Subject: [PATCH] Send fetch requests for all page dict lookups in parallel -
 When adding page dict candidates to the lookup tree, also initiate fetching
 them from xref, so if they are not yet loaded at all, the XHR will be sent -
 We can then await on the cached Promise without making the requests pipeline
 - This has a significant performance improvement for load-on-demand (i.e.
 with auto-fetch turned off) when a PDF has a large number of pages in the top
 level /Pages collection, and those pages are spread through a file, so every
 candidate needs to be fetched separately  - PDFs with many pages where each
 page is a big image and all the pages are at the top level are quite a common
 output for digitisation programmes - I would have liked to do something like
 "if it's the top level collection and page count = number of kids, then just
 fetch that page without traversing the tree" but unfortunately I agree with
 comments on #8088 that there is no good general solution to allow for /Pages
 nodes with empty /Kids arrays - The other alternative for fixing this use
 case is to simply not validate the last page at all, so pages can be loaded
 on demand. But that validation was added for good reasons, and this would
 also result in a bad experience if you didn't read the document from the
 front. Or assume in certain conditions that the top level /Pages contains
 only pages (see
 https://github.com/mozilla/pdf.js/compare/master...richard-smith-preservica:pdf.js:rcs/assume-all-pages-in-top-level-when-likely?expand=1),
 but that allows for particular edge case 'bad' PDFs to render incorrectly

eslint

Review - Fix new promise side of fetch; local cache variable; validation on when to prefetch
---
 src/core/catalog.js | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/src/core/catalog.js b/src/core/catalog.js
index 2c4551d3ab2c7a..74d199e54d2046 100644
--- a/src/core/catalog.js
+++ b/src/core/catalog.js
@@ -143,6 +143,7 @@ class Catalog {
     this.globalImageCache = new GlobalImageCache();
     this.pageKidsCountCache = new RefSetCache();
     this.pageIndexCache = new RefSetCache();
+    this.pageDictCache = new RefSetCache();
     this.nonBlendModesSet = new RefSet();
     this.systemFontCache = new Map();
   }
@@ -1161,6 +1162,7 @@ class Catalog {
     this.globalImageCache.clear(/* onlyData = */ manuallyTriggered);
     this.pageKidsCountCache.clear();
     this.pageIndexCache.clear();
+    this.pageDictCache.clear();
     this.nonBlendModesSet.clear();
 
     const translatedFonts = await Promise.all(this.fontCache);
@@ -1184,7 +1186,8 @@ class Catalog {
     }
     const xref = this.xref,
       pageKidsCountCache = this.pageKidsCountCache,
-      pageIndexCache = this.pageIndexCache;
+      pageIndexCache = this.pageIndexCache,
+      pageDictCache = this.pageDictCache;
     let currentPageIndex = 0;
 
     while (nodesToVisit.length) {
@@ -1203,7 +1206,8 @@ class Catalog {
         }
         visitedNodes.put(currentNode);
 
-        const obj = await xref.fetchAsync(currentNode);
+        const obj = await (pageDictCache.get(currentNode) ||
+          xref.fetchAsync(currentNode));
         if (obj instanceof Dict) {
           let type = obj.getRaw("Type");
           if (type instanceof Ref) {
@@ -1285,7 +1289,14 @@ class Catalog {
       // node further down in the tree (see issue5644.pdf, issue8088.pdf),
       // and to ensure that we actually find the correct `Page` dict.
       for (let last = kids.length - 1; last >= 0; last--) {
-        nodesToVisit.push(kids[last]);
+        const lastKid = kids[last];
+        nodesToVisit.push(lastKid);
+
+        // Launch all requests in parallel so we don't wait for each one in turn
+        // when looking for a page near the end
+        if (lastKid instanceof Ref && !pageDictCache.has(lastKid)) {
+          pageDictCache.put(lastKid, xref.fetchAsync(lastKid));
+        }
       }
     }