Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Regression] Eagerly fetch/parse the entire /Pages-tree in corrupt documents (issue 14303, PR 14311 follow-up) #14335

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 87 additions & 4 deletions src/core/catalog.js
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,16 @@ import {
isRefsEqual,
isStream,
Name,
Ref,
RefSet,
RefSetCache,
} from "./primitives.js";
import {
collectActions,
MissingDataException,
PageDictMissingException,
recoverJsURL,
toRomanNumerals,
XRefEntryException,
} from "./core_utils.js";
import {
createPromiseCapability,
Expand Down Expand Up @@ -1212,14 +1213,96 @@ class Catalog {
nodesToVisit.push(kids[last]);
}
}
capability.reject(
new PageDictMissingException(`Page index ${pageIndex} not found.`)
);
capability.reject(new Error(`Page index ${pageIndex} not found.`));
}
next();
return capability.promise;
}

/**
* Eagerly fetches the entire /Pages-tree; should ONLY be used as a fallback.
* @returns {Map}
*/
getAllPageDicts() {
const queue = [{ currentNode: this.toplevelPagesDict, posInKids: 0 }];
const visitedNodes = new RefSet();
const map = new Map();
let pageIndex = 0;

function addPageDict(pageDict, pageRef) {
map.set(pageIndex++, [pageDict, pageRef]);
}
function addPageError(msg) {
map.set(pageIndex++, [new FormatError(msg), null]);
}

while (queue.length > 0) {
const queueItem = queue[queue.length - 1];
const { currentNode, posInKids } = queueItem;

let kids;
try {
kids = currentNode.get("Kids");
} catch (ex) {
if (ex instanceof MissingDataException) {
throw ex;
}
if (ex instanceof XRefEntryException) {
throw ex;
}
}
if (!Array.isArray(kids)) {
addPageError("Page dictionary kids object is not an array.");
break;
}

if (posInKids >= kids.length) {
queue.pop();
continue;
}

const kidObj = kids[posInKids];
let obj;
if (kidObj instanceof Ref) {
try {
obj = this.xref.fetch(kidObj);
} catch (ex) {
if (ex instanceof MissingDataException) {
throw ex;
}
if (ex instanceof XRefEntryException) {
throw ex;
}
}
// Prevent circular references in the /Pages tree.
if (visitedNodes.has(kidObj)) {
addPageError("Pages tree contains circular reference.");
break;
}
visitedNodes.put(kidObj);
} else {
// Prevent errors in corrupt PDF documents that violate the
// specification by *inlining* Page dicts directly in the Kids
// array, rather than using indirect objects (see issue9540.pdf).
obj = kidObj;
}
if (!(obj instanceof Dict)) {
addPageError(
"Page dictionary kid reference points to wrong type of object."
);
break;
}

if (isDict(obj, "Page") || !obj.has("Kids")) {
addPageDict(obj, kidObj instanceof Ref ? kidObj : null);
} else {
queue.push({ currentNode: obj, posInKids: 0 });
}
queueItem.posInKids++;
}
return map;
}

getPageIndex(pageRef) {
const cachedPageIndex = this.pageIndexCache.get(pageRef);
if (cachedPageIndex !== undefined) {
Expand Down
7 changes: 0 additions & 7 deletions src/core/core_utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -60,12 +60,6 @@ class MissingDataException extends BaseException {
}
}

class PageDictMissingException extends BaseException {
constructor(msg) {
super(msg, "PageDictMissingException");
}
}

class ParserEOFException extends BaseException {
constructor(msg) {
super(msg, "ParserEOFException");
Expand Down Expand Up @@ -547,7 +541,6 @@ export {
isWhiteSpace,
log2,
MissingDataException,
PageDictMissingException,
ParserEOFException,
parseXFAPath,
readInt8,
Expand Down
73 changes: 49 additions & 24 deletions src/core/document.js
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@ import {
getInheritableProperty,
isWhiteSpace,
MissingDataException,
PageDictMissingException,
validateCSSFont,
XRefEntryException,
XRefParseException,
Expand Down Expand Up @@ -1354,28 +1353,30 @@ class PDFDocument {
}

async checkLastPage(recoveryMode = false) {
this.catalog.setActualNumPages(); // Ensure that it's always reset.
const { catalog, pdfManager } = this;

catalog.setActualNumPages(); // Ensure that it's always reset.
let numPages;

try {
await Promise.all([
this.pdfManager.ensureDoc("xfaFactory"),
this.pdfManager.ensureDoc("linearization"),
this.pdfManager.ensureCatalog("numPages"),
pdfManager.ensureDoc("xfaFactory"),
pdfManager.ensureDoc("linearization"),
pdfManager.ensureCatalog("numPages"),
]);

if (this.xfaFactory) {
return; // The Page count is always calculated for XFA-documents.
} else if (this.linearization) {
numPages = this.linearization.numPages;
} else {
numPages = this.catalog.numPages;
numPages = catalog.numPages;
}

if (numPages === 1) {
return;
} else if (!Number.isInteger(numPages)) {
if (!Number.isInteger(numPages)) {
throw new FormatError("Page count is not an integer.");
} else if (numPages <= 1) {
return;
}
await this.getPage(numPages - 1);
} catch (reason) {
Expand All @@ -1385,24 +1386,48 @@ class PDFDocument {
// subsequent `this.getPage` calls.
await this.cleanup();

let pageIndex = 1; // The first page was already loaded.
while (true) {
try {
await this.getPage(pageIndex);
} catch (reasonLoop) {
if (reasonLoop instanceof PageDictMissingException) {
break;
}
if (reasonLoop instanceof XRefEntryException) {
if (!recoveryMode) {
throw new XRefParseException();
}
break;
let pagesTree;
try {
pagesTree = await pdfManager.ensureCatalog("getAllPageDicts");
} catch (reasonAll) {
if (reasonAll instanceof XRefEntryException) {
if (!recoveryMode) {
throw new XRefParseException();
}
}
pageIndex++;
catalog.setActualNumPages(1);
return;
}

for (const [pageIndex, [pageDict, ref]] of pagesTree) {
let promise;
if (pageDict instanceof Error) {
promise = Promise.reject(pageDict);

// Prevent "uncaught exception: Object"-messages in the console.
promise.catch(() => {});
} else {
promise = Promise.resolve(
new Page({
pdfManager,
xref: this.xref,
pageIndex,
pageDict,
ref,
globalIdFactory: this._globalIdFactory,
fontCache: catalog.fontCache,
builtInCMapCache: catalog.builtInCMapCache,
standardFontDataCache: catalog.standardFontDataCache,
globalImageCache: catalog.globalImageCache,
nonBlendModesSet: catalog.nonBlendModesSet,
xfaFactory: null,
})
);
}

this._pagePromises.set(pageIndex, promise);
}
this.catalog.setActualNumPages(pageIndex);
catalog.setActualNumPages(pagesTree.size);
}
}

Expand Down
2 changes: 2 additions & 0 deletions test/pdfs/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -492,6 +492,8 @@
!xfa_issue14315.pdf
!poppler-67295-0.pdf
!poppler-85140-0.pdf
!poppler-395-0-fuzzed.pdf
!GHOSTSCRIPT-698804-1-fuzzed.pdf
!poppler-91414-0-53.pdf
!poppler-91414-0-54.pdf
!poppler-742-0-fuzzed.pdf
Expand Down
Binary file added test/pdfs/GHOSTSCRIPT-698804-1-fuzzed.pdf
Binary file not shown.
Binary file added test/pdfs/poppler-395-0-fuzzed.pdf
Binary file not shown.
35 changes: 35 additions & 0 deletions test/unit/api_spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -495,14 +495,27 @@ describe("api", function () {
const loadingTask2 = getDocument(
buildGetDocumentParams("poppler-85140-0.pdf")
);
const loadingTask3 = getDocument(
buildGetDocumentParams("poppler-395-0-fuzzed.pdf")
);
const loadingTask4 = getDocument(
buildGetDocumentParams("GHOSTSCRIPT-698804-1-fuzzed.pdf")
);

expect(loadingTask1 instanceof PDFDocumentLoadingTask).toEqual(true);
expect(loadingTask2 instanceof PDFDocumentLoadingTask).toEqual(true);
expect(loadingTask3 instanceof PDFDocumentLoadingTask).toEqual(true);
expect(loadingTask4 instanceof PDFDocumentLoadingTask).toEqual(true);

const pdfDocument1 = await loadingTask1.promise;
const pdfDocument2 = await loadingTask2.promise;
const pdfDocument3 = await loadingTask3.promise;
const pdfDocument4 = await loadingTask4.promise;

expect(pdfDocument1.numPages).toEqual(1);
expect(pdfDocument2.numPages).toEqual(1);
expect(pdfDocument3.numPages).toEqual(1);
expect(pdfDocument4.numPages).toEqual(1);

const pageA = await pdfDocument1.getPage(1);
expect(pageA instanceof PDFPageProxy).toEqual(true);
Expand All @@ -516,6 +529,28 @@ describe("api", function () {
expect(reason instanceof UnknownErrorException).toEqual(true);
expect(reason.message).toEqual("Bad (uncompressed) XRef entry: 3R");
}
try {
await pdfDocument3.getPage(1);

// Shouldn't get here.
expect(false).toEqual(true);
} catch (reason) {
expect(reason instanceof UnknownErrorException).toEqual(true);
expect(reason.message).toEqual(
"Page dictionary kid reference points to wrong type of object."
);
}
try {
await pdfDocument4.getPage(1);

// Shouldn't get here.
expect(false).toEqual(true);
} catch (reason) {
expect(reason instanceof UnknownErrorException).toEqual(true);
expect(reason.message).toEqual(
"Page dictionary kid reference points to wrong type of object."
);
}

await Promise.all([loadingTask1.destroy(), loadingTask2.destroy()]);
});
Expand Down