Skip to content

Commit

Permalink
Allow specifying custom match logic in PDFFindController
Browse files Browse the repository at this point in the history
This patch allows embedders of PDF.js to provide custom match
logic for seaching in PDFs. This is done by subclassing the
PDFFindController class and overriding the `match` method.

`match` is called once per PDF page, receives as parameters the
search query, the page contents, and the page index, and returns
an array of { index, length } objects representing the search
results.
  • Loading branch information
nicolo-ribaudo committed Aug 4, 2024
1 parent b80e552 commit 42b2f48
Show file tree
Hide file tree
Showing 2 changed files with 160 additions and 63 deletions.
87 changes: 85 additions & 2 deletions test/unit/pdf_find_controller_spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@ class MockLinkService extends SimpleLinkService {

async function initPdfFindController(
filename,
updateMatchesCountOnProgress = true
updateMatchesCountOnProgress = true,
matcher = undefined
) {
const loadingTask = getDocument(
buildGetDocumentParams(filename || tracemonkeyFileName, {
Expand All @@ -65,7 +66,13 @@ async function initPdfFindController(
const linkService = new MockLinkService();
linkService.setDocument(pdfDocument);

const pdfFindController = new PDFFindController({
let FindControllerClass = PDFFindController;
if (matcher !== undefined) {
FindControllerClass = class extends PDFFindController {};
FindControllerClass.prototype.match = matcher;
}

const pdfFindController = new FindControllerClass({
linkService,
eventBus,
updateMatchesCountOnProgress,
Expand Down Expand Up @@ -1054,4 +1061,80 @@ describe("pdf_find_controller", function () {
const { eventBus } = await initPdfFindController();
await testOnFind({ eventBus });
});

describe("custom matcher", () => {
it("calls to the matcher with the right arguments", async () => {
const QUERY = "Foo bar";

const spy = jasmine
.createSpy("custom find matcher")
.and.callFake(() => [{ index: 0, length: 1 }]);

const { eventBus, pdfFindController } = await initPdfFindController(
null,
false,
spy
);

const PAGES_COUNT = 14;

await testSearch({
eventBus,
pdfFindController,
state: { query: QUERY },
selectedMatch: { pageIndex: 0, matchIndex: 0 },
matchesPerPage: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
});

expect(spy).toHaveBeenCalledTimes(PAGES_COUNT);

for (let i = 0; i < PAGES_COUNT; i++) {
const args = spy.calls.argsFor(i);
expect(args[0]).withContext(`page ${i}`).toBe(QUERY);
expect(args[2]).withContext(`page ${i}`).toBe(i);
}

expect(spy.calls.argsFor(0)[1]).toMatch(/^Trace-based /);
expect(spy.calls.argsFor(1)[1]).toMatch(/^Hence, recording and /);
expect(spy.calls.argsFor(12)[1]).toMatch(/Figure 12. Fraction of time /);
expect(spy.calls.argsFor(13)[1]).toMatch(/^not be interpreted as /);
});

it("uses the results returned by the custom matcher", async () => {
const QUERY = "Foo bar";

// prettier-ignore
const spy = jasmine.createSpy("custom find matcher")
.and.returnValue(undefined)
.withArgs(QUERY, jasmine.anything(), 0)
.and.returnValue([
{ index: 20, length: 3 },
{ index: 50, length: 8 },
])
.withArgs(QUERY, jasmine.anything(), 2)
.and.returnValue([
{ index: 7, length: 19 }
])
.withArgs(QUERY, jasmine.anything(), 13)
.and.returnValue([
{ index: 50, length: 2 },
{ index: 54, length: 9 },
{ index: 80, length: 4 },
]);

const { eventBus, pdfFindController } = await initPdfFindController(
null,
false,
spy
);

await testSearch({
eventBus,
pdfFindController,
state: { query: QUERY },
selectedMatch: { pageIndex: 0, matchIndex: 0 },
matchesPerPage: [2, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3],
});
});
});
});
136 changes: 75 additions & 61 deletions web/pdf_find_controller.js
Original file line number Diff line number Diff line change
Expand Up @@ -670,37 +670,6 @@ class PDFFindController {
return true;
}

#calculateRegExpMatch(query, entireWord, pageIndex, pageContent) {
const matches = (this._pageMatches[pageIndex] = []);
const matchesLength = (this._pageMatchesLength[pageIndex] = []);
if (!query) {
// The query can be empty because some chars like diacritics could have
// been stripped out.
return;
}
const diffs = this._pageDiffs[pageIndex];
let match;
while ((match = query.exec(pageContent)) !== null) {
if (
entireWord &&
!this.#isEntireWord(pageContent, match.index, match[0].length)
) {
continue;
}

const [matchPos, matchLen] = getOriginalIndex(
diffs,
match.index,
match[0].length
);

if (matchLen) {
matches.push(matchPos);
matchesLength.push(matchLen);
}
}
}

#convertToRegExpString(query, hasDiacritics) {
const { matchDiacritics } = this.#state;
let isUnicode = false;
Expand Down Expand Up @@ -771,13 +740,65 @@ class PDFFindController {
return [isUnicode, query];
}

#calculateMatch(pageIndex) {
let query = this.#query;
async #calculateMatch(pageIndex) {
const query = this.#query;
if (query.length === 0) {
return; // Do nothing: the matches should be wiped out already.
}
const { caseSensitive, entireWord } = this.#state;
const pageContent = this._pageContents[pageIndex];
const matcherResult = await this.match(query, pageContent, pageIndex);

const matches = (this._pageMatches[pageIndex] = []);
const matchesLength = (this._pageMatchesLength[pageIndex] = []);
const diffs = this._pageDiffs[pageIndex];

matcherResult?.forEach(({ index, length }) => {
const [matchPos, matchLen] = getOriginalIndex(diffs, index, length);
if (matchLen) {
matches.push(matchPos);
matchesLength.push(matchLen);
}
});

// When `highlightAll` is set, ensure that the matches on previously
// rendered (and still active) pages are correctly highlighted.
if (this.#state.highlightAll) {
this.#updatePage(pageIndex);
}
if (this._resumePageIdx === pageIndex) {
this._resumePageIdx = null;
this.#nextPageMatch();
}

// Update the match count.
const pageMatchesCount = this._pageMatches[pageIndex].length;
this._matchesCountTotal += pageMatchesCount;
if (this.#updateMatchesCountOnProgress) {
if (pageMatchesCount > 0) {
this.#updateUIResultsCount();
}
} else if (++this.#visitedPagesCount === this._linkService.pagesCount) {
// For example, in GeckoView we want to have only the final update because
// the Java side provides only one object to update the counts.
this.#updateUIResultsCount();
}
}

/**
* @typedef {Object} SingleFindMatch
* @property {number} index - The start of the matched text in the page's
* string contents.
* @property {number} length - The length of the matched text.
*/

/**
* @param {string | string[]} query - The search query.
* @param {string} pageContent - The text content of the page to search in.
* @param {number} pageIndex - The index of the page that is being processed.
* @returns {Promise<SingleFindMatch[]> | SingleFindMatch[] | undefined} An
* array of matches in the provided page.
*/
match(query, pageContent, pageIndex) {
const hasDiacritics = this._hasDiacritics[pageIndex];

let isUnicode = false;
Expand All @@ -799,34 +820,28 @@ class PDFFindController {
})
.join("|");
}
if (!query) {
// The query can be empty because some chars like diacritics could have
// been stripped out.
return undefined;
}

const { caseSensitive, entireWord } = this.#state;
const flags = `g${isUnicode ? "u" : ""}${caseSensitive ? "" : "i"}`;
query = query ? new RegExp(query, flags) : null;

this.#calculateRegExpMatch(query, entireWord, pageIndex, pageContent);

// When `highlightAll` is set, ensure that the matches on previously
// rendered (and still active) pages are correctly highlighted.
if (this.#state.highlightAll) {
this.#updatePage(pageIndex);
}
if (this._resumePageIdx === pageIndex) {
this._resumePageIdx = null;
this.#nextPageMatch();
}
query = new RegExp(query, flags);

// Update the match count.
const pageMatchesCount = this._pageMatches[pageIndex].length;
this._matchesCountTotal += pageMatchesCount;
if (this.#updateMatchesCountOnProgress) {
if (pageMatchesCount > 0) {
this.#updateUIResultsCount();
const matches = [];
let match;
while ((match = query.exec(pageContent)) !== null) {
if (
entireWord &&
!this.#isEntireWord(pageContent, match.index, match[0].length)
) {
continue;
}
} else if (++this.#visitedPagesCount === this._linkService.pagesCount) {
// For example, in GeckoView we want to have only the final update because
// the Java side provides only one object to update the counts.
this.#updateUIResultsCount();
matches.push({ index: match.index, length: match[0].length });
}
return matches;
}

#extractText() {
Expand Down Expand Up @@ -930,10 +945,9 @@ class PDFFindController {
continue;
}
this._pendingFindMatches.add(i);
this._extractTextPromises[i].then(() => {
this._pendingFindMatches.delete(i);
this.#calculateMatch(i);
});
this._extractTextPromises[i]
.then(() => this.#calculateMatch(i))
.finally(() => this._pendingFindMatches.delete(i));
}
}

Expand Down

0 comments on commit 42b2f48

Please sign in to comment.