Skip to content

Commit

Permalink
[api-minor] Include the document /Lang attribute in the textContent-data
Browse files Browse the repository at this point in the history
 - These changes will allow a simpler way of implementing PR 17770.

 - The /Lang attribute is fetched lazily, with the first `getTextContent` invocation. Given the existing worker-thread caching, this will thus only need to be done *once* per PDF document (and most PDFs don't included this data).

 - This makes the /Lang attribute *directly available* in the `textLayer`, which has the following advantages:
    - We don't need to block, and thus delay, overall viewer initialization on fetching it (nor pass it around throughout the viewer).

    - Third-party users of the `textLayer` will automatically benefit from this, once we start actually using the /Lang attribute in PR 17770.
      *Please note:* This also, importantly, means that the `text` reference-tests will then cover this code (which wouldn't otherwise have been the case).
  • Loading branch information
Snuffleupagus committed May 3, 2024
1 parent 1b811ac commit 0ac822e
Show file tree
Hide file tree
Showing 6 changed files with 60 additions and 45 deletions.
2 changes: 1 addition & 1 deletion src/core/catalog.js
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ class Catalog {
return shadow(
this,
"lang",
typeof lang === "string" ? stringToPDFString(lang) : null
lang && typeof lang === "string" ? stringToPDFString(lang) : null
);
}

Expand Down
59 changes: 31 additions & 28 deletions src/core/document.js
Original file line number Diff line number Diff line change
Expand Up @@ -392,10 +392,9 @@ class Page {
}

loadResources(keys) {
if (!this.resourcesPromise) {
// TODO: add async `_getInheritableProperty` and remove this.
this.resourcesPromise = this.pdfManager.ensure(this, "resources");
}
// TODO: add async `_getInheritableProperty` and remove this.
this.resourcesPromise ||= this.pdfManager.ensure(this, "resources");

return this.resourcesPromise.then(() => {
const objectLoader = new ObjectLoader(this.resources, keys, this.xref);
return objectLoader.load();
Expand Down Expand Up @@ -622,7 +621,7 @@ class Page {
});
}

extractTextContent({
async extractTextContent({
handler,
task,
includeMarkedContent,
Expand All @@ -636,31 +635,35 @@ class Page {
"Properties",
"XObject",
]);
const langPromise = this.pdfManager.ensureCatalog("lang");

const dataPromises = Promise.all([contentStreamPromise, resourcesPromise]);
return dataPromises.then(([contentStream]) => {
const partialEvaluator = new PartialEvaluator({
xref: this.xref,
handler,
pageIndex: this.pageIndex,
idFactory: this._localIdFactory,
fontCache: this.fontCache,
builtInCMapCache: this.builtInCMapCache,
standardFontDataCache: this.standardFontDataCache,
globalImageCache: this.globalImageCache,
systemFontCache: this.systemFontCache,
options: this.evaluatorOptions,
});
const [contentStream, , lang] = await Promise.all([
contentStreamPromise,
resourcesPromise,
langPromise,
]);
const partialEvaluator = new PartialEvaluator({
xref: this.xref,
handler,
pageIndex: this.pageIndex,
idFactory: this._localIdFactory,
fontCache: this.fontCache,
builtInCMapCache: this.builtInCMapCache,
standardFontDataCache: this.standardFontDataCache,
globalImageCache: this.globalImageCache,
systemFontCache: this.systemFontCache,
options: this.evaluatorOptions,
});

return partialEvaluator.getTextContent({
stream: contentStream,
task,
resources: this.resources,
includeMarkedContent,
disableNormalization,
sink,
viewBox: this.view,
});
return partialEvaluator.getTextContent({
stream: contentStream,
task,
resources: this.resources,
includeMarkedContent,
disableNormalization,
sink,
viewBox: this.view,
lang,
});
}

Expand Down
3 changes: 3 additions & 0 deletions src/core/evaluator.js
Original file line number Diff line number Diff line change
Expand Up @@ -2311,6 +2311,7 @@ class PartialEvaluator {
sink,
seenStyles = new Set(),
viewBox,
lang = null,
markedContentData = null,
disableNormalization = false,
keepWhiteSpace = false,
Expand All @@ -2327,6 +2328,7 @@ class PartialEvaluator {
const textContent = {
items: [],
styles: Object.create(null),
lang,
};
const textContentItem = {
initialized: false,
Expand Down Expand Up @@ -3300,6 +3302,7 @@ class PartialEvaluator {
sink: sinkWrapper,
seenStyles,
viewBox,
lang,
markedContentData,
disableNormalization,
keepWhiteSpace,
Expand Down
3 changes: 3 additions & 0 deletions src/display/api.js
Original file line number Diff line number Diff line change
Expand Up @@ -1160,6 +1160,7 @@ class PDFDocumentProxy {
* items are included when includeMarkedContent is true.
* @property {Object<string, TextStyle>} styles - {@link TextStyle} objects,
* indexed by font name.
* @property {string | null} lang - The document /Lang attribute.
*/

/**
Expand Down Expand Up @@ -1671,6 +1672,7 @@ class PDFPageProxy {
resolve(textContent);
return;
}
textContent.lang ??= value.lang;
Object.assign(textContent.styles, value.styles);
textContent.items.push(...value.items);
pump();
Expand All @@ -1681,6 +1683,7 @@ class PDFPageProxy {
const textContent = {
items: [],
styles: Object.create(null),
lang: undefined,
};
pump();
});
Expand Down
32 changes: 18 additions & 14 deletions src/display/text_layer.js
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ const DEFAULT_FONT_ASCENT = 0.8;
const ascentCache = new Map();
let _canvasContext = null;

function getCtx() {
function getCtx(lang = null) {
if (!_canvasContext) {
// We don't use an OffscreenCanvas here because we use serif/sans serif
// fonts with it and they depends on the locale.
Expand All @@ -89,13 +89,13 @@ function cleanupTextLayer() {
_canvasContext = null;
}

function getAscent(fontFamily) {
function getAscent(fontFamily, lang) {
const cachedAscent = ascentCache.get(fontFamily);
if (cachedAscent) {
return cachedAscent;
}

const ctx = getCtx();
const ctx = getCtx(lang);

const savedFont = ctx.font;
ctx.canvas.width = ctx.canvas.height = DEFAULT_FONT_SIZE;
Expand Down Expand Up @@ -162,7 +162,7 @@ function getAscent(fontFamily) {
return DEFAULT_FONT_ASCENT;
}

function appendText(task, geom, styles) {
function appendText(task, geom, styles, lang) {
// Initialize all used properties to keep the caches monomorphic.
const textDiv = document.createElement("span");
const textDivProperties = {
Expand All @@ -171,6 +171,7 @@ function appendText(task, geom, styles) {
hasText: geom.str !== "",
hasEOL: geom.hasEOL,
fontSize: 0,
lang,
};
task._textDivs.push(textDiv);

Expand All @@ -184,7 +185,7 @@ function appendText(task, geom, styles) {
const fontFamily =
(task._fontInspectorEnabled && style.fontSubstitution) || style.fontFamily;
const fontHeight = Math.hypot(tx[2], tx[3]);
const fontAscent = fontHeight * getAscent(fontFamily);
const fontAscent = fontHeight * getAscent(fontFamily, lang);

let left, top;
if (angle === 0) {
Expand Down Expand Up @@ -333,7 +334,7 @@ class TextLayerRenderTask {
div: null,
scale: viewport.scale * (globalThis.devicePixelRatio || 1),
properties: null,
ctx: getCtx(),
ctx: null,
};
const { pageWidth, pageHeight, pageX, pageY } = viewport.rawDims;
this._transform = [1, 0, 0, -1, -pageX, pageY + pageHeight];
Expand Down Expand Up @@ -379,7 +380,9 @@ class TextLayerRenderTask {
/**
* @private
*/
_processItems(items, styleCache) {
_processItems(items, styleCache, lang) {
this._layoutTextParams.ctx ||= getCtx(lang);

for (const item of items) {
if (item.str === undefined) {
if (
Expand All @@ -399,7 +402,7 @@ class TextLayerRenderTask {
continue;
}
this._textContentItemsStr.push(item.str);
appendText(this, item, styleCache);
appendText(this, item, styleCache, lang);
}
}

Expand Down Expand Up @@ -438,16 +441,16 @@ class TextLayerRenderTask {
}

Object.assign(styleCache, value.styles);
this._processItems(value.items, styleCache);
this._processItems(value.items, styleCache, value.lang);
pump();
}, reject);
};

this._reader = this._textContentSource.getReader();
pump();
} else if (this._textContentSource) {
const { items, styles } = this._textContentSource;
this._processItems(items, styles);
const { items, styles, lang } = this._textContentSource;
this._processItems(items, styles, lang);
resolve();
} else {
throw new Error('No "textContentSource" parameter specified.');
Expand Down Expand Up @@ -487,19 +490,20 @@ function updateTextLayer({
}

if (mustRescale) {
const ctx = getCtx();
const scale = viewport.scale * (globalThis.devicePixelRatio || 1);
const params = {
prevFontSize: null,
prevFontFamily: null,
div: null,
scale,
properties: null,
ctx,
ctx: null,
};
for (const div of textDivs) {
params.properties = textDivProperties.get(div);
const props = textDivProperties.get(div);
params.properties = props;
params.div = div;
params.ctx ||= getCtx(props.lang);
layout(params);
}
}
Expand Down
6 changes: 4 additions & 2 deletions test/unit/api_spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -3128,10 +3128,11 @@ describe("api", function () {
});

it("gets text content", async function () {
const { items, styles } = await page.getTextContent();
const { items, styles, lang } = await page.getTextContent();

expect(items.length).toEqual(15);
expect(objectSize(styles)).toEqual(5);
expect(lang).toEqual("en");

const text = mergeText(items);
expect(text).toEqual(`Table Of Content
Expand All @@ -3146,13 +3147,14 @@ page 1 / 3`);
);
const pdfDoc = await loadingTask.promise;
const pdfPage = await pdfDoc.getPage(1);
const { items, styles } = await pdfPage.getTextContent({
const { items, styles, lang } = await pdfPage.getTextContent({
disableNormalization: true,
});
expect(items.length).toEqual(1);
// Font name will be a random object id.
const fontName = items[0].fontName;
expect(Object.keys(styles)).toEqual([fontName]);
expect(lang).toEqual(null);

expect(items[0]).toEqual({
dir: "ltr",
Expand Down

0 comments on commit 0ac822e

Please sign in to comment.