Skip to content

Commit

Permalink
Implement new text analyzer features
Browse files Browse the repository at this point in the history
  • Loading branch information
mrtcode committed Jan 13, 2025
1 parent 2586e97 commit bed392a
Show file tree
Hide file tree
Showing 28 changed files with 4,854 additions and 2 deletions.
2 changes: 2 additions & 0 deletions src/core/document.js
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ import { StreamsSequenceStream } from "./decode_stream.js";
import { StructTreePage } from "./struct_tree.js";
import { XFAFactory } from "./xfa/factory.js";
import { XRef } from "./xref.js";
import { Module } from "./module/module.js";

const LETTER_SIZE_MEDIABOX = [0, 0, 612, 792];

Expand Down Expand Up @@ -941,6 +942,7 @@ class PDFDocument {
this.pdfManager = pdfManager;
this.stream = stream;
this.xref = new XRef(stream, pdfManager);
this.module = new Module(this);
this._pagePromises = new Map();
this._version = null;

Expand Down
173 changes: 173 additions & 0 deletions src/core/evaluator.js
Original file line number Diff line number Diff line change
Expand Up @@ -2383,6 +2383,7 @@ class PartialEvaluator {
transform: null,
fontName: null,
hasEOL: false,
chars: [],
};

// Use a circular buffer (length === 2) to save the last chars in the
Expand Down Expand Up @@ -2622,6 +2623,7 @@ class PartialEvaluator {
transform: textChunk.transform,
fontName: textChunk.fontName,
hasEOL: textChunk.hasEOL,
chars: textChunk.chars,
};
}

Expand Down Expand Up @@ -2943,6 +2945,9 @@ class PartialEvaluator {
scaledDim = 0;
}

let prevWidth = textChunk.width;
let m = Util.transform(textState.ctm, textState.textMatrix);

if (!font.vertical) {
scaledDim *= textState.textHScale;
textState.translateTextMatrix(scaledDim, 0);
Expand All @@ -2969,6 +2974,173 @@ class PartialEvaluator {
}
textChunk.str.push(glyphUnicode);

function closestStandardAngle(degrees) {
const standardAngles = [0, 90, 180, 270];
let closestAngle = standardAngles[0];
let minDifference = Math.abs(degrees - closestAngle);

for (let i = 1; i < standardAngles.length; i++) {
const difference = Math.abs(degrees - standardAngles[i]);
if (difference < minDifference) {
minDifference = difference;
closestAngle = standardAngles[i];
}
}

return closestAngle;
}

function matrixToDegrees(matrix) {
let radians = Math.atan2(matrix[1], matrix[0]);
if (radians < 0) {
radians += (2 * Math.PI);
}
let degrees = Math.round(radians * (180 / Math.PI));
degrees = degrees % 360;
if (degrees < 0) {
degrees += 360;
}
degrees = closestStandardAngle(degrees);
return degrees;
}

let rotation = matrixToDegrees(m);

let ascent = font.ascent;
let descent = font.descent;
if (descent > 0) {
descent = -descent;
}
if (ascent && descent) {
if (ascent > 1) {
ascent = 0.75;
}
if (descent < -0.5) {
descent = -0.25;
}
}
else {
ascent = 0.75;
descent = -0.25;
}

if (font.capHeight && font.capHeight < ascent && font.capHeight > 0) {
ascent = font.capHeight;
}

let charWidth = textChunk.width - prevWidth;
let rect = [0, textState.fontSize * descent, charWidth, textState.fontSize * ascent]

if (
font.isType3Font &&
textState.fontSize <= 1 &&
!isArrayEqual(textState.fontMatrix, FONT_IDENTITY_MATRIX)
) {
const glyphHeight = font.bbox[3] - font.bbox[1];
if (glyphHeight > 0) {
rect[1] = font.bbox[1] * textState.fontMatrix[3];
rect[3] = font.bbox[3] * textState.fontMatrix[3];
}
}

rect = Util.getAxialAlignedBoundingBox(rect, m);

let baselineRect = Util.getAxialAlignedBoundingBox([0, 0, 0, 0], m);
let baseline = 0;
if (rotation === 0 || rotation === 180) {
baseline = baselineRect[1];
}
else if (rotation === 90 || rotation === 270) {
baseline = baselineRect[0];
}

let p1 = [0, 0];
let p2 = [0, 1];

let [x1, y1] = Util.applyTransform(p1, getCurrentTextTransform());
let [x2, y2] = Util.applyTransform(p2, getCurrentTextTransform());
let fontSize = Math.hypot(x1 - x2, y1 - y2);

let diagonal = rotation % 90 !== 0;

function normalizeChar(char) {
// Normalize the character to NFKD form to decompose ligatures and combined characters
let normalizedChar = char.normalize('NFKD');

// Handling known special cases where combining characters may still be decomposed
const specialCases = {
'e\u0301': 'é', // e + ´ -> é
'a\u0301': 'á', // a + ´ -> á
'i\u0301': 'í', // i + ´ -> í
'o\u0301': 'ó', // o + ´ -> ó
'u\u0301': 'ú', // u + ´ -> ú
'e\u0300': 'è', // e + ` -> è
'a\u0300': 'à', // a + ` -> à
'i\u0300': 'ì', // i + ` -> ì
'o\u0300': 'ò', // o + ` -> ò
'u\u0300': 'ù', // u + ` -> ù
'e\u0302': 'ê', // e + ^ -> ê
'a\u0302': 'â', // a + ^ -> â
'i\u0302': 'î', // i + ^ -> î
'o\u0302': 'ô', // o + ^ -> ô
'u\u0302': 'û', // u + ^ -> û
'e\u0308': 'ë', // e + ¨ -> ë
'a\u0308': 'ä', // a + ¨ -> ä
'i\u0308': 'ï', // i + ¨ -> ï
'o\u0308': 'ö', // o + ¨ -> ö
'u\u0308': 'ü', // u + ¨ -> ü
'c\u0327': 'ç', // c + ¸ -> ç
'n\u0303': 'ñ', // n + ˜ -> ñ
// Add other special cases here
};

// Check if the normalized character sequence matches a special case
if (specialCases[normalizedChar]) {
return specialCases[normalizedChar];
}

return normalizedChar;
}

let charCode = glyph.unicode.charCodeAt(0);

if (
glyph.unicode !== ' ' &&
fontSize !== 0 &&
// Skip null and other control characters to avoid breaking strings, DOM, end even browsers…
// TODO: Consider skipping other non-printable characters as well
// TODO: Determine whether it's better to skip or replace these characters
// since we may need to keep PDF.js text layer character offsets aligned with
// Zotero reader text layer character offsets
!(
// ASCII control characters
(charCode >= 0x00 && charCode <= 0x1F) ||
// Extended control characters
(charCode >= 0x7F && charCode <= 0x9F)
)
) {
textChunk.chars.push({
// Decomposed ligatures, normalized Arabic characters
c: normalizeChar(glyphUnicode),
// Normalizes Arabic characters others characters where length remains 1, but preserves
// ligatures and more importantly avoids 'e\u00be' being converted into 'e \u0301'
// which is quite common in Spanish author names and because of the space prevents
// author name recognition
// NOTE: THIS CAN STILL HAVE DECOMPOSED LIGATURES IF THE FONT HAS ITS OWN CHARACTER MAPPING,
// THEREFORE CONSIDER DITCHING THIS PROPERTY
u: glyphUnicode.length === 1 ? glyphUnicode : glyph.unicode,
rect,
fontSize,
fontName: textState.font.name,
bold: textState.font.bold,
italic: textState.font.italic,
glyphWidth,
baseline,
rotation,
diagonal,
});
}

if (charSpacing) {
if (!font.vertical) {
textState.translateTextMatrix(
Expand Down Expand Up @@ -3049,6 +3221,7 @@ class PartialEvaluator {
textContent.items.push(runBidiTransform(textContentItem));
textContentItem.initialized = false;
textContentItem.str.length = 0;
textContentItem.chars = [];
}

function enqueueChunk(batch = false) {
Expand Down
Loading

0 comments on commit bed392a

Please sign in to comment.