Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(web): optimize the wordbreaker data table for filesize and ease of first-load parsing ⚡ #10692

Merged
merged 10 commits into from
Aug 27, 2024
39 changes: 5 additions & 34 deletions common/models/wordbreakers/src/main/default/index.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import { WordBreakProperty, WORD_BREAK_PROPERTY, I, propertyMap } from "./data.inc.js";
import { WordBreakProperty, propertyMap } from "./data.inc.js";

import { searchForProperty } from "./searchForProperty.js";

/**
* A set of options used to customize and extend the behavior of the default
Expand Down Expand Up @@ -566,43 +568,12 @@ function property(character: string, options?: DefaultWordBreakerOptions): WordB
// TODO: remove dependence on character.codepointAt()?
let codepoint = character.codePointAt(0) as number;

return searchForProperty(codepoint, 0, WORD_BREAK_PROPERTY.length - 1);
return searchForProperty(codepoint);
}

function propertyVal(propName: string, options?: DefaultWordBreakerOptions) {
const matcher = (name: string) => name.toLowerCase() == propName.toLowerCase()

const customIndex = options?.customProperties?.findIndex(matcher) ?? -1;
return customIndex != -1 ? -customIndex - 1 : propertyMap.findIndex(matcher);
}

/**
* Binary search for the word break property of a given CODE POINT.
*
* The auto-generated data.ts master array defines a **character range**
* lookup table. If a character's codepoint is equal to or greater than
* the I.Start value for an entry and exclusively less than the next entry,
* it falls in the first entry's range bucket and is classified accordingly
* by this method.
*/
function searchForProperty(codePoint: number, left: number, right: number): WordBreakProperty {
// All items that are not found in the array are assigned the 'Other' property.
if (right < left) {
return WordBreakProperty.Other;
}

let midpoint = left + ~~((right - left) / 2);
let candidate = WORD_BREAK_PROPERTY[midpoint];

let nextRange = WORD_BREAK_PROPERTY[midpoint + 1];
let startOfNextRange = nextRange ? nextRange[I.Start] : Infinity;

if (codePoint < candidate[I.Start]) {
return searchForProperty(codePoint, left, midpoint - 1);
} else if (codePoint >= startOfNextRange) {
return searchForProperty(codePoint, midpoint + 1, right);
}

// We found it!
return candidate[I.Value];
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import { WordBreakProperty, WORD_BREAK_PROPERTY_BMP, WORD_BREAK_PROPERTY_NON_BMP } from "./data.inc.js";

export function searchForProperty(codePoint: number): WordBreakProperty {
const bucketSize = codePoint <= 0xFFFF ? 2 : 3;

// SMP chars take a bit more space to encode.
const encodedArray = bucketSize == 2 ? WORD_BREAK_PROPERTY_BMP : WORD_BREAK_PROPERTY_NON_BMP;

return _searchForProperty(encodedArray, codePoint, bucketSize, 0, encodedArray.length / bucketSize - 1) - 0x20;
}

/**
* Binary search for the word break property of a given CODE POINT.
*
* The auto-generated data.ts master strings encode **character range**
* lookup tables. If a character's codepoint is equal to or greater than
* the start-of-range value for an entry and exclusively less than the next
* entry's start-of-range, it falls within the first entry's range bucket
* and is classified accordingly by this method.
*/
function _searchForProperty(encodedArray: string, codePoint: number, bucketSize: number, left: number, right: number): WordBreakProperty {
// All items that are not found in the array are assigned the 'Other' property.
if (right < left) { // May need special handling at end of BMP / start of non-BMP.
return WordBreakProperty.Other;
}

let midpoint = left + ~~((right - left) / 2);
let candidate = encodedArray.codePointAt(bucketSize * midpoint);

// If out-of-bounds, gives NaN.
let nextRange = encodedArray.codePointAt(bucketSize * (midpoint + 1));
let startOfNextRange = isNaN(nextRange) ? Infinity : nextRange;

if (codePoint < candidate) {
return _searchForProperty(encodedArray, codePoint, bucketSize, left, midpoint - 1);
} else if (codePoint >= startOfNextRange) {
return _searchForProperty(encodedArray, codePoint, bucketSize, midpoint + 1, right);
}

// We found it!
const propertyCode = encodedArray.charCodeAt(bucketSize * (midpoint + 1) - 1);
return propertyCode as WordBreakProperty;
}
33 changes: 33 additions & 0 deletions common/models/wordbreakers/test/test-search-property.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
/**
* Smoke-test the default
*/

import { assert } from 'chai';
import { searchForProperty } from '../build/obj/default/searchForProperty.js';
import { propertyMap } from '../build/obj/default/data.inc.js';

describe('searchForProperty', () => {
it('correctly finds character classes for standard ASCII characters', () => {
assert.equal(searchForProperty('a'.codePointAt(0)), propertyMap.indexOf('ALetter'));
assert.equal(searchForProperty('Z'.codePointAt(0)), propertyMap.indexOf('ALetter'));

assert.equal(searchForProperty("'".codePointAt(0)), propertyMap.indexOf('Single_Quote'));
assert.equal(searchForProperty('"'.codePointAt(0)), propertyMap.indexOf('Double_Quote'));
assert.equal(searchForProperty(','.codePointAt(0)), propertyMap.indexOf('MidNum'));
assert.equal(searchForProperty('.'.codePointAt(0)), propertyMap.indexOf('MidNumLet'));
assert.equal(searchForProperty('-'.codePointAt(0)), propertyMap.indexOf('Other'));
});

it('correctly finds character classes for specialized BMP characters', () => {
assert.equal(searchForProperty(0x05D0), propertyMap.indexOf('Hebrew_Letter'));
assert.equal(searchForProperty(0x3031), propertyMap.indexOf('Katakana'));
assert.equal(searchForProperty(0xFFFE), propertyMap.indexOf('Other'));
assert.equal(searchForProperty(0xFFFF), propertyMap.indexOf('Other'));
});

it('correctly finds character classes for non-BMP characters', () => {
assert.equal(searchForProperty(0x0001F1E6), propertyMap.indexOf('Regional_Indicator'));
assert.equal(searchForProperty(0x00013430), propertyMap.indexOf('Format'));
assert.equal(searchForProperty(0x00010000), propertyMap.indexOf('ALetter'));
});
});
100 changes: 65 additions & 35 deletions common/models/wordbreakers/tools/data-compiler/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@

// Original version found at: https://github.com/eddieantonio/unicode-default-word-boundary/blob/master/libexec/compile-word-break.js

// TODO: Adapt to produce two string-encoded arrays - one for BMP chars, one for non-BMP chars.

import fs from 'fs';
import path from 'path';

Expand Down Expand Up @@ -93,13 +91,54 @@ const categoryMap = new Map<string, number>();

for(let cat of categories) {
categoryMap.set(cat, catIndexSeed++);
if(catIndexSeed == '`'.charCodeAt(0)) {
catIndexSeed++; // Skip the back-tick as an encoding symbol.
// Reduces complications, as it's the encoding string start/end char.
}
}

const bmpRanges: typeof ranges = [];
const nonBmpRanges: typeof ranges = [];

// { start: number, property: number}[]
for(let range of ranges) { // already sorted
if(range.start <= 0xFFFF) {
bmpRanges.push(range);
} else {
if(nonBmpRanges.length == 0) {
const finalBmpRange = bmpRanges[bmpRanges.length - 1];
bmpRanges.push({
start: 0xFFFF,
property: finalBmpRange.property,
end: undefined
});

if(range.start != 0x10000) {
nonBmpRanges.push({
start: 0x10000,
property: finalBmpRange.property,
end: undefined
});
}
}

nonBmpRanges.push(range);
}
}

//////////////////////// Creating the generated file /////////////////////////

// Save the output in the gen/ directory.
let stream = fs.createWriteStream(generatedFilename);

function escape(codedChar: string) {
if(codedChar == '`' || codedChar == '\\') {
return '\\' + codedChar;
} else {
return codedChar;
}
}

// // Former entry in the original version by Eddie that was never included in our repo:
// export const extendedPictographic = ${extendedPictographicRegExp};

Expand All @@ -116,7 +155,7 @@ stream.write(`// Automatically generated file. DO NOT MODIFY.
export const enum WordBreakProperty {
${ /* Create enum values for each word break property */
Array.from(categories)
.map(x => ` ${x}`)
.map(x => ` ${x} = ${categoryMap.get(x)}`)
.join(',\n')
}
};
Expand All @@ -133,38 +172,29 @@ ${ /* Enumerate the plain-text names for ease of lookup at runtime */
}
];

/**
* Constants for indexing values in WORD_BREAK_PROPERTY.
*/
export const enum I {
Start = 0,
Value = 1
}

/**
* Defines a mapping of all characters to their assigned word-breaking
* property type.
*
* There are implicit buckets starting at the char with specified code \`number\`
* of an entry up to, but not including, the value in the next entry. All
* entries in each bucket share the same property value.
*
* Consider the following two consecutive buckets:
* - [0x0041, WordBreakProperty.ALetter]
* - [0x005B, WordBreakProperty.Other]
*
* For this example, all characters from 0x0041 to 0x005B (that is, 'A'-'Z')
* have the wordbreaking property \`ALetter\`.
*/
export const WORD_BREAK_PROPERTY: [number, WordBreakProperty][] = [
${
// TODO: Two versions: one that's BMP-encoded, one that's non-BMP encoded.
ranges.map(({start, property}) => (` [` +
`/*start*/ 0x${start.toString(16).toUpperCase()}, ` +
`WordBreakProperty.${property}],`
)).join('\n')
}
];
export const WORD_BREAK_PROPERTY_BMP: string = \`${
// To consider: emit `\uxxxx` codes instead of the raw char?
bmpRanges.map(({start, property}) => {
let codedStart = escape(String.fromCodePoint(start));

// Offset the encoded property value to lie within a friendlier range,
// with characters that render naturally within code editors.
const codedProp = escape(String.fromCharCode(categoryMap.get(property) + 0x20));
return `${codedStart}${codedProp}`;
}).join('')
}\`;

export const WORD_BREAK_PROPERTY_NON_BMP: string = \`${
// To consider: emit `\uxxxx` codes instead of the raw char?
nonBmpRanges.map(({start, property}) => {
const codedStart = escape(String.fromCodePoint(start));

// Offset the encoded property value to lie within a friendlier range,
// with characters that render naturally within code editors.
const codedProp = escape(String.fromCharCode(categoryMap.get(property) + 0x20));
return `${codedStart}${codedProp}`;
}).join('')
}\`;
`);

/**
Expand Down
Loading