From 7006630cfe307da4840651bc8b6da102660e3f1a Mon Sep 17 00:00:00 2001 From: Greg Brimble Date: Mon, 10 Feb 2025 05:23:03 -0500 Subject: [PATCH] Add code for interpolation search for asset manifest lookup (#8044) --- .changeset/stupid-turtles-hammer.md | 5 + .../asset-worker/crypto-polyfill.ts | 13 + .../asset-worker/fixtures/AssetManifest.bin | Bin 0 -> 140 bytes .../asset-worker/src/assets-manifest.ts | 91 ++++- .../asset-worker/src/experiment-analytics.ts | 54 +++ .../workers-shared/asset-worker/src/index.ts | 37 ++- .../tests/assets-manifest.test.ts | 312 ++++++++++++++++++ .../asset-worker/vitest.config.mts | 1 + 8 files changed, 507 insertions(+), 6 deletions(-) create mode 100644 .changeset/stupid-turtles-hammer.md create mode 100644 packages/workers-shared/asset-worker/crypto-polyfill.ts create mode 100644 packages/workers-shared/asset-worker/fixtures/AssetManifest.bin create mode 100644 packages/workers-shared/asset-worker/src/experiment-analytics.ts create mode 100644 packages/workers-shared/asset-worker/tests/assets-manifest.test.ts diff --git a/.changeset/stupid-turtles-hammer.md b/.changeset/stupid-turtles-hammer.md new file mode 100644 index 000000000000..8ba80591860b --- /dev/null +++ b/.changeset/stupid-turtles-hammer.md @@ -0,0 +1,5 @@ +--- +"@cloudflare/workers-shared": minor +--- + +chore: Adds analytics and code (zero-percent gated) for a new asset manifest search algorithm diff --git a/packages/workers-shared/asset-worker/crypto-polyfill.ts b/packages/workers-shared/asset-worker/crypto-polyfill.ts new file mode 100644 index 000000000000..9e54abef7899 --- /dev/null +++ b/packages/workers-shared/asset-worker/crypto-polyfill.ts @@ -0,0 +1,13 @@ +import { afterAll, beforeAll } from "vitest"; + +// Can be deleted once Node.js (where these tests run) version is bumped to one which includes this global :) + +beforeAll(() => { + // @ts-expect-error will go away once Node.js is bumped + globalThis.crypto = require("crypto"); +}); + +afterAll(() => { + // @ts-expect-error will go away once Node.js is bumped + delete globalThis.crypto; +}); diff --git a/packages/workers-shared/asset-worker/fixtures/AssetManifest.bin b/packages/workers-shared/asset-worker/fixtures/AssetManifest.bin new file mode 100644 index 0000000000000000000000000000000000000000..5524c4aab4caf5a0cf386ffa0d03a06f2189cfec GIT binary patch literal 140 zcmZQzzy!y1LmzKhm8P+7Q)%tggEyZrD!Zn4u0H!7M#FTRH9tEkW0L=s0JjgX>NW{U p3gS@rpnm<*htgLz%FYw&m>aIM9&D75fsmo0vi$vj_wU2Z0|0Y@Firpf literal 0 HcmV?d00001 diff --git a/packages/workers-shared/asset-worker/src/assets-manifest.ts b/packages/workers-shared/asset-worker/src/assets-manifest.ts index 6f2ad737d1c8..2816572f1b16 100644 --- a/packages/workers-shared/asset-worker/src/assets-manifest.ts +++ b/packages/workers-shared/asset-worker/src/assets-manifest.ts @@ -12,7 +12,7 @@ export class AssetsManifest { this.data = data; } - async get(pathname: string) { + async getWithBinarySearch(pathname: string) { const pathHash = await hashPath(pathname); const entry = binarySearch( new Uint8Array(this.data, HEADER_SIZE), @@ -20,9 +20,18 @@ export class AssetsManifest { ); return entry ? contentHashToKey(entry) : null; } + + async getWithInterpolationSearch(pathname: string) { + const pathHash = await hashPath(pathname); + const entry = interpolationSearch( + new Uint8Array(this.data, HEADER_SIZE), + pathHash + ); + return entry ? contentHashToKey(entry) : null; + } } -const hashPath = async (path: string) => { +export const hashPath = async (path: string) => { const encoder = new TextEncoder(); const data = encoder.encode(path); const hashBuffer = await crypto.subtle.digest( @@ -32,7 +41,7 @@ const hashPath = async (path: string) => { return new Uint8Array(hashBuffer, 0, PATH_HASH_SIZE); }; -const binarySearch = ( +export const binarySearch = ( arr: Uint8Array, searchValue: Uint8Array ): Uint8Array | false => { @@ -67,7 +76,81 @@ const binarySearch = ( } }; -const compare = (a: Uint8Array, b: Uint8Array) => { +const uint8ArrayToNumber = (uint8Array: Uint8Array) => { + const dataView = new DataView(uint8Array.buffer, uint8Array.byteOffset); + return (dataView.getBigUint64(0) << 64n) + dataView.getBigUint64(8); +}; + +export const interpolationSearch = ( + arr: Uint8Array, + searchValue: Uint8Array +) => { + if (arr.byteLength === 0) { + return false; + } + let low = 0; + let high = arr.byteLength / ENTRY_SIZE - 1; + if (high === low) { + const current = new Uint8Array(arr.buffer, arr.byteOffset, PATH_HASH_SIZE); + if (current.byteLength !== searchValue.byteLength) { + throw new TypeError( + "Search value and current value are of different lengths" + ); + } + const cmp = compare(current, searchValue); + if (cmp === 0) { + return new Uint8Array(arr.buffer, arr.byteOffset, ENTRY_SIZE); + } else { + return false; + } + } + const searchValueNumber = uint8ArrayToNumber(searchValue); + while (low <= high) { + const lowValue = new Uint8Array( + arr.buffer, + arr.byteOffset + low * ENTRY_SIZE, + PATH_HASH_SIZE + ); + const highValue = new Uint8Array( + arr.buffer, + arr.byteOffset + high * ENTRY_SIZE, + PATH_HASH_SIZE + ); + const mid = Math.floor( + Number( + BigInt(low) + + (BigInt(high - low) * + (searchValueNumber - uint8ArrayToNumber(lowValue))) / + (uint8ArrayToNumber(highValue) - uint8ArrayToNumber(lowValue)) + ) + ); + const current = new Uint8Array( + arr.buffer, + arr.byteOffset + mid * ENTRY_SIZE, + PATH_HASH_SIZE + ); + if (current.byteLength !== searchValue.byteLength) { + throw new TypeError( + "Search value and current value are of different lengths" + ); + } + const cmp = compare(current, searchValue); + if (cmp === 0) { + return new Uint8Array( + arr.buffer, + arr.byteOffset + mid * ENTRY_SIZE, + ENTRY_SIZE + ); + } else if (cmp < 0) { + low = mid + 1; + } else { + high = mid - 1; + } + } + return false; +}; + +export const compare = (a: Uint8Array, b: Uint8Array) => { if (a.byteLength < b.byteLength) { return -1; } diff --git a/packages/workers-shared/asset-worker/src/experiment-analytics.ts b/packages/workers-shared/asset-worker/src/experiment-analytics.ts new file mode 100644 index 000000000000..db4cfe13e49c --- /dev/null +++ b/packages/workers-shared/asset-worker/src/experiment-analytics.ts @@ -0,0 +1,54 @@ +import type { ReadyAnalytics } from "./types"; + +// This will allow us to make breaking changes to the analytic schema +const VERSION = 1; + +// When adding new columns please update the schema +type Data = { + // -- Indexes -- + accountId?: number; + experimentName?: string; + + // -- Doubles -- + // double1 - The time it takes to read the manifest in milliseconds + manifestReadTime?: number; + + // -- Blobs -- + // blob1 - Manifest read method + manifestReadMethod?: string; +}; + +export class ExperimentAnalytics { + private data: Data = {}; + private readyAnalytics?: ReadyAnalytics; + + constructor(readyAnalytics?: ReadyAnalytics) { + this.readyAnalytics = readyAnalytics; + } + + setData(newData: Partial) { + this.data = { ...this.data, ...newData }; + } + + getData(key: keyof Data) { + return this.data[key]; + } + + write() { + if (!this.readyAnalytics) { + return; + } + + this.readyAnalytics.logEvent({ + version: VERSION, + accountId: this.data.accountId, + indexId: this.data.experimentName, + doubles: [ + this.data.manifestReadTime ?? -1, // double1 + ], + blobs: [ + this.data.manifestReadMethod, // blob1 + ], + }); + } +} diff --git a/packages/workers-shared/asset-worker/src/index.ts b/packages/workers-shared/asset-worker/src/index.ts index 8dc24237eb2e..d3bd3aa92651 100644 --- a/packages/workers-shared/asset-worker/src/index.ts +++ b/packages/workers-shared/asset-worker/src/index.ts @@ -6,6 +6,7 @@ import { mockJaegerBinding } from "../../utils/tracing"; import { Analytics } from "./analytics"; import { AssetsManifest } from "./assets-manifest"; import { applyConfigurationDefaults } from "./configuration"; +import { ExperimentAnalytics } from "./experiment-analytics"; import { decodePath, getIntent, handleRequest } from "./handler"; import { getAssetWithMetadataFromKV } from "./utils/kv"; import type { @@ -39,6 +40,7 @@ export type Env = { JAEGER: JaegerTracing; ENVIRONMENT: Environment; + EXPERIMENT_ANALYTICS: ReadyAnalytics; ANALYTICS: ReadyAnalytics; COLO_METADATA: ColoMetadata; UNSAFE_PERFORMANCE: UnsafePerformanceTimer; @@ -212,7 +214,38 @@ export default class extends WorkerEntrypoint { } async unstable_exists(pathname: string): Promise { - const assetsManifest = new AssetsManifest(this.env.ASSETS_MANIFEST); - return await assetsManifest.get(pathname); + const analytics = new ExperimentAnalytics(this.env.EXPERIMENT_ANALYTICS); + const performance = new PerformanceTimer(this.env.UNSAFE_PERFORMANCE); + + const INTERPOLATION_EXPERIMENT_SAMPLE_RATE = 0; + let searchMethod: "binary" | "interpolation" = "binary"; + if (Math.random() < INTERPOLATION_EXPERIMENT_SAMPLE_RATE) { + searchMethod = "interpolation"; + } + analytics.setData({ manifestReadMethod: searchMethod }); + + if ( + this.env.COLO_METADATA && + this.env.VERSION_METADATA && + this.env.CONFIG + ) { + analytics.setData({ + accountId: this.env.CONFIG.account_id, + experimentName: "manifest-read-timing", + }); + } + + const startTimeMs = performance.now(); + try { + const assetsManifest = new AssetsManifest(this.env.ASSETS_MANIFEST); + if (searchMethod === "interpolation") { + return await assetsManifest.getWithInterpolationSearch(pathname); + } else { + return await assetsManifest.getWithBinarySearch(pathname); + } + } finally { + analytics.setData({ manifestReadTime: performance.now() - startTimeMs }); + analytics.write(); + } } } diff --git a/packages/workers-shared/asset-worker/tests/assets-manifest.test.ts b/packages/workers-shared/asset-worker/tests/assets-manifest.test.ts new file mode 100644 index 000000000000..80992d9a56b2 --- /dev/null +++ b/packages/workers-shared/asset-worker/tests/assets-manifest.test.ts @@ -0,0 +1,312 @@ +import { readFileSync } from "fs"; +import { join } from "path"; +import { + CONTENT_HASH_OFFSET, + CONTENT_HASH_SIZE, + ENTRY_SIZE, + HEADER_SIZE, + PATH_HASH_OFFSET, + PATH_HASH_SIZE, +} from "../../utils/constants"; +import { + binarySearch, + compare, + hashPath, + interpolationSearch, +} from "../src/assets-manifest"; + +const encoder = new TextEncoder(); + +async function SHA_256(value: string, length: number) { + const data = encoder.encode(value); + const hashBuffer = await crypto.subtle.digest( + "SHA-256", + data.buffer as ArrayBuffer + ); + return new Uint8Array(hashBuffer, 0, length); +} + +function hexToBytes(hex: string) { + if (!hex.match(/^([0-9a-f]{2})+$/gi)) { + throw new TypeError(`Invalid byte string: ${hex}`); + } + + return new Uint8Array( + hex.match(/[0-9a-f]{2}/gi)?.map((b) => parseInt(b, 16)) ?? [] + ); +} + +const encode = async ( + assetEntries: { path: string; contentHash: string }[] +) => { + const entries = await Promise.all( + assetEntries.map(async (entry) => ({ + path: entry.path, + contentHash: entry.contentHash, + pathHashBytes: await SHA_256(entry.path, PATH_HASH_SIZE), + })) + ); + entries.sort((a, b) => compare(a.pathHashBytes, b.pathHashBytes)); + + const assetManifestBytes = new Uint8Array( + HEADER_SIZE + entries.length * ENTRY_SIZE + ); + + for (const [i, { pathHashBytes, contentHash }] of entries.entries()) { + const contentHashBytes = hexToBytes(contentHash); + const entryOffset = HEADER_SIZE + i * ENTRY_SIZE; + + assetManifestBytes.set(pathHashBytes, entryOffset + PATH_HASH_OFFSET); + assetManifestBytes.set(contentHashBytes, entryOffset + CONTENT_HASH_OFFSET); + } + + return assetManifestBytes.buffer; +}; + +describe("encode()", () => { + it("works", async () => { + const snapshotValue = readFileSync( + join(__dirname, "../fixtures/AssetManifest.bin") + ); + + const computedValue = new Uint8Array( + await encode([ + { + path: "/path1", + contentHash: "0123456789abcdef0123456789abcdef", + }, + { + path: "/path2", + contentHash: "1123456789abcdef0123456789abcdef", + }, + { + path: "/path3", + contentHash: "ABCDEF01231230123131231FDFFEDFDF", + }, + ]) + ); + expect(compare(computedValue, snapshotValue)).toBe(0); + + const invalidContentHashValue = new Uint8Array( + await encode([ + { + path: "/path1", + contentHash: "EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE", + }, + { + path: "/path2", + contentHash: "1123456789abcdef0123456789abcdef", + }, + { + path: "/path3", + contentHash: "ABCDEF01231230123131231FDFFEDFDF", + }, + ]) + ); + expect(compare(invalidContentHashValue, snapshotValue)).not.toBe(0); + + const invalidPathValue = new Uint8Array( + await encode([ + { + path: "/path123", + contentHash: "0123456789abcdef0123456789abcdef", + }, + { + path: "/path2", + contentHash: "1123456789abcdef0123456789abcdef", + }, + { + path: "/path3", + contentHash: "ABCDEF01231230123131231FDFFEDFDF", + }, + ]) + ); + expect(compare(invalidPathValue, snapshotValue)).not.toBe(0); + }); +}); + +const makePathForId = (id: number) => `/path${id}`; + +const makeManifestOfLength = async (length: number) => { + const entries = new Array(length).fill(undefined).map((_, i) => ({ + path: makePathForId(i), + contentHash: String(i).padEnd(32, "f"), + })); + return { entries, manifest: await encode(entries) }; +}; + +describe("search methods", async () => { + describe("binary search", () => { + it("doesn't error for an empty manifest", async () => { + const { manifest } = await makeManifestOfLength(0); + const foundEntry = binarySearch( + new Uint8Array(manifest, HEADER_SIZE), + await hashPath("/path") + ); + expect(foundEntry).toBe(false); + }); + + it("works for a single entry manifest", async () => { + const { manifest, entries } = await makeManifestOfLength(1); + for (const searchEntry of entries) { + const path = await hashPath(searchEntry.path); + const foundEntry = binarySearch( + new Uint8Array(manifest, HEADER_SIZE), + path + ) as Uint8Array; + expect(foundEntry).not.toBe(false); + expect( + new Uint8Array( + foundEntry.buffer, + CONTENT_HASH_OFFSET + foundEntry.byteOffset, + CONTENT_HASH_SIZE + ) + ).toEqual(hexToBytes(searchEntry.contentHash)); + } + }); + + it("works for a two entry manifest", async () => { + const { manifest, entries } = await makeManifestOfLength(2); + for (const searchEntry of entries) { + const path = await hashPath(searchEntry.path); + const foundEntry = binarySearch( + new Uint8Array(manifest, HEADER_SIZE), + path + ) as Uint8Array; + expect(foundEntry).not.toBe(false); + expect( + new Uint8Array( + foundEntry.buffer, + CONTENT_HASH_OFFSET + foundEntry.byteOffset, + CONTENT_HASH_SIZE + ) + ).toEqual(hexToBytes(searchEntry.contentHash)); + } + }); + + it("works for a three entry manifest", async () => { + const { manifest, entries } = await makeManifestOfLength(3); + for (const searchEntry of entries) { + const path = await hashPath(searchEntry.path); + const foundEntry = binarySearch( + new Uint8Array(manifest, HEADER_SIZE), + path + ) as Uint8Array; + expect(foundEntry).not.toBe(false); + expect( + new Uint8Array( + foundEntry.buffer, + CONTENT_HASH_OFFSET + foundEntry.byteOffset, + CONTENT_HASH_SIZE + ) + ).toEqual(hexToBytes(searchEntry.contentHash)); + } + }); + + it("works for a 20,000 entry manifest", async () => { + const { manifest, entries } = await makeManifestOfLength(20_000); + for (const searchEntry of entries) { + const path = await hashPath(searchEntry.path); + const foundEntry = binarySearch( + new Uint8Array(manifest, HEADER_SIZE), + path + ) as Uint8Array; + expect(foundEntry).not.toBe(false); + expect( + new Uint8Array( + foundEntry.buffer, + CONTENT_HASH_OFFSET + foundEntry.byteOffset, + CONTENT_HASH_SIZE + ) + ).toEqual(hexToBytes(searchEntry.contentHash)); + } + }); + }); + + describe("interpolation search", () => { + it("doesn't error for an empty manifest", async () => { + const { manifest } = await makeManifestOfLength(0); + const foundEntry = interpolationSearch( + new Uint8Array(manifest, HEADER_SIZE), + await hashPath("/path") + ); + expect(foundEntry).toBe(false); + }); + + it("works for a single entry manifest", async () => { + const { manifest, entries } = await makeManifestOfLength(1); + for (const searchEntry of entries) { + const path = await hashPath(searchEntry.path); + const foundEntry = interpolationSearch( + new Uint8Array(manifest, HEADER_SIZE), + path + ) as Uint8Array; + expect(foundEntry).not.toBe(false); + expect( + new Uint8Array( + foundEntry.buffer, + CONTENT_HASH_OFFSET + foundEntry.byteOffset, + CONTENT_HASH_SIZE + ) + ).toEqual(hexToBytes(searchEntry.contentHash)); + } + }); + + it("works for a two entry manifest", async () => { + const { manifest, entries } = await makeManifestOfLength(2); + for (const searchEntry of entries) { + const path = await hashPath(searchEntry.path); + const foundEntry = interpolationSearch( + new Uint8Array(manifest, HEADER_SIZE), + path + ) as Uint8Array; + expect(foundEntry).not.toBe(false); + expect( + new Uint8Array( + foundEntry.buffer, + CONTENT_HASH_OFFSET + foundEntry.byteOffset, + CONTENT_HASH_SIZE + ) + ).toEqual(hexToBytes(searchEntry.contentHash)); + } + }); + + it("works for a three entry manifest", async () => { + const { manifest, entries } = await makeManifestOfLength(3); + for (const searchEntry of entries) { + const path = await hashPath(searchEntry.path); + const foundEntry = interpolationSearch( + new Uint8Array(manifest, HEADER_SIZE), + path + ) as Uint8Array; + expect(foundEntry).not.toBe(false); + expect( + new Uint8Array( + foundEntry.buffer, + CONTENT_HASH_OFFSET + foundEntry.byteOffset, + CONTENT_HASH_SIZE + ) + ).toEqual(hexToBytes(searchEntry.contentHash)); + } + }); + + it("works for a 20,000 entry manifest", async () => { + const { manifest, entries } = await makeManifestOfLength(20_000); + for (const searchEntry of entries) { + const path = await hashPath(searchEntry.path); + const foundEntry = interpolationSearch( + new Uint8Array(manifest, HEADER_SIZE), + path + ) as Uint8Array; + expect(foundEntry).not.toBe(false); + expect( + new Uint8Array( + foundEntry.buffer, + CONTENT_HASH_OFFSET + foundEntry.byteOffset, + CONTENT_HASH_SIZE + ) + ).toEqual(hexToBytes(searchEntry.contentHash)); + } + }); + }); +}); diff --git a/packages/workers-shared/asset-worker/vitest.config.mts b/packages/workers-shared/asset-worker/vitest.config.mts index 19d3d456b85a..f26ff109b995 100644 --- a/packages/workers-shared/asset-worker/vitest.config.mts +++ b/packages/workers-shared/asset-worker/vitest.config.mts @@ -7,6 +7,7 @@ export default mergeConfig( test: { include: ["tests/**.{test,spec}.{ts,js}"], globals: true, + setupFiles: [import.meta.resolve("./crypto-polyfill.ts")], }, }) );