`Buffer.asciiWrite` is 3x slower than `Buffer.set` #169

ronag · 2024-05-20T16:55:29Z

I would assume that Buffer.set and Buffer.asciiWrite should be roughly the same (they are both essentially a memcpy). However this is not the case.

Given that the speed is somewhat the same even when the string size grows I would assume that most goes to call overhead.

cpu: Apple M2 Pro
runtime: node v21.6.0 (arm64-darwin)

benchmark       time (avg)             (min … max)       p75       p99      p999
-------------------------------------------------- -----------------------------
• 8
-------------------------------------------------- -----------------------------
asciiWrite   34.29 ns/iter     (31.17 ns … 493 ns)  34.77 ns  40.71 ns  70.13 ns
loopWrite    10.27 ns/iter    (9.28 ns … 1'906 ns)  10.11 ns  14.97 ns  21.99 ns
bufWrite     13.31 ns/iter   (11.47 ns … 5'753 ns)  12.43 ns  15.65 ns  49.74 ns

summary for 8
  loopWrite
   1.3x faster than bufWrite
   3.34x faster than asciiWrite

• 16
-------------------------------------------------- -----------------------------
asciiWrite   33.56 ns/iter     (31.35 ns … 160 ns)  33.77 ns  39.75 ns  66.26 ns
loopWrite    20.99 ns/iter   (18.17 ns … 6'464 ns)  19.08 ns  26.37 ns   57.6 ns
bufWrite      13.7 ns/iter      (12 ns … 7'717 ns)  12.92 ns  16.48 ns     34 ns

summary for 16
  bufWrite
   1.53x faster than loopWrite
   2.45x faster than asciiWrite

• 32
-------------------------------------------------- -----------------------------
asciiWrite   37.25 ns/iter     (34.87 ns … 221 ns)  37.62 ns  43.68 ns  79.14 ns
loopWrite    92.46 ns/iter   (75.13 ns … 9'072 ns)  82.25 ns    228 ns  1'358 ns
bufWrite      13.6 ns/iter     (12.57 ns … 473 ns)  13.61 ns  17.01 ns  32.76 ns

summary for 32
  bufWrite
   2.74x faster than asciiWrite
   6.8x faster than loopWrite

• 64
-------------------------------------------------- -----------------------------
asciiWrite   37.03 ns/iter   (33.41 ns … 4'565 ns)  37.03 ns  43.38 ns  86.69 ns
loopWrite      166 ns/iter     (150 ns … 2'707 ns)    165 ns    183 ns  2'318 ns
bufWrite     12.52 ns/iter     (11.72 ns … 554 ns)  12.63 ns  15.83 ns  23.99 ns

summary for 64
  bufWrite
   2.96x faster than asciiWrite
   13.25x faster than loopWrite

import { bench, group, run } from 'mitata'

const BUF = Buffer.allocUnsafe(64).fill(88)
const BUF_BUF = new Array(64).fill(null)

function asciiWrite(str) {
  BUF.asciiWrite(str, 0)
}

function bufWrite(str, index) {
  BUF.set((BUF_BUF[index] ??= Buffer.from(str)))
}

function loopWrite(src) {
  for (let n = 0; n < src.length; n++) {
    BUF[n] = src.charCodeAt(n)
  }
}

const str8 = '01234567'
const str16 = '0123456789abcdef'
const str32 = '0123456789abcdef'.repeat(2)
const str64 = '0123456789abcdef'.repeat(4)

group('8', () => {
  bench('asciiWrite', () => asciiWrite(str8))
  bench('loopWrite', () => loopWrite(str8))
  bench('bufWrite', () => bufWrite(str8, 1))
})
group('16', () => {
  bench('asciiWrite', () => asciiWrite(str16))
  bench('loopWrite', () => loopWrite(str16))
  bench('bufWrite', () => bufWrite(str16, 2))
})
group('32', () => {
  bench('asciiWrite', () => asciiWrite(str32))
  bench('loopWrite', () => loopWrite(str32))
  bench('bufWrite', () => bufWrite(str32, 3))
})
group('64', () => {
  bench('asciiWrite', () => asciiWrite(str64))
  bench('loopWrite', () => loopWrite(str64))
  bench('bufWrite', () => bufWrite(str64, 4))
})

await run()

Refs: #168

The text was updated successfully, but these errors were encountered:

evanwashere · 2024-10-11T12:38:02Z

you are caching text encoding with BUF_BUF[index] ??= Buffer.from(str) so benchmark produces misleading results

here is fixed example

import { run, bench, summary } from 'mitata'

summary(() => {
  bench('Buffer.set($size)', function* (state) {
    const str = 'x'.repeat(state.get('size'));
    const buf = () => Buffer.from(str, 'ascii');
    const scratch = Buffer.allocUnsafe(state.get('size'));

    yield () => scratch.set(buf(), 0);
  }).compact().range('size', 1, 1024);

  bench('Buffer.asciiWrite($size)', function* (state) {
    const str = 'x'.repeat(state.get('size'));
    const scratch = Buffer.allocUnsafe(state.get('size'));

    yield () => scratch.asciiWrite(str, 0);
  }).compact().range('size', 1, 1024);

  bench('for loop ($size)', function* (state) {
    const str = 'x'.repeat(state.get('size'));
    const scratch = Buffer.allocUnsafe(state.get('size'));

    yield () => {
      for (let i = 0; i < str.length; i++) {
        scratch[i] = str.charCodeAt(i);
      }
    }
  }).compact().range('size', 1, 1024);
});

await run();

clk: ~3.25 GHz
cpu: Apple M2 Pro
runtime: node 22.9.0 (arm64-darwin)

benchmark              avg (min … max) p75   p99    (min … top 1%)
-------------------------------------- -------------------------------
Buffer.set(1)            44.55 ns/iter  44.90 ns  65.30 ns █▄▂▁▁▁▁▁▁▁▁
Buffer.set(8)            46.47 ns/iter  47.04 ns  67.72 ns █▆▃▂▁▁▁▁▁▁▁
Buffer.set(64)           81.05 ns/iter  82.12 ns 105.02 ns ▃██▃▂▁▁▁▁▁▁
Buffer.set(512)         128.24 ns/iter 132.17 ns 156.47 ns ▁▂▂▇█▆▄▂▂▂▁
Buffer.set(1024)        162.07 ns/iter 170.16 ns 198.58 ns ▂▂▃▃▄█▅▃▂▃▁
Buffer.asciiWrite(1)     31.07 ns/iter  31.39 ns  35.41 ns ▁▁▁▁▁▁▁▁█▃▁
Buffer.asciiWrite(8)     31.87 ns/iter  32.00 ns  36.10 ns ▁▁▁▁▁▁▁▁█▃▁
Buffer.asciiWrite(64)    34.58 ns/iter  34.82 ns  40.47 ns █▁▂▂▁▁▁▁▁▁▁
Buffer.asciiWrite(512)   43.15 ns/iter  43.71 ns  49.75 ns █▂▂▂▁▁▁▁▁▁▁
Buffer.asciiWrite(1024)  52.29 ns/iter  52.75 ns  59.33 ns █▂▂▂▂▁▁▁▁▁▁
for loop (1)              1.18 ns/iter   1.15 ns   2.15 ns █▁▁▁▁▁▁▁▁▁▁
for loop (8)              6.10 ns/iter   5.98 ns   8.69 ns █▁▁▁▁▁▁▁▁▁▁
for loop (64)           158.66 ns/iter 160.59 ns 170.77 ns █▇█▄▄▃▂▂▂▁▁
for loop (512)            1.22 µs/iter   1.24 µs   1.27 µs ▂▆▅▄▅▇▅█▄▄▁
for loop (1024)           2.39 µs/iter   2.40 µs   2.43 µs ▂█▂▆▇▃▅▅▄▃▁

summary
  Buffer.asciiWrite($size)
   3.1…1.43x faster than Buffer.set($size)
   45.72…-26.43x faster than for loop ($size)

ronag · 2024-10-11T12:48:22Z

you are caching text encoding with BUF_BUF[index] ??= Buffer.from(str) so benchmark produces misleading results

That's on purpose. The conversion to buffer should not be part of the benchmark.

evanwashere · 2024-10-11T12:54:14Z

you are caching text encoding with BUF_BUF[index] ??= Buffer.from(str) so benchmark produces misleading results

That's on purpose. The conversion to buffer should not be part of the benchmark.

for loop and Buffer.asciiWrite always have to do text encoding so not sure what would be the purpose of comparing cached bytes memcpy vs text encoding to bytes + memcpy

ronag · 2024-10-11T13:06:06Z

always have to do text encoding

It does not. When it has a "one-byte" representation it can be memcpied. Though there is indeed something wrong with the benchmark. It should be latin1Write and not ascii. So maybe that would fix the difference.

evanwashere · 2024-10-11T14:01:44Z

always have to do text encoding

It does not. When it has a "one-byte" representation it can be memcpied. Though there is indeed something wrong with the benchmark. It should be latin1Write and not ascii. So maybe that would fix the difference.

now it makes much more sense what you are trying to compare (found it), in this case the difference comes from v8 doing something with string when passing it to fast function (checking if string is latin1? might depend on length? making unique string copy for c++?) or Uint8Array.set and memcpy being different

Uint8Array memcpy 14256 bytes  181.47 ns/iter 187.24 ns 220.88 ns ▂▆█▆▄▃▃▂▂▁▁
asciiWrite(14256 bytes [lati.. 292.18 ns/iter 306.63 ns 319.10 ns ▁▁▁▁▁▁▁▁▁█▂
asciiWrite(14259 bytes [lati..   4.20 µs/iter   4.22 µs   4.23 µs ▃▁▃▅▅▃█▁▄█▂

summary
  Uint8Array memcpy 14256 bytes
   1.61x faster than asciiWrite(14256 bytes [latin1])
   23.16x faster than asciiWrite(14259 bytes [latin1 + 1 unicode])

ronag changed the title ~~asciiWrite is slow~~ Buffer.asciiWrite is slow May 20, 2024

ronag changed the title ~~Buffer.asciiWrite is slow~~ Buffer.asciiWrite is 3x slower than Buffer.set May 20, 2024

ronag closed this as completed Oct 11, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

`Buffer.asciiWrite` is 3x slower than `Buffer.set` #169

`Buffer.asciiWrite` is 3x slower than `Buffer.set` #169

ronag commented May 20, 2024 •

edited

Loading

evanwashere commented Oct 11, 2024

ronag commented Oct 11, 2024 •

edited

Loading

evanwashere commented Oct 11, 2024 •

edited

Loading

ronag commented Oct 11, 2024 •

edited

Loading

evanwashere commented Oct 11, 2024 •

edited

Loading

Buffer.asciiWrite is 3x slower than Buffer.set #169

Buffer.asciiWrite is 3x slower than Buffer.set #169

Comments

ronag commented May 20, 2024 • edited Loading

evanwashere commented Oct 11, 2024

ronag commented Oct 11, 2024 • edited Loading

evanwashere commented Oct 11, 2024 • edited Loading

ronag commented Oct 11, 2024 • edited Loading

evanwashere commented Oct 11, 2024 • edited Loading

`Buffer.asciiWrite` is 3x slower than `Buffer.set` #169

`Buffer.asciiWrite` is 3x slower than `Buffer.set` #169

ronag commented May 20, 2024 •

edited

Loading

ronag commented Oct 11, 2024 •

edited

Loading

evanwashere commented Oct 11, 2024 •

edited

Loading

ronag commented Oct 11, 2024 •

edited

Loading

evanwashere commented Oct 11, 2024 •

edited

Loading