From 2b93f5cf38ff1cf255d397f470a1008d701d787f Mon Sep 17 00:00:00 2001 From: Thomas Wolf Date: Thu, 18 Jul 2024 22:03:50 +0200 Subject: [PATCH] GH-524: Poly1305Mac: avoid needless byte copying The update() implementation copied _all_ bytes (successively) first into an internal 16-byte buffer and then processed that buffer. This is no needed if the input is long. Use the internal 16-byte buffer only for inputs shorter than 16 bytes, or if there is a leftover of less than 16 bytes at the end of a long input. In between process 16-byte chunks directly from the input byte array. For 32kB inputs this saves us some 2048 calls to System.arraycopy() copying all those 32kB. The speedup is minimal but noticeable in benchmarking. Bug: https://github.com/apache/mina-sshd/issues/524 --- .../apache/sshd/common/mac/Poly1305Mac.java | 56 ++++++++++++------- 1 file changed, 35 insertions(+), 21 deletions(-) diff --git a/sshd-common/src/main/java/org/apache/sshd/common/mac/Poly1305Mac.java b/sshd-common/src/main/java/org/apache/sshd/common/mac/Poly1305Mac.java index 62c8edb7b..4fa809b2f 100644 --- a/sshd-common/src/main/java/org/apache/sshd/common/mac/Poly1305Mac.java +++ b/sshd-common/src/main/java/org/apache/sshd/common/mac/Poly1305Mac.java @@ -95,20 +95,36 @@ public void init(byte[] key) throws Exception { k1 = unpackIntLE(key, 20); k2 = unpackIntLE(key, 24); k3 = unpackIntLE(key, 28); + + currentBlockOffset = 0; } @Override public void update(byte[] in, int offset, int length) { - while (length > 0) { - if (currentBlockOffset == BLOCK_SIZE) { - processBlock(); - } - + if (currentBlockOffset > 0) { + // There is a partially filled block. int toCopy = Math.min(length, BLOCK_SIZE - currentBlockOffset); System.arraycopy(in, offset, currentBlock, currentBlockOffset, toCopy); offset += toCopy; length -= toCopy; currentBlockOffset += toCopy; + if (currentBlockOffset == BLOCK_SIZE) { + processBlock(currentBlock, 0, BLOCK_SIZE); + currentBlockOffset = 0; + } + if (length == 0) { + return; + } + } + while (length >= BLOCK_SIZE) { + processBlock(in, offset, BLOCK_SIZE); + offset += BLOCK_SIZE; + length -= BLOCK_SIZE; + } + if (length > 0) { + // Put remaining bytes into internal buffer (length < BLOCK_SIZE here). + System.arraycopy(in, offset, currentBlock, 0, length); + currentBlockOffset = length; } } @@ -125,7 +141,14 @@ public void doFinal(byte[] out, int offset) throws Exception { throw new BufferOverflowException(); } if (currentBlockOffset > 0) { - processBlock(); + if (currentBlockOffset < BLOCK_SIZE) { + // padding + currentBlock[currentBlockOffset] = 1; + for (int i = currentBlockOffset + 1; i < BLOCK_SIZE; i++) { + currentBlock[i] = 0; + } + } + processBlock(currentBlock, 0, currentBlockOffset); } h1 += h0 >>> 26; @@ -179,19 +202,12 @@ public void doFinal(byte[] out, int offset) throws Exception { reset(); } - private void processBlock() { - if (currentBlockOffset < BLOCK_SIZE) { - // padding - currentBlock[currentBlockOffset] = 1; - for (int i = currentBlockOffset + 1; i < BLOCK_SIZE; i++) { - currentBlock[i] = 0; - } - } + private void processBlock(byte[] block, int offset, int length) { - int t0 = unpackIntLE(currentBlock, 0); - int t1 = unpackIntLE(currentBlock, 4); - int t2 = unpackIntLE(currentBlock, 8); - int t3 = unpackIntLE(currentBlock, 12); + int t0 = unpackIntLE(block, offset); + int t1 = unpackIntLE(block, offset + 4); + int t2 = unpackIntLE(block, offset + 8); + int t3 = unpackIntLE(block, offset + 12); h0 += t0 & 0x3ffffff; h1 += (t0 >>> 26 | t1 << 6) & 0x3ffffff; @@ -199,7 +215,7 @@ private void processBlock() { h3 += (t2 >>> 14 | t3 << 18) & 0x3ffffff; h4 += t3 >>> 8; - if (currentBlockOffset == BLOCK_SIZE) { + if (length == BLOCK_SIZE) { h4 += 1 << 24; } @@ -226,8 +242,6 @@ private void processBlock() { h0 += (int) (tp4 >>> 26) * 5; h1 += h0 >>> 26; h0 &= 0x3ffffff; - - currentBlockOffset = 0; } private void reset() {