From 6bf960e5bd5d38ae691c390c245c33e175a27cdb Mon Sep 17 00:00:00 2001 From: greatroar <61184462+greatroar@users.noreply.github.com> Date: Sat, 9 Dec 2023 17:46:34 +0100 Subject: [PATCH] zstd: Shorter and faster asm for decSymbol.newState (#896) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * zstd: Shorter asm for decSymbol.newState The asm needs to compute decSymbol.newState, which is uint16(state >> 16), or, equivalently (except for types), uint32(state) >> 16. This can be accomplished by a MOVL+SHRL, the former of which is elided by avo, so we get a single instruction for both the BMI2 and non-BMI2 cases. Benchmarks show no difference on a new BMI2-supporting machine, but on an older i7, decompression throughput is a tiny bit faster: goos: linux goarch: amd64 pkg: github.com/klauspost/compress/zstd cpu: Intel(R) Core(TM) i7-3770K CPU @ 3.50GHz │ old │ shift │ │ B/s │ B/s vs base │ Decoder_DecodeAll/kppkn.gtb.zst-8 441.4Mi ± 2% 450.4Mi ± 0% +2.03% (p=0.000 n=10) Decoder_DecodeAll/geo.protodata.zst-8 1.148Gi ± 1% 1.152Gi ± 0% +0.34% (p=0.009 n=10) Decoder_DecodeAll/plrabn12.txt.zst-8 347.9Mi ± 0% 356.6Mi ± 1% +2.48% (p=0.000 n=10) Decoder_DecodeAll/lcet10.txt.zst-8 417.4Mi ± 0% 427.3Mi ± 0% +2.37% (p=0.000 n=10) Decoder_DecodeAll/asyoulik.txt.zst-8 347.1Mi ± 0% 352.7Mi ± 1% +1.62% (p=0.003 n=10) Decoder_DecodeAll/alice29.txt.zst-8 346.3Mi ± 1% 352.6Mi ± 0% +1.83% (p=0.000 n=10) Decoder_DecodeAll/html_x_4.zst-8 1.440Gi ± 0% 1.445Gi ± 0% +0.29% (p=0.019 n=10) Decoder_DecodeAll/paper-100k.pdf.zst-8 4.191Gi ± 0% 4.210Gi ± 0% +0.45% (p=0.007 n=10) Decoder_DecodeAll/fireworks.jpeg.zst-8 8.891Gi ± 0% 8.849Gi ± 0% -0.47% (p=0.000 n=10) Decoder_DecodeAll/urls.10K.zst-8 589.6Mi ± 0% 600.2Mi ± 0% +1.80% (p=0.001 n=10) Decoder_DecodeAll/html.zst-8 926.1Mi ± 1% 937.9Mi ± 0% +1.27% (p=0.000 n=10) Decoder_DecodeAll/comp-data.bin.zst-8 389.6Mi ± 0% 395.1Mi ± 0% +1.40% (p=0.000 n=10) geomean 832.6Mi 843.3Mi +1.28% * zstd: Remove unused parameter in asm generator --- zstd/_generate/gen.go | 39 ++++-------- zstd/seqdec_amd64.s | 136 +++++++++++++++++------------------------- 2 files changed, 68 insertions(+), 107 deletions(-) diff --git a/zstd/_generate/gen.go b/zstd/_generate/gen.go index 7ef9a45aa8..03a3595d4a 100644 --- a/zstd/_generate/gen.go +++ b/zstd/_generate/gen.go @@ -316,7 +316,7 @@ func (o options) generateBody(name string, executeSingleTriple func(ctx *execute lowBits := GP64() BZHIQ(nBits, bits, lowBits) // lowBits = bits & ((1 << nBits) - 1)) SHRXQ(nBits, bits, bits) // bits >>= nBits - o.nextState(name+"_ofState", ofState, lowBits, "ofTable") + o.nextState(ofState, lowBits, "ofTable") } Comment("Update Match Length State") { @@ -324,22 +324,22 @@ func (o options) generateBody(name string, executeSingleTriple func(ctx *execute lowBits := GP64() BZHIQ(nBits, bits, lowBits) // lowBits = bits & ((1 << nBits) - 1)) SHRXQ(nBits, bits, bits) // lowBits >>= nBits - o.nextState(name+"_mlState", mlState, lowBits, "mlTable") + o.nextState(mlState, lowBits, "mlTable") } Comment("Update Literal Length State") { nBits := llState lowBits := GP64() BZHIQ(nBits, bits, lowBits) // lowBits = bits & ((1 << nBits) - 1)) - o.nextState(name+"_llState", llState, lowBits, "llTable") + o.nextState(llState, lowBits, "llTable") } } else { Comment("Update Literal Length State") - o.updateState(name+"_llState", llState, brValue, brBitsRead, "llTable") + o.updateState(llState, brValue, brBitsRead, "llTable") Comment("Update Match Length State") - o.updateState(name+"_mlState", mlState, brValue, brBitsRead, "mlTable") + o.updateState(mlState, brValue, brBitsRead, "mlTable") Comment("Update Offset State") - o.updateState(name+"_ofState", ofState, brValue, brBitsRead, "ofTable") + o.updateState(ofState, brValue, brBitsRead, "ofTable") } } Label(name + "_skip_update") @@ -631,8 +631,7 @@ func (o options) updateLength(name string, brValue, brBitsRead, state reg.GPVirt } } -func (o options) updateState(name string, state, brValue, brBitsRead reg.GPVirtual, table string) { - name = name + "_updateState" +func (o options) updateState(state, brValue, brBitsRead reg.GPVirtual, table string) { AX := GP64() MOVBQZX(state.As8(), AX) // AX = nBits // Check we have a reasonable nBits @@ -642,15 +641,8 @@ func (o options) updateState(name string, state, brValue, brBitsRead reg.GPVirtu }) DX := GP64() - if o.bmi2 { - tmp := GP64() - MOVQ(U32(16|(16<<8)), tmp) - BEXTRQ(tmp, state, DX) - } else { - MOVQ(state, DX) - SHRQ(U8(16), DX) - MOVWQZX(DX.As16(), DX) - } + MOVL(state.As32(), DX.As32()) // Clear the top 32 bits. + SHRL(U8(16), DX.As32()) { lowBits := o.getBits(AX, brValue, brBitsRead) @@ -681,17 +673,10 @@ func (o options) updateState(name string, state, brValue, brBitsRead reg.GPVirtu MOVQ(Mem{Base: tablePtr, Index: DX, Scale: 8}, state) } -func (o options) nextState(name string, state, lowBits reg.GPVirtual, table string) { +func (o options) nextState(state, lowBits reg.GPVirtual, table string) { DX := GP64() - if o.bmi2 { - tmp := GP64() - MOVQ(U32(16|(16<<8)), tmp) - BEXTRQ(tmp, state, DX) - } else { - MOVQ(state, DX) - SHRQ(U8(16), DX) - MOVWQZX(DX.As16(), DX) - } + MOVL(state.As32(), DX.As32()) // Clear the top 32 bits. + SHRL(U8(16), DX.As32()) ADDQ(lowBits, DX) diff --git a/zstd/seqdec_amd64.s b/zstd/seqdec_amd64.s index 974b99725f..5b06174b89 100644 --- a/zstd/seqdec_amd64.s +++ b/zstd/seqdec_amd64.s @@ -157,8 +157,7 @@ sequenceDecs_decode_amd64_ll_update_zero: // Update Literal Length State MOVBQZX DI, R14 - SHRQ $0x10, DI - MOVWQZX DI, DI + SHRL $0x10, DI LEAQ (BX)(R14*1), CX MOVQ DX, R15 MOVQ CX, BX @@ -177,8 +176,7 @@ sequenceDecs_decode_amd64_ll_update_zero: // Update Match Length State MOVBQZX R8, R14 - SHRQ $0x10, R8 - MOVWQZX R8, R8 + SHRL $0x10, R8 LEAQ (BX)(R14*1), CX MOVQ DX, R15 MOVQ CX, BX @@ -197,8 +195,7 @@ sequenceDecs_decode_amd64_ll_update_zero: // Update Offset State MOVBQZX R9, R14 - SHRQ $0x10, R9 - MOVWQZX R9, R9 + SHRL $0x10, R9 LEAQ (BX)(R14*1), CX MOVQ DX, R15 MOVQ CX, BX @@ -459,8 +456,7 @@ sequenceDecs_decode_56_amd64_ll_update_zero: // Update Literal Length State MOVBQZX DI, R14 - SHRQ $0x10, DI - MOVWQZX DI, DI + SHRL $0x10, DI LEAQ (BX)(R14*1), CX MOVQ DX, R15 MOVQ CX, BX @@ -479,8 +475,7 @@ sequenceDecs_decode_56_amd64_ll_update_zero: // Update Match Length State MOVBQZX R8, R14 - SHRQ $0x10, R8 - MOVWQZX R8, R8 + SHRL $0x10, R8 LEAQ (BX)(R14*1), CX MOVQ DX, R15 MOVQ CX, BX @@ -499,8 +494,7 @@ sequenceDecs_decode_56_amd64_ll_update_zero: // Update Offset State MOVBQZX R9, R14 - SHRQ $0x10, R9 - MOVWQZX R9, R9 + SHRL $0x10, R9 LEAQ (BX)(R14*1), CX MOVQ DX, R15 MOVQ CX, BX @@ -772,11 +766,10 @@ sequenceDecs_decode_bmi2_fill_2_end: BZHIQ R14, R15, R15 // Update Offset State - BZHIQ R8, R15, CX - SHRXQ R8, R15, R15 - MOVQ $0x00001010, R14 - BEXTRQ R14, R8, R8 - ADDQ CX, R8 + BZHIQ R8, R15, CX + SHRXQ R8, R15, R15 + SHRL $0x10, R8 + ADDQ CX, R8 // Load ctx.ofTable MOVQ ctx+16(FP), CX @@ -784,11 +777,10 @@ sequenceDecs_decode_bmi2_fill_2_end: MOVQ (CX)(R8*8), R8 // Update Match Length State - BZHIQ DI, R15, CX - SHRXQ DI, R15, R15 - MOVQ $0x00001010, R14 - BEXTRQ R14, DI, DI - ADDQ CX, DI + BZHIQ DI, R15, CX + SHRXQ DI, R15, R15 + SHRL $0x10, DI + ADDQ CX, DI // Load ctx.mlTable MOVQ ctx+16(FP), CX @@ -796,10 +788,9 @@ sequenceDecs_decode_bmi2_fill_2_end: MOVQ (CX)(DI*8), DI // Update Literal Length State - BZHIQ SI, R15, CX - MOVQ $0x00001010, R14 - BEXTRQ R14, SI, SI - ADDQ CX, SI + BZHIQ SI, R15, CX + SHRL $0x10, SI + ADDQ CX, SI // Load ctx.llTable MOVQ ctx+16(FP), CX @@ -1032,11 +1023,10 @@ sequenceDecs_decode_56_bmi2_fill_end: BZHIQ R14, R15, R15 // Update Offset State - BZHIQ R8, R15, CX - SHRXQ R8, R15, R15 - MOVQ $0x00001010, R14 - BEXTRQ R14, R8, R8 - ADDQ CX, R8 + BZHIQ R8, R15, CX + SHRXQ R8, R15, R15 + SHRL $0x10, R8 + ADDQ CX, R8 // Load ctx.ofTable MOVQ ctx+16(FP), CX @@ -1044,11 +1034,10 @@ sequenceDecs_decode_56_bmi2_fill_end: MOVQ (CX)(R8*8), R8 // Update Match Length State - BZHIQ DI, R15, CX - SHRXQ DI, R15, R15 - MOVQ $0x00001010, R14 - BEXTRQ R14, DI, DI - ADDQ CX, DI + BZHIQ DI, R15, CX + SHRXQ DI, R15, R15 + SHRL $0x10, DI + ADDQ CX, DI // Load ctx.mlTable MOVQ ctx+16(FP), CX @@ -1056,10 +1045,9 @@ sequenceDecs_decode_56_bmi2_fill_end: MOVQ (CX)(DI*8), DI // Update Literal Length State - BZHIQ SI, R15, CX - MOVQ $0x00001010, R14 - BEXTRQ R14, SI, SI - ADDQ CX, SI + BZHIQ SI, R15, CX + SHRL $0x10, SI + ADDQ CX, SI // Load ctx.llTable MOVQ ctx+16(FP), CX @@ -1967,8 +1955,7 @@ sequenceDecs_decodeSync_amd64_ll_update_zero: // Update Literal Length State MOVBQZX DI, R13 - SHRQ $0x10, DI - MOVWQZX DI, DI + SHRL $0x10, DI LEAQ (BX)(R13*1), CX MOVQ DX, R14 MOVQ CX, BX @@ -1987,8 +1974,7 @@ sequenceDecs_decodeSync_amd64_ll_update_zero: // Update Match Length State MOVBQZX R8, R13 - SHRQ $0x10, R8 - MOVWQZX R8, R8 + SHRL $0x10, R8 LEAQ (BX)(R13*1), CX MOVQ DX, R14 MOVQ CX, BX @@ -2007,8 +1993,7 @@ sequenceDecs_decodeSync_amd64_ll_update_zero: // Update Offset State MOVBQZX R9, R13 - SHRQ $0x10, R9 - MOVWQZX R9, R9 + SHRL $0x10, R9 LEAQ (BX)(R13*1), CX MOVQ DX, R14 MOVQ CX, BX @@ -2514,11 +2499,10 @@ sequenceDecs_decodeSync_bmi2_fill_2_end: BZHIQ R13, R14, R14 // Update Offset State - BZHIQ R8, R14, CX - SHRXQ R8, R14, R14 - MOVQ $0x00001010, R13 - BEXTRQ R13, R8, R8 - ADDQ CX, R8 + BZHIQ R8, R14, CX + SHRXQ R8, R14, R14 + SHRL $0x10, R8 + ADDQ CX, R8 // Load ctx.ofTable MOVQ ctx+16(FP), CX @@ -2526,11 +2510,10 @@ sequenceDecs_decodeSync_bmi2_fill_2_end: MOVQ (CX)(R8*8), R8 // Update Match Length State - BZHIQ DI, R14, CX - SHRXQ DI, R14, R14 - MOVQ $0x00001010, R13 - BEXTRQ R13, DI, DI - ADDQ CX, DI + BZHIQ DI, R14, CX + SHRXQ DI, R14, R14 + SHRL $0x10, DI + ADDQ CX, DI // Load ctx.mlTable MOVQ ctx+16(FP), CX @@ -2538,10 +2521,9 @@ sequenceDecs_decodeSync_bmi2_fill_2_end: MOVQ (CX)(DI*8), DI // Update Literal Length State - BZHIQ SI, R14, CX - MOVQ $0x00001010, R13 - BEXTRQ R13, SI, SI - ADDQ CX, SI + BZHIQ SI, R14, CX + SHRL $0x10, SI + ADDQ CX, SI // Load ctx.llTable MOVQ ctx+16(FP), CX @@ -3055,8 +3037,7 @@ sequenceDecs_decodeSync_safe_amd64_ll_update_zero: // Update Literal Length State MOVBQZX DI, R13 - SHRQ $0x10, DI - MOVWQZX DI, DI + SHRL $0x10, DI LEAQ (BX)(R13*1), CX MOVQ DX, R14 MOVQ CX, BX @@ -3075,8 +3056,7 @@ sequenceDecs_decodeSync_safe_amd64_ll_update_zero: // Update Match Length State MOVBQZX R8, R13 - SHRQ $0x10, R8 - MOVWQZX R8, R8 + SHRL $0x10, R8 LEAQ (BX)(R13*1), CX MOVQ DX, R14 MOVQ CX, BX @@ -3095,8 +3075,7 @@ sequenceDecs_decodeSync_safe_amd64_ll_update_zero: // Update Offset State MOVBQZX R9, R13 - SHRQ $0x10, R9 - MOVWQZX R9, R9 + SHRL $0x10, R9 LEAQ (BX)(R13*1), CX MOVQ DX, R14 MOVQ CX, BX @@ -3704,11 +3683,10 @@ sequenceDecs_decodeSync_safe_bmi2_fill_2_end: BZHIQ R13, R14, R14 // Update Offset State - BZHIQ R8, R14, CX - SHRXQ R8, R14, R14 - MOVQ $0x00001010, R13 - BEXTRQ R13, R8, R8 - ADDQ CX, R8 + BZHIQ R8, R14, CX + SHRXQ R8, R14, R14 + SHRL $0x10, R8 + ADDQ CX, R8 // Load ctx.ofTable MOVQ ctx+16(FP), CX @@ -3716,11 +3694,10 @@ sequenceDecs_decodeSync_safe_bmi2_fill_2_end: MOVQ (CX)(R8*8), R8 // Update Match Length State - BZHIQ DI, R14, CX - SHRXQ DI, R14, R14 - MOVQ $0x00001010, R13 - BEXTRQ R13, DI, DI - ADDQ CX, DI + BZHIQ DI, R14, CX + SHRXQ DI, R14, R14 + SHRL $0x10, DI + ADDQ CX, DI // Load ctx.mlTable MOVQ ctx+16(FP), CX @@ -3728,10 +3705,9 @@ sequenceDecs_decodeSync_safe_bmi2_fill_2_end: MOVQ (CX)(DI*8), DI // Update Literal Length State - BZHIQ SI, R14, CX - MOVQ $0x00001010, R13 - BEXTRQ R13, SI, SI - ADDQ CX, SI + BZHIQ SI, R14, CX + SHRL $0x10, SI + ADDQ CX, SI // Load ctx.llTable MOVQ ctx+16(FP), CX