diff --git a/chacha20poly1305/_asm/chacha20poly1305_amd64_asm.go b/chacha20poly1305/_asm/chacha20poly1305_amd64_asm.go new file mode 100644 index 0000000000..e9ba153b4c --- /dev/null +++ b/chacha20poly1305/_asm/chacha20poly1305_amd64_asm.go @@ -0,0 +1,5516 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// This assembly implementation was originally from https://golang.org/cl/24717 by Vlad Krasnov of CloudFlare. + +package main + +import ( + "fmt" + "os" + "strings" + + . "github.com/mmcloughlin/avo/build" + "github.com/mmcloughlin/avo/ir" + . "github.com/mmcloughlin/avo/operand" + . "github.com/mmcloughlin/avo/reg" + _ "golang.org/x/crypto/chacha20poly1305" +) + +//go:generate go run . -out ../chacha20poly1305_amd64.s -pkg chacha20poly1305 + +var ( + // General register allocation + oup GPPhysical = RDI + inp = RSI + inl = RBX + adp = RCX // free to reuse, after we hash the additional data + keyp = R8 // free to reuse, when we copy the key to stack + itr2 = R9 // general iterator + itr1 = RCX // general iterator + acc0 = R10 + acc1 = R11 + acc2 = R12 + t0 = R13 + t1 = R14 + t2 = R15 + t3 = R8 + + // Register and stack allocation for the SSE code + rStore Mem = Mem{Base: BP}.Offset(0 * 16) + sStore = Mem{Base: BP}.Offset(1 * 16) + state1Store = Mem{Base: BP}.Offset(2 * 16) + state2Store = Mem{Base: BP}.Offset(3 * 16) + tmpStore = Mem{Base: BP}.Offset(4 * 16) + ctr0Store = Mem{Base: BP}.Offset(5 * 16) + ctr1Store = Mem{Base: BP}.Offset(6 * 16) + ctr2Store = Mem{Base: BP}.Offset(7 * 16) + ctr3Store = Mem{Base: BP}.Offset(8 * 16) + A0 VecPhysical = X0 + A1 = X1 + A2 = X2 + B0 = X3 + B1 = X4 + B2 = X5 + C0 = X6 + C1 = X7 + C2 = X8 + D0 = X9 + D1 = X10 + D2 = X11 + T0 = X12 + T1 = X13 + T2 = X14 + T3 = X15 + A3 = T0 + B3 = T1 + C3 = T2 + D3 = T3 + + // Register and stack allocation for the AVX2 code + rsStoreAVX2 Mem = Mem{Base: BP}.Offset(0 * 32) + state1StoreAVX2 = Mem{Base: BP}.Offset(1 * 32) + state2StoreAVX2 = Mem{Base: BP}.Offset(2 * 32) + ctr0StoreAVX2 = Mem{Base: BP}.Offset(3 * 32) + ctr1StoreAVX2 = Mem{Base: BP}.Offset(4 * 32) + ctr2StoreAVX2 = Mem{Base: BP}.Offset(5 * 32) + ctr3StoreAVX2 = Mem{Base: BP}.Offset(6 * 32) + tmpStoreAVX2 = Mem{Base: BP}.Offset(7 * 32) // 256 bytes on stack + AA0 VecPhysical = Y0 + AA1 = Y5 + AA2 = Y6 + AA3 = Y7 + BB0 = Y14 + BB1 = Y9 + BB2 = Y10 + BB3 = Y11 + CC0 = Y12 + CC1 = Y13 + CC2 = Y8 + CC3 = Y15 + DD0 = Y4 + DD1 = Y1 + DD2 = Y2 + DD3 = Y3 + TT0 = DD3 + TT1 = AA3 + TT2 = BB3 + TT3 = CC3 +) + +const ThatPeskyUnicodeDot = "\u00b7" + +func main() { + Package("golang.org/x/crypto/chacha20poly1305") + ConstraintExpr("gc,!purego") + polyHashADInternal() + chacha20Poly1305Open() + chacha20Poly1305Seal() + Generate() + + var internalFunctions []string = []string{"·polyHashADInternal"} + removePeskyUnicodeDot(internalFunctions, "../chacha20poly1305_amd64.s") +} + +// Utility function to emit BYTE instruction +func BYTE(u8 U8) { + Instruction(&ir.Instruction{Opcode: "BYTE", Operands: []Op{u8}}) +} + +// PALIGNR $4, X3, X3 +func shiftB0Left() { + BYTE(U8(0x66)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xdb)) + BYTE(U8(0x04)) +} + +// PALIGNR $4, X4, X4 +func shiftB1Left() { + BYTE(U8(0x66)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xe4)) + BYTE(U8(0x04)) +} + +// PALIGNR $4, X5, X5 +func shiftB2Left() { + BYTE(U8(0x66)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xed)) + BYTE(U8(0x04)) +} + +// PALIGNR $4, X13, X13 +func shiftB3Left() { + BYTE(U8(0x66)) + BYTE(U8(0x45)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xed)) + BYTE(U8(0x04)) +} + +// PALIGNR $8, X6, X6 +func shiftC0Left() { + BYTE(U8(0x66)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xf6)) + BYTE(U8(0x08)) +} + +// PALIGNR $8, X7, X7 +func shiftC1Left() { + BYTE(U8(0x66)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xff)) + BYTE(U8(0x08)) +} + +// PALIGNR $8, X8, X8 +func shiftC2Left() { + BYTE(U8(0x66)) + BYTE(U8(0x45)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xc0)) + BYTE(U8(0x08)) +} + +// PALIGNR $8, X14, X14 +func shiftC3Left() { + BYTE(U8(0x66)) + BYTE(U8(0x45)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xf6)) + BYTE(U8(0x08)) +} + +// PALIGNR $12, X9, X9 +func shiftD0Left() { + BYTE(U8(0x66)) + BYTE(U8(0x45)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xc9)) + BYTE(U8(0x0c)) +} + +// PALIGNR $12, X10, X10 +func shiftD1Left() { + BYTE(U8(0x66)) + BYTE(U8(0x45)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xd2)) + BYTE(U8(0x0c)) +} + +// PALIGNR $12, X11, X11 +func shiftD2Left() { + BYTE(U8(0x66)) + BYTE(U8(0x45)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xdb)) + BYTE(U8(0x0c)) +} + +// PALIGNR $12, X15, X15 +func shiftD3Left() { + BYTE(U8(0x66)) + BYTE(U8(0x45)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xff)) + BYTE(U8(0x0c)) +} + +// PALIGNR $12, X3, X3 +func shiftB0Right() { + BYTE(U8(0x66)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xdb)) + BYTE(U8(0x0c)) +} + +// PALIGNR $12, X4, X4 +func shiftB1Right() { + BYTE(U8(0x66)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xe4)) + BYTE(U8(0x0c)) +} + +// PALIGNR $12, X5, X5 +func shiftB2Right() { + BYTE(U8(0x66)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xed)) + BYTE(U8(0x0c)) +} + +// PALIGNR $12, X13, X13 +func shiftB3Right() { + BYTE(U8(0x66)) + BYTE(U8(0x45)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xed)) + BYTE(U8(0x0c)) +} + +func shiftC0Right() { + shiftC0Left() +} + +func shiftC1Right() { + shiftC1Left() +} + +func shiftC2Right() { + shiftC2Left() +} + +func shiftC3Right() { + shiftC3Left() +} + +// PALIGNR $4, X9, X9 +func shiftD0Right() { + BYTE(U8(0x66)) + BYTE(U8(0x45)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xc9)) + BYTE(U8(0x04)) +} + +// PALIGNR $4, X10, X10 +func shiftD1Right() { + BYTE(U8(0x66)) + BYTE(U8(0x45)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xd2)) + BYTE(U8(0x04)) +} + +// PALIGNR $4, X11, X11 +func shiftD2Right() { + BYTE(U8(0x66)) + BYTE(U8(0x45)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xdb)) + BYTE(U8(0x04)) +} + +// PALIGNR $4, X15, X15 +func shiftD3Right() { + BYTE(U8(0x66)) + BYTE(U8(0x45)) + BYTE(U8(0x0f)) + BYTE(U8(0x3a)) + BYTE(U8(0x0f)) + BYTE(U8(0xff)) + BYTE(U8(0x04)) +} + +// ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~SOME MACROS~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~## + +// Hack: ROL must be a #define macro as it is referenced by other macros +func defineROL() { + definition := + `#define ROL(N, R, T) \ + MOVO R, T; \ + PSLLL $(N), T; \ + PSRLL $(32-(N)), R; \ + PXOR T, R` + Comment("ROL rotates the uint32s in register R left by N bits, using temporary T.") + Instruction(&ir.Instruction{Opcode: definition}) +} + +// ROL rotates the uint32s in register R left by N bits, using temporary T. +func ROL(N uint64, R, T VecPhysical) { + // Hack: ROL must be a #define macro as it is referenced by other macros + Instruction(&ir.Instruction{Opcode: fmt.Sprintf("ROL(%s, %s, %s)", I8(N).Asm(), R.Asm(), T.Asm())}) +} + +// Hack to get Avo to generate an #ifdef +// +// ROL16(R, T) definition depends on a compiler flag that specifies amd64 architectural level. +func defineROL16() { + definition := + `#ifdef GOAMD64_v2 + #define ROL16(R, T) PSHUFB ·rol16<>(SB), R + #else + #define ROL16(R, T) ROL(16, R, T) + #endif` + + Comment("ROL16 rotates the uint32s in register R left by 16, using temporary T if needed.") + Instruction(&ir.Instruction{Opcode: definition}) +} + +// Hack to emit macro call +// +// ROL16 rotates the uint32s in register R left by 16, using temporary T if needed. +func ROL16(R, T VecPhysical) { + Instruction(&ir.Instruction{Opcode: fmt.Sprintf("ROL16(%s, %s)", R.Asm(), T.Asm())}) +} + +// Hack to get Avo to generate an #ifdef +// +// ROL8(R, T) definition depends on a compiler flag that specifies amd64 architectural level. +func defineROL8() { + definition := + `#ifdef GOAMD64_v2 + #define ROL8(R, T) PSHUFB ·rol8<>(SB), R + #else + #define ROL8(R, T) ROL(8, R, T) + #endif` + + Comment("ROL8 rotates the uint32s in register R left by 8, using temporary T if needed.") + Instruction(&ir.Instruction{Opcode: definition}) +} + +// Hack to emit macro call +// +// ROL8 rotates the uint32s in register R left by 8, using temporary T if needed. +func ROL8(R, T VecPhysical) { + Instruction(&ir.Instruction{Opcode: fmt.Sprintf("ROL8(%s, %s)", R.Asm(), T.Asm())}) +} + +func chachaQR(A, B, C, D, T VecPhysical) { + PADDD(B, A) + PXOR(A, D) + ROL16(D, T) + PADDD(D, C) + PXOR(C, B) + MOVO(B, T) + PSLLL(Imm(12), T) + PSRLL(Imm(20), B) + PXOR(T, B) + PADDD(B, A) + PXOR(A, D) + ROL8(D, T) + PADDD(D, C) + PXOR(C, B) + MOVO(B, T) + PSLLL(Imm(7), T) + PSRLL(Imm(25), B) + PXOR(T, B) +} + +func chachaQR_AVX2(A, B, C, D, T VecPhysical) { + VPADDD(B, A, A) + VPXOR(A, D, D) + rol16 := rol16_DATA() + VPSHUFB(rol16, D, D) + VPADDD(D, C, C) + VPXOR(C, B, B) + VPSLLD(Imm(12), B, T) + VPSRLD(Imm(20), B, B) + VPXOR(T, B, B) + VPADDD(B, A, A) + VPXOR(A, D, D) + rol8 := rol8_DATA() + VPSHUFB(rol8, D, D) + VPADDD(D, C, C) + VPXOR(C, B, B) + VPSLLD(Imm(7), B, T) + VPSRLD(Imm(25), B, B) + VPXOR(T, B, B) +} + +func polyAdd(S Mem) { + ADDQ(S, acc0) + ADCQ(S.Offset(8), acc1) + ADCQ(Imm(1), acc2) +} + +func polyMulStage1() { + MOVQ(Mem{Base: BP}.Offset(0*8), RAX) + MOVQ(RAX, t2) + MULQ(acc0) + MOVQ(RAX, t0) + MOVQ(RDX, t1) + MOVQ(Mem{Base: BP}.Offset(0*8), RAX) + MULQ(acc1) + IMULQ(acc2, t2) + ADDQ(RAX, t1) + ADCQ(RDX, t2) +} + +func polyMulStage2() { + MOVQ(Mem{Base: BP}.Offset(1*8), RAX) + MOVQ(RAX, t3) + MULQ(acc0) + ADDQ(RAX, t1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, acc0) + MOVQ(Mem{Base: BP}.Offset(1*8), RAX) + MULQ(acc1) + ADDQ(RAX, t2) + ADCQ(Imm(0), RDX) +} + +func polyMulStage3() { + IMULQ(acc2, t3) + ADDQ(acc0, t2) + ADCQ(RDX, t3) +} + +func polyMulReduceStage() { + MOVQ(t0, acc0) + MOVQ(t1, acc1) + MOVQ(t2, acc2) + ANDQ(Imm(3), acc2) + MOVQ(t2, t0) + ANDQ(I8(-4), t0) + MOVQ(t3, t1) + SHRQ(Imm(2), t3, t2) + SHRQ(Imm(2), t3) + ADDQ(t0, acc0) + ADCQ(t1, acc1) + ADCQ(Imm(0), acc2) + ADDQ(t2, acc0) + ADCQ(t3, acc1) + ADCQ(Imm(0), acc2) +} + +func polyMulStage1_AVX2() { + MOVQ(Mem{Base: BP}.Offset(0*8), RDX) + MOVQ(RDX, t2) + MULXQ(acc0, t0, t1) + IMULQ(acc2, t2) + MULXQ(acc1, RAX, RDX) + ADDQ(RAX, t1) + ADCQ(RDX, t2) +} + +func polyMulStage2_AVX2() { + MOVQ(Mem{Base: BP}.Offset(1*8), RDX) + MULXQ(acc0, acc0, RAX) + ADDQ(acc0, t1) + MULXQ(acc1, acc1, t3) + ADCQ(acc1, t2) + ADCQ(Imm(0), t3) +} + +func polyMulStage3_AVX2() { + IMULQ(acc2, RDX) + ADDQ(RAX, t2) + ADCQ(RDX, t3) +} + +func polyMul() { + polyMulStage1() + polyMulStage2() + polyMulStage3() + polyMulReduceStage() +} + +func polyMulAVX2() { + polyMulStage1_AVX2() + polyMulStage2_AVX2() + polyMulStage3_AVX2() + polyMulReduceStage() +} + +// ---------------------------------------------------------------------------- +// ---------------------------------------------------------------------------- + +func polyHashADInternal() { + Function("polyHashADInternal<>") + Attributes(NOSPLIT) + AllocLocal(0) + + Comment("Hack: Must declare #define macros inside of a function due to Avo constraints") + defineROL() + defineROL8() + defineROL16() + + // adp points to beginning of additional data + // itr2 holds ad length + XORQ(acc0, acc0) + XORQ(acc1, acc1) + XORQ(acc2, acc2) + CMPQ(itr2, Imm(13)) + JNE(LabelRef("hashADLoop")) + + openFastTLSAD() + hashADLoop() + hashADTail() + hashADTailLoop() + hashADTailFinish() + hashADDone() +} + +// Special treatment for the TLS case of 13 bytes +func openFastTLSAD() { + Label("openFastTLSAD") + MOVQ(Mem{Base: adp}, acc0) + MOVQ(Mem{Base: adp}.Offset(5), acc1) + SHRQ(Imm(24), acc1) + MOVQ(U32(1), acc2) + polyMul() + RET() +} + +// Hash in 16 byte chunks +func hashADLoop() { + Label("hashADLoop") + Comment("Hash in 16 byte chunks") + CMPQ(itr2, Imm(16)) + JB(LabelRef("hashADTail")) + polyAdd(Mem{Base: adp}.Offset(0)) + LEAQ(Mem{Base: adp}.Offset(1*16), adp) + SUBQ(Imm(16), itr2) + polyMul() + JMP(LabelRef("hashADLoop")) +} + +func hashADTail() { + Label("hashADTail") + CMPQ(itr2, Imm(0)) + JE(LabelRef("hashADDone")) + + Comment("Hash last < 16 byte tail") + XORQ(t0, t0) + XORQ(t1, t1) + XORQ(t2, t2) + ADDQ(itr2, adp) +} + +func hashADTailLoop() { + Label("hashADTailLoop") + SHLQ(Imm(8), t0, t1) + SHLQ(Imm(8), t0) + // Hack to get Avo to emit: + // MOVB -1(adp), t2 + Instruction(&ir.Instruction{Opcode: "MOVB", Operands: []Op{Mem{Base: adp}.Offset(-1), t2}}) + XORQ(t2, t0) + DECQ(adp) + DECQ(itr2) + JNE(LabelRef("hashADTailLoop")) +} + +func hashADTailFinish() { + ADDQ(t0, acc0) + ADCQ(t1, acc1) + ADCQ(Imm(1), acc2) + polyMul() +} + +// Finished AD +func hashADDone() { + Label("hashADDone") + RET() +} + +// ---------------------------------------------------------------------------- +// ---------------------------------------------------------------------------- + +// Implements the following function fignature: +// +// func chacha20Poly1305Open(dst []byte, key []uint32, src []byte, ad []byte) bool +func chacha20Poly1305Open() { + Implement("chacha20Poly1305Open") + Attributes(0) + AllocLocal(288) + + Comment("For aligned stack access") + MOVQ(RSP, RBP) + ADDQ(Imm(32), RBP) + ANDQ(I8(-32), RBP) + + Load(Param("dst").Base(), oup) + Load(Param("key").Base(), keyp) + Load(Param("src").Base(), inp) + Load(Param("src").Len(), inl) + Load(Param("ad").Base(), adp) + + Comment("Check for AVX2 support") + CMPB(Mem{Symbol: Symbol{Name: ThatPeskyUnicodeDot + "useAVX2"}, Base: StaticBase}, Imm(1)) + JE(LabelRef("chacha20Poly1305Open_AVX2")) + + Comment("Special optimization, for very short buffers") + CMPQ(inl, Imm(128)) + JBE(LabelRef("openSSE128")) // About 16% faster + + Comment("For long buffers, prepare the poly key first") + chacha20Constants := chacha20Constants_DATA() + MOVOU(chacha20Constants, A0) + MOVOU(Mem{Base: keyp}.Offset(1*16), B0) + MOVOU(Mem{Base: keyp}.Offset(2*16), C0) + MOVOU(Mem{Base: keyp}.Offset(3*16), D0) + MOVO(D0, T1) + + Comment("Store state on stack for future use") + MOVO(B0, state1Store) + MOVO(C0, state2Store) + MOVO(D0, ctr3Store) + MOVQ(U32(10), itr2) + + openSSEPreparePolyKey() + openSSEMainLoop() + openSSEInternalLoop() + openSSEMainLoopDone() + openSSEFinalize() + + // ---------------------------------------------------------------------------- + // Special optimization for buffers smaller than 129 bytes + openSSE128() + openSSE128InnerCipherLoop() + openSSE128Open() + openSSETail16() + openSSETail16Store() + + // ---------------------------------------------------------------------------- + // Special optimization for the last 64 bytes of ciphertext + openSSETail64() + openSSETail64LoopA() + openSSETail64LoopB() + openSSETail64DecLoop() + openSSETail64DecLoopDone() + + // ---------------------------------------------------------------------------- + // Special optimization for the last 128 bytes of ciphertext + openSSETail128() + openSSETail128LoopA() + openSSETail128LoopB() + + // ---------------------------------------------------------------------------- + // Special optimization for the last 192 bytes of ciphertext + openSSETail192() + openSSLTail192LoopA() + openSSLTail192LoopB() + openSSLTail192Store() + + // ---------------------------------------------------------------------------- + // Special optimization for the last 256 bytes of ciphertext + openSSETail256() + openSSETail256Loop() + openSSETail256HashLoop() + + // ---------------------------------------------------------------------------- + // ------------------------- AVX2 Code ---------------------------------------- + chacha20Poly1305Open_AVX2() + openAVX2PreparePolyKey() + openAVX2InitialHash64() + openAVX2MainLoop() + openAVX2InternalLoop() + openAVX2MainLoopDone() + + // ---------------------------------------------------------------------------- + // Special optimization for buffers smaller than 193 bytes + openAVX2192() + openAVX2192InnerCipherLoop() + openAVX2ShortOpen() + openAVX2ShortOpenLoop() + openAVX2ShortTail32() + openAVX2ShortDone() + + // ---------------------------------------------------------------------------- + // Special optimization for buffers smaller than 321 bytes + openAVX2320() + openAVX2320InnerCipherLoop() + + // ---------------------------------------------------------------------------- + // Special optimization for the last 128 bytes of ciphertext + openAVX2Tail128() + openAVX2Tail128LoopA() + openAVX2Tail128LoopB() + openAVX2TailLoop() + openAVX2Tail() + openAVX2TailDone() + + // ---------------------------------------------------------------------------- + // Special optimization for the last 256 bytes of ciphertext + openAVX2Tail256() + openAVX2Tail256LoopA() + openAVX2Tail256LoopB() + openAVX2Tail256Hash() + openAVX2Tail256HashEnd() + + // ---------------------------------------------------------------------------- + // Special optimization for the last 384 bytes of ciphertext + openAVX2Tail384() + openAVX2Tail384LoopB() + openAVX2Tail384LoopA() + openAVX2Tail384Hash() + openAVX2Tail384HashEnd() + + // ---------------------------------------------------------------------------- + // Special optimization for the last 512 bytes of ciphertext + openAVX2Tail512() + openAVX2Tail512LoopB() + openAVX2Tail512LoopA() + openAVX2Tail512HashLoop() + openAVX2Tail512HashEnd() +} + +func openSSEPreparePolyKey() { + Label("openSSEPreparePolyKey") + chachaQR(A0, B0, C0, D0, T0) + shiftB0Left() + shiftC0Left() + shiftD0Left() + chachaQR(A0, B0, C0, D0, T0) + shiftB0Right() + shiftC0Right() + shiftD0Right() + DECQ(itr2) + JNE(LabelRef("openSSEPreparePolyKey")) + + Comment("A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded") + chacha20Constants := chacha20Constants_DATA() + PADDL(chacha20Constants, A0) + PADDL(state1Store, B0) + + Comment("Clamp and store the key") + polyClampMask := polyClampMask_DATA() + PAND(polyClampMask, A0) + MOVO(A0, rStore) + MOVO(B0, sStore) + + Comment("Hash AAD") + Load(Param("ad").Len(), itr2) + CALL(LabelRef("polyHashADInternal<>(SB)")) +} + +func openSSEMainLoop() { + Label("openSSEMainLoop") + CMPQ(inl, U32(256)) + JB(LabelRef("openSSEMainLoopDone")) + + chacha20Constants := chacha20Constants_DATA() + sseIncMask := sseIncMask_DATA() + + Comment("Load state, increment counter blocks") + MOVO(chacha20Constants, A0) + MOVO(state1Store, B0) + MOVO(state2Store, C0) + MOVO(ctr3Store, D0) + PADDL(sseIncMask, D0) + MOVO(A0, A1) + MOVO(B0, B1) + MOVO(C0, C1) + MOVO(D0, D1) + PADDL(sseIncMask, D1) + MOVO(A1, A2) + MOVO(B1, B2) + MOVO(C1, C2) + MOVO(D1, D2) + PADDL(sseIncMask, D2) + MOVO(A2, A3) + MOVO(B2, B3) + MOVO(C2, C3) + MOVO(D2, D3) + PADDL(sseIncMask, D3) + + Comment("Store counters") + MOVO(D0, ctr0Store) + MOVO(D1, ctr1Store) + MOVO(D2, ctr2Store) + MOVO(D3, ctr3Store) + + Comment("There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash") + Comment("2 blocks, and for the remaining 4 only 1 block - for a total of 16") + MOVQ(U32(4), itr1) + MOVQ(inp, itr2) +} + +func openSSEInternalLoop() { + Label("openSSEInternalLoop") + MOVO(C3, tmpStore) + chachaQR(A0, B0, C0, D0, C3) + chachaQR(A1, B1, C1, D1, C3) + chachaQR(A2, B2, C2, D2, C3) + MOVO(tmpStore, C3) + MOVO(C1, tmpStore) + chachaQR(A3, B3, C3, D3, C1) + MOVO(tmpStore, C1) + polyAdd(Mem{Base: itr2}.Offset(0)) + shiftB0Left() + shiftB1Left() + shiftB2Left() + shiftB3Left() + shiftC0Left() + shiftC1Left() + shiftC2Left() + shiftC3Left() + shiftD0Left() + shiftD1Left() + shiftD2Left() + shiftD3Left() + polyMulStage1() + polyMulStage2() + LEAQ(Mem{Base: itr2}.Offset(2*8), itr2) + MOVO(C3, tmpStore) + chachaQR(A0, B0, C0, D0, C3) + chachaQR(A1, B1, C1, D1, C3) + chachaQR(A2, B2, C2, D2, C3) + MOVO(tmpStore, C3) + MOVO(C1, tmpStore) + polyMulStage3() + chachaQR(A3, B3, C3, D3, C1) + MOVO(tmpStore, C1) + polyMulReduceStage() + shiftB0Right() + shiftB1Right() + shiftB2Right() + shiftB3Right() + shiftC0Right() + shiftC1Right() + shiftC2Right() + shiftC3Right() + shiftD0Right() + shiftD1Right() + shiftD2Right() + shiftD3Right() + DECQ(itr1) + JGE(LabelRef("openSSEInternalLoop")) + + polyAdd(Mem{Base: itr2}.Offset(0)) + polyMul() + LEAQ(Mem{Base: itr2}.Offset(2*8), itr2) + + CMPQ(itr1, I8(-6)) + JG(LabelRef("openSSEInternalLoop")) + + chacha20Constants := chacha20Constants_DATA() + Comment("Add in the state") + PADDD(chacha20Constants, A0) + PADDD(chacha20Constants, A1) + PADDD(chacha20Constants, A2) + PADDD(chacha20Constants, A3) + PADDD(state1Store, B0) + PADDD(state1Store, B1) + PADDD(state1Store, B2) + PADDD(state1Store, B3) + PADDD(state2Store, C0) + PADDD(state2Store, C1) + PADDD(state2Store, C2) + PADDD(state2Store, C3) + PADDD(ctr0Store, D0) + PADDD(ctr1Store, D1) + PADDD(ctr2Store, D2) + PADDD(ctr3Store, D3) + + Comment("Load - xor - store") + MOVO(D3, tmpStore) + MOVOU(Mem{Base: inp}.Offset(0*16), D3) + PXOR(D3, A0) + MOVOU(A0, Mem{Base: oup}.Offset(0*16)) + MOVOU(Mem{Base: inp}.Offset(1*16), D3) + PXOR(D3, B0) + MOVOU(B0, Mem{Base: oup}.Offset(1*16)) + MOVOU(Mem{Base: inp}.Offset(2*16), D3) + PXOR(D3, C0) + MOVOU(C0, Mem{Base: oup}.Offset(2*16)) + MOVOU(Mem{Base: inp}.Offset(3*16), D3) + PXOR(D3, D0) + MOVOU(D0, Mem{Base: oup}.Offset(3*16)) + MOVOU(Mem{Base: inp}.Offset(4*16), D0) + PXOR(D0, A1) + MOVOU(A1, Mem{Base: oup}.Offset(4*16)) + MOVOU(Mem{Base: inp}.Offset(5*16), D0) + PXOR(D0, B1) + MOVOU(B1, Mem{Base: oup}.Offset(5*16)) + MOVOU(Mem{Base: inp}.Offset(6*16), D0) + PXOR(D0, C1) + MOVOU(C1, Mem{Base: oup}.Offset(6*16)) + MOVOU(Mem{Base: inp}.Offset(7*16), D0) + PXOR(D0, D1) + MOVOU(D1, Mem{Base: oup}.Offset(7*16)) + MOVOU(Mem{Base: inp}.Offset(8*16), D0) + PXOR(D0, A2) + MOVOU(A2, Mem{Base: oup}.Offset(8*16)) + MOVOU(Mem{Base: inp}.Offset(9*16), D0) + PXOR(D0, B2) + MOVOU(B2, Mem{Base: oup}.Offset(9*16)) + MOVOU(Mem{Base: inp}.Offset(10*16), D0) + PXOR(D0, C2) + MOVOU(C2, Mem{Base: oup}.Offset(10*16)) + MOVOU(Mem{Base: inp}.Offset(11*16), D0) + PXOR(D0, D2) + MOVOU(D2, Mem{Base: oup}.Offset(11*16)) + MOVOU(Mem{Base: inp}.Offset(12*16), D0) + PXOR(D0, A3) + MOVOU(A3, Mem{Base: oup}.Offset(12*16)) + MOVOU(Mem{Base: inp}.Offset(13*16), D0) + PXOR(D0, B3) + MOVOU(B3, Mem{Base: oup}.Offset(13*16)) + MOVOU(Mem{Base: inp}.Offset(14*16), D0) + PXOR(D0, C3) + MOVOU(C3, Mem{Base: oup}.Offset(14*16)) + MOVOU(Mem{Base: inp}.Offset(15*16), D0) + PXOR(tmpStore, D0) + MOVOU(D0, Mem{Base: oup}.Offset(15*16)) + LEAQ(Mem{Base: inp}.Offset(256), inp) + LEAQ(Mem{Base: oup}.Offset(256), oup) + SUBQ(U32(256), inl) + JMP(LabelRef("openSSEMainLoop")) +} + +func openSSEMainLoopDone() { + Label("openSSEMainLoopDone") + Comment("Handle the various tail sizes efficiently") + TESTQ(inl, inl) + JE(LabelRef("openSSEFinalize")) + CMPQ(inl, Imm(64)) + JBE(LabelRef("openSSETail64")) + CMPQ(inl, Imm(128)) + JBE(LabelRef("openSSETail128")) + CMPQ(inl, Imm(192)) + JBE(LabelRef("openSSETail192")) + JMP(LabelRef("openSSETail256")) +} + +func openSSEFinalize() { + Label("openSSEFinalize") + Comment("Hash in the PT, AAD lengths") + ADDQ(NewParamAddr("ad_len", 80), acc0) + ADCQ(NewParamAddr("src_len", 56), acc1) + ADCQ(Imm(1), acc2) + polyMul() + + Comment("Final reduce") + MOVQ(acc0, t0) + MOVQ(acc1, t1) + MOVQ(acc2, t2) + SUBQ(I8(-5), acc0) + SBBQ(I8(-1), acc1) + SBBQ(Imm(3), acc2) + CMOVQCS(t0, acc0) + CMOVQCS(t1, acc1) + CMOVQCS(t2, acc2) + + Comment("Add in the \"s\" part of the key") + ADDQ(sStore.Offset(0), acc0) + ADCQ(sStore.Offset(8), acc1) + + Comment("Finally, constant time compare to the tag at the end of the message") + XORQ(RAX, RAX) + MOVQ(U32(1), RDX) + XORQ(Mem{Base: inp}.Offset(0*8), acc0) + XORQ(Mem{Base: inp}.Offset(1*8), acc1) + ORQ(acc1, acc0) + CMOVQEQ(RDX, RAX) + + Comment("Return true iff tags are equal") + // Hack to get Avo to emit: + // MOVB AX, ret+96(FP) + Instruction(&ir.Instruction{Opcode: "MOVB", Operands: []Op{AX, NewParamAddr("ret", 96)}}) + RET() +} + +// ---------------------------------------------------------------------------- +// Special optimization for buffers smaller than 129 bytes + +// For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks +func openSSE128() { + Label("openSSE128") + + chacha20Constants := chacha20Constants_DATA() + sseIncMask := sseIncMask_DATA() + + MOVOU(chacha20Constants, A0) + MOVOU(Mem{Base: keyp}.Offset(1*16), B0) + MOVOU(Mem{Base: keyp}.Offset(2*16), C0) + MOVOU(Mem{Base: keyp}.Offset(3*16), D0) + MOVO(A0, A1) + MOVO(B0, B1) + MOVO(C0, C1) + MOVO(D0, D1) + PADDL(sseIncMask, D1) + MOVO(A1, A2) + MOVO(B1, B2) + MOVO(C1, C2) + MOVO(D1, D2) + PADDL(sseIncMask, D2) + MOVO(B0, T1) + MOVO(C0, T2) + MOVO(D1, T3) + MOVQ(U32(10), itr2) +} + +func openSSE128InnerCipherLoop() { + Label("openSSE128InnerCipherLoop") + chachaQR(A0, B0, C0, D0, T0) + chachaQR(A1, B1, C1, D1, T0) + chachaQR(A2, B2, C2, D2, T0) + shiftB0Left() + shiftB1Left() + shiftB2Left() + shiftC0Left() + shiftC1Left() + shiftC2Left() + shiftD0Left() + shiftD1Left() + shiftD2Left() + chachaQR(A0, B0, C0, D0, T0) + chachaQR(A1, B1, C1, D1, T0) + chachaQR(A2, B2, C2, D2, T0) + shiftB0Right() + shiftB1Right() + shiftB2Right() + shiftC0Right() + shiftC1Right() + shiftC2Right() + shiftD0Right() + shiftD1Right() + shiftD2Right() + DECQ(itr2) + JNE(LabelRef("openSSE128InnerCipherLoop")) + + Comment("A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded") + + chacha20Constants := chacha20Constants_DATA() + PADDL(chacha20Constants, A0) + PADDL(chacha20Constants, A1) + PADDL(chacha20Constants, A2) + PADDL(T1, B0) + PADDL(T1, B1) + PADDL(T1, B2) + PADDL(T2, C1) + PADDL(T2, C2) + PADDL(T3, D1) + sseIncMask := sseIncMask_DATA() + PADDL(sseIncMask, T3) + PADDL(T3, D2) + + Comment("Clamp and store the key") + polyClampMask := polyClampMask_DATA() + PAND(polyClampMask, A0) + MOVOU(A0, rStore) + MOVOU(B0, sStore) + + Comment("Hash") + Load(Param("ad").Len(), itr2) + CALL(LabelRef("polyHashADInternal<>(SB)")) +} + +func openSSE128Open() { + Label("openSSE128Open") + CMPQ(inl, Imm(16)) + JB(LabelRef("openSSETail16")) + SUBQ(Imm(16), inl) + + Comment("Load for hashing") + polyAdd(Mem{Base: inp}.Offset(0)) + + Comment("Load for decryption") + MOVOU(Mem{Base: inp}, T0) + PXOR(T0, A1) + MOVOU(A1, Mem{Base: oup}) + LEAQ(Mem{Base: inp}.Offset(1*16), inp) + LEAQ(Mem{Base: oup}.Offset(1*16), oup) + polyMul() + + Comment("Shift the stream \"left\"") + MOVO(B1, A1) + MOVO(C1, B1) + MOVO(D1, C1) + MOVO(A2, D1) + MOVO(B2, A2) + MOVO(C2, B2) + MOVO(D2, C2) + JMP(LabelRef("openSSE128Open")) +} + +func openSSETail16() { + Label("openSSETail16") + TESTQ(inl, inl) + JE(LabelRef("openSSEFinalize")) + + Comment("We can safely load the CT from the end, because it is padded with the MAC") + MOVQ(inl, itr2) + SHLQ(Imm(4), itr2) + andMask := andMask_DATA() + LEAQ(andMask, t0) + MOVOU(Mem{Base: inp}, T0) + ADDQ(inl, inp) + PAND(Mem{Base: t0, Index: itr2, Scale: 1}.Offset(-16), T0) + MOVO(T0, tmpStore.Offset(0)) + MOVQ(T0, t0) + MOVQ(tmpStore.Offset(8), t1) + PXOR(A1, T0) +} + +func openSSETail16Store() { + Comment("We can only store one byte at a time, since plaintext can be shorter than 16 bytes") + Label("openSSETail16Store") + MOVQ(T0, t3) + // Hack to get Avo to emit: + // MOVB t3, (oup) + Instruction(&ir.Instruction{Opcode: "MOVB", Operands: []Op{t3, Mem{Base: oup}}}) + PSRLDQ(Imm(1), T0) + INCQ(oup) + DECQ(inl) + JNE(LabelRef("openSSETail16Store")) + ADDQ(t0, acc0) + ADCQ(t1, acc1) + ADCQ(Imm(1), acc2) + polyMul() + JMP(LabelRef("openSSEFinalize")) +} + +// ---------------------------------------------------------------------------- +// Special optimization for the last 64 bytes of ciphertext + +// Need to decrypt up to 64 bytes - prepare single block +func openSSETail64() { + Label("openSSETail64") + chacha20Constants := chacha20Constants_DATA() + MOVO(chacha20Constants, A0) + MOVO(state1Store, B0) + MOVO(state2Store, C0) + MOVO(ctr3Store, D0) + sseIncMask := sseIncMask_DATA() + PADDL(sseIncMask, D0) + MOVO(D0, ctr0Store) + XORQ(itr2, itr2) + MOVQ(inl, itr1) + CMPQ(itr1, Imm(16)) + JB(LabelRef("openSSETail64LoopB")) +} + +// Perform ChaCha rounds, while hashing the remaining input +func openSSETail64LoopA() { + Label("openSSETail64LoopA") + polyAdd(Mem{Base: inp, Index: itr2, Scale: 1}.Offset(0)) + polyMul() + SUBQ(Imm(16), itr1) +} + +func openSSETail64LoopB() { + Label("openSSETail64LoopB") + ADDQ(Imm(16), itr2) + chachaQR(A0, B0, C0, D0, T0) + shiftB0Left() + shiftC0Left() + shiftD0Left() + chachaQR(A0, B0, C0, D0, T0) + shiftB0Right() + shiftC0Right() + shiftD0Right() + + CMPQ(itr1, Imm(16)) + JAE(LabelRef("openSSETail64LoopA")) + + CMPQ(itr2, Imm(160)) + JNE(LabelRef("openSSETail64LoopB")) + + chacha20Constants := chacha20Constants_DATA() + PADDL(chacha20Constants, A0) + PADDL(state1Store, B0) + PADDL(state2Store, C0) + PADDL(ctr0Store, D0) +} + +func openSSETail64DecLoop() { + Label("openSSETail64DecLoop") + CMPQ(inl, Imm(16)) + JB(LabelRef("openSSETail64DecLoopDone")) + SUBQ(Imm(16), inl) + MOVOU(Mem{Base: inp}, T0) + PXOR(T0, A0) + MOVOU(A0, Mem{Base: oup}) + LEAQ(Mem{Base: inp}.Offset(16), inp) + LEAQ(Mem{Base: oup}.Offset(16), oup) + MOVO(B0, A0) + MOVO(C0, B0) + MOVO(D0, C0) + JMP(LabelRef("openSSETail64DecLoop")) +} + +func openSSETail64DecLoopDone() { + Label("openSSETail64DecLoopDone") + MOVO(A0, A1) + JMP(LabelRef("openSSETail16")) +} + +// ---------------------------------------------------------------------------- +// Special optimization for the last 128 bytes of ciphertext + +// Need to decrypt up to 128 bytes - prepare two blocks +func openSSETail128() { + Label("openSSETail128") + chacha20Constants := chacha20Constants_DATA() + MOVO(chacha20Constants, A1) + MOVO(state1Store, B1) + MOVO(state2Store, C1) + MOVO(ctr3Store, D1) + sseIncMask := sseIncMask_DATA() + PADDL(sseIncMask, D1) + MOVO(D1, ctr0Store) + MOVO(A1, A0) + MOVO(B1, B0) + MOVO(C1, C0) + MOVO(D1, D0) + PADDL(sseIncMask, D0) + MOVO(D0, ctr1Store) + XORQ(itr2, itr2) + MOVQ(inl, itr1) + ANDQ(I8(-16), itr1) +} + +// Perform ChaCha rounds, while hashing the remaining input +func openSSETail128LoopA() { + Label("openSSETail128LoopA") + polyAdd(Mem{Base: inp, Index: itr2, Scale: 1}.Offset(0)) + polyMul() +} + +func openSSETail128LoopB() { + Label("openSSETail128LoopB") + ADDQ(Imm(16), itr2) + chachaQR(A0, B0, C0, D0, T0) + chachaQR(A1, B1, C1, D1, T0) + shiftB0Left() + shiftC0Left() + shiftD0Left() + shiftB1Left() + shiftC1Left() + shiftD1Left() + chachaQR(A0, B0, C0, D0, T0) + chachaQR(A1, B1, C1, D1, T0) + shiftB0Right() + shiftC0Right() + shiftD0Right() + shiftB1Right() + shiftC1Right() + shiftD1Right() + + CMPQ(itr2, itr1) + JB(LabelRef("openSSETail128LoopA")) + + CMPQ(itr2, Imm(160)) + JNE(LabelRef("openSSETail128LoopB")) + + chacha20Constants := chacha20Constants_DATA() + PADDL(chacha20Constants, A0) + PADDL(chacha20Constants, A1) + PADDL(state1Store, B0) + PADDL(state1Store, B1) + PADDL(state2Store, C0) + PADDL(state2Store, C1) + PADDL(ctr1Store, D0) + PADDL(ctr0Store, D1) + + MOVOU(Mem{Base: inp}.Offset(0*16), T0) + MOVOU(Mem{Base: inp}.Offset(1*16), T1) + MOVOU(Mem{Base: inp}.Offset(2*16), T2) + MOVOU(Mem{Base: inp}.Offset(3*16), T3) + PXOR(T0, A1) + PXOR(T1, B1) + PXOR(T2, C1) + PXOR(T3, D1) + MOVOU(A1, Mem{Base: oup}.Offset(0*16)) + MOVOU(B1, Mem{Base: oup}.Offset(1*16)) + MOVOU(C1, Mem{Base: oup}.Offset(2*16)) + MOVOU(D1, Mem{Base: oup}.Offset(3*16)) + + SUBQ(Imm(64), inl) + LEAQ(Mem{Base: inp}.Offset(64), inp) + LEAQ(Mem{Base: oup}.Offset(64), oup) + JMP(LabelRef("openSSETail64DecLoop")) +} + +// ---------------------------------------------------------------------------- +// Special optimization for the last 192 bytes of ciphertext + +// Need to decrypt up to 192 bytes - prepare three blocks +func openSSETail192() { + Label("openSSETail192") + chacha20Constants := chacha20Constants_DATA() + MOVO(chacha20Constants, A2) + MOVO(state1Store, B2) + MOVO(state2Store, C2) + MOVO(ctr3Store, D2) + sseIncMask := sseIncMask_DATA() + PADDL(sseIncMask, D2) + MOVO(D2, ctr0Store) + MOVO(A2, A1) + MOVO(B2, B1) + MOVO(C2, C1) + MOVO(D2, D1) + PADDL(sseIncMask, D1) + MOVO(D1, ctr1Store) + MOVO(A1, A0) + MOVO(B1, B0) + MOVO(C1, C0) + MOVO(D1, D0) + PADDL(sseIncMask, D0) + MOVO(D0, ctr2Store) + + MOVQ(inl, itr1) + MOVQ(U32(160), itr2) + CMPQ(itr1, Imm(160)) + CMOVQGT(itr2, itr1) + ANDQ(I8(-16), itr1) + XORQ(itr2, itr2) +} + +// Perform ChaCha rounds, while hashing the remaining input +func openSSLTail192LoopA() { + Label("openSSLTail192LoopA") + polyAdd(Mem{Base: inp, Index: itr2, Scale: 1}.Offset(0)) + polyMul() +} + +func openSSLTail192LoopB() { + Label("openSSLTail192LoopB") + ADDQ(Imm(16), itr2) + chachaQR(A0, B0, C0, D0, T0) + chachaQR(A1, B1, C1, D1, T0) + chachaQR(A2, B2, C2, D2, T0) + shiftB0Left() + shiftC0Left() + shiftD0Left() + shiftB1Left() + shiftC1Left() + shiftD1Left() + shiftB2Left() + shiftC2Left() + shiftD2Left() + + chachaQR(A0, B0, C0, D0, T0) + chachaQR(A1, B1, C1, D1, T0) + chachaQR(A2, B2, C2, D2, T0) + shiftB0Right() + shiftC0Right() + shiftD0Right() + shiftB1Right() + shiftC1Right() + shiftD1Right() + shiftB2Right() + shiftC2Right() + shiftD2Right() + + CMPQ(itr2, itr1) + JB(LabelRef("openSSLTail192LoopA")) + + CMPQ(itr2, Imm(160)) + JNE(LabelRef("openSSLTail192LoopB")) + + CMPQ(inl, Imm(176)) + JB(LabelRef("openSSLTail192Store")) + + polyAdd(Mem{Base: inp}.Offset(160)) + polyMul() + + CMPQ(inl, Imm(192)) + JB(LabelRef("openSSLTail192Store")) + + polyAdd(Mem{Base: inp}.Offset(176)) + polyMul() +} + +func openSSLTail192Store() { + Label("openSSLTail192Store") + chacha20Constants := chacha20Constants_DATA() + PADDL(chacha20Constants, A0) + PADDL(chacha20Constants, A1) + PADDL(chacha20Constants, A2) + PADDL(state1Store, B0) + PADDL(state1Store, B1) + PADDL(state1Store, B2) + PADDL(state2Store, C0) + PADDL(state2Store, C1) + PADDL(state2Store, C2) + PADDL(ctr2Store, D0) + PADDL(ctr1Store, D1) + PADDL(ctr0Store, D2) + + MOVOU(Mem{Base: inp}.Offset(0*16), T0) + MOVOU(Mem{Base: inp}.Offset(1*16), T1) + MOVOU(Mem{Base: inp}.Offset(2*16), T2) + MOVOU(Mem{Base: inp}.Offset(3*16), T3) + PXOR(T0, A2) + PXOR(T1, B2) + PXOR(T2, C2) + PXOR(T3, D2) + MOVOU(A2, Mem{Base: oup}.Offset(0*16)) + MOVOU(B2, Mem{Base: oup}.Offset(1*16)) + MOVOU(C2, Mem{Base: oup}.Offset(2*16)) + MOVOU(D2, Mem{Base: oup}.Offset(3*16)) + + MOVOU(Mem{Base: inp}.Offset(4*16), T0) + MOVOU(Mem{Base: inp}.Offset(5*16), T1) + MOVOU(Mem{Base: inp}.Offset(6*16), T2) + MOVOU(Mem{Base: inp}.Offset(7*16), T3) + PXOR(T0, A1) + PXOR(T1, B1) + PXOR(T2, C1) + PXOR(T3, D1) + MOVOU(A1, Mem{Base: oup}.Offset(4*16)) + MOVOU(B1, Mem{Base: oup}.Offset(5*16)) + MOVOU(C1, Mem{Base: oup}.Offset(6*16)) + MOVOU(D1, Mem{Base: oup}.Offset(7*16)) + + SUBQ(Imm(128), inl) + LEAQ(Mem{Base: inp}.Offset(128), inp) + LEAQ(Mem{Base: oup}.Offset(128), oup) + JMP(LabelRef("openSSETail64DecLoop")) +} + +// ---------------------------------------------------------------------------- +// Special optimization for the last 256 bytes of ciphertext + +// Need to decrypt up to 256 bytes - prepare four blocks +func openSSETail256() { + Label("openSSETail256") + chacha20Constants := chacha20Constants_DATA() + MOVO(chacha20Constants, A0) + MOVO(state1Store, B0) + MOVO(state2Store, C0) + MOVO(ctr3Store, D0) + sseIncMask := sseIncMask_DATA() + PADDL(sseIncMask, D0) + MOVO(A0, A1) + MOVO(B0, B1) + MOVO(C0, C1) + MOVO(D0, D1) + PADDL(sseIncMask, D1) + MOVO(A1, A2) + MOVO(B1, B2) + MOVO(C1, C2) + MOVO(D1, D2) + PADDL(sseIncMask, D2) + MOVO(A2, A3) + MOVO(B2, B3) + MOVO(C2, C3) + MOVO(D2, D3) + PADDL(sseIncMask, D3) + + Comment("Store counters") + MOVO(D0, ctr0Store) + MOVO(D1, ctr1Store) + MOVO(D2, ctr2Store) + MOVO(D3, ctr3Store) + XORQ(itr2, itr2) +} + +// This loop inteleaves 8 ChaCha quarter rounds with 1 poly multiplication +func openSSETail256Loop() { + Label("openSSETail256Loop") + polyAdd(Mem{Base: inp, Index: itr2, Scale: 1}.Offset(0)) + MOVO(C3, tmpStore) + chachaQR(A0, B0, C0, D0, C3) + chachaQR(A1, B1, C1, D1, C3) + chachaQR(A2, B2, C2, D2, C3) + MOVO(tmpStore, C3) + MOVO(C1, tmpStore) + chachaQR(A3, B3, C3, D3, C1) + MOVO(tmpStore, C1) + shiftB0Left() + shiftB1Left() + shiftB2Left() + shiftB3Left() + shiftC0Left() + shiftC1Left() + shiftC2Left() + shiftC3Left() + shiftD0Left() + shiftD1Left() + shiftD2Left() + shiftD3Left() + polyMulStage1() + polyMulStage2() + MOVO(C3, tmpStore) + chachaQR(A0, B0, C0, D0, C3) + chachaQR(A1, B1, C1, D1, C3) + chachaQR(A2, B2, C2, D2, C3) + MOVO(tmpStore, C3) + MOVO(C1, tmpStore) + chachaQR(A3, B3, C3, D3, C1) + MOVO(tmpStore, C1) + polyMulStage3() + polyMulReduceStage() + shiftB0Right() + shiftB1Right() + shiftB2Right() + shiftB3Right() + shiftC0Right() + shiftC1Right() + shiftC2Right() + shiftC3Right() + shiftD0Right() + shiftD1Right() + shiftD2Right() + shiftD3Right() + ADDQ(Imm(2*8), itr2) + CMPQ(itr2, Imm(160)) + JB(LabelRef("openSSETail256Loop")) + MOVQ(inl, itr1) + ANDQ(I8(-16), itr1) +} + +func openSSETail256HashLoop() { + Label("openSSETail256HashLoop") + polyAdd(Mem{Base: inp, Index: itr2, Scale: 1}.Offset(0)) + polyMul() + ADDQ(Imm(2*8), itr2) + CMPQ(itr2, itr1) + JB(LabelRef("openSSETail256HashLoop")) + + Comment("Add in the state") + chacha20Constants := chacha20Constants_DATA() + PADDD(chacha20Constants, A0) + PADDD(chacha20Constants, A1) + PADDD(chacha20Constants, A2) + PADDD(chacha20Constants, A3) + PADDD(state1Store, B0) + PADDD(state1Store, B1) + PADDD(state1Store, B2) + PADDD(state1Store, B3) + PADDD(state2Store, C0) + PADDD(state2Store, C1) + PADDD(state2Store, C2) + PADDD(state2Store, C3) + PADDD(ctr0Store, D0) + PADDD(ctr1Store, D1) + PADDD(ctr2Store, D2) + PADDD(ctr3Store, D3) + MOVO(D3, tmpStore) + + Comment("Load - xor - store") + MOVOU(Mem{Base: inp}.Offset(0*16), D3) + PXOR(D3, A0) + MOVOU(Mem{Base: inp}.Offset(1*16), D3) + PXOR(D3, B0) + MOVOU(Mem{Base: inp}.Offset(2*16), D3) + PXOR(D3, C0) + MOVOU(Mem{Base: inp}.Offset(3*16), D3) + PXOR(D3, D0) + MOVOU(A0, Mem{Base: oup}.Offset(0*16)) + MOVOU(B0, Mem{Base: oup}.Offset(1*16)) + MOVOU(C0, Mem{Base: oup}.Offset(2*16)) + MOVOU(D0, Mem{Base: oup}.Offset(3*16)) + MOVOU(Mem{Base: inp}.Offset(4*16), A0) + MOVOU(Mem{Base: inp}.Offset(5*16), B0) + MOVOU(Mem{Base: inp}.Offset(6*16), C0) + MOVOU(Mem{Base: inp}.Offset(7*16), D0) + PXOR(A0, A1) + PXOR(B0, B1) + PXOR(C0, C1) + PXOR(D0, D1) + MOVOU(A1, Mem{Base: oup}.Offset(4*16)) + MOVOU(B1, Mem{Base: oup}.Offset(5*16)) + MOVOU(C1, Mem{Base: oup}.Offset(6*16)) + MOVOU(D1, Mem{Base: oup}.Offset(7*16)) + MOVOU(Mem{Base: inp}.Offset(8*16), A0) + MOVOU(Mem{Base: inp}.Offset(9*16), B0) + MOVOU(Mem{Base: inp}.Offset(10*16), C0) + MOVOU(Mem{Base: inp}.Offset(11*16), D0) + PXOR(A0, A2) + PXOR(B0, B2) + PXOR(C0, C2) + PXOR(D0, D2) + MOVOU(A2, Mem{Base: oup}.Offset(8*16)) + MOVOU(B2, Mem{Base: oup}.Offset(9*16)) + MOVOU(C2, Mem{Base: oup}.Offset(10*16)) + MOVOU(D2, Mem{Base: oup}.Offset(11*16)) + LEAQ(Mem{Base: inp}.Offset(192), inp) + LEAQ(Mem{Base: oup}.Offset(192), oup) + SUBQ(Imm(192), inl) + MOVO(A3, A0) + MOVO(B3, B0) + MOVO(C3, C0) + MOVO(tmpStore, D0) + + JMP(LabelRef("openSSETail64DecLoop")) +} + +// Functions to emit AVX instructions via BYTE directive + +// broadcasti128 16(r8), ymm14 +func VBROADCASTI128_16_R8_YMM14() { + BYTE(U8(0xc4)) + BYTE(U8(0x42)) + BYTE(U8(0x7d)) + BYTE(U8(0x5a)) + BYTE(U8(0x70)) + BYTE(U8(0x10)) +} + +// broadcasti128 32(r8), ymm12 +func VBROADCASTI128_32_R8_YMM12() { + BYTE(U8(0xc4)) + BYTE(U8(0x42)) + BYTE(U8(0x7d)) + BYTE(U8(0x5a)) + BYTE(U8(0x60)) + BYTE(U8(0x20)) +} + +// broadcasti128 48(r8), ymm4 +func VBROADCASTI128_48_R8_YMM4() { + BYTE(U8(0xc4)) + BYTE(U8(0xc2)) + BYTE(U8(0x7d)) + BYTE(U8(0x5a)) + BYTE(U8(0x60)) + BYTE(U8(0x30)) +} + +// ---------------------------------------------------------------------------- +// ------------------------- AVX2 Code ---------------------------------------- + +func chacha20Poly1305Open_AVX2() { + Label("chacha20Poly1305Open_AVX2") + VZEROUPPER() + chacha20Constants := chacha20Constants_DATA() + VMOVDQU(chacha20Constants, AA0) + VBROADCASTI128_16_R8_YMM14() + VBROADCASTI128_32_R8_YMM12() + VBROADCASTI128_48_R8_YMM4() + avx2InitMask := avx2InitMask_DATA() + VPADDD(avx2InitMask, DD0, DD0) + + Comment("Special optimization, for very short buffers") + CMPQ(inl, Imm(192)) + JBE(LabelRef("openAVX2192")) + CMPQ(inl, U32(320)) + JBE(LabelRef("openAVX2320")) + + Comment("For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream") + VMOVDQA(BB0, state1StoreAVX2) + VMOVDQA(CC0, state2StoreAVX2) + VMOVDQA(DD0, ctr3StoreAVX2) + MOVQ(U32(10), itr2) +} + +func openAVX2PreparePolyKey() { + Label("openAVX2PreparePolyKey") + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + VPALIGNR(Imm(4), BB0, BB0, BB0) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(12), DD0, DD0, DD0) + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + VPALIGNR(Imm(12), BB0, BB0, BB0) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(4), DD0, DD0, DD0) + DECQ(itr2) + JNE(LabelRef("openAVX2PreparePolyKey")) + + chacha20Constants := chacha20Constants_DATA() + VPADDD(chacha20Constants, AA0, AA0) + VPADDD(state1StoreAVX2, BB0, BB0) + VPADDD(state2StoreAVX2, CC0, CC0) + VPADDD(ctr3StoreAVX2, DD0, DD0) + + VPERM2I128(Imm(0x02), AA0, BB0, TT0) + + Comment("Clamp and store poly key") + polyClampMask := polyClampMask_DATA() + VPAND(polyClampMask, TT0, TT0) + VMOVDQA(TT0, rsStoreAVX2) + + Comment("Stream for the first 64 bytes") + VPERM2I128(Imm(0x13), AA0, BB0, AA0) + VPERM2I128(Imm(0x13), CC0, DD0, BB0) + + Comment("Hash AD + first 64 bytes") + // MOVQ ad_len+80(FP), itr2 + MOVQ(NewParamAddr("ad_len", 80), itr2) + CALL(LabelRef("polyHashADInternal<>(SB)")) + XORQ(itr1, itr1) +} + +func openAVX2InitialHash64() { + Label("openAVX2InitialHash64") + // polyAdd(0(inp)(itr1*1)) + polyAdd(Mem{Base: inp, Index: itr1, Scale: 1}.Offset(0)) + polyMulAVX2() + ADDQ(Imm(16), itr1) + CMPQ(itr1, Imm(64)) + JNE(LabelRef("openAVX2InitialHash64")) + + Comment("Decrypt the first 64 bytes") + VPXOR(Mem{Base: inp}.Offset(0*32), AA0, AA0) + VPXOR(Mem{Base: inp}.Offset(1*32), BB0, BB0) + VMOVDQU(AA0, Mem{Base: oup}.Offset(0*32)) + VMOVDQU(BB0, Mem{Base: oup}.Offset(1*32)) + LEAQ(Mem{Base: inp}.Offset(2*32), inp) + LEAQ(Mem{Base: oup}.Offset(2*32), oup) + SUBQ(Imm(64), inl) +} + +func openAVX2MainLoop() { + Label("openAVX2MainLoop") + CMPQ(inl, U32(512)) + JB(LabelRef("openAVX2MainLoopDone")) + + Comment("Load state, increment counter blocks, store the incremented counters") + chacha20Constants := chacha20Constants_DATA() + VMOVDQU(chacha20Constants, AA0) + VMOVDQA(AA0, AA1) + VMOVDQA(AA0, AA2) + VMOVDQA(AA0, AA3) + VMOVDQA(state1StoreAVX2, BB0) + VMOVDQA(BB0, BB1) + VMOVDQA(BB0, BB2) + VMOVDQA(BB0, BB3) + VMOVDQA(state2StoreAVX2, CC0) + VMOVDQA(CC0, CC1) + VMOVDQA(CC0, CC2) + VMOVDQA(CC0, CC3) + VMOVDQA(ctr3StoreAVX2, DD0) + avx2IncMask := avx2IncMask_DATA() + VPADDD(avx2IncMask, DD0, DD0) + VPADDD(avx2IncMask, DD0, DD1) + VPADDD(avx2IncMask, DD1, DD2) + VPADDD(avx2IncMask, DD2, DD3) + VMOVDQA(DD0, ctr0StoreAVX2) + VMOVDQA(DD1, ctr1StoreAVX2) + VMOVDQA(DD2, ctr2StoreAVX2) + VMOVDQA(DD3, ctr3StoreAVX2) + XORQ(itr1, itr1) +} + +// Lets just say this spaghetti loop interleaves 2 quarter rounds with 3 poly multiplications +// Effectively per 512 bytes of stream we hash 480 bytes of ciphertext +func openAVX2InternalLoop() { + Label("openAVX2InternalLoop") + polyAdd(Mem{Base: inp, Index: itr1, Scale: 1}.Offset(0 * 8)) + VPADDD(BB0, AA0, AA0) + VPADDD(BB1, AA1, AA1) + VPADDD(BB2, AA2, AA2) + VPADDD(BB3, AA3, AA3) + polyMulStage1_AVX2() + VPXOR(AA0, DD0, DD0) + VPXOR(AA1, DD1, DD1) + VPXOR(AA2, DD2, DD2) + VPXOR(AA3, DD3, DD3) + rol16 := rol16_DATA() + VPSHUFB(rol16, DD0, DD0) + VPSHUFB(rol16, DD1, DD1) + VPSHUFB(rol16, DD2, DD2) + VPSHUFB(rol16, DD3, DD3) + polyMulStage2_AVX2() + VPADDD(DD0, CC0, CC0) + VPADDD(DD1, CC1, CC1) + VPADDD(DD2, CC2, CC2) + VPADDD(DD3, CC3, CC3) + VPXOR(CC0, BB0, BB0) + VPXOR(CC1, BB1, BB1) + VPXOR(CC2, BB2, BB2) + VPXOR(CC3, BB3, BB3) + polyMulStage3_AVX2() + VMOVDQA(CC3, tmpStoreAVX2) + VPSLLD(Imm(12), BB0, CC3) + VPSRLD(Imm(20), BB0, BB0) + VPXOR(CC3, BB0, BB0) + VPSLLD(Imm(12), BB1, CC3) + VPSRLD(Imm(20), BB1, BB1) + VPXOR(CC3, BB1, BB1) + VPSLLD(Imm(12), BB2, CC3) + VPSRLD(Imm(20), BB2, BB2) + VPXOR(CC3, BB2, BB2) + VPSLLD(Imm(12), BB3, CC3) + VPSRLD(Imm(20), BB3, BB3) + VPXOR(CC3, BB3, BB3) + VMOVDQA(tmpStoreAVX2, CC3) + polyMulReduceStage() + VPADDD(BB0, AA0, AA0) + VPADDD(BB1, AA1, AA1) + VPADDD(BB2, AA2, AA2) + VPADDD(BB3, AA3, AA3) + VPXOR(AA0, DD0, DD0) + VPXOR(AA1, DD1, DD1) + VPXOR(AA2, DD2, DD2) + VPXOR(AA3, DD3, DD3) + rol8 := rol8_DATA() + VPSHUFB(rol8, DD0, DD0) + VPSHUFB(rol8, DD1, DD1) + VPSHUFB(rol8, DD2, DD2) + VPSHUFB(rol8, DD3, DD3) + polyAdd(Mem{Base: inp, Index: itr1, Scale: 1}.Offset(2 * 8)) + VPADDD(DD0, CC0, CC0) + VPADDD(DD1, CC1, CC1) + VPADDD(DD2, CC2, CC2) + VPADDD(DD3, CC3, CC3) + polyMulStage1_AVX2() + VPXOR(CC0, BB0, BB0) + VPXOR(CC1, BB1, BB1) + VPXOR(CC2, BB2, BB2) + VPXOR(CC3, BB3, BB3) + VMOVDQA(CC3, tmpStoreAVX2) + VPSLLD(Imm(7), BB0, CC3) + VPSRLD(Imm(25), BB0, BB0) + VPXOR(CC3, BB0, BB0) + VPSLLD(Imm(7), BB1, CC3) + VPSRLD(Imm(25), BB1, BB1) + VPXOR(CC3, BB1, BB1) + VPSLLD(Imm(7), BB2, CC3) + VPSRLD(Imm(25), BB2, BB2) + VPXOR(CC3, BB2, BB2) + VPSLLD(Imm(7), BB3, CC3) + VPSRLD(Imm(25), BB3, BB3) + VPXOR(CC3, BB3, BB3) + VMOVDQA(tmpStoreAVX2, CC3) + polyMulStage2_AVX2() + VPALIGNR(Imm(4), BB0, BB0, BB0) + VPALIGNR(Imm(4), BB1, BB1, BB1) + VPALIGNR(Imm(4), BB2, BB2, BB2) + VPALIGNR(Imm(4), BB3, BB3, BB3) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(8), CC3, CC3, CC3) + VPALIGNR(Imm(12), DD0, DD0, DD0) + VPALIGNR(Imm(12), DD1, DD1, DD1) + VPALIGNR(Imm(12), DD2, DD2, DD2) + VPALIGNR(Imm(12), DD3, DD3, DD3) + VPADDD(BB0, AA0, AA0) + VPADDD(BB1, AA1, AA1) + VPADDD(BB2, AA2, AA2) + VPADDD(BB3, AA3, AA3) + polyMulStage3_AVX2() + VPXOR(AA0, DD0, DD0) + VPXOR(AA1, DD1, DD1) + VPXOR(AA2, DD2, DD2) + VPXOR(AA3, DD3, DD3) + VPSHUFB(rol16, DD0, DD0) + VPSHUFB(rol16, DD1, DD1) + VPSHUFB(rol16, DD2, DD2) + VPSHUFB(rol16, DD3, DD3) + polyMulReduceStage() + VPADDD(DD0, CC0, CC0) + VPADDD(DD1, CC1, CC1) + VPADDD(DD2, CC2, CC2) + VPADDD(DD3, CC3, CC3) + VPXOR(CC0, BB0, BB0) + VPXOR(CC1, BB1, BB1) + VPXOR(CC2, BB2, BB2) + VPXOR(CC3, BB3, BB3) + polyAdd(Mem{Base: inp, Index: itr1, Scale: 1}.Offset(4 * 8)) + LEAQ(Mem{Base: itr1}.Offset(6*8), itr1) + VMOVDQA(CC3, tmpStoreAVX2) + VPSLLD(Imm(12), BB0, CC3) + VPSRLD(Imm(20), BB0, BB0) + VPXOR(CC3, BB0, BB0) + VPSLLD(Imm(12), BB1, CC3) + VPSRLD(Imm(20), BB1, BB1) + VPXOR(CC3, BB1, BB1) + VPSLLD(Imm(12), BB2, CC3) + VPSRLD(Imm(20), BB2, BB2) + VPXOR(CC3, BB2, BB2) + VPSLLD(Imm(12), BB3, CC3) + VPSRLD(Imm(20), BB3, BB3) + VPXOR(CC3, BB3, BB3) + VMOVDQA(tmpStoreAVX2, CC3) + polyMulStage1_AVX2() + VPADDD(BB0, AA0, AA0) + VPADDD(BB1, AA1, AA1) + VPADDD(BB2, AA2, AA2) + VPADDD(BB3, AA3, AA3) + VPXOR(AA0, DD0, DD0) + VPXOR(AA1, DD1, DD1) + VPXOR(AA2, DD2, DD2) + VPXOR(AA3, DD3, DD3) + polyMulStage2_AVX2() + VPSHUFB(rol8, DD0, DD0) + VPSHUFB(rol8, DD1, DD1) + VPSHUFB(rol8, DD2, DD2) + VPSHUFB(rol8, DD3, DD3) + VPADDD(DD0, CC0, CC0) + VPADDD(DD1, CC1, CC1) + VPADDD(DD2, CC2, CC2) + VPADDD(DD3, CC3, CC3) + polyMulStage3_AVX2() + VPXOR(CC0, BB0, BB0) + VPXOR(CC1, BB1, BB1) + VPXOR(CC2, BB2, BB2) + VPXOR(CC3, BB3, BB3) + VMOVDQA(CC3, tmpStoreAVX2) + VPSLLD(Imm(7), BB0, CC3) + VPSRLD(Imm(25), BB0, BB0) + VPXOR(CC3, BB0, BB0) + VPSLLD(Imm(7), BB1, CC3) + VPSRLD(Imm(25), BB1, BB1) + VPXOR(CC3, BB1, BB1) + VPSLLD(Imm(7), BB2, CC3) + VPSRLD(Imm(25), BB2, BB2) + VPXOR(CC3, BB2, BB2) + VPSLLD(Imm(7), BB3, CC3) + VPSRLD(Imm(25), BB3, BB3) + VPXOR(CC3, BB3, BB3) + VMOVDQA(tmpStoreAVX2, CC3) + polyMulReduceStage() + VPALIGNR(Imm(12), BB0, BB0, BB0) + VPALIGNR(Imm(12), BB1, BB1, BB1) + VPALIGNR(Imm(12), BB2, BB2, BB2) + VPALIGNR(Imm(12), BB3, BB3, BB3) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(8), CC3, CC3, CC3) + VPALIGNR(Imm(4), DD0, DD0, DD0) + VPALIGNR(Imm(4), DD1, DD1, DD1) + VPALIGNR(Imm(4), DD2, DD2, DD2) + VPALIGNR(Imm(4), DD3, DD3, DD3) + CMPQ(itr1, U32(480)) + JNE(LabelRef("openAVX2InternalLoop")) + + chacha20Constants := chacha20Constants_DATA() + VPADDD(chacha20Constants, AA0, AA0) + VPADDD(chacha20Constants, AA1, AA1) + VPADDD(chacha20Constants, AA2, AA2) + VPADDD(chacha20Constants, AA3, AA3) + VPADDD(state1StoreAVX2, BB0, BB0) + VPADDD(state1StoreAVX2, BB1, BB1) + VPADDD(state1StoreAVX2, BB2, BB2) + VPADDD(state1StoreAVX2, BB3, BB3) + VPADDD(state2StoreAVX2, CC0, CC0) + VPADDD(state2StoreAVX2, CC1, CC1) + VPADDD(state2StoreAVX2, CC2, CC2) + VPADDD(state2StoreAVX2, CC3, CC3) + VPADDD(ctr0StoreAVX2, DD0, DD0) + VPADDD(ctr1StoreAVX2, DD1, DD1) + VPADDD(ctr2StoreAVX2, DD2, DD2) + VPADDD(ctr3StoreAVX2, DD3, DD3) + VMOVDQA(CC3, tmpStoreAVX2) + + Comment("We only hashed 480 of the 512 bytes available - hash the remaining 32 here") + polyAdd(Mem{Base: inp}.Offset(480)) + polyMulAVX2() + VPERM2I128(Imm(0x02), AA0, BB0, CC3) + VPERM2I128(Imm(0x13), AA0, BB0, BB0) + VPERM2I128(Imm(0x02), CC0, DD0, AA0) + VPERM2I128(Imm(0x13), CC0, DD0, CC0) + VPXOR(Mem{Base: inp}.Offset(0*32), CC3, CC3) + VPXOR(Mem{Base: inp}.Offset(1*32), AA0, AA0) + VPXOR(Mem{Base: inp}.Offset(2*32), BB0, BB0) + VPXOR(Mem{Base: inp}.Offset(3*32), CC0, CC0) + VMOVDQU(CC3, Mem{Base: oup}.Offset(0*32)) + VMOVDQU(AA0, Mem{Base: oup}.Offset(1*32)) + VMOVDQU(BB0, Mem{Base: oup}.Offset(2*32)) + VMOVDQU(CC0, Mem{Base: oup}.Offset(3*32)) + VPERM2I128(Imm(0x02), AA1, BB1, AA0) + VPERM2I128(Imm(0x02), CC1, DD1, BB0) + VPERM2I128(Imm(0x13), AA1, BB1, CC0) + VPERM2I128(Imm(0x13), CC1, DD1, DD0) + VPXOR(Mem{Base: inp}.Offset(4*32), AA0, AA0) + VPXOR(Mem{Base: inp}.Offset(5*32), BB0, BB0) + VPXOR(Mem{Base: inp}.Offset(6*32), CC0, CC0) + VPXOR(Mem{Base: inp}.Offset(7*32), DD0, DD0) + VMOVDQU(AA0, Mem{Base: oup}.Offset(4*32)) + VMOVDQU(BB0, Mem{Base: oup}.Offset(5*32)) + VMOVDQU(CC0, Mem{Base: oup}.Offset(6*32)) + VMOVDQU(DD0, Mem{Base: oup}.Offset(7*32)) + + Comment("and here") + polyAdd(Mem{Base: inp}.Offset(496)) + polyMulAVX2() + VPERM2I128(Imm(0x02), AA2, BB2, AA0) + VPERM2I128(Imm(0x02), CC2, DD2, BB0) + VPERM2I128(Imm(0x13), AA2, BB2, CC0) + VPERM2I128(Imm(0x13), CC2, DD2, DD0) + VPXOR(Mem{Base: inp}.Offset(8*32), AA0, AA0) + VPXOR(Mem{Base: inp}.Offset(9*32), BB0, BB0) + VPXOR(Mem{Base: inp}.Offset(10*32), CC0, CC0) + VPXOR(Mem{Base: inp}.Offset(11*32), DD0, DD0) + VMOVDQU(AA0, Mem{Base: oup}.Offset(8*32)) + VMOVDQU(BB0, Mem{Base: oup}.Offset(9*32)) + VMOVDQU(CC0, Mem{Base: oup}.Offset(10*32)) + VMOVDQU(DD0, Mem{Base: oup}.Offset(11*32)) + VPERM2I128(Imm(0x02), AA3, BB3, AA0) + VPERM2I128(Imm(0x02), tmpStoreAVX2, DD3, BB0) + VPERM2I128(Imm(0x13), AA3, BB3, CC0) + VPERM2I128(Imm(0x13), tmpStoreAVX2, DD3, DD0) + VPXOR(Mem{Base: inp}.Offset(12*32), AA0, AA0) + VPXOR(Mem{Base: inp}.Offset(13*32), BB0, BB0) + VPXOR(Mem{Base: inp}.Offset(14*32), CC0, CC0) + VPXOR(Mem{Base: inp}.Offset(15*32), DD0, DD0) + VMOVDQU(AA0, Mem{Base: oup}.Offset(12*32)) + VMOVDQU(BB0, Mem{Base: oup}.Offset(13*32)) + VMOVDQU(CC0, Mem{Base: oup}.Offset(14*32)) + VMOVDQU(DD0, Mem{Base: oup}.Offset(15*32)) + LEAQ(Mem{Base: inp}.Offset(32*16), inp) + LEAQ(Mem{Base: oup}.Offset(32*16), oup) + SUBQ(U32(32*16), inl) + JMP(LabelRef("openAVX2MainLoop")) +} + +// Handle the various tail sizes efficiently +func openAVX2MainLoopDone() { + Label("openAVX2MainLoopDone") + Comment("Handle the various tail sizes efficiently") + TESTQ(inl, inl) + JE(LabelRef("openSSEFinalize")) + CMPQ(inl, Imm(128)) + JBE(LabelRef("openAVX2Tail128")) + CMPQ(inl, U32(256)) + JBE(LabelRef("openAVX2Tail256")) + CMPQ(inl, U32(384)) + JBE(LabelRef("openAVX2Tail384")) + JMP(LabelRef("openAVX2Tail512")) +} + +// ---------------------------------------------------------------------------- +// Special optimization for buffers smaller than 193 bytes + +// For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks +func openAVX2192() { + Label("openAVX2192") + VMOVDQA(AA0, AA1) + VMOVDQA(BB0, BB1) + VMOVDQA(CC0, CC1) + avx2IncMask := avx2IncMask_DATA() + VPADDD(avx2IncMask, DD0, DD1) + VMOVDQA(AA0, AA2) + VMOVDQA(BB0, BB2) + VMOVDQA(CC0, CC2) + VMOVDQA(DD0, DD2) + VMOVDQA(DD1, TT3) + MOVQ(U32(10), itr2) +} + +func openAVX2192InnerCipherLoop() { + Label("openAVX2192InnerCipherLoop") + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + VPALIGNR(Imm(4), BB0, BB0, BB0) + VPALIGNR(Imm(4), BB1, BB1, BB1) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(12), DD0, DD0, DD0) + VPALIGNR(Imm(12), DD1, DD1, DD1) + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + VPALIGNR(Imm(12), BB0, BB0, BB0) + VPALIGNR(Imm(12), BB1, BB1, BB1) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(4), DD0, DD0, DD0) + VPALIGNR(Imm(4), DD1, DD1, DD1) + DECQ(itr2) + JNE(LabelRef("openAVX2192InnerCipherLoop")) + VPADDD(AA2, AA0, AA0) + VPADDD(AA2, AA1, AA1) + VPADDD(BB2, BB0, BB0) + VPADDD(BB2, BB1, BB1) + VPADDD(CC2, CC0, CC0) + VPADDD(CC2, CC1, CC1) + VPADDD(DD2, DD0, DD0) + VPADDD(TT3, DD1, DD1) + VPERM2I128(Imm(0x02), AA0, BB0, TT0) + + Comment("Clamp and store poly key") + polyClampMask := polyClampMask_DATA() + VPAND(polyClampMask, TT0, TT0) + VMOVDQA(TT0, rsStoreAVX2) + + Comment("Stream for up to 192 bytes") + VPERM2I128(Imm(0x13), AA0, BB0, AA0) + VPERM2I128(Imm(0x13), CC0, DD0, BB0) + VPERM2I128(Imm(0x02), AA1, BB1, CC0) + VPERM2I128(Imm(0x02), CC1, DD1, DD0) + VPERM2I128(Imm(0x13), AA1, BB1, AA1) + VPERM2I128(Imm(0x13), CC1, DD1, BB1) +} + +func openAVX2ShortOpen() { + Label("openAVX2ShortOpen") + Comment("Hash") + Load(Param("ad").Len(), itr2) + CALL(LabelRef("polyHashADInternal<>(SB)")) +} + +func openAVX2ShortOpenLoop() { + Label("openAVX2ShortOpenLoop") + CMPQ(inl, Imm(32)) + JB(LabelRef("openAVX2ShortTail32")) + SUBQ(Imm(32), inl) + + Comment("Load for hashing") + polyAdd(Mem{Base: inp}.Offset(0 * 8)) + polyMulAVX2() + polyAdd(Mem{Base: inp}.Offset(2 * 8)) + polyMulAVX2() + + Comment("Load for decryption") + VPXOR(Mem{Base: inp}, AA0, AA0) + VMOVDQU(AA0, Mem{Base: oup}) + LEAQ(Mem{Base: inp}.Offset(1*32), inp) + LEAQ(Mem{Base: oup}.Offset(1*32), oup) + + Comment("Shift stream left") + VMOVDQA(BB0, AA0) + VMOVDQA(CC0, BB0) + VMOVDQA(DD0, CC0) + VMOVDQA(AA1, DD0) + VMOVDQA(BB1, AA1) + VMOVDQA(CC1, BB1) + VMOVDQA(DD1, CC1) + VMOVDQA(AA2, DD1) + VMOVDQA(BB2, AA2) + JMP(LabelRef("openAVX2ShortOpenLoop")) +} + +func openAVX2ShortTail32() { + Label("openAVX2ShortTail32") + CMPQ(inl, Imm(16)) + VMOVDQA(A0, A1) + JB(LabelRef("openAVX2ShortDone")) + + SUBQ(Imm(16), inl) + + Comment("Load for hashing") + polyAdd(Mem{Base: inp}.Offset(0 * 8)) + polyMulAVX2() + + Comment("Load for decryption") + VPXOR(Mem{Base: inp}, A0, T0) + VMOVDQU(T0, Mem{Base: oup}) + LEAQ(Mem{Base: inp}.Offset(1*16), inp) + LEAQ(Mem{Base: oup}.Offset(1*16), oup) + VPERM2I128(Imm(0x11), AA0, AA0, AA0) + VMOVDQA(A0, A1) +} + +func openAVX2ShortDone() { + Label("openAVX2ShortDone") + VZEROUPPER() + JMP(LabelRef("openSSETail16")) +} + +// ---------------------------------------------------------------------------- +// Special optimization for buffers smaller than 321 bytes + +// For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks +func openAVX2320() { + Label("openAVX2320") + VMOVDQA(AA0, AA1) + VMOVDQA(BB0, BB1) + VMOVDQA(CC0, CC1) + avx2IncMask := avx2IncMask_DATA() + VPADDD(avx2IncMask, DD0, DD1) + VMOVDQA(AA0, AA2) + VMOVDQA(BB0, BB2) + VMOVDQA(CC0, CC2) + VPADDD(avx2IncMask, DD1, DD2) + VMOVDQA(BB0, TT1) + VMOVDQA(CC0, TT2) + VMOVDQA(DD0, TT3) + MOVQ(U32(10), itr2) +} + +func openAVX2320InnerCipherLoop() { + Label("openAVX2320InnerCipherLoop") + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) + VPALIGNR(Imm(4), BB0, BB0, BB0) + VPALIGNR(Imm(4), BB1, BB1, BB1) + VPALIGNR(Imm(4), BB2, BB2, BB2) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(12), DD0, DD0, DD0) + VPALIGNR(Imm(12), DD1, DD1, DD1) + VPALIGNR(Imm(12), DD2, DD2, DD2) + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) + VPALIGNR(Imm(12), BB0, BB0, BB0) + VPALIGNR(Imm(12), BB1, BB1, BB1) + VPALIGNR(Imm(12), BB2, BB2, BB2) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(4), DD0, DD0, DD0) + VPALIGNR(Imm(4), DD1, DD1, DD1) + VPALIGNR(Imm(4), DD2, DD2, DD2) + DECQ(itr2) + JNE(LabelRef("openAVX2320InnerCipherLoop")) + + chacha20Constants := chacha20Constants_DATA() + VMOVDQA(chacha20Constants, TT0) + VPADDD(TT0, AA0, AA0) + VPADDD(TT0, AA1, AA1) + VPADDD(TT0, AA2, AA2) + VPADDD(TT1, BB0, BB0) + VPADDD(TT1, BB1, BB1) + VPADDD(TT1, BB2, BB2) + VPADDD(TT2, CC0, CC0) + VPADDD(TT2, CC1, CC1) + VPADDD(TT2, CC2, CC2) + avx2IncMask := avx2IncMask_DATA() + VMOVDQA(avx2IncMask, TT0) + VPADDD(TT3, DD0, DD0) + VPADDD(TT0, TT3, TT3) + VPADDD(TT3, DD1, DD1) + VPADDD(TT0, TT3, TT3) + VPADDD(TT3, DD2, DD2) + + Comment("Clamp and store poly key") + VPERM2I128(Imm(0x02), AA0, BB0, TT0) + polyClampMask := polyClampMask_DATA() + VPAND(polyClampMask, TT0, TT0) + VMOVDQA(TT0, rsStoreAVX2) + + Comment("Stream for up to 320 bytes") + VPERM2I128(Imm(0x13), AA0, BB0, AA0) + VPERM2I128(Imm(0x13), CC0, DD0, BB0) + VPERM2I128(Imm(0x02), AA1, BB1, CC0) + VPERM2I128(Imm(0x02), CC1, DD1, DD0) + VPERM2I128(Imm(0x13), AA1, BB1, AA1) + VPERM2I128(Imm(0x13), CC1, DD1, BB1) + VPERM2I128(Imm(0x02), AA2, BB2, CC1) + VPERM2I128(Imm(0x02), CC2, DD2, DD1) + VPERM2I128(Imm(0x13), AA2, BB2, AA2) + VPERM2I128(Imm(0x13), CC2, DD2, BB2) + JMP(LabelRef("openAVX2ShortOpen")) +} + +// ---------------------------------------------------------------------------- +// Special optimization for the last 128 bytes of ciphertext + +// Need to decrypt up to 128 bytes - prepare two blocks +func openAVX2Tail128() { + Label("openAVX2Tail128") + Comment("Need to decrypt up to 128 bytes - prepare two blocks") + chacha20Constants := chacha20Constants_DATA() + VMOVDQA(chacha20Constants, AA1) + VMOVDQA(state1StoreAVX2, BB1) + VMOVDQA(state2StoreAVX2, CC1) + VMOVDQA(ctr3StoreAVX2, DD1) + avx2IncMask := avx2IncMask_DATA() + VPADDD(avx2IncMask, DD1, DD1) + VMOVDQA(DD1, DD0) + + XORQ(itr2, itr2) + MOVQ(inl, itr1) + ANDQ(I8(-16), itr1) + TESTQ(itr1, itr1) + JE(LabelRef("openAVX2Tail128LoopB")) +} + +// Perform ChaCha rounds, while hashing the remaining input +func openAVX2Tail128LoopA() { + Label("openAVX2Tail128LoopA") + polyAdd(Mem{Base: inp, Index: itr2, Scale: 1}.Offset(0)) + polyMulAVX2() +} + +func openAVX2Tail128LoopB() { + Label("openAVX2Tail128LoopB") + ADDQ(Imm(16), itr2) + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + VPALIGNR(Imm(4), BB1, BB1, BB1) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(12), DD1, DD1, DD1) + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + VPALIGNR(Imm(12), BB1, BB1, BB1) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(4), DD1, DD1, DD1) + CMPQ(itr2, itr1) + JB(LabelRef("openAVX2Tail128LoopA")) + CMPQ(itr2, Imm(160)) + JNE(LabelRef("openAVX2Tail128LoopB")) + + chacha20Constants := chacha20Constants_DATA() + VPADDD(chacha20Constants, AA1, AA1) + VPADDD(state1StoreAVX2, BB1, BB1) + VPADDD(state2StoreAVX2, CC1, CC1) + VPADDD(DD0, DD1, DD1) + VPERM2I128(Imm(0x02), AA1, BB1, AA0) + VPERM2I128(Imm(0x02), CC1, DD1, BB0) + VPERM2I128(Imm(0x13), AA1, BB1, CC0) + VPERM2I128(Imm(0x13), CC1, DD1, DD0) +} + +func openAVX2TailLoop() { + Label("openAVX2TailLoop") + CMPQ(inl, Imm(32)) + JB(LabelRef("openAVX2Tail")) + SUBQ(Imm(32), inl) + + Comment("Load for decryption") + VPXOR(Mem{Base: inp}, AA0, AA0) + VMOVDQU(AA0, Mem{Base: oup}) + LEAQ(Mem{Base: inp}.Offset(1*32), inp) + LEAQ(Mem{Base: oup}.Offset(1*32), oup) + VMOVDQA(BB0, AA0) + VMOVDQA(CC0, BB0) + VMOVDQA(DD0, CC0) + JMP(LabelRef("openAVX2TailLoop")) +} + +func openAVX2Tail() { + Label("openAVX2Tail") + CMPQ(inl, Imm(16)) + VMOVDQA(A0, A1) + JB(LabelRef("openAVX2TailDone")) + SUBQ(Imm(16), inl) + + Comment("Load for decryption") + VPXOR(Mem{Base: inp}, A0, T0) + VMOVDQU(T0, Mem{Base: oup}) + LEAQ(Mem{Base: inp}.Offset(1*16), inp) + LEAQ(Mem{Base: oup}.Offset(1*16), oup) + VPERM2I128(Imm(0x11), AA0, AA0, AA0) + VMOVDQA(A0, A1) +} + +func openAVX2TailDone() { + Label("openAVX2TailDone") + VZEROUPPER() + JMP(LabelRef("openSSETail16")) +} + +// ---------------------------------------------------------------------------- +// Special optimization for the last 256 bytes of ciphertext + +// Need to decrypt up to 256 bytes - prepare four blocks +func openAVX2Tail256() { + Label("openAVX2Tail256") + chacha20Constants := chacha20Constants_DATA() + VMOVDQA(chacha20Constants, AA0) + VMOVDQA(AA0, AA1) + VMOVDQA(state1StoreAVX2, BB0) + VMOVDQA(BB0, BB1) + VMOVDQA(state2StoreAVX2, CC0) + VMOVDQA(CC0, CC1) + VMOVDQA(ctr3StoreAVX2, DD0) + avx2IncMask := avx2IncMask_DATA() + VPADDD(avx2IncMask, DD0, DD0) + VPADDD(avx2IncMask, DD0, DD1) + VMOVDQA(DD0, TT1) + VMOVDQA(DD1, TT2) + + Comment("Compute the number of iterations that will hash data") + MOVQ(inl, tmpStoreAVX2) + MOVQ(inl, itr1) + SUBQ(Imm(128), itr1) + SHRQ(Imm(4), itr1) + MOVQ(U32(10), itr2) + CMPQ(itr1, Imm(10)) + CMOVQGT(itr2, itr1) + MOVQ(inp, inl) + XORQ(itr2, itr2) +} + +func openAVX2Tail256LoopA() { + Label("openAVX2Tail256LoopA") + polyAdd(Mem{Base: inl}.Offset(0)) + polyMulAVX2() + LEAQ(Mem{Base: inl}.Offset(16), inl) +} + +// Perform ChaCha rounds, while hashing the remaining input +func openAVX2Tail256LoopB() { + Label("openAVX2Tail256LoopB") + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + VPALIGNR(Imm(4), BB0, BB0, BB0) + VPALIGNR(Imm(4), BB1, BB1, BB1) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(12), DD0, DD0, DD0) + VPALIGNR(Imm(12), DD1, DD1, DD1) + INCQ(itr2) + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + VPALIGNR(Imm(12), BB0, BB0, BB0) + VPALIGNR(Imm(12), BB1, BB1, BB1) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(4), DD0, DD0, DD0) + VPALIGNR(Imm(4), DD1, DD1, DD1) + CMPQ(itr2, itr1) + JB(LabelRef("openAVX2Tail256LoopA")) + + CMPQ(itr2, Imm(10)) + JNE(LabelRef("openAVX2Tail256LoopB")) + + MOVQ(inl, itr2) + SUBQ(inp, inl) + MOVQ(inl, itr1) + MOVQ(tmpStoreAVX2, inl) +} + +// Hash the remainder of data (if any) +func openAVX2Tail256Hash() { + Label("openAVX2Tail256Hash") + ADDQ(Imm(16), itr1) + CMPQ(itr1, inl) + JGT(LabelRef("openAVX2Tail256HashEnd")) + polyAdd(Mem{Base: itr2}.Offset(0)) + polyMulAVX2() + LEAQ(Mem{Base: itr2}.Offset(16), itr2) + JMP(LabelRef("openAVX2Tail256Hash")) +} + +// Store 128 bytes safely, then go to store loop +func openAVX2Tail256HashEnd() { + Label("openAVX2Tail256HashEnd") + chacha20Constants := chacha20Constants_DATA() + VPADDD(chacha20Constants, AA0, AA0) + VPADDD(chacha20Constants, AA1, AA1) + VPADDD(state1StoreAVX2, BB0, BB0) + VPADDD(state1StoreAVX2, BB1, BB1) + VPADDD(state2StoreAVX2, CC0, CC0) + VPADDD(state2StoreAVX2, CC1, CC1) + VPADDD(TT1, DD0, DD0) + VPADDD(TT2, DD1, DD1) + VPERM2I128(Imm(0x02), AA0, BB0, AA2) + VPERM2I128(Imm(0x02), CC0, DD0, BB2) + VPERM2I128(Imm(0x13), AA0, BB0, CC2) + VPERM2I128(Imm(0x13), CC0, DD0, DD2) + VPERM2I128(Imm(0x02), AA1, BB1, AA0) + VPERM2I128(Imm(0x02), CC1, DD1, BB0) + VPERM2I128(Imm(0x13), AA1, BB1, CC0) + VPERM2I128(Imm(0x13), CC1, DD1, DD0) + + VPXOR(Mem{Base: inp}.Offset(0*32), AA2, AA2) + VPXOR(Mem{Base: inp}.Offset(1*32), BB2, BB2) + VPXOR(Mem{Base: inp}.Offset(2*32), CC2, CC2) + VPXOR(Mem{Base: inp}.Offset(3*32), DD2, DD2) + VMOVDQU(AA2, Mem{Base: oup}.Offset(0*32)) + VMOVDQU(BB2, Mem{Base: oup}.Offset(1*32)) + VMOVDQU(CC2, Mem{Base: oup}.Offset(2*32)) + VMOVDQU(DD2, Mem{Base: oup}.Offset(3*32)) + LEAQ(Mem{Base: inp}.Offset(4*32), inp) + LEAQ(Mem{Base: oup}.Offset(4*32), oup) + SUBQ(Imm(4*32), inl) + + JMP(LabelRef("openAVX2TailLoop")) +} + +// ---------------------------------------------------------------------------- +// Special optimization for the last 384 bytes of ciphertext + +// Need to decrypt up to 384 bytes - prepare six blocks +func openAVX2Tail384() { + Label("openAVX2Tail384") + Comment("Need to decrypt up to 384 bytes - prepare six blocks") + chacha20Constants := chacha20Constants_DATA() + VMOVDQA(chacha20Constants, AA0) + VMOVDQA(AA0, AA1) + VMOVDQA(AA0, AA2) + VMOVDQA(state1StoreAVX2, BB0) + VMOVDQA(BB0, BB1) + VMOVDQA(BB0, BB2) + VMOVDQA(state2StoreAVX2, CC0) + VMOVDQA(CC0, CC1) + VMOVDQA(CC0, CC2) + VMOVDQA(ctr3StoreAVX2, DD0) + avx2IncMask := avx2IncMask_DATA() + VPADDD(avx2IncMask, DD0, DD0) + VPADDD(avx2IncMask, DD0, DD1) + VPADDD(avx2IncMask, DD1, DD2) + VMOVDQA(DD0, ctr0StoreAVX2) + VMOVDQA(DD1, ctr1StoreAVX2) + VMOVDQA(DD2, ctr2StoreAVX2) + + Comment("Compute the number of iterations that will hash two blocks of data") + MOVQ(inl, tmpStoreAVX2) + MOVQ(inl, itr1) + SUBQ(U32(256), itr1) + SHRQ(Imm(4), itr1) + ADDQ(Imm(6), itr1) + MOVQ(U32(10), itr2) + CMPQ(itr1, Imm(10)) + CMOVQGT(itr2, itr1) + MOVQ(inp, inl) + XORQ(itr2, itr2) +} + +// Perform ChaCha rounds, while hashing the remaining input +func openAVX2Tail384LoopB() { + Label("openAVX2Tail384LoopB") + polyAdd(Mem{Base: inl}.Offset(0)) + polyMulAVX2() + LEAQ(Mem{Base: inl}.Offset(16), inl) +} + +func openAVX2Tail384LoopA() { + Label("openAVX2Tail384LoopA") + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) + VPALIGNR(Imm(4), BB0, BB0, BB0) + VPALIGNR(Imm(4), BB1, BB1, BB1) + VPALIGNR(Imm(4), BB2, BB2, BB2) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(12), DD0, DD0, DD0) + VPALIGNR(Imm(12), DD1, DD1, DD1) + VPALIGNR(Imm(12), DD2, DD2, DD2) + polyAdd(Mem{Base: inl}.Offset(0)) + polyMulAVX2() + LEAQ(Mem{Base: inl}.Offset(16), inl) + INCQ(itr2) + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) + VPALIGNR(Imm(12), BB0, BB0, BB0) + VPALIGNR(Imm(12), BB1, BB1, BB1) + VPALIGNR(Imm(12), BB2, BB2, BB2) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(4), DD0, DD0, DD0) + VPALIGNR(Imm(4), DD1, DD1, DD1) + VPALIGNR(Imm(4), DD2, DD2, DD2) + + CMPQ(itr2, itr1) + JB(LabelRef("openAVX2Tail384LoopB")) + + CMPQ(itr2, Imm(10)) + JNE(LabelRef("openAVX2Tail384LoopA")) + + MOVQ(inl, itr2) + SUBQ(inp, inl) + MOVQ(inl, itr1) + MOVQ(tmpStoreAVX2, inl) +} + +func openAVX2Tail384Hash() { + Label("openAVX2Tail384Hash") + ADDQ(Imm(16), itr1) + CMPQ(itr1, inl) + JGT(LabelRef("openAVX2Tail384HashEnd")) + polyAdd(Mem{Base: itr2}.Offset(0)) + polyMulAVX2() + LEAQ(Mem{Base: itr2}.Offset(16), itr2) + JMP(LabelRef("openAVX2Tail384Hash")) +} + +// Store 256 bytes safely, then go to store loop +func openAVX2Tail384HashEnd() { + Label("openAVX2Tail384HashEnd") + chacha20Constants := chacha20Constants_DATA() + VPADDD(chacha20Constants, AA0, AA0) + VPADDD(chacha20Constants, AA1, AA1) + VPADDD(chacha20Constants, AA2, AA2) + VPADDD(state1StoreAVX2, BB0, BB0) + VPADDD(state1StoreAVX2, BB1, BB1) + VPADDD(state1StoreAVX2, BB2, BB2) + VPADDD(state2StoreAVX2, CC0, CC0) + VPADDD(state2StoreAVX2, CC1, CC1) + VPADDD(state2StoreAVX2, CC2, CC2) + VPADDD(ctr0StoreAVX2, DD0, DD0) + VPADDD(ctr1StoreAVX2, DD1, DD1) + VPADDD(ctr2StoreAVX2, DD2, DD2) + VPERM2I128(Imm(0x02), AA0, BB0, TT0) + VPERM2I128(Imm(0x02), CC0, DD0, TT1) + VPERM2I128(Imm(0x13), AA0, BB0, TT2) + VPERM2I128(Imm(0x13), CC0, DD0, TT3) + VPXOR(Mem{Base: inp}.Offset(0*32), TT0, TT0) + VPXOR(Mem{Base: inp}.Offset(1*32), TT1, TT1) + VPXOR(Mem{Base: inp}.Offset(2*32), TT2, TT2) + VPXOR(Mem{Base: inp}.Offset(3*32), TT3, TT3) + VMOVDQU(TT0, Mem{Base: oup}.Offset(0*32)) + VMOVDQU(TT1, Mem{Base: oup}.Offset(1*32)) + VMOVDQU(TT2, Mem{Base: oup}.Offset(2*32)) + VMOVDQU(TT3, Mem{Base: oup}.Offset(3*32)) + VPERM2I128(Imm(0x02), AA1, BB1, TT0) + VPERM2I128(Imm(0x02), CC1, DD1, TT1) + VPERM2I128(Imm(0x13), AA1, BB1, TT2) + VPERM2I128(Imm(0x13), CC1, DD1, TT3) + VPXOR(Mem{Base: inp}.Offset(4*32), TT0, TT0) + VPXOR(Mem{Base: inp}.Offset(5*32), TT1, TT1) + VPXOR(Mem{Base: inp}.Offset(6*32), TT2, TT2) + VPXOR(Mem{Base: inp}.Offset(7*32), TT3, TT3) + VMOVDQU(TT0, Mem{Base: oup}.Offset(4*32)) + VMOVDQU(TT1, Mem{Base: oup}.Offset(5*32)) + VMOVDQU(TT2, Mem{Base: oup}.Offset(6*32)) + VMOVDQU(TT3, Mem{Base: oup}.Offset(7*32)) + VPERM2I128(Imm(0x02), AA2, BB2, AA0) + VPERM2I128(Imm(0x02), CC2, DD2, BB0) + VPERM2I128(Imm(0x13), AA2, BB2, CC0) + VPERM2I128(Imm(0x13), CC2, DD2, DD0) + LEAQ(Mem{Base: inp}.Offset(8*32), inp) + LEAQ(Mem{Base: oup}.Offset(8*32), oup) + SUBQ(U32(8*32), inl) + JMP(LabelRef("openAVX2TailLoop")) +} + +// ---------------------------------------------------------------------------- +// Special optimization for the last 512 bytes of ciphertext + +func openAVX2Tail512() { + Label("openAVX2Tail512") + chacha20Constants := chacha20Constants_DATA() + VMOVDQU(chacha20Constants, AA0) + VMOVDQA(AA0, AA1) + VMOVDQA(AA0, AA2) + VMOVDQA(AA0, AA3) + VMOVDQA(state1StoreAVX2, BB0) + VMOVDQA(BB0, BB1) + VMOVDQA(BB0, BB2) + VMOVDQA(BB0, BB3) + VMOVDQA(state2StoreAVX2, CC0) + VMOVDQA(CC0, CC1) + VMOVDQA(CC0, CC2) + VMOVDQA(CC0, CC3) + VMOVDQA(ctr3StoreAVX2, DD0) + avx2IncMask := avx2IncMask_DATA() + VPADDD(avx2IncMask, DD0, DD0) + VPADDD(avx2IncMask, DD0, DD1) + VPADDD(avx2IncMask, DD1, DD2) + VPADDD(avx2IncMask, DD2, DD3) + VMOVDQA(DD0, ctr0StoreAVX2) + VMOVDQA(DD1, ctr1StoreAVX2) + VMOVDQA(DD2, ctr2StoreAVX2) + VMOVDQA(DD3, ctr3StoreAVX2) + XORQ(itr1, itr1) + MOVQ(inp, itr2) +} + +func openAVX2Tail512LoopB() { + Label("openAVX2Tail512LoopB") + polyAdd(Mem{Base: itr2}.Offset(0)) + polyMulAVX2() + LEAQ(Mem{Base: itr2}.Offset(2*8), itr2) +} + +func openAVX2Tail512LoopA() { + Label("openAVX2Tail512LoopA") + VPADDD(BB0, AA0, AA0) + VPADDD(BB1, AA1, AA1) + VPADDD(BB2, AA2, AA2) + VPADDD(BB3, AA3, AA3) + VPXOR(AA0, DD0, DD0) + VPXOR(AA1, DD1, DD1) + VPXOR(AA2, DD2, DD2) + VPXOR(AA3, DD3, DD3) + rol16 := rol16_DATA() + VPSHUFB(rol16, DD0, DD0) + VPSHUFB(rol16, DD1, DD1) + VPSHUFB(rol16, DD2, DD2) + VPSHUFB(rol16, DD3, DD3) + VPADDD(DD0, CC0, CC0) + VPADDD(DD1, CC1, CC1) + VPADDD(DD2, CC2, CC2) + VPADDD(DD3, CC3, CC3) + VPXOR(CC0, BB0, BB0) + VPXOR(CC1, BB1, BB1) + VPXOR(CC2, BB2, BB2) + VPXOR(CC3, BB3, BB3) + VMOVDQA(CC3, tmpStoreAVX2) + VPSLLD(Imm(12), BB0, CC3) + VPSRLD(Imm(20), BB0, BB0) + VPXOR(CC3, BB0, BB0) + VPSLLD(Imm(12), BB1, CC3) + VPSRLD(Imm(20), BB1, BB1) + VPXOR(CC3, BB1, BB1) + VPSLLD(Imm(12), BB2, CC3) + VPSRLD(Imm(20), BB2, BB2) + VPXOR(CC3, BB2, BB2) + VPSLLD(Imm(12), BB3, CC3) + VPSRLD(Imm(20), BB3, BB3) + VPXOR(CC3, BB3, BB3) + VMOVDQA(tmpStoreAVX2, CC3) + polyAdd(Mem{Base: itr2}.Offset(0 * 8)) + polyMulAVX2() + VPADDD(BB0, AA0, AA0) + VPADDD(BB1, AA1, AA1) + VPADDD(BB2, AA2, AA2) + VPADDD(BB3, AA3, AA3) + VPXOR(AA0, DD0, DD0) + VPXOR(AA1, DD1, DD1) + VPXOR(AA2, DD2, DD2) + VPXOR(AA3, DD3, DD3) + rol8 := rol8_DATA() + VPSHUFB(rol8, DD0, DD0) + VPSHUFB(rol8, DD1, DD1) + VPSHUFB(rol8, DD2, DD2) + VPSHUFB(rol8, DD3, DD3) + VPADDD(DD0, CC0, CC0) + VPADDD(DD1, CC1, CC1) + VPADDD(DD2, CC2, CC2) + VPADDD(DD3, CC3, CC3) + VPXOR(CC0, BB0, BB0) + VPXOR(CC1, BB1, BB1) + VPXOR(CC2, BB2, BB2) + VPXOR(CC3, BB3, BB3) + VMOVDQA(CC3, tmpStoreAVX2) + VPSLLD(Imm(7), BB0, CC3) + VPSRLD(Imm(25), BB0, BB0) + VPXOR(CC3, BB0, BB0) + VPSLLD(Imm(7), BB1, CC3) + VPSRLD(Imm(25), BB1, BB1) + VPXOR(CC3, BB1, BB1) + VPSLLD(Imm(7), BB2, CC3) + VPSRLD(Imm(25), BB2, BB2) + VPXOR(CC3, BB2, BB2) + VPSLLD(Imm(7), BB3, CC3) + VPSRLD(Imm(25), BB3, BB3) + VPXOR(CC3, BB3, BB3) + VMOVDQA(tmpStoreAVX2, CC3) + VPALIGNR(Imm(4), BB0, BB0, BB0) + VPALIGNR(Imm(4), BB1, BB1, BB1) + VPALIGNR(Imm(4), BB2, BB2, BB2) + VPALIGNR(Imm(4), BB3, BB3, BB3) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(8), CC3, CC3, CC3) + VPALIGNR(Imm(12), DD0, DD0, DD0) + VPALIGNR(Imm(12), DD1, DD1, DD1) + VPALIGNR(Imm(12), DD2, DD2, DD2) + VPALIGNR(Imm(12), DD3, DD3, DD3) + VPADDD(BB0, AA0, AA0) + VPADDD(BB1, AA1, AA1) + VPADDD(BB2, AA2, AA2) + VPADDD(BB3, AA3, AA3) + VPXOR(AA0, DD0, DD0) + VPXOR(AA1, DD1, DD1) + VPXOR(AA2, DD2, DD2) + VPXOR(AA3, DD3, DD3) + VPSHUFB(rol16, DD0, DD0) + VPSHUFB(rol16, DD1, DD1) + VPSHUFB(rol16, DD2, DD2) + VPSHUFB(rol16, DD3, DD3) + VPADDD(DD0, CC0, CC0) + VPADDD(DD1, CC1, CC1) + VPADDD(DD2, CC2, CC2) + VPADDD(DD3, CC3, CC3) + VPXOR(CC0, BB0, BB0) + VPXOR(CC1, BB1, BB1) + VPXOR(CC2, BB2, BB2) + VPXOR(CC3, BB3, BB3) + polyAdd(Mem{Base: itr2}.Offset(2 * 8)) + polyMulAVX2() + LEAQ(Mem{Base: itr2}.Offset(4*8), itr2) + VMOVDQA(CC3, tmpStoreAVX2) + VPSLLD(Imm(12), BB0, CC3) + VPSRLD(Imm(20), BB0, BB0) + VPXOR(CC3, BB0, BB0) + VPSLLD(Imm(12), BB1, CC3) + VPSRLD(Imm(20), BB1, BB1) + VPXOR(CC3, BB1, BB1) + VPSLLD(Imm(12), BB2, CC3) + VPSRLD(Imm(20), BB2, BB2) + VPXOR(CC3, BB2, BB2) + VPSLLD(Imm(12), BB3, CC3) + VPSRLD(Imm(20), BB3, BB3) + VPXOR(CC3, BB3, BB3) + VMOVDQA(tmpStoreAVX2, CC3) + VPADDD(BB0, AA0, AA0) + VPADDD(BB1, AA1, AA1) + VPADDD(BB2, AA2, AA2) + VPADDD(BB3, AA3, AA3) + VPXOR(AA0, DD0, DD0) + VPXOR(AA1, DD1, DD1) + VPXOR(AA2, DD2, DD2) + VPXOR(AA3, DD3, DD3) + VPSHUFB(rol8, DD0, DD0) + VPSHUFB(rol8, DD1, DD1) + VPSHUFB(rol8, DD2, DD2) + VPSHUFB(rol8, DD3, DD3) + VPADDD(DD0, CC0, CC0) + VPADDD(DD1, CC1, CC1) + VPADDD(DD2, CC2, CC2) + VPADDD(DD3, CC3, CC3) + VPXOR(CC0, BB0, BB0) + VPXOR(CC1, BB1, BB1) + VPXOR(CC2, BB2, BB2) + VPXOR(CC3, BB3, BB3) + VMOVDQA(CC3, tmpStoreAVX2) + VPSLLD(Imm(7), BB0, CC3) + VPSRLD(Imm(25), BB0, BB0) + VPXOR(CC3, BB0, BB0) + VPSLLD(Imm(7), BB1, CC3) + VPSRLD(Imm(25), BB1, BB1) + VPXOR(CC3, BB1, BB1) + VPSLLD(Imm(7), BB2, CC3) + VPSRLD(Imm(25), BB2, BB2) + VPXOR(CC3, BB2, BB2) + VPSLLD(Imm(7), BB3, CC3) + VPSRLD(Imm(25), BB3, BB3) + VPXOR(CC3, BB3, BB3) + VMOVDQA(tmpStoreAVX2, CC3) + VPALIGNR(Imm(12), BB0, BB0, BB0) + VPALIGNR(Imm(12), BB1, BB1, BB1) + VPALIGNR(Imm(12), BB2, BB2, BB2) + VPALIGNR(Imm(12), BB3, BB3, BB3) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(8), CC3, CC3, CC3) + VPALIGNR(Imm(4), DD0, DD0, DD0) + VPALIGNR(Imm(4), DD1, DD1, DD1) + VPALIGNR(Imm(4), DD2, DD2, DD2) + VPALIGNR(Imm(4), DD3, DD3, DD3) + INCQ(itr1) + CMPQ(itr1, Imm(4)) + JLT(LabelRef("openAVX2Tail512LoopB")) + + CMPQ(itr1, Imm(10)) + JNE(LabelRef("openAVX2Tail512LoopA")) + + MOVQ(inl, itr1) + SUBQ(U32(384), itr1) + ANDQ(I8(-16), itr1) +} + +func openAVX2Tail512HashLoop() { + Label("openAVX2Tail512HashLoop") + TESTQ(itr1, itr1) + JE(LabelRef("openAVX2Tail512HashEnd")) + polyAdd(Mem{Base: itr2}.Offset(0)) + polyMulAVX2() + LEAQ(Mem{Base: itr2}.Offset(16), itr2) + SUBQ(Imm(16), itr1) + JMP(LabelRef("openAVX2Tail512HashLoop")) +} + +func openAVX2Tail512HashEnd() { + Label("openAVX2Tail512HashEnd") + chacha20Constants := chacha20Constants_DATA() + VPADDD(chacha20Constants, AA0, AA0) + VPADDD(chacha20Constants, AA1, AA1) + VPADDD(chacha20Constants, AA2, AA2) + VPADDD(chacha20Constants, AA3, AA3) + VPADDD(state1StoreAVX2, BB0, BB0) + VPADDD(state1StoreAVX2, BB1, BB1) + VPADDD(state1StoreAVX2, BB2, BB2) + VPADDD(state1StoreAVX2, BB3, BB3) + VPADDD(state2StoreAVX2, CC0, CC0) + VPADDD(state2StoreAVX2, CC1, CC1) + VPADDD(state2StoreAVX2, CC2, CC2) + VPADDD(state2StoreAVX2, CC3, CC3) + VPADDD(ctr0StoreAVX2, DD0, DD0) + VPADDD(ctr1StoreAVX2, DD1, DD1) + VPADDD(ctr2StoreAVX2, DD2, DD2) + VPADDD(ctr3StoreAVX2, DD3, DD3) + VMOVDQA(CC3, tmpStoreAVX2) + VPERM2I128(Imm(0x02), AA0, BB0, CC3) + VPERM2I128(Imm(0x13), AA0, BB0, BB0) + VPERM2I128(Imm(0x02), CC0, DD0, AA0) + VPERM2I128(Imm(0x13), CC0, DD0, CC0) + VPXOR(Mem{Base: inp}.Offset(0*32), CC3, CC3) + VPXOR(Mem{Base: inp}.Offset(1*32), AA0, AA0) + VPXOR(Mem{Base: inp}.Offset(2*32), BB0, BB0) + VPXOR(Mem{Base: inp}.Offset(3*32), CC0, CC0) + VMOVDQU(CC3, Mem{Base: oup}.Offset(0*32)) + VMOVDQU(AA0, Mem{Base: oup}.Offset(1*32)) + VMOVDQU(BB0, Mem{Base: oup}.Offset(2*32)) + VMOVDQU(CC0, Mem{Base: oup}.Offset(3*32)) + VPERM2I128(Imm(0x02), AA1, BB1, AA0) + VPERM2I128(Imm(0x02), CC1, DD1, BB0) + VPERM2I128(Imm(0x13), AA1, BB1, CC0) + VPERM2I128(Imm(0x13), CC1, DD1, DD0) + VPXOR(Mem{Base: inp}.Offset(4*32), AA0, AA0) + VPXOR(Mem{Base: inp}.Offset(5*32), BB0, BB0) + VPXOR(Mem{Base: inp}.Offset(6*32), CC0, CC0) + VPXOR(Mem{Base: inp}.Offset(7*32), DD0, DD0) + VMOVDQU(AA0, Mem{Base: oup}.Offset(4*32)) + VMOVDQU(BB0, Mem{Base: oup}.Offset(5*32)) + VMOVDQU(CC0, Mem{Base: oup}.Offset(6*32)) + VMOVDQU(DD0, Mem{Base: oup}.Offset(7*32)) + VPERM2I128(Imm(0x02), AA2, BB2, AA0) + VPERM2I128(Imm(0x02), CC2, DD2, BB0) + VPERM2I128(Imm(0x13), AA2, BB2, CC0) + VPERM2I128(Imm(0x13), CC2, DD2, DD0) + VPXOR(Mem{Base: inp}.Offset(8*32), AA0, AA0) + VPXOR(Mem{Base: inp}.Offset(9*32), BB0, BB0) + VPXOR(Mem{Base: inp}.Offset(10*32), CC0, CC0) + VPXOR(Mem{Base: inp}.Offset(11*32), DD0, DD0) + VMOVDQU(AA0, Mem{Base: oup}.Offset(8*32)) + VMOVDQU(BB0, Mem{Base: oup}.Offset(9*32)) + VMOVDQU(CC0, Mem{Base: oup}.Offset(10*32)) + VMOVDQU(DD0, Mem{Base: oup}.Offset(11*32)) + VPERM2I128(Imm(0x02), AA3, BB3, AA0) + VPERM2I128(Imm(0x02), tmpStoreAVX2, DD3, BB0) + VPERM2I128(Imm(0x13), AA3, BB3, CC0) + VPERM2I128(Imm(0x13), tmpStoreAVX2, DD3, DD0) + + LEAQ(Mem{Base: inp}.Offset(12*32), inp) + LEAQ(Mem{Base: oup}.Offset(12*32), oup) + SUBQ(U32(12*32), inl) + + JMP(LabelRef("openAVX2TailLoop")) +} + +// ---------------------------------------------------------------------------- +// ---------------------------------------------------------------------------- + +// Implements the following function fignature: +// +// func chacha20Poly1305Seal(dst []byte, key []uint32, src, ad []byte) +func chacha20Poly1305Seal() { + Implement("chacha20Poly1305Seal") + Attributes(0) + AllocLocal(288) + + MOVQ(RSP, RBP) + ADDQ(Imm(32), RBP) + ANDQ(I32(-32), RBP) + Load(Param("dst").Base(), oup) + Load(Param("key").Base(), keyp) + Load(Param("src").Base(), inp) + Load(Param("src").Len(), inl) + Load(Param("ad").Base(), adp) + + CMPB(Mem{Symbol: Symbol{Name: ThatPeskyUnicodeDot + "useAVX2"}, Base: StaticBase}, Imm(1)) + JE(LabelRef("chacha20Poly1305Seal_AVX2")) + + Comment("Special optimization, for very short buffers") + CMPQ(inl, Imm(128)) + JBE(LabelRef("sealSSE128")) + + Comment("In the seal case - prepare the poly key + 3 blocks of stream in the first iteration") + chacha20Constants := chacha20Constants_DATA() + MOVOU(chacha20Constants, A0) + MOVOU(Mem{Base: keyp}.Offset(1*16), B0) + MOVOU(Mem{Base: keyp}.Offset(2*16), C0) + MOVOU(Mem{Base: keyp}.Offset(3*16), D0) + + Comment("Store state on stack for future use") + MOVO(B0, state1Store) + MOVO(C0, state2Store) + + Comment("Load state, increment counter blocks") + MOVO(A0, A1) + MOVO(B0, B1) + MOVO(C0, C1) + MOVO(D0, D1) + sseIncMask := sseIncMask_DATA() + PADDL(sseIncMask, D1) + MOVO(A1, A2) + MOVO(B1, B2) + MOVO(C1, C2) + MOVO(D1, D2) + PADDL(sseIncMask, D2) + MOVO(A2, A3) + MOVO(B2, B3) + MOVO(C2, C3) + MOVO(D2, D3) + PADDL(sseIncMask, D3) + + Comment("Store counters") + MOVO(D0, ctr0Store) + MOVO(D1, ctr1Store) + MOVO(D2, ctr2Store) + MOVO(D3, ctr3Store) + MOVQ(U32(10), itr2) + + sealSSEIntroLoop() + sealSSEMainLoop() + + // ---------------------------------------------------------------------------- + // Special optimization for the last 64 bytes of plaintext + sealSSETail64() + sealSSETail64LoopA() + sealSSETail64LoopB() + + // ---------------------------------------------------------------------------- + // Special optimization for the last 128 bytes of plaintext + sealSSETail128() + sealSSETail128LoopA() + sealSSETail128LoopB() + + // ---------------------------------------------------------------------------- + // Special optimization for the last 192 bytes of plaintext + sealSSETail192() + sealSSETail192LoopA() + sealSSETail192LoopB() + + // ---------------------------------------------------------------------------- + // Special seal optimization for buffers smaller than 129 bytes + sealSSE128() + sealSSE128SealHash() + sealSSE128Seal() + sealSSETail() + sealSSETailLoadLoop() + sealSSEFinalize() + + // ---------------------------------------------------------------------------- + // ------------------------- AVX2 Code ---------------------------------------- + chacha20Poly1305Seal_AVX2() + sealAVX2IntroLoop() + sealAVX2MainLoop() + sealAVX2InternalLoop() + sealAVX2InternalLoopStart() + + // ---------------------------------------------------------------------------- + // Special optimization for buffers smaller than 193 bytes + seal192AVX2() + sealAVX2192InnerCipherLoop() + sealAVX2ShortSeal() + sealAVX2SealHash() + sealAVX2ShortSealLoop() + sealAVX2ShortTail32() + sealAVX2ShortDone() + + // ---------------------------------------------------------------------------- + // Special optimization for buffers smaller than 321 bytes + seal320AVX2() + sealAVX2320InnerCipherLoop() + + // ---------------------------------------------------------------------------- + // Special optimization for the last 128 bytes of ciphertext + sealAVX2Tail128() + sealAVX2Tail128LoopA() + sealAVX2Tail128LoopB() + + // ---------------------------------------------------------------------------- + // Special optimization for the last 256 bytes of ciphertext + sealAVX2Tail256() + sealAVX2Tail256LoopA() + sealAVX2Tail256LoopB() + + // ---------------------------------------------------------------------------- + // Special optimization for the last 384 bytes of ciphertext + sealAVX2Tail384() + sealAVX2Tail384LoopA() + sealAVX2Tail384LoopB() + + // ---------------------------------------------------------------------------- + // Special optimization for the last 512 bytes of ciphertext + sealAVX2Tail512() + sealAVX2Tail512LoopA() + sealAVX2Tail512LoopB() +} + +func sealSSEIntroLoop() { + Label("sealSSEIntroLoop") + MOVO(C3, tmpStore) + chachaQR(A0, B0, C0, D0, C3) + chachaQR(A1, B1, C1, D1, C3) + chachaQR(A2, B2, C2, D2, C3) + MOVO(tmpStore, C3) + MOVO(C1, tmpStore) + chachaQR(A3, B3, C3, D3, C1) + MOVO(tmpStore, C1) + shiftB0Left() + shiftB1Left() + shiftB2Left() + shiftB3Left() + shiftC0Left() + shiftC1Left() + shiftC2Left() + shiftC3Left() + shiftD0Left() + shiftD1Left() + shiftD2Left() + shiftD3Left() + + MOVO(C3, tmpStore) + chachaQR(A0, B0, C0, D0, C3) + chachaQR(A1, B1, C1, D1, C3) + chachaQR(A2, B2, C2, D2, C3) + MOVO(tmpStore, C3) + MOVO(C1, tmpStore) + chachaQR(A3, B3, C3, D3, C1) + MOVO(tmpStore, C1) + shiftB0Right() + shiftB1Right() + shiftB2Right() + shiftB3Right() + shiftC0Right() + shiftC1Right() + shiftC2Right() + shiftC3Right() + shiftD0Right() + shiftD1Right() + shiftD2Right() + shiftD3Right() + DECQ(itr2) + JNE(LabelRef("sealSSEIntroLoop")) + + Comment("Add in the state") + chacha20Constants := chacha20Constants_DATA() + PADDD(chacha20Constants, A0) + PADDD(chacha20Constants, A1) + PADDD(chacha20Constants, A2) + PADDD(chacha20Constants, A3) + PADDD(state1Store, B0) + PADDD(state1Store, B1) + PADDD(state1Store, B2) + PADDD(state1Store, B3) + PADDD(state2Store, C1) + PADDD(state2Store, C2) + PADDD(state2Store, C3) + PADDD(ctr1Store, D1) + PADDD(ctr2Store, D2) + PADDD(ctr3Store, D3) + + Comment("Clamp and store the key") + polyClampMask := polyClampMask_DATA() + PAND(polyClampMask, A0) + MOVO(A0, rStore) + MOVO(B0, sStore) + + Comment("Hash AAD") + MOVQ(NewParamAddr("ad_len", 80), itr2) + CALL(LabelRef("polyHashADInternal<>(SB)")) + + MOVOU(Mem{Base: inp}.Offset(0*16), A0) + MOVOU(Mem{Base: inp}.Offset(1*16), B0) + MOVOU(Mem{Base: inp}.Offset(2*16), C0) + MOVOU(Mem{Base: inp}.Offset(3*16), D0) + PXOR(A0, A1) + PXOR(B0, B1) + PXOR(C0, C1) + PXOR(D0, D1) + MOVOU(A1, Mem{Base: oup}.Offset(0*16)) + MOVOU(B1, Mem{Base: oup}.Offset(1*16)) + MOVOU(C1, Mem{Base: oup}.Offset(2*16)) + MOVOU(D1, Mem{Base: oup}.Offset(3*16)) + MOVOU(Mem{Base: inp}.Offset(4*16), A0) + MOVOU(Mem{Base: inp}.Offset(5*16), B0) + MOVOU(Mem{Base: inp}.Offset(6*16), C0) + MOVOU(Mem{Base: inp}.Offset(7*16), D0) + PXOR(A0, A2) + PXOR(B0, B2) + PXOR(C0, C2) + PXOR(D0, D2) + MOVOU(A2, Mem{Base: oup}.Offset(4*16)) + MOVOU(B2, Mem{Base: oup}.Offset(5*16)) + MOVOU(C2, Mem{Base: oup}.Offset(6*16)) + MOVOU(D2, Mem{Base: oup}.Offset(7*16)) + + MOVQ(U32(128), itr1) + SUBQ(Imm(128), inl) + LEAQ(Mem{Base: inp}.Offset(128), inp) + + MOVO(A3, A1) + MOVO(B3, B1) + MOVO(C3, C1) + MOVO(D3, D1) + + CMPQ(inl, Imm(64)) + JBE(LabelRef("sealSSE128SealHash")) + + MOVOU(Mem{Base: inp}.Offset(0*16), A0) + MOVOU(Mem{Base: inp}.Offset(1*16), B0) + MOVOU(Mem{Base: inp}.Offset(2*16), C0) + MOVOU(Mem{Base: inp}.Offset(3*16), D0) + PXOR(A0, A3) + PXOR(B0, B3) + PXOR(C0, C3) + PXOR(D0, D3) + MOVOU(A3, Mem{Base: oup}.Offset(8*16)) + MOVOU(B3, Mem{Base: oup}.Offset(9*16)) + MOVOU(C3, Mem{Base: oup}.Offset(10*16)) + MOVOU(D3, Mem{Base: oup}.Offset(11*16)) + + ADDQ(Imm(64), itr1) + SUBQ(Imm(64), inl) + LEAQ(Mem{Base: inp}.Offset(64), inp) + + MOVQ(U32(2), itr1) + MOVQ(U32(8), itr2) + + CMPQ(inl, Imm(64)) + JBE(LabelRef("sealSSETail64")) + CMPQ(inl, Imm(128)) + JBE(LabelRef("sealSSETail128")) + CMPQ(inl, Imm(192)) + JBE(LabelRef("sealSSETail192")) +} + +func sealSSEMainLoop() { + Label("sealSSEMainLoop") + Comment("Load state, increment counter blocks") + chacha20Constants := chacha20Constants_DATA() + MOVO(chacha20Constants, A0) + MOVO(state1Store, B0) + MOVO(state2Store, C0) + MOVO(ctr3Store, D0) + sseIncMask := sseIncMask_DATA() + PADDL(sseIncMask, D0) + MOVO(A0, A1) + MOVO(B0, B1) + MOVO(C0, C1) + MOVO(D0, D1) + PADDL(sseIncMask, D1) + MOVO(A1, A2) + MOVO(B1, B2) + MOVO(C1, C2) + MOVO(D1, D2) + PADDL(sseIncMask, D2) + MOVO(A2, A3) + MOVO(B2, B3) + MOVO(C2, C3) + MOVO(D2, D3) + PADDL(sseIncMask, D3) + + Comment("Store counters") + MOVO(D0, ctr0Store) + MOVO(D1, ctr1Store) + MOVO(D2, ctr2Store) + MOVO(D3, ctr3Store) + + Label("sealSSEInnerLoop") + MOVO(C3, tmpStore) + chachaQR(A0, B0, C0, D0, C3) + chachaQR(A1, B1, C1, D1, C3) + chachaQR(A2, B2, C2, D2, C3) + MOVO(tmpStore, C3) + MOVO(C1, tmpStore) + chachaQR(A3, B3, C3, D3, C1) + MOVO(tmpStore, C1) + polyAdd(Mem{Base: oup}.Offset(0)) + shiftB0Left() + shiftB1Left() + shiftB2Left() + shiftB3Left() + shiftC0Left() + shiftC1Left() + shiftC2Left() + shiftC3Left() + shiftD0Left() + shiftD1Left() + shiftD2Left() + shiftD3Left() + polyMulStage1() + polyMulStage2() + LEAQ(Mem{Base: oup}.Offset(2*8), oup) + MOVO(C3, tmpStore) + chachaQR(A0, B0, C0, D0, C3) + chachaQR(A1, B1, C1, D1, C3) + chachaQR(A2, B2, C2, D2, C3) + MOVO(tmpStore, C3) + MOVO(C1, tmpStore) + polyMulStage3() + chachaQR(A3, B3, C3, D3, C1) + MOVO(tmpStore, C1) + polyMulReduceStage() + shiftB0Right() + shiftB1Right() + shiftB2Right() + shiftB3Right() + shiftC0Right() + shiftC1Right() + shiftC2Right() + shiftC3Right() + shiftD0Right() + shiftD1Right() + shiftD2Right() + shiftD3Right() + DECQ(itr2) + JGE(LabelRef("sealSSEInnerLoop")) + polyAdd(Mem{Base: oup}.Offset(0)) + polyMul() + LEAQ(Mem{Base: oup}.Offset(2*8), oup) + DECQ(itr1) + JG(LabelRef("sealSSEInnerLoop")) + + Comment("Add in the state") + PADDD(chacha20Constants, A0) + PADDD(chacha20Constants, A1) + PADDD(chacha20Constants, A2) + PADDD(chacha20Constants, A3) + PADDD(state1Store, B0) + PADDD(state1Store, B1) + PADDD(state1Store, B2) + PADDD(state1Store, B3) + PADDD(state2Store, C0) + PADDD(state2Store, C1) + PADDD(state2Store, C2) + PADDD(state2Store, C3) + PADDD(ctr0Store, D0) + PADDD(ctr1Store, D1) + PADDD(ctr2Store, D2) + PADDD(ctr3Store, D3) + MOVO(D3, tmpStore) + + Comment("Load - xor - store") + MOVOU(Mem{Base: inp}.Offset(0*16), D3) + PXOR(D3, A0) + MOVOU(Mem{Base: inp}.Offset(1*16), D3) + PXOR(D3, B0) + MOVOU(Mem{Base: inp}.Offset(2*16), D3) + PXOR(D3, C0) + MOVOU(Mem{Base: inp}.Offset(3*16), D3) + PXOR(D3, D0) + MOVOU(A0, Mem{Base: oup}.Offset(0*16)) + MOVOU(B0, Mem{Base: oup}.Offset(1*16)) + MOVOU(C0, Mem{Base: oup}.Offset(2*16)) + MOVOU(D0, Mem{Base: oup}.Offset(3*16)) + MOVO(tmpStore, D3) + + MOVOU(Mem{Base: inp}.Offset(4*16), A0) + MOVOU(Mem{Base: inp}.Offset(5*16), B0) + MOVOU(Mem{Base: inp}.Offset(6*16), C0) + MOVOU(Mem{Base: inp}.Offset(7*16), D0) + PXOR(A0, A1) + PXOR(B0, B1) + PXOR(C0, C1) + PXOR(D0, D1) + MOVOU(A1, Mem{Base: oup}.Offset(4*16)) + MOVOU(B1, Mem{Base: oup}.Offset(5*16)) + MOVOU(C1, Mem{Base: oup}.Offset(6*16)) + MOVOU(D1, Mem{Base: oup}.Offset(7*16)) + MOVOU(Mem{Base: inp}.Offset(8*16), A0) + MOVOU(Mem{Base: inp}.Offset(9*16), B0) + MOVOU(Mem{Base: inp}.Offset(10*16), C0) + MOVOU(Mem{Base: inp}.Offset(11*16), D0) + PXOR(A0, A2) + PXOR(B0, B2) + PXOR(C0, C2) + PXOR(D0, D2) + MOVOU(A2, Mem{Base: oup}.Offset(8*16)) + MOVOU(B2, Mem{Base: oup}.Offset(9*16)) + MOVOU(C2, Mem{Base: oup}.Offset(10*16)) + MOVOU(D2, Mem{Base: oup}.Offset(11*16)) + ADDQ(Imm(192), inp) + MOVQ(U32(192), itr1) + SUBQ(Imm(192), inl) + MOVO(A3, A1) + MOVO(B3, B1) + MOVO(C3, C1) + MOVO(D3, D1) + CMPQ(inl, Imm(64)) + JBE(LabelRef("sealSSE128SealHash")) + MOVOU(Mem{Base: inp}.Offset(0*16), A0) + MOVOU(Mem{Base: inp}.Offset(1*16), B0) + MOVOU(Mem{Base: inp}.Offset(2*16), C0) + MOVOU(Mem{Base: inp}.Offset(3*16), D0) + PXOR(A0, A3) + PXOR(B0, B3) + PXOR(C0, C3) + PXOR(D0, D3) + MOVOU(A3, Mem{Base: oup}.Offset(12*16)) + MOVOU(B3, Mem{Base: oup}.Offset(13*16)) + MOVOU(C3, Mem{Base: oup}.Offset(14*16)) + MOVOU(D3, Mem{Base: oup}.Offset(15*16)) + LEAQ(Mem{Base: inp}.Offset(64), inp) + SUBQ(Imm(64), inl) + MOVQ(U32(6), itr1) + MOVQ(U32(4), itr2) + CMPQ(inl, Imm(192)) + JG(LabelRef("sealSSEMainLoop")) + + MOVQ(inl, itr1) + TESTQ(inl, inl) + JE(LabelRef("sealSSE128SealHash")) + MOVQ(U32(6), itr1) + CMPQ(inl, Imm(64)) + JBE(LabelRef("sealSSETail64")) + CMPQ(inl, Imm(128)) + JBE(LabelRef("sealSSETail128")) + JMP(LabelRef("sealSSETail192")) +} + +// ---------------------------------------------------------------------------- +// Special optimization for the last 64 bytes of plaintext + +// Need to encrypt up to 64 bytes - prepare single block, hash 192 or 256 bytes +func sealSSETail64() { + Label("sealSSETail64") + chacha20Constants := chacha20Constants_DATA() + MOVO(chacha20Constants, A1) + MOVO(state1Store, B1) + MOVO(state2Store, C1) + MOVO(ctr3Store, D1) + sseIncMask := sseIncMask_DATA() + PADDL(sseIncMask, D1) + MOVO(D1, ctr0Store) +} + +// Perform ChaCha rounds, while hashing the previously encrypted ciphertext +func sealSSETail64LoopA() { + Label("sealSSETail64LoopA") + polyAdd(Mem{Base: oup}.Offset(0)) + polyMul() + LEAQ(Mem{Base: oup}.Offset(16), oup) +} + +func sealSSETail64LoopB() { + Label("sealSSETail64LoopB") + chachaQR(A1, B1, C1, D1, T1) + shiftB1Left() + shiftC1Left() + shiftD1Left() + chachaQR(A1, B1, C1, D1, T1) + shiftB1Right() + shiftC1Right() + shiftD1Right() + polyAdd(Mem{Base: oup}.Offset(0)) + polyMul() + LEAQ(Mem{Base: oup}.Offset(16), oup) + + DECQ(itr1) + JG(LabelRef("sealSSETail64LoopA")) + + DECQ(itr2) + JGE(LabelRef("sealSSETail64LoopB")) + chacha20Constants := chacha20Constants_DATA() + PADDL(chacha20Constants, A1) + PADDL(state1Store, B1) + PADDL(state2Store, C1) + PADDL(ctr0Store, D1) + + JMP(LabelRef("sealSSE128Seal")) +} + +// ---------------------------------------------------------------------------- +// Special optimization for the last 128 bytes of plaintext + +// Need to encrypt up to 128 bytes - prepare two blocks, hash 192 or 256 bytes +func sealSSETail128() { + Label("sealSSETail128") + chacha20Constants := chacha20Constants_DATA() + MOVO(chacha20Constants, A0) + MOVO(state1Store, B0) + MOVO(state2Store, C0) + MOVO(ctr3Store, D0) + sseIncMask := sseIncMask_DATA() + PADDL(sseIncMask, D0) + MOVO(D0, ctr0Store) + MOVO(A0, A1) + MOVO(B0, B1) + MOVO(C0, C1) + MOVO(D0, D1) + PADDL(sseIncMask, D1) + MOVO(D1, ctr1Store) +} + +// Perform ChaCha rounds, while hashing the previously encrypted ciphertext +func sealSSETail128LoopA() { + Label("sealSSETail128LoopA") + polyAdd(Mem{Base: oup}.Offset(0)) + polyMul() + LEAQ(Mem{Base: oup}.Offset(16), oup) +} + +func sealSSETail128LoopB() { + Label("sealSSETail128LoopB") + chachaQR(A0, B0, C0, D0, T0) + chachaQR(A1, B1, C1, D1, T0) + shiftB0Left() + shiftC0Left() + shiftD0Left() + shiftB1Left() + shiftC1Left() + shiftD1Left() + polyAdd(Mem{Base: oup}.Offset(0)) + polyMul() + LEAQ(Mem{Base: oup}.Offset(16), oup) + chachaQR(A0, B0, C0, D0, T0) + chachaQR(A1, B1, C1, D1, T0) + shiftB0Right() + shiftC0Right() + shiftD0Right() + shiftB1Right() + shiftC1Right() + shiftD1Right() + + DECQ(itr1) + JG(LabelRef("sealSSETail128LoopA")) + + DECQ(itr2) + JGE(LabelRef("sealSSETail128LoopB")) + + chacha20Constants := chacha20Constants_DATA() + PADDL(chacha20Constants, A0) + PADDL(chacha20Constants, A1) + PADDL(state1Store, B0) + PADDL(state1Store, B1) + PADDL(state2Store, C0) + PADDL(state2Store, C1) + PADDL(ctr0Store, D0) + PADDL(ctr1Store, D1) + + MOVOU(Mem{Base: inp}.Offset(0*16), T0) + MOVOU(Mem{Base: inp}.Offset(1*16), T1) + MOVOU(Mem{Base: inp}.Offset(2*16), T2) + MOVOU(Mem{Base: inp}.Offset(3*16), T3) + PXOR(T0, A0) + PXOR(T1, B0) + PXOR(T2, C0) + PXOR(T3, D0) + MOVOU(A0, Mem{Base: oup}.Offset(0*16)) + MOVOU(B0, Mem{Base: oup}.Offset(1*16)) + MOVOU(C0, Mem{Base: oup}.Offset(2*16)) + MOVOU(D0, Mem{Base: oup}.Offset(3*16)) + + MOVQ(U32(64), itr1) + LEAQ(Mem{Base: inp}.Offset(64), inp) + SUBQ(Imm(64), inl) + + JMP(LabelRef("sealSSE128SealHash")) +} + +// ---------------------------------------------------------------------------- +// Special optimization for the last 192 bytes of plaintext + +// Need to encrypt up to 192 bytes - prepare three blocks, hash 192 or 256 bytes +func sealSSETail192() { + Label("sealSSETail192") + chacha20Constants := chacha20Constants_DATA() + MOVO(chacha20Constants, A0) + MOVO(state1Store, B0) + MOVO(state2Store, C0) + MOVO(ctr3Store, D0) + sseIncMask := sseIncMask_DATA() + PADDL(sseIncMask, D0) + MOVO(D0, ctr0Store) + MOVO(A0, A1) + MOVO(B0, B1) + MOVO(C0, C1) + MOVO(D0, D1) + PADDL(sseIncMask, D1) + MOVO(D1, ctr1Store) + MOVO(A1, A2) + MOVO(B1, B2) + MOVO(C1, C2) + MOVO(D1, D2) + PADDL(sseIncMask, D2) + MOVO(D2, ctr2Store) +} + +// Perform ChaCha rounds, while hashing the previously encrypted ciphertext +func sealSSETail192LoopA() { + Label("sealSSETail192LoopA") + polyAdd(Mem{Base: oup}.Offset(0)) + polyMul() + LEAQ(Mem{Base: oup}.Offset(16), oup) +} + +func sealSSETail192LoopB() { + Label("sealSSETail192LoopB") + chachaQR(A0, B0, C0, D0, T0) + chachaQR(A1, B1, C1, D1, T0) + chachaQR(A2, B2, C2, D2, T0) + shiftB0Left() + shiftC0Left() + shiftD0Left() + shiftB1Left() + shiftC1Left() + shiftD1Left() + shiftB2Left() + shiftC2Left() + shiftD2Left() + + polyAdd(Mem{Base: oup}.Offset(0)) + polyMul() + LEAQ(Mem{Base: oup}.Offset(16), oup) + + chachaQR(A0, B0, C0, D0, T0) + chachaQR(A1, B1, C1, D1, T0) + chachaQR(A2, B2, C2, D2, T0) + shiftB0Right() + shiftC0Right() + shiftD0Right() + shiftB1Right() + shiftC1Right() + shiftD1Right() + shiftB2Right() + shiftC2Right() + shiftD2Right() + + DECQ(itr1) + JG(LabelRef("sealSSETail192LoopA")) + + DECQ(itr2) + JGE(LabelRef("sealSSETail192LoopB")) + + chacha20Constants := chacha20Constants_DATA() + PADDL(chacha20Constants, A0) + PADDL(chacha20Constants, A1) + PADDL(chacha20Constants, A2) + PADDL(state1Store, B0) + PADDL(state1Store, B1) + PADDL(state1Store, B2) + PADDL(state2Store, C0) + PADDL(state2Store, C1) + PADDL(state2Store, C2) + PADDL(ctr0Store, D0) + PADDL(ctr1Store, D1) + PADDL(ctr2Store, D2) + + MOVOU(Mem{Base: inp}.Offset(0*16), T0) + MOVOU(Mem{Base: inp}.Offset(1*16), T1) + MOVOU(Mem{Base: inp}.Offset(2*16), T2) + MOVOU(Mem{Base: inp}.Offset(3*16), T3) + PXOR(T0, A0) + PXOR(T1, B0) + PXOR(T2, C0) + PXOR(T3, D0) + MOVOU(A0, Mem{Base: oup}.Offset(0*16)) + MOVOU(B0, Mem{Base: oup}.Offset(1*16)) + MOVOU(C0, Mem{Base: oup}.Offset(2*16)) + MOVOU(D0, Mem{Base: oup}.Offset(3*16)) + MOVOU(Mem{Base: inp}.Offset(4*16), T0) + MOVOU(Mem{Base: inp}.Offset(5*16), T1) + MOVOU(Mem{Base: inp}.Offset(6*16), T2) + MOVOU(Mem{Base: inp}.Offset(7*16), T3) + PXOR(T0, A1) + PXOR(T1, B1) + PXOR(T2, C1) + PXOR(T3, D1) + MOVOU(A1, Mem{Base: oup}.Offset(4*16)) + MOVOU(B1, Mem{Base: oup}.Offset(5*16)) + MOVOU(C1, Mem{Base: oup}.Offset(6*16)) + MOVOU(D1, Mem{Base: oup}.Offset(7*16)) + + MOVO(A2, A1) + MOVO(B2, B1) + MOVO(C2, C1) + MOVO(D2, D1) + MOVQ(U32(128), itr1) + LEAQ(Mem{Base: inp}.Offset(128), inp) + SUBQ(Imm(128), inl) + + JMP(LabelRef("sealSSE128SealHash")) +} + +// ---------------------------------------------------------------------------- +// Special seal optimization for buffers smaller than 129 bytes + +// For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks +func sealSSE128() { + Label("sealSSE128") + chacha20Constants := chacha20Constants_DATA() + MOVOU(chacha20Constants, A0) + MOVOU(Mem{Base: keyp}.Offset(1*16), B0) + MOVOU(Mem{Base: keyp}.Offset(2*16), C0) + MOVOU(Mem{Base: keyp}.Offset(3*16), D0) + MOVO(A0, A1) + MOVO(B0, B1) + MOVO(C0, C1) + MOVO(D0, D1) + sseIncMask := sseIncMask_DATA() + PADDL(sseIncMask, D1) + MOVO(A1, A2) + MOVO(B1, B2) + MOVO(C1, C2) + MOVO(D1, D2) + PADDL(sseIncMask, D2) + MOVO(B0, T1) + MOVO(C0, T2) + MOVO(D1, T3) + MOVQ(U32(10), itr2) + + Label("sealSSE128InnerCipherLoop") + chachaQR(A0, B0, C0, D0, T0) + chachaQR(A1, B1, C1, D1, T0) + chachaQR(A2, B2, C2, D2, T0) + shiftB0Left() + shiftB1Left() + shiftB2Left() + shiftC0Left() + shiftC1Left() + shiftC2Left() + shiftD0Left() + shiftD1Left() + shiftD2Left() + chachaQR(A0, B0, C0, D0, T0) + chachaQR(A1, B1, C1, D1, T0) + chachaQR(A2, B2, C2, D2, T0) + shiftB0Right() + shiftB1Right() + shiftB2Right() + shiftC0Right() + shiftC1Right() + shiftC2Right() + shiftD0Right() + shiftD1Right() + shiftD2Right() + DECQ(itr2) + JNE(LabelRef("sealSSE128InnerCipherLoop")) + + Comment("A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded") + PADDL(chacha20Constants, A0) + PADDL(chacha20Constants, A1) + PADDL(chacha20Constants, A2) + PADDL(T1, B0) + PADDL(T1, B1) + PADDL(T1, B2) + PADDL(T2, C1) + PADDL(T2, C2) + PADDL(T3, D1) + PADDL(sseIncMask, T3) + PADDL(T3, D2) + polyClampMask := polyClampMask_DATA() + PAND(polyClampMask, A0) + MOVOU(A0, rStore) + MOVOU(B0, sStore) + + Comment("Hash") + MOVQ(NewParamAddr("ad_len", 80), itr2) + CALL(LabelRef("polyHashADInternal<>(SB)")) + XORQ(itr1, itr1) +} + +// itr1 holds the number of bytes encrypted but not yet hashed +func sealSSE128SealHash() { + Label("sealSSE128SealHash") + CMPQ(itr1, Imm(16)) + JB(LabelRef("sealSSE128Seal")) + polyAdd(Mem{Base: oup}.Offset(0)) + polyMul() + + SUBQ(Imm(16), itr1) + ADDQ(Imm(16), oup) + + JMP(LabelRef("sealSSE128SealHash")) +} + +func sealSSE128Seal() { + Label("sealSSE128Seal") + CMPQ(inl, Imm(16)) + JB(LabelRef("sealSSETail")) + SUBQ(Imm(16), inl) + + Comment("Load for decryption") + MOVOU(Mem{Base: inp}, T0) + PXOR(T0, A1) + MOVOU(A1, Mem{Base: oup}) + LEAQ(Mem{Base: inp}.Offset(1*16), inp) + LEAQ(Mem{Base: oup}.Offset(1*16), oup) + + Comment("Extract for hashing") + MOVQ(A1, t0) + PSRLDQ(Imm(8), A1) + MOVQ(A1, t1) + ADDQ(t0, acc0) + ADCQ(t1, acc1) + ADCQ(Imm(1), acc2) + polyMul() + + Comment("Shift the stream \"left\"") + MOVO(B1, A1) + MOVO(C1, B1) + MOVO(D1, C1) + MOVO(A2, D1) + MOVO(B2, A2) + MOVO(C2, B2) + MOVO(D2, C2) + JMP(LabelRef("sealSSE128Seal")) +} + +func sealSSETail() { + Label("sealSSETail") + TESTQ(inl, inl) + JE(LabelRef("sealSSEFinalize")) + + Comment("We can only load the PT one byte at a time to avoid read after end of buffer") + MOVQ(inl, itr2) + SHLQ(Imm(4), itr2) + andMask := andMask_DATA() + LEAQ(andMask, t0) + MOVQ(inl, itr1) + LEAQ(Mem{Base: inp, Index: inl, Scale: 1}.Offset(-1), inp) + XORQ(t2, t2) + XORQ(t3, t3) + XORQ(RAX, RAX) +} + +func sealSSETailLoadLoop() { + Label("sealSSETailLoadLoop") + SHLQ(Imm(8), t2, t3) + SHLQ(Imm(8), t2) + // Hack to get Avo to emit: + // MOVB (inp), AX + Instruction(&ir.Instruction{Opcode: "MOVB", Operands: []Op{Mem{Base: inp}, AX}}) + XORQ(RAX, t2) + LEAQ(Mem{Base: inp}.Offset(-1), inp) + DECQ(itr1) + JNE(LabelRef("sealSSETailLoadLoop")) + MOVQ(t2, tmpStore.Offset(0)) + MOVQ(t3, tmpStore.Offset(8)) + PXOR(tmpStore.Offset(0), A1) + MOVOU(A1, Mem{Base: oup}) + MOVOU(Mem{Base: t0, Index: itr2, Scale: 1}.Offset(-16), T0) + PAND(T0, A1) + MOVQ(A1, t0) + PSRLDQ(Imm(8), A1) + MOVQ(A1, t1) + ADDQ(t0, acc0) + ADCQ(t1, acc1) + ADCQ(Imm(1), acc2) + polyMul() + + ADDQ(inl, oup) +} + +func sealSSEFinalize() { + Label("sealSSEFinalize") + Comment("Hash in the buffer lengths") + ADDQ(NewParamAddr("ad_len", 80), acc0) + ADCQ(NewParamAddr("src_len", 56), acc1) + ADCQ(Imm(1), acc2) + polyMul() + + Comment("Final reduce") + MOVQ(acc0, t0) + MOVQ(acc1, t1) + MOVQ(acc2, t2) + SUBQ(I8(-5), acc0) + SBBQ(I8(-1), acc1) + SBBQ(Imm(3), acc2) + CMOVQCS(t0, acc0) + CMOVQCS(t1, acc1) + CMOVQCS(t2, acc2) + + Comment("Add in the \"s\" part of the key") + ADDQ(sStore.Offset(0), acc0) + ADCQ(sStore.Offset(8), acc1) + + Comment("Finally store the tag at the end of the message") + MOVQ(acc0, Mem{Base: oup}.Offset(0*8)) + MOVQ(acc1, Mem{Base: oup}.Offset(1*8)) + RET() +} + +// ---------------------------------------------------------------------------- +// ------------------------- AVX2 Code ---------------------------------------- + +func chacha20Poly1305Seal_AVX2() { + Label("chacha20Poly1305Seal_AVX2") + VZEROUPPER() + chacha20Constants := chacha20Constants_DATA() + VMOVDQU(chacha20Constants, AA0) + VBROADCASTI128_16_R8_YMM14() + VBROADCASTI128_32_R8_YMM12() + VBROADCASTI128_48_R8_YMM4() + avx2InitMask := avx2InitMask_DATA() + VPADDD(avx2InitMask, DD0, DD0) + + Comment("Special optimizations, for very short buffers") + CMPQ(inl, U32(192)) + JBE(LabelRef("seal192AVX2")) + CMPQ(inl, U32(320)) + JBE(LabelRef("seal320AVX2")) + + Comment("For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream") + VMOVDQA(AA0, AA1) + VMOVDQA(AA0, AA2) + VMOVDQA(AA0, AA3) + VMOVDQA(BB0, BB1) + VMOVDQA(BB0, BB2) + VMOVDQA(BB0, BB3) + VMOVDQA(BB0, state1StoreAVX2) + VMOVDQA(CC0, CC1) + VMOVDQA(CC0, CC2) + VMOVDQA(CC0, CC3) + VMOVDQA(CC0, state2StoreAVX2) + avx2IncMask := avx2IncMask_DATA() + VPADDD(avx2IncMask, DD0, DD1) + VMOVDQA(DD0, ctr0StoreAVX2) + VPADDD(avx2IncMask, DD1, DD2) + VMOVDQA(DD1, ctr1StoreAVX2) + VPADDD(avx2IncMask, DD2, DD3) + VMOVDQA(DD2, ctr2StoreAVX2) + VMOVDQA(DD3, ctr3StoreAVX2) + MOVQ(U32(10), itr2) +} + +func sealAVX2IntroLoop() { + Label("sealAVX2IntroLoop") + VMOVDQA(CC3, tmpStoreAVX2) + chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3) + chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3) + chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) + VMOVDQA(tmpStoreAVX2, CC3) + VMOVDQA(CC1, tmpStoreAVX2) + chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) + VMOVDQA(tmpStoreAVX2, CC1) + + VPALIGNR(Imm(4), BB0, BB0, BB0) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(12), DD0, DD0, DD0) + VPALIGNR(Imm(4), BB1, BB1, BB1) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(12), DD1, DD1, DD1) + VPALIGNR(Imm(4), BB2, BB2, BB2) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(12), DD2, DD2, DD2) + VPALIGNR(Imm(4), BB3, BB3, BB3) + VPALIGNR(Imm(8), CC3, CC3, CC3) + VPALIGNR(Imm(12), DD3, DD3, DD3) + + VMOVDQA(CC3, tmpStoreAVX2) + chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3) + chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3) + chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) + VMOVDQA(tmpStoreAVX2, CC3) + VMOVDQA(CC1, tmpStoreAVX2) + chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) + VMOVDQA(tmpStoreAVX2, CC1) + + VPALIGNR(Imm(12), BB0, BB0, BB0) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(4), DD0, DD0, DD0) + VPALIGNR(Imm(12), BB1, BB1, BB1) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(4), DD1, DD1, DD1) + VPALIGNR(Imm(12), BB2, BB2, BB2) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(4), DD2, DD2, DD2) + VPALIGNR(Imm(12), BB3, BB3, BB3) + VPALIGNR(Imm(8), CC3, CC3, CC3) + VPALIGNR(Imm(4), DD3, DD3, DD3) + DECQ(itr2) + JNE(LabelRef("sealAVX2IntroLoop")) + + chacha20Constants := chacha20Constants_DATA() + VPADDD(chacha20Constants, AA0, AA0) + VPADDD(chacha20Constants, AA1, AA1) + VPADDD(chacha20Constants, AA2, AA2) + VPADDD(chacha20Constants, AA3, AA3) + VPADDD(state1StoreAVX2, BB0, BB0) + VPADDD(state1StoreAVX2, BB1, BB1) + VPADDD(state1StoreAVX2, BB2, BB2) + VPADDD(state1StoreAVX2, BB3, BB3) + VPADDD(state2StoreAVX2, CC0, CC0) + VPADDD(state2StoreAVX2, CC1, CC1) + VPADDD(state2StoreAVX2, CC2, CC2) + VPADDD(state2StoreAVX2, CC3, CC3) + VPADDD(ctr0StoreAVX2, DD0, DD0) + VPADDD(ctr1StoreAVX2, DD1, DD1) + VPADDD(ctr2StoreAVX2, DD2, DD2) + VPADDD(ctr3StoreAVX2, DD3, DD3) + + VPERM2I128(Imm(0x13), CC0, DD0, CC0) + VPERM2I128(Imm(0x02), AA0, BB0, DD0) + VPERM2I128(Imm(0x13), AA0, BB0, AA0) + + Comment("Clamp and store poly key") + polyClampMask := polyClampMask_DATA() + VPAND(polyClampMask, DD0, DD0) + VMOVDQA(DD0, rsStoreAVX2) + + Comment("Hash AD") + MOVQ(NewParamAddr("ad_len", 80), itr2) + CALL(LabelRef("polyHashADInternal<>(SB)")) + + Comment("Can store at least 320 bytes") + VPXOR(Mem{Base: inp}.Offset(0*32), AA0, AA0) + VPXOR(Mem{Base: inp}.Offset(1*32), CC0, CC0) + VMOVDQU(AA0, Mem{Base: oup}.Offset(0*32)) + VMOVDQU(CC0, Mem{Base: oup}.Offset(1*32)) + + VPERM2I128(Imm(0x02), AA1, BB1, AA0) + VPERM2I128(Imm(0x02), CC1, DD1, BB0) + VPERM2I128(Imm(0x13), AA1, BB1, CC0) + VPERM2I128(Imm(0x13), CC1, DD1, DD0) + VPXOR(Mem{Base: inp}.Offset(2*32), AA0, AA0) + VPXOR(Mem{Base: inp}.Offset(3*32), BB0, BB0) + VPXOR(Mem{Base: inp}.Offset(4*32), CC0, CC0) + VPXOR(Mem{Base: inp}.Offset(5*32), DD0, DD0) + VMOVDQU(AA0, Mem{Base: oup}.Offset(2*32)) + VMOVDQU(BB0, Mem{Base: oup}.Offset(3*32)) + VMOVDQU(CC0, Mem{Base: oup}.Offset(4*32)) + VMOVDQU(DD0, Mem{Base: oup}.Offset(5*32)) + VPERM2I128(Imm(0x02), AA2, BB2, AA0) + VPERM2I128(Imm(0x02), CC2, DD2, BB0) + VPERM2I128(Imm(0x13), AA2, BB2, CC0) + VPERM2I128(Imm(0x13), CC2, DD2, DD0) + VPXOR(Mem{Base: inp}.Offset(6*32), AA0, AA0) + VPXOR(Mem{Base: inp}.Offset(7*32), BB0, BB0) + VPXOR(Mem{Base: inp}.Offset(8*32), CC0, CC0) + VPXOR(Mem{Base: inp}.Offset(9*32), DD0, DD0) + VMOVDQU(AA0, Mem{Base: oup}.Offset(6*32)) + VMOVDQU(BB0, Mem{Base: oup}.Offset(7*32)) + VMOVDQU(CC0, Mem{Base: oup}.Offset(8*32)) + VMOVDQU(DD0, Mem{Base: oup}.Offset(9*32)) + + MOVQ(U32(320), itr1) + SUBQ(U32(320), inl) + LEAQ(Mem{Base: inp}.Offset(320), inp) + + VPERM2I128(Imm(0x02), AA3, BB3, AA0) + VPERM2I128(Imm(0x02), CC3, DD3, BB0) + VPERM2I128(Imm(0x13), AA3, BB3, CC0) + VPERM2I128(Imm(0x13), CC3, DD3, DD0) + CMPQ(inl, Imm(128)) + JBE(LabelRef("sealAVX2SealHash")) + + VPXOR(Mem{Base: inp}.Offset(0*32), AA0, AA0) + VPXOR(Mem{Base: inp}.Offset(1*32), BB0, BB0) + VPXOR(Mem{Base: inp}.Offset(2*32), CC0, CC0) + VPXOR(Mem{Base: inp}.Offset(3*32), DD0, DD0) + VMOVDQU(AA0, Mem{Base: oup}.Offset(10*32)) + VMOVDQU(BB0, Mem{Base: oup}.Offset(11*32)) + VMOVDQU(CC0, Mem{Base: oup}.Offset(12*32)) + VMOVDQU(DD0, Mem{Base: oup}.Offset(13*32)) + SUBQ(Imm(128), inl) + LEAQ(Mem{Base: inp}.Offset(128), inp) + + MOVQ(U32(8), itr1) + MOVQ(U32(2), itr2) + + CMPQ(inl, Imm(128)) + JBE(LabelRef("sealAVX2Tail128")) + CMPQ(inl, U32(256)) + JBE(LabelRef("sealAVX2Tail256")) + CMPQ(inl, U32(384)) + JBE(LabelRef("sealAVX2Tail384")) + CMPQ(inl, U32(512)) + JBE(LabelRef("sealAVX2Tail512")) + + Comment("We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop") + VMOVDQA(chacha20Constants, AA0) + VMOVDQA(AA0, AA1) + VMOVDQA(AA0, AA2) + VMOVDQA(AA0, AA3) + VMOVDQA(state1StoreAVX2, BB0) + VMOVDQA(BB0, BB1) + VMOVDQA(BB0, BB2) + VMOVDQA(BB0, BB3) + VMOVDQA(state2StoreAVX2, CC0) + VMOVDQA(CC0, CC1) + VMOVDQA(CC0, CC2) + VMOVDQA(CC0, CC3) + VMOVDQA(ctr3StoreAVX2, DD0) + avx2IncMask := avx2IncMask_DATA() + VPADDD(avx2IncMask, DD0, DD0) + VPADDD(avx2IncMask, DD0, DD1) + VPADDD(avx2IncMask, DD1, DD2) + VPADDD(avx2IncMask, DD2, DD3) + VMOVDQA(DD0, ctr0StoreAVX2) + VMOVDQA(DD1, ctr1StoreAVX2) + VMOVDQA(DD2, ctr2StoreAVX2) + VMOVDQA(DD3, ctr3StoreAVX2) + + VMOVDQA(CC3, tmpStoreAVX2) + chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3) + chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3) + chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) + VMOVDQA(tmpStoreAVX2, CC3) + VMOVDQA(CC1, tmpStoreAVX2) + chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) + VMOVDQA(tmpStoreAVX2, CC1) + + VPALIGNR(Imm(4), BB0, BB0, BB0) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(12), DD0, DD0, DD0) + VPALIGNR(Imm(4), BB1, BB1, BB1) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(12), DD1, DD1, DD1) + VPALIGNR(Imm(4), BB2, BB2, BB2) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(12), DD2, DD2, DD2) + VPALIGNR(Imm(4), BB3, BB3, BB3) + VPALIGNR(Imm(8), CC3, CC3, CC3) + VPALIGNR(Imm(12), DD3, DD3, DD3) + + VMOVDQA(CC3, tmpStoreAVX2) + chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3) + chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3) + chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) + VMOVDQA(tmpStoreAVX2, CC3) + VMOVDQA(CC1, tmpStoreAVX2) + chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) + VMOVDQA(tmpStoreAVX2, CC1) + + VPALIGNR(Imm(12), BB0, BB0, BB0) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(4), DD0, DD0, DD0) + VPALIGNR(Imm(12), BB1, BB1, BB1) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(4), DD1, DD1, DD1) + VPALIGNR(Imm(12), BB2, BB2, BB2) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(4), DD2, DD2, DD2) + VPALIGNR(Imm(12), BB3, BB3, BB3) + VPALIGNR(Imm(8), CC3, CC3, CC3) + VPALIGNR(Imm(4), DD3, DD3, DD3) + VPADDD(BB0, AA0, AA0) + VPADDD(BB1, AA1, AA1) + VPADDD(BB2, AA2, AA2) + VPADDD(BB3, AA3, AA3) + VPXOR(AA0, DD0, DD0) + VPXOR(AA1, DD1, DD1) + VPXOR(AA2, DD2, DD2) + VPXOR(AA3, DD3, DD3) + rol16 := rol16_DATA() + VPSHUFB(rol16, DD0, DD0) + VPSHUFB(rol16, DD1, DD1) + VPSHUFB(rol16, DD2, DD2) + VPSHUFB(rol16, DD3, DD3) + VPADDD(DD0, CC0, CC0) + VPADDD(DD1, CC1, CC1) + VPADDD(DD2, CC2, CC2) + VPADDD(DD3, CC3, CC3) + VPXOR(CC0, BB0, BB0) + VPXOR(CC1, BB1, BB1) + VPXOR(CC2, BB2, BB2) + VPXOR(CC3, BB3, BB3) + VMOVDQA(CC3, tmpStoreAVX2) + VPSLLD(Imm(12), BB0, CC3) + VPSRLD(Imm(20), BB0, BB0) + VPXOR(CC3, BB0, BB0) + VPSLLD(Imm(12), BB1, CC3) + VPSRLD(Imm(20), BB1, BB1) + VPXOR(CC3, BB1, BB1) + VPSLLD(Imm(12), BB2, CC3) + VPSRLD(Imm(20), BB2, BB2) + VPXOR(CC3, BB2, BB2) + VPSLLD(Imm(12), BB3, CC3) + VPSRLD(Imm(20), BB3, BB3) + VPXOR(CC3, BB3, BB3) + VMOVDQA(tmpStoreAVX2, CC3) + + SUBQ(Imm(16), oup) // Adjust the pointer + MOVQ(U32(9), itr1) + JMP(LabelRef("sealAVX2InternalLoopStart")) +} + +// Load state, increment counter blocks, store the incremented counters +func sealAVX2MainLoop() { + Label("sealAVX2MainLoop") + chacha20Constants := chacha20Constants_DATA() + VMOVDQU(chacha20Constants, AA0) + VMOVDQA(AA0, AA1) + VMOVDQA(AA0, AA2) + VMOVDQA(AA0, AA3) + VMOVDQA(state1StoreAVX2, BB0) + VMOVDQA(BB0, BB1) + VMOVDQA(BB0, BB2) + VMOVDQA(BB0, BB3) + VMOVDQA(state2StoreAVX2, CC0) + VMOVDQA(CC0, CC1) + VMOVDQA(CC0, CC2) + VMOVDQA(CC0, CC3) + VMOVDQA(ctr3StoreAVX2, DD0) + avx2IncMask := avx2IncMask_DATA() + VPADDD(avx2IncMask, DD0, DD0) + VPADDD(avx2IncMask, DD0, DD1) + VPADDD(avx2IncMask, DD1, DD2) + VPADDD(avx2IncMask, DD2, DD3) + VMOVDQA(DD0, ctr0StoreAVX2) + VMOVDQA(DD1, ctr1StoreAVX2) + VMOVDQA(DD2, ctr2StoreAVX2) + VMOVDQA(DD3, ctr3StoreAVX2) + MOVQ(U32(10), itr1) +} + +func sealAVX2InternalLoop() { + Label("sealAVX2InternalLoop") + polyAdd(Mem{Base: oup}.Offset(0 * 8)) + VPADDD(BB0, AA0, AA0) + VPADDD(BB1, AA1, AA1) + VPADDD(BB2, AA2, AA2) + VPADDD(BB3, AA3, AA3) + polyMulStage1_AVX2() + VPXOR(AA0, DD0, DD0) + VPXOR(AA1, DD1, DD1) + VPXOR(AA2, DD2, DD2) + VPXOR(AA3, DD3, DD3) + rol16 := rol16_DATA() + VPSHUFB(rol16, DD0, DD0) + VPSHUFB(rol16, DD1, DD1) + VPSHUFB(rol16, DD2, DD2) + VPSHUFB(rol16, DD3, DD3) + polyMulStage2_AVX2() + VPADDD(DD0, CC0, CC0) + VPADDD(DD1, CC1, CC1) + VPADDD(DD2, CC2, CC2) + VPADDD(DD3, CC3, CC3) + VPXOR(CC0, BB0, BB0) + VPXOR(CC1, BB1, BB1) + VPXOR(CC2, BB2, BB2) + VPXOR(CC3, BB3, BB3) + polyMulStage3_AVX2() + VMOVDQA(CC3, tmpStoreAVX2) + VPSLLD(Imm(12), BB0, CC3) + VPSRLD(Imm(20), BB0, BB0) + VPXOR(CC3, BB0, BB0) + VPSLLD(Imm(12), BB1, CC3) + VPSRLD(Imm(20), BB1, BB1) + VPXOR(CC3, BB1, BB1) + VPSLLD(Imm(12), BB2, CC3) + VPSRLD(Imm(20), BB2, BB2) + VPXOR(CC3, BB2, BB2) + VPSLLD(Imm(12), BB3, CC3) + VPSRLD(Imm(20), BB3, BB3) + VPXOR(CC3, BB3, BB3) + VMOVDQA(tmpStoreAVX2, CC3) + polyMulReduceStage() +} + +func sealAVX2InternalLoopStart() { + Label("sealAVX2InternalLoopStart") + VPADDD(BB0, AA0, AA0) + VPADDD(BB1, AA1, AA1) + VPADDD(BB2, AA2, AA2) + VPADDD(BB3, AA3, AA3) + VPXOR(AA0, DD0, DD0) + VPXOR(AA1, DD1, DD1) + VPXOR(AA2, DD2, DD2) + VPXOR(AA3, DD3, DD3) + rol8 := rol8_DATA() + VPSHUFB(rol8, DD0, DD0) + VPSHUFB(rol8, DD1, DD1) + VPSHUFB(rol8, DD2, DD2) + VPSHUFB(rol8, DD3, DD3) + polyAdd(Mem{Base: oup}.Offset(2 * 8)) + VPADDD(DD0, CC0, CC0) + VPADDD(DD1, CC1, CC1) + VPADDD(DD2, CC2, CC2) + VPADDD(DD3, CC3, CC3) + polyMulStage1_AVX2() + VPXOR(CC0, BB0, BB0) + VPXOR(CC1, BB1, BB1) + VPXOR(CC2, BB2, BB2) + VPXOR(CC3, BB3, BB3) + VMOVDQA(CC3, tmpStoreAVX2) + VPSLLD(Imm(7), BB0, CC3) + VPSRLD(Imm(25), BB0, BB0) + VPXOR(CC3, BB0, BB0) + VPSLLD(Imm(7), BB1, CC3) + VPSRLD(Imm(25), BB1, BB1) + VPXOR(CC3, BB1, BB1) + VPSLLD(Imm(7), BB2, CC3) + VPSRLD(Imm(25), BB2, BB2) + VPXOR(CC3, BB2, BB2) + VPSLLD(Imm(7), BB3, CC3) + VPSRLD(Imm(25), BB3, BB3) + VPXOR(CC3, BB3, BB3) + VMOVDQA(tmpStoreAVX2, CC3) + polyMulStage2_AVX2() + VPALIGNR(Imm(4), BB0, BB0, BB0) + VPALIGNR(Imm(4), BB1, BB1, BB1) + VPALIGNR(Imm(4), BB2, BB2, BB2) + VPALIGNR(Imm(4), BB3, BB3, BB3) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(8), CC3, CC3, CC3) + VPALIGNR(Imm(12), DD0, DD0, DD0) + VPALIGNR(Imm(12), DD1, DD1, DD1) + VPALIGNR(Imm(12), DD2, DD2, DD2) + VPALIGNR(Imm(12), DD3, DD3, DD3) + VPADDD(BB0, AA0, AA0) + VPADDD(BB1, AA1, AA1) + VPADDD(BB2, AA2, AA2) + VPADDD(BB3, AA3, AA3) + polyMulStage3_AVX2() + VPXOR(AA0, DD0, DD0) + VPXOR(AA1, DD1, DD1) + VPXOR(AA2, DD2, DD2) + VPXOR(AA3, DD3, DD3) + rol16 := rol16_DATA() + VPSHUFB(rol16, DD0, DD0) + VPSHUFB(rol16, DD1, DD1) + VPSHUFB(rol16, DD2, DD2) + VPSHUFB(rol16, DD3, DD3) + polyMulReduceStage() + VPADDD(DD0, CC0, CC0) + VPADDD(DD1, CC1, CC1) + VPADDD(DD2, CC2, CC2) + VPADDD(DD3, CC3, CC3) + VPXOR(CC0, BB0, BB0) + VPXOR(CC1, BB1, BB1) + VPXOR(CC2, BB2, BB2) + VPXOR(CC3, BB3, BB3) + polyAdd(Mem{Base: oup}.Offset(4 * 8)) + LEAQ(Mem{Base: oup}.Offset(6*8), oup) + VMOVDQA(CC3, tmpStoreAVX2) + VPSLLD(Imm(12), BB0, CC3) + VPSRLD(Imm(20), BB0, BB0) + VPXOR(CC3, BB0, BB0) + VPSLLD(Imm(12), BB1, CC3) + VPSRLD(Imm(20), BB1, BB1) + VPXOR(CC3, BB1, BB1) + VPSLLD(Imm(12), BB2, CC3) + VPSRLD(Imm(20), BB2, BB2) + VPXOR(CC3, BB2, BB2) + VPSLLD(Imm(12), BB3, CC3) + VPSRLD(Imm(20), BB3, BB3) + VPXOR(CC3, BB3, BB3) + VMOVDQA(tmpStoreAVX2, CC3) + polyMulStage1_AVX2() + VPADDD(BB0, AA0, AA0) + VPADDD(BB1, AA1, AA1) + VPADDD(BB2, AA2, AA2) + VPADDD(BB3, AA3, AA3) + VPXOR(AA0, DD0, DD0) + VPXOR(AA1, DD1, DD1) + VPXOR(AA2, DD2, DD2) + VPXOR(AA3, DD3, DD3) + polyMulStage2_AVX2() + VPSHUFB(rol8, DD0, DD0) + VPSHUFB(rol8, DD1, DD1) + VPSHUFB(rol8, DD2, DD2) + VPSHUFB(rol8, DD3, DD3) + VPADDD(DD0, CC0, CC0) + VPADDD(DD1, CC1, CC1) + VPADDD(DD2, CC2, CC2) + VPADDD(DD3, CC3, CC3) + polyMulStage3_AVX2() + VPXOR(CC0, BB0, BB0) + VPXOR(CC1, BB1, BB1) + VPXOR(CC2, BB2, BB2) + VPXOR(CC3, BB3, BB3) + VMOVDQA(CC3, tmpStoreAVX2) + VPSLLD(Imm(7), BB0, CC3) + VPSRLD(Imm(25), BB0, BB0) + VPXOR(CC3, BB0, BB0) + VPSLLD(Imm(7), BB1, CC3) + VPSRLD(Imm(25), BB1, BB1) + VPXOR(CC3, BB1, BB1) + VPSLLD(Imm(7), BB2, CC3) + VPSRLD(Imm(25), BB2, BB2) + VPXOR(CC3, BB2, BB2) + VPSLLD(Imm(7), BB3, CC3) + VPSRLD(Imm(25), BB3, BB3) + VPXOR(CC3, BB3, BB3) + VMOVDQA(tmpStoreAVX2, CC3) + polyMulReduceStage() + VPALIGNR(Imm(12), BB0, BB0, BB0) + VPALIGNR(Imm(12), BB1, BB1, BB1) + VPALIGNR(Imm(12), BB2, BB2, BB2) + VPALIGNR(Imm(12), BB3, BB3, BB3) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(8), CC3, CC3, CC3) + VPALIGNR(Imm(4), DD0, DD0, DD0) + VPALIGNR(Imm(4), DD1, DD1, DD1) + VPALIGNR(Imm(4), DD2, DD2, DD2) + VPALIGNR(Imm(4), DD3, DD3, DD3) + DECQ(itr1) + JNE(LabelRef("sealAVX2InternalLoop")) + + chacha20Constants := chacha20Constants_DATA() + VPADDD(chacha20Constants, AA0, AA0) + VPADDD(chacha20Constants, AA1, AA1) + VPADDD(chacha20Constants, AA2, AA2) + VPADDD(chacha20Constants, AA3, AA3) + VPADDD(state1StoreAVX2, BB0, BB0) + VPADDD(state1StoreAVX2, BB1, BB1) + VPADDD(state1StoreAVX2, BB2, BB2) + VPADDD(state1StoreAVX2, BB3, BB3) + VPADDD(state2StoreAVX2, CC0, CC0) + VPADDD(state2StoreAVX2, CC1, CC1) + VPADDD(state2StoreAVX2, CC2, CC2) + VPADDD(state2StoreAVX2, CC3, CC3) + VPADDD(ctr0StoreAVX2, DD0, DD0) + VPADDD(ctr1StoreAVX2, DD1, DD1) + VPADDD(ctr2StoreAVX2, DD2, DD2) + VPADDD(ctr3StoreAVX2, DD3, DD3) + VMOVDQA(CC3, tmpStoreAVX2) + + Comment("We only hashed 480 of the 512 bytes available - hash the remaining 32 here") + polyAdd(Mem{Base: oup}.Offset(0 * 8)) + polyMulAVX2() + LEAQ(Mem{Base: oup}.Offset(4*8), oup) + VPERM2I128(Imm(0x02), AA0, BB0, CC3) + VPERM2I128(Imm(0x13), AA0, BB0, BB0) + VPERM2I128(Imm(0x02), CC0, DD0, AA0) + VPERM2I128(Imm(0x13), CC0, DD0, CC0) + VPXOR(Mem{Base: inp}.Offset(0*32), CC3, CC3) + VPXOR(Mem{Base: inp}.Offset(1*32), AA0, AA0) + VPXOR(Mem{Base: inp}.Offset(2*32), BB0, BB0) + VPXOR(Mem{Base: inp}.Offset(3*32), CC0, CC0) + VMOVDQU(CC3, Mem{Base: oup}.Offset(0*32)) + VMOVDQU(AA0, Mem{Base: oup}.Offset(1*32)) + VMOVDQU(BB0, Mem{Base: oup}.Offset(2*32)) + VMOVDQU(CC0, Mem{Base: oup}.Offset(3*32)) + VPERM2I128(Imm(0x02), AA1, BB1, AA0) + VPERM2I128(Imm(0x02), CC1, DD1, BB0) + VPERM2I128(Imm(0x13), AA1, BB1, CC0) + VPERM2I128(Imm(0x13), CC1, DD1, DD0) + VPXOR(Mem{Base: inp}.Offset(4*32), AA0, AA0) + VPXOR(Mem{Base: inp}.Offset(5*32), BB0, BB0) + VPXOR(Mem{Base: inp}.Offset(6*32), CC0, CC0) + VPXOR(Mem{Base: inp}.Offset(7*32), DD0, DD0) + VMOVDQU(AA0, Mem{Base: oup}.Offset(4*32)) + VMOVDQU(BB0, Mem{Base: oup}.Offset(5*32)) + VMOVDQU(CC0, Mem{Base: oup}.Offset(6*32)) + VMOVDQU(DD0, Mem{Base: oup}.Offset(7*32)) + + Comment("and here") + polyAdd(Mem{Base: oup}.Offset(-2 * 8)) + polyMulAVX2() + VPERM2I128(Imm(0x02), AA2, BB2, AA0) + VPERM2I128(Imm(0x02), CC2, DD2, BB0) + VPERM2I128(Imm(0x13), AA2, BB2, CC0) + VPERM2I128(Imm(0x13), CC2, DD2, DD0) + VPXOR(Mem{Base: inp}.Offset(8*32), AA0, AA0) + VPXOR(Mem{Base: inp}.Offset(9*32), BB0, BB0) + VPXOR(Mem{Base: inp}.Offset(10*32), CC0, CC0) + VPXOR(Mem{Base: inp}.Offset(11*32), DD0, DD0) + VMOVDQU(AA0, Mem{Base: oup}.Offset(8*32)) + VMOVDQU(BB0, Mem{Base: oup}.Offset(9*32)) + VMOVDQU(CC0, Mem{Base: oup}.Offset(10*32)) + VMOVDQU(DD0, Mem{Base: oup}.Offset(11*32)) + VPERM2I128(Imm(0x02), AA3, BB3, AA0) + VPERM2I128(Imm(0x02), tmpStoreAVX2, DD3, BB0) + VPERM2I128(Imm(0x13), AA3, BB3, CC0) + VPERM2I128(Imm(0x13), tmpStoreAVX2, DD3, DD0) + VPXOR(Mem{Base: inp}.Offset(12*32), AA0, AA0) + VPXOR(Mem{Base: inp}.Offset(13*32), BB0, BB0) + VPXOR(Mem{Base: inp}.Offset(14*32), CC0, CC0) + VPXOR(Mem{Base: inp}.Offset(15*32), DD0, DD0) + VMOVDQU(AA0, Mem{Base: oup}.Offset(12*32)) + VMOVDQU(BB0, Mem{Base: oup}.Offset(13*32)) + VMOVDQU(CC0, Mem{Base: oup}.Offset(14*32)) + VMOVDQU(DD0, Mem{Base: oup}.Offset(15*32)) + LEAQ(Mem{Base: inp}.Offset(32*16), inp) + SUBQ(U32(32*16), inl) + CMPQ(inl, U32(512)) + JG(LabelRef("sealAVX2MainLoop")) + + Comment("Tail can only hash 480 bytes") + polyAdd(Mem{Base: oup}.Offset(0 * 8)) + polyMulAVX2() + polyAdd(Mem{Base: oup}.Offset(2 * 8)) + polyMulAVX2() + LEAQ(Mem{Base: oup}.Offset(32), oup) + + MOVQ(U32(10), itr1) + MOVQ(U32(0), itr2) + CMPQ(inl, Imm(128)) + JBE(LabelRef("sealAVX2Tail128")) + CMPQ(inl, U32(256)) + JBE(LabelRef("sealAVX2Tail256")) + CMPQ(inl, U32(384)) + JBE(LabelRef("sealAVX2Tail384")) + JMP(LabelRef("sealAVX2Tail512")) +} + +// ---------------------------------------------------------------------------- +// Special optimization for buffers smaller than 193 bytes + +// For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks +func seal192AVX2() { + Label("seal192AVX2") + VMOVDQA(AA0, AA1) + VMOVDQA(BB0, BB1) + VMOVDQA(CC0, CC1) + avx2IncMask := avx2IncMask_DATA() + VPADDD(avx2IncMask, DD0, DD1) + VMOVDQA(AA0, AA2) + VMOVDQA(BB0, BB2) + VMOVDQA(CC0, CC2) + VMOVDQA(DD0, DD2) + VMOVDQA(DD1, TT3) + MOVQ(U32(10), itr2) +} + +func sealAVX2192InnerCipherLoop() { + Label("sealAVX2192InnerCipherLoop") + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + VPALIGNR(Imm(4), BB0, BB0, BB0) + VPALIGNR(Imm(4), BB1, BB1, BB1) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(12), DD0, DD0, DD0) + VPALIGNR(Imm(12), DD1, DD1, DD1) + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + VPALIGNR(Imm(12), BB0, BB0, BB0) + VPALIGNR(Imm(12), BB1, BB1, BB1) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(4), DD0, DD0, DD0) + VPALIGNR(Imm(4), DD1, DD1, DD1) + DECQ(itr2) + JNE(LabelRef("sealAVX2192InnerCipherLoop")) + VPADDD(AA2, AA0, AA0) + VPADDD(AA2, AA1, AA1) + VPADDD(BB2, BB0, BB0) + VPADDD(BB2, BB1, BB1) + VPADDD(CC2, CC0, CC0) + VPADDD(CC2, CC1, CC1) + VPADDD(DD2, DD0, DD0) + VPADDD(TT3, DD1, DD1) + VPERM2I128(Imm(0x02), AA0, BB0, TT0) + + Comment("Clamp and store poly key") + polyClampMask := polyClampMask_DATA() + VPAND(polyClampMask, TT0, TT0) + VMOVDQA(TT0, rsStoreAVX2) + + Comment("Stream for up to 192 bytes") + VPERM2I128(Imm(0x13), AA0, BB0, AA0) + VPERM2I128(Imm(0x13), CC0, DD0, BB0) + VPERM2I128(Imm(0x02), AA1, BB1, CC0) + VPERM2I128(Imm(0x02), CC1, DD1, DD0) + VPERM2I128(Imm(0x13), AA1, BB1, AA1) + VPERM2I128(Imm(0x13), CC1, DD1, BB1) +} + +func sealAVX2ShortSeal() { + Label("sealAVX2ShortSeal") + Comment("Hash aad") + MOVQ(NewParamAddr("ad_len", 80), itr2) + CALL(LabelRef("polyHashADInternal<>(SB)")) + XORQ(itr1, itr1) +} + +func sealAVX2SealHash() { + Label("sealAVX2SealHash") + Comment("itr1 holds the number of bytes encrypted but not yet hashed") + CMPQ(itr1, Imm(16)) + JB(LabelRef("sealAVX2ShortSealLoop")) + polyAdd(Mem{Base: oup}.Offset(0)) + polyMul() + SUBQ(Imm(16), itr1) + ADDQ(Imm(16), oup) + JMP(LabelRef("sealAVX2SealHash")) +} + +func sealAVX2ShortSealLoop() { + Label("sealAVX2ShortSealLoop") + CMPQ(inl, Imm(32)) + JB(LabelRef("sealAVX2ShortTail32")) + SUBQ(Imm(32), inl) + + Comment("Load for encryption") + VPXOR(Mem{Base: inp}, AA0, AA0) + VMOVDQU(AA0, Mem{Base: oup}) + LEAQ(Mem{Base: inp}.Offset(1*32), inp) + + Comment("Now can hash") + polyAdd(Mem{Base: oup}.Offset(0 * 8)) + polyMulAVX2() + polyAdd(Mem{Base: oup}.Offset(2 * 8)) + polyMulAVX2() + LEAQ(Mem{Base: oup}.Offset(1*32), oup) + + Comment("Shift stream left") + VMOVDQA(BB0, AA0) + VMOVDQA(CC0, BB0) + VMOVDQA(DD0, CC0) + VMOVDQA(AA1, DD0) + VMOVDQA(BB1, AA1) + VMOVDQA(CC1, BB1) + VMOVDQA(DD1, CC1) + VMOVDQA(AA2, DD1) + VMOVDQA(BB2, AA2) + JMP(LabelRef("sealAVX2ShortSealLoop")) +} + +func sealAVX2ShortTail32() { + Label("sealAVX2ShortTail32") + CMPQ(inl, Imm(16)) + VMOVDQA(A0, A1) + JB(LabelRef("sealAVX2ShortDone")) + + SUBQ(Imm(16), inl) + + Comment("Load for encryption") + VPXOR(Mem{Base: inp}, A0, T0) + VMOVDQU(T0, Mem{Base: oup}) + LEAQ(Mem{Base: inp}.Offset(1*16), inp) + + Comment("Hash") + polyAdd(Mem{Base: oup}.Offset(0 * 8)) + polyMulAVX2() + LEAQ(Mem{Base: oup}.Offset(1*16), oup) + VPERM2I128(Imm(0x11), AA0, AA0, AA0) + VMOVDQA(A0, A1) +} + +func sealAVX2ShortDone() { + Label("sealAVX2ShortDone") + VZEROUPPER() + JMP(LabelRef("sealSSETail")) +} + +// ---------------------------------------------------------------------------- +// Special optimization for buffers smaller than 321 bytes + +// For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks +func seal320AVX2() { + Label("seal320AVX2") + VMOVDQA(AA0, AA1) + VMOVDQA(BB0, BB1) + VMOVDQA(CC0, CC1) + avx2IncMask := avx2IncMask_DATA() + VPADDD(avx2IncMask, DD0, DD1) + VMOVDQA(AA0, AA2) + VMOVDQA(BB0, BB2) + VMOVDQA(CC0, CC2) + VPADDD(avx2IncMask, DD1, DD2) + VMOVDQA(BB0, TT1) + VMOVDQA(CC0, TT2) + VMOVDQA(DD0, TT3) + MOVQ(U32(10), itr2) +} + +func sealAVX2320InnerCipherLoop() { + Label("sealAVX2320InnerCipherLoop") + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) + VPALIGNR(Imm(4), BB0, BB0, BB0) + VPALIGNR(Imm(4), BB1, BB1, BB1) + VPALIGNR(Imm(4), BB2, BB2, BB2) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(12), DD0, DD0, DD0) + VPALIGNR(Imm(12), DD1, DD1, DD1) + VPALIGNR(Imm(12), DD2, DD2, DD2) + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) + VPALIGNR(Imm(12), BB0, BB0, BB0) + VPALIGNR(Imm(12), BB1, BB1, BB1) + VPALIGNR(Imm(12), BB2, BB2, BB2) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(4), DD0, DD0, DD0) + VPALIGNR(Imm(4), DD1, DD1, DD1) + VPALIGNR(Imm(4), DD2, DD2, DD2) + DECQ(itr2) + JNE(LabelRef("sealAVX2320InnerCipherLoop")) + + chacha20Constants := chacha20Constants_DATA() + VMOVDQA(chacha20Constants, TT0) + VPADDD(TT0, AA0, AA0) + VPADDD(TT0, AA1, AA1) + VPADDD(TT0, AA2, AA2) + VPADDD(TT1, BB0, BB0) + VPADDD(TT1, BB1, BB1) + VPADDD(TT1, BB2, BB2) + VPADDD(TT2, CC0, CC0) + VPADDD(TT2, CC1, CC1) + VPADDD(TT2, CC2, CC2) + avx2IncMask := avx2IncMask_DATA() + VMOVDQA(avx2IncMask, TT0) + VPADDD(TT3, DD0, DD0) + VPADDD(TT0, TT3, TT3) + VPADDD(TT3, DD1, DD1) + VPADDD(TT0, TT3, TT3) + VPADDD(TT3, DD2, DD2) + + Comment("Clamp and store poly key") + VPERM2I128(Imm(0x02), AA0, BB0, TT0) + polyClampMask := polyClampMask_DATA() + VPAND(polyClampMask, TT0, TT0) + VMOVDQA(TT0, rsStoreAVX2) + + Comment("Stream for up to 320 bytes") + VPERM2I128(Imm(0x13), AA0, BB0, AA0) + VPERM2I128(Imm(0x13), CC0, DD0, BB0) + VPERM2I128(Imm(0x02), AA1, BB1, CC0) + VPERM2I128(Imm(0x02), CC1, DD1, DD0) + VPERM2I128(Imm(0x13), AA1, BB1, AA1) + VPERM2I128(Imm(0x13), CC1, DD1, BB1) + VPERM2I128(Imm(0x02), AA2, BB2, CC1) + VPERM2I128(Imm(0x02), CC2, DD2, DD1) + VPERM2I128(Imm(0x13), AA2, BB2, AA2) + VPERM2I128(Imm(0x13), CC2, DD2, BB2) + JMP(LabelRef("sealAVX2ShortSeal")) +} + +// Need to decrypt up to 128 bytes - prepare two blocks: +// - If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed. +// - If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed. +func sealAVX2Tail128() { + Label("sealAVX2Tail128") + chacha20Constants := chacha20Constants_DATA() + VMOVDQA(chacha20Constants, AA0) + VMOVDQA(state1StoreAVX2, BB0) + VMOVDQA(state2StoreAVX2, CC0) + VMOVDQA(ctr3StoreAVX2, DD0) + avx2IncMask := avx2IncMask_DATA() + VPADDD(avx2IncMask, DD0, DD0) + VMOVDQA(DD0, DD1) +} + +func sealAVX2Tail128LoopA() { + Label("sealAVX2Tail128LoopA") + polyAdd(Mem{Base: oup}.Offset(0)) + polyMul() + LEAQ(Mem{Base: oup}.Offset(16), oup) +} + +func sealAVX2Tail128LoopB() { + Label("sealAVX2Tail128LoopB") + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + polyAdd(Mem{Base: oup}.Offset(0)) + polyMul() + VPALIGNR(Imm(4), BB0, BB0, BB0) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(12), DD0, DD0, DD0) + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + polyAdd(Mem{Base: oup}.Offset(16)) + polyMul() + LEAQ(Mem{Base: oup}.Offset(32), oup) + VPALIGNR(Imm(12), BB0, BB0, BB0) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(4), DD0, DD0, DD0) + DECQ(itr1) + JG(LabelRef("sealAVX2Tail128LoopA")) + DECQ(itr2) + JGE(LabelRef("sealAVX2Tail128LoopB")) + + chacha20Constants := chacha20Constants_DATA() + VPADDD(chacha20Constants, AA0, AA1) + VPADDD(state1StoreAVX2, BB0, BB1) + VPADDD(state2StoreAVX2, CC0, CC1) + VPADDD(DD1, DD0, DD1) + + VPERM2I128(Imm(0x02), AA1, BB1, AA0) + VPERM2I128(Imm(0x02), CC1, DD1, BB0) + VPERM2I128(Imm(0x13), AA1, BB1, CC0) + VPERM2I128(Imm(0x13), CC1, DD1, DD0) + JMP(LabelRef("sealAVX2ShortSealLoop")) +} + +// ---------------------------------------------------------------------------- +// Special optimization for the last 256 bytes of ciphertext + +// Need to decrypt up to 256 bytes - prepare two blocks +// - If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed +// - If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed +func sealAVX2Tail256() { + Label("sealAVX2Tail256") + chacha20Constants := chacha20Constants_DATA() + VMOVDQA(chacha20Constants, AA0) + VMOVDQA(chacha20Constants, AA1) + VMOVDQA(state1StoreAVX2, BB0) + VMOVDQA(state1StoreAVX2, BB1) + VMOVDQA(state2StoreAVX2, CC0) + VMOVDQA(state2StoreAVX2, CC1) + VMOVDQA(ctr3StoreAVX2, DD0) + avx2IncMask := avx2IncMask_DATA() + VPADDD(avx2IncMask, DD0, DD0) + VPADDD(avx2IncMask, DD0, DD1) + VMOVDQA(DD0, TT1) + VMOVDQA(DD1, TT2) +} + +func sealAVX2Tail256LoopA() { + Label("sealAVX2Tail256LoopA") + polyAdd(Mem{Base: oup}.Offset(0)) + polyMul() + LEAQ(Mem{Base: oup}.Offset(16), oup) +} + +// LIne 2493 +func sealAVX2Tail256LoopB() { + Label("sealAVX2Tail256LoopB") + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + polyAdd(Mem{Base: oup}.Offset(0)) + polyMul() + VPALIGNR(Imm(4), BB0, BB0, BB0) + VPALIGNR(Imm(4), BB1, BB1, BB1) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(12), DD0, DD0, DD0) + VPALIGNR(Imm(12), DD1, DD1, DD1) + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + polyAdd(Mem{Base: oup}.Offset(16)) + polyMul() + LEAQ(Mem{Base: oup}.Offset(32), oup) + VPALIGNR(Imm(12), BB0, BB0, BB0) + VPALIGNR(Imm(12), BB1, BB1, BB1) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(4), DD0, DD0, DD0) + VPALIGNR(Imm(4), DD1, DD1, DD1) + DECQ(itr1) + JG(LabelRef("sealAVX2Tail256LoopA")) + DECQ(itr2) + JGE(LabelRef("sealAVX2Tail256LoopB")) + + chacha20Constants := chacha20Constants_DATA() + VPADDD(chacha20Constants, AA0, AA0) + VPADDD(chacha20Constants, AA1, AA1) + VPADDD(state1StoreAVX2, BB0, BB0) + VPADDD(state1StoreAVX2, BB1, BB1) + VPADDD(state2StoreAVX2, CC0, CC0) + VPADDD(state2StoreAVX2, CC1, CC1) + VPADDD(TT1, DD0, DD0) + VPADDD(TT2, DD1, DD1) + VPERM2I128(Imm(0x02), AA0, BB0, TT0) + VPERM2I128(Imm(0x02), CC0, DD0, TT1) + VPERM2I128(Imm(0x13), AA0, BB0, TT2) + VPERM2I128(Imm(0x13), CC0, DD0, TT3) + VPXOR(Mem{Base: inp}.Offset(0*32), TT0, TT0) + VPXOR(Mem{Base: inp}.Offset(1*32), TT1, TT1) + VPXOR(Mem{Base: inp}.Offset(2*32), TT2, TT2) + VPXOR(Mem{Base: inp}.Offset(3*32), TT3, TT3) + VMOVDQU(TT0, Mem{Base: oup}.Offset(0*32)) + VMOVDQU(TT1, Mem{Base: oup}.Offset(1*32)) + VMOVDQU(TT2, Mem{Base: oup}.Offset(2*32)) + VMOVDQU(TT3, Mem{Base: oup}.Offset(3*32)) + MOVQ(U32(128), itr1) + LEAQ(Mem{Base: inp}.Offset(128), inp) + SUBQ(Imm(128), inl) + VPERM2I128(Imm(0x02), AA1, BB1, AA0) + VPERM2I128(Imm(0x02), CC1, DD1, BB0) + VPERM2I128(Imm(0x13), AA1, BB1, CC0) + VPERM2I128(Imm(0x13), CC1, DD1, DD0) + + JMP(LabelRef("sealAVX2SealHash")) +} + +// ---------------------------------------------------------------------------- +// Special optimization for the last 384 bytes of ciphertext + +// Need to decrypt up to 384 bytes - prepare two blocks +// - If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed +// - If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed +func sealAVX2Tail384() { + Label("sealAVX2Tail384") + chacha20Constants := chacha20Constants_DATA() + VMOVDQA(chacha20Constants, AA0) + VMOVDQA(AA0, AA1) + VMOVDQA(AA0, AA2) + VMOVDQA(state1StoreAVX2, BB0) + VMOVDQA(BB0, BB1) + VMOVDQA(BB0, BB2) + VMOVDQA(state2StoreAVX2, CC0) + VMOVDQA(CC0, CC1) + VMOVDQA(CC0, CC2) + VMOVDQA(ctr3StoreAVX2, DD0) + avx2IncMask := avx2IncMask_DATA() + VPADDD(avx2IncMask, DD0, DD0) + VPADDD(avx2IncMask, DD0, DD1) + VPADDD(avx2IncMask, DD1, DD2) + VMOVDQA(DD0, TT1) + VMOVDQA(DD1, TT2) + VMOVDQA(DD2, TT3) +} + +func sealAVX2Tail384LoopA() { + Label("sealAVX2Tail384LoopA") + polyAdd(Mem{Base: oup}.Offset(0)) + polyMul() + LEAQ(Mem{Base: oup}.Offset(16), oup) +} + +func sealAVX2Tail384LoopB() { + Label("sealAVX2Tail384LoopB") + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) + polyAdd(Mem{Base: oup}.Offset(0)) + polyMul() + VPALIGNR(Imm(4), BB0, BB0, BB0) + VPALIGNR(Imm(4), BB1, BB1, BB1) + VPALIGNR(Imm(4), BB2, BB2, BB2) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(12), DD0, DD0, DD0) + VPALIGNR(Imm(12), DD1, DD1, DD1) + VPALIGNR(Imm(12), DD2, DD2, DD2) + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) + polyAdd(Mem{Base: oup}.Offset(16)) + polyMul() + LEAQ(Mem{Base: oup}.Offset(32), oup) + VPALIGNR(Imm(12), BB0, BB0, BB0) + VPALIGNR(Imm(12), BB1, BB1, BB1) + VPALIGNR(Imm(12), BB2, BB2, BB2) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(4), DD0, DD0, DD0) + VPALIGNR(Imm(4), DD1, DD1, DD1) + VPALIGNR(Imm(4), DD2, DD2, DD2) + DECQ(itr1) + JG(LabelRef("sealAVX2Tail384LoopA")) + DECQ(itr2) + JGE(LabelRef("sealAVX2Tail384LoopB")) + + chacha20Constants := chacha20Constants_DATA() + VPADDD(chacha20Constants, AA0, AA0) + VPADDD(chacha20Constants, AA1, AA1) + VPADDD(chacha20Constants, AA2, AA2) + VPADDD(state1StoreAVX2, BB0, BB0) + VPADDD(state1StoreAVX2, BB1, BB1) + VPADDD(state1StoreAVX2, BB2, BB2) + VPADDD(state2StoreAVX2, CC0, CC0) + VPADDD(state2StoreAVX2, CC1, CC1) + VPADDD(state2StoreAVX2, CC2, CC2) + VPADDD(TT1, DD0, DD0) + VPADDD(TT2, DD1, DD1) + VPADDD(TT3, DD2, DD2) + VPERM2I128(Imm(0x02), AA0, BB0, TT0) + VPERM2I128(Imm(0x02), CC0, DD0, TT1) + VPERM2I128(Imm(0x13), AA0, BB0, TT2) + VPERM2I128(Imm(0x13), CC0, DD0, TT3) + VPXOR(Mem{Base: inp}.Offset(0*32), TT0, TT0) + VPXOR(Mem{Base: inp}.Offset(1*32), TT1, TT1) + VPXOR(Mem{Base: inp}.Offset(2*32), TT2, TT2) + VPXOR(Mem{Base: inp}.Offset(3*32), TT3, TT3) + VMOVDQU(TT0, Mem{Base: oup}.Offset(0*32)) + VMOVDQU(TT1, Mem{Base: oup}.Offset(1*32)) + VMOVDQU(TT2, Mem{Base: oup}.Offset(2*32)) + VMOVDQU(TT3, Mem{Base: oup}.Offset(3*32)) + VPERM2I128(Imm(0x02), AA1, BB1, TT0) + VPERM2I128(Imm(0x02), CC1, DD1, TT1) + VPERM2I128(Imm(0x13), AA1, BB1, TT2) + VPERM2I128(Imm(0x13), CC1, DD1, TT3) + VPXOR(Mem{Base: inp}.Offset(4*32), TT0, TT0) + VPXOR(Mem{Base: inp}.Offset(5*32), TT1, TT1) + VPXOR(Mem{Base: inp}.Offset(6*32), TT2, TT2) + VPXOR(Mem{Base: inp}.Offset(7*32), TT3, TT3) + VMOVDQU(TT0, Mem{Base: oup}.Offset(4*32)) + VMOVDQU(TT1, Mem{Base: oup}.Offset(5*32)) + VMOVDQU(TT2, Mem{Base: oup}.Offset(6*32)) + VMOVDQU(TT3, Mem{Base: oup}.Offset(7*32)) + MOVQ(U32(256), itr1) + LEAQ(Mem{Base: inp}.Offset(256), inp) + SUBQ(U32(256), inl) + VPERM2I128(Imm(0x02), AA2, BB2, AA0) + VPERM2I128(Imm(0x02), CC2, DD2, BB0) + VPERM2I128(Imm(0x13), AA2, BB2, CC0) + VPERM2I128(Imm(0x13), CC2, DD2, DD0) + + JMP(LabelRef("sealAVX2SealHash")) +} + +// ---------------------------------------------------------------------------- +// Special optimization for the last 512 bytes of ciphertext + +// Need to decrypt up to 512 bytes - prepare two blocks +// - If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed +// - If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed +func sealAVX2Tail512() { + Label("sealAVX2Tail512") + chacha20Constants := chacha20Constants_DATA() + VMOVDQA(chacha20Constants, AA0) + VMOVDQA(AA0, AA1) + VMOVDQA(AA0, AA2) + VMOVDQA(AA0, AA3) + VMOVDQA(state1StoreAVX2, BB0) + VMOVDQA(BB0, BB1) + VMOVDQA(BB0, BB2) + VMOVDQA(BB0, BB3) + VMOVDQA(state2StoreAVX2, CC0) + VMOVDQA(CC0, CC1) + VMOVDQA(CC0, CC2) + VMOVDQA(CC0, CC3) + VMOVDQA(ctr3StoreAVX2, DD0) + avx2IncMask := avx2IncMask_DATA() + VPADDD(avx2IncMask, DD0, DD0) + VPADDD(avx2IncMask, DD0, DD1) + VPADDD(avx2IncMask, DD1, DD2) + VPADDD(avx2IncMask, DD2, DD3) + VMOVDQA(DD0, ctr0StoreAVX2) + VMOVDQA(DD1, ctr1StoreAVX2) + VMOVDQA(DD2, ctr2StoreAVX2) + VMOVDQA(DD3, ctr3StoreAVX2) +} + +func sealAVX2Tail512LoopA() { + Label("sealAVX2Tail512LoopA") + polyAdd(Mem{Base: oup}.Offset(0)) + polyMul() + LEAQ(Mem{Base: oup}.Offset(16), oup) +} + +func sealAVX2Tail512LoopB() { + Label("sealAVX2Tail512LoopB") + VPADDD(BB0, AA0, AA0) + VPADDD(BB1, AA1, AA1) + VPADDD(BB2, AA2, AA2) + VPADDD(BB3, AA3, AA3) + VPXOR(AA0, DD0, DD0) + VPXOR(AA1, DD1, DD1) + VPXOR(AA2, DD2, DD2) + VPXOR(AA3, DD3, DD3) + rol16 := rol16_DATA() + VPSHUFB(rol16, DD0, DD0) + VPSHUFB(rol16, DD1, DD1) + VPSHUFB(rol16, DD2, DD2) + VPSHUFB(rol16, DD3, DD3) + VPADDD(DD0, CC0, CC0) + VPADDD(DD1, CC1, CC1) + VPADDD(DD2, CC2, CC2) + VPADDD(DD3, CC3, CC3) + VPXOR(CC0, BB0, BB0) + VPXOR(CC1, BB1, BB1) + VPXOR(CC2, BB2, BB2) + VPXOR(CC3, BB3, BB3) + VMOVDQA(CC3, tmpStoreAVX2) + VPSLLD(Imm(12), BB0, CC3) + VPSRLD(Imm(20), BB0, BB0) + VPXOR(CC3, BB0, BB0) + VPSLLD(Imm(12), BB1, CC3) + VPSRLD(Imm(20), BB1, BB1) + VPXOR(CC3, BB1, BB1) + VPSLLD(Imm(12), BB2, CC3) + VPSRLD(Imm(20), BB2, BB2) + VPXOR(CC3, BB2, BB2) + VPSLLD(Imm(12), BB3, CC3) + VPSRLD(Imm(20), BB3, BB3) + VPXOR(CC3, BB3, BB3) + VMOVDQA(tmpStoreAVX2, CC3) + polyAdd(Mem{Base: oup}.Offset(0 * 8)) + polyMulAVX2() + VPADDD(BB0, AA0, AA0) + VPADDD(BB1, AA1, AA1) + VPADDD(BB2, AA2, AA2) + VPADDD(BB3, AA3, AA3) + VPXOR(AA0, DD0, DD0) + VPXOR(AA1, DD1, DD1) + VPXOR(AA2, DD2, DD2) + VPXOR(AA3, DD3, DD3) + rol8 := rol8_DATA() + VPSHUFB(rol8, DD0, DD0) + VPSHUFB(rol8, DD1, DD1) + VPSHUFB(rol8, DD2, DD2) + VPSHUFB(rol8, DD3, DD3) + VPADDD(DD0, CC0, CC0) + VPADDD(DD1, CC1, CC1) + VPADDD(DD2, CC2, CC2) + VPADDD(DD3, CC3, CC3) + VPXOR(CC0, BB0, BB0) + VPXOR(CC1, BB1, BB1) + VPXOR(CC2, BB2, BB2) + VPXOR(CC3, BB3, BB3) + VMOVDQA(CC3, tmpStoreAVX2) + VPSLLD(Imm(7), BB0, CC3) + VPSRLD(Imm(25), BB0, BB0) + VPXOR(CC3, BB0, BB0) + VPSLLD(Imm(7), BB1, CC3) + VPSRLD(Imm(25), BB1, BB1) + VPXOR(CC3, BB1, BB1) + VPSLLD(Imm(7), BB2, CC3) + VPSRLD(Imm(25), BB2, BB2) + VPXOR(CC3, BB2, BB2) + VPSLLD(Imm(7), BB3, CC3) + VPSRLD(Imm(25), BB3, BB3) + VPXOR(CC3, BB3, BB3) + VMOVDQA(tmpStoreAVX2, CC3) + VPALIGNR(Imm(4), BB0, BB0, BB0) + VPALIGNR(Imm(4), BB1, BB1, BB1) + VPALIGNR(Imm(4), BB2, BB2, BB2) + VPALIGNR(Imm(4), BB3, BB3, BB3) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(8), CC3, CC3, CC3) + VPALIGNR(Imm(12), DD0, DD0, DD0) + VPALIGNR(Imm(12), DD1, DD1, DD1) + VPALIGNR(Imm(12), DD2, DD2, DD2) + VPALIGNR(Imm(12), DD3, DD3, DD3) + VPADDD(BB0, AA0, AA0) + VPADDD(BB1, AA1, AA1) + VPADDD(BB2, AA2, AA2) + VPADDD(BB3, AA3, AA3) + VPXOR(AA0, DD0, DD0) + VPXOR(AA1, DD1, DD1) + VPXOR(AA2, DD2, DD2) + VPXOR(AA3, DD3, DD3) + VPSHUFB(rol16, DD0, DD0) + VPSHUFB(rol16, DD1, DD1) + VPSHUFB(rol16, DD2, DD2) + VPSHUFB(rol16, DD3, DD3) + VPADDD(DD0, CC0, CC0) + VPADDD(DD1, CC1, CC1) + VPADDD(DD2, CC2, CC2) + VPADDD(DD3, CC3, CC3) + VPXOR(CC0, BB0, BB0) + VPXOR(CC1, BB1, BB1) + VPXOR(CC2, BB2, BB2) + VPXOR(CC3, BB3, BB3) + polyAdd(Mem{Base: oup}.Offset(2 * 8)) + polyMulAVX2() + LEAQ(Mem{Base: oup}.Offset(4*8), oup) + VMOVDQA(CC3, tmpStoreAVX2) + VPSLLD(Imm(12), BB0, CC3) + VPSRLD(Imm(20), BB0, BB0) + VPXOR(CC3, BB0, BB0) + VPSLLD(Imm(12), BB1, CC3) + VPSRLD(Imm(20), BB1, BB1) + VPXOR(CC3, BB1, BB1) + VPSLLD(Imm(12), BB2, CC3) + VPSRLD(Imm(20), BB2, BB2) + VPXOR(CC3, BB2, BB2) + VPSLLD(Imm(12), BB3, CC3) + VPSRLD(Imm(20), BB3, BB3) + VPXOR(CC3, BB3, BB3) + VMOVDQA(tmpStoreAVX2, CC3) + VPADDD(BB0, AA0, AA0) + VPADDD(BB1, AA1, AA1) + VPADDD(BB2, AA2, AA2) + VPADDD(BB3, AA3, AA3) + VPXOR(AA0, DD0, DD0) + VPXOR(AA1, DD1, DD1) + VPXOR(AA2, DD2, DD2) + VPXOR(AA3, DD3, DD3) + VPSHUFB(rol8, DD0, DD0) + VPSHUFB(rol8, DD1, DD1) + VPSHUFB(rol8, DD2, DD2) + VPSHUFB(rol8, DD3, DD3) + VPADDD(DD0, CC0, CC0) + VPADDD(DD1, CC1, CC1) + VPADDD(DD2, CC2, CC2) + VPADDD(DD3, CC3, CC3) + VPXOR(CC0, BB0, BB0) + VPXOR(CC1, BB1, BB1) + VPXOR(CC2, BB2, BB2) + VPXOR(CC3, BB3, BB3) + VMOVDQA(CC3, tmpStoreAVX2) + VPSLLD(Imm(7), BB0, CC3) + VPSRLD(Imm(25), BB0, BB0) + VPXOR(CC3, BB0, BB0) + VPSLLD(Imm(7), BB1, CC3) + VPSRLD(Imm(25), BB1, BB1) + VPXOR(CC3, BB1, BB1) + VPSLLD(Imm(7), BB2, CC3) + VPSRLD(Imm(25), BB2, BB2) + VPXOR(CC3, BB2, BB2) + VPSLLD(Imm(7), BB3, CC3) + VPSRLD(Imm(25), BB3, BB3) + VPXOR(CC3, BB3, BB3) + VMOVDQA(tmpStoreAVX2, CC3) + VPALIGNR(Imm(12), BB0, BB0, BB0) + VPALIGNR(Imm(12), BB1, BB1, BB1) + VPALIGNR(Imm(12), BB2, BB2, BB2) + VPALIGNR(Imm(12), BB3, BB3, BB3) + VPALIGNR(Imm(8), CC0, CC0, CC0) + VPALIGNR(Imm(8), CC1, CC1, CC1) + VPALIGNR(Imm(8), CC2, CC2, CC2) + VPALIGNR(Imm(8), CC3, CC3, CC3) + VPALIGNR(Imm(4), DD0, DD0, DD0) + VPALIGNR(Imm(4), DD1, DD1, DD1) + VPALIGNR(Imm(4), DD2, DD2, DD2) + VPALIGNR(Imm(4), DD3, DD3, DD3) + + DECQ(itr1) + JG(LabelRef("sealAVX2Tail512LoopA")) + DECQ(itr2) + JGE(LabelRef("sealAVX2Tail512LoopB")) + + chacha20Constants := chacha20Constants_DATA() + VPADDD(chacha20Constants, AA0, AA0) + VPADDD(chacha20Constants, AA1, AA1) + VPADDD(chacha20Constants, AA2, AA2) + VPADDD(chacha20Constants, AA3, AA3) + VPADDD(state1StoreAVX2, BB0, BB0) + VPADDD(state1StoreAVX2, BB1, BB1) + VPADDD(state1StoreAVX2, BB2, BB2) + VPADDD(state1StoreAVX2, BB3, BB3) + VPADDD(state2StoreAVX2, CC0, CC0) + VPADDD(state2StoreAVX2, CC1, CC1) + VPADDD(state2StoreAVX2, CC2, CC2) + VPADDD(state2StoreAVX2, CC3, CC3) + VPADDD(ctr0StoreAVX2, DD0, DD0) + VPADDD(ctr1StoreAVX2, DD1, DD1) + VPADDD(ctr2StoreAVX2, DD2, DD2) + VPADDD(ctr3StoreAVX2, DD3, DD3) + VMOVDQA(CC3, tmpStoreAVX2) + VPERM2I128(Imm(0x02), AA0, BB0, CC3) + VPXOR(Mem{Base: inp}.Offset(0*32), CC3, CC3) + VMOVDQU(CC3, Mem{Base: oup}.Offset(0*32)) + VPERM2I128(Imm(0x02), CC0, DD0, CC3) + VPXOR(Mem{Base: inp}.Offset(1*32), CC3, CC3) + VMOVDQU(CC3, Mem{Base: oup}.Offset(1*32)) + VPERM2I128(Imm(0x13), AA0, BB0, CC3) + VPXOR(Mem{Base: inp}.Offset(2*32), CC3, CC3) + VMOVDQU(CC3, Mem{Base: oup}.Offset(2*32)) + VPERM2I128(Imm(0x13), CC0, DD0, CC3) + VPXOR(Mem{Base: inp}.Offset(3*32), CC3, CC3) + VMOVDQU(CC3, Mem{Base: oup}.Offset(3*32)) + + VPERM2I128(Imm(0x02), AA1, BB1, AA0) + VPERM2I128(Imm(0x02), CC1, DD1, BB0) + VPERM2I128(Imm(0x13), AA1, BB1, CC0) + VPERM2I128(Imm(0x13), CC1, DD1, DD0) + VPXOR(Mem{Base: inp}.Offset(4*32), AA0, AA0) + VPXOR(Mem{Base: inp}.Offset(5*32), BB0, BB0) + VPXOR(Mem{Base: inp}.Offset(6*32), CC0, CC0) + VPXOR(Mem{Base: inp}.Offset(7*32), DD0, DD0) + VMOVDQU(AA0, Mem{Base: oup}.Offset(4*32)) + VMOVDQU(BB0, Mem{Base: oup}.Offset(5*32)) + VMOVDQU(CC0, Mem{Base: oup}.Offset(6*32)) + VMOVDQU(DD0, Mem{Base: oup}.Offset(7*32)) + + VPERM2I128(Imm(0x02), AA2, BB2, AA0) + VPERM2I128(Imm(0x02), CC2, DD2, BB0) + VPERM2I128(Imm(0x13), AA2, BB2, CC0) + VPERM2I128(Imm(0x13), CC2, DD2, DD0) + VPXOR(Mem{Base: inp}.Offset(8*32), AA0, AA0) + VPXOR(Mem{Base: inp}.Offset(9*32), BB0, BB0) + VPXOR(Mem{Base: inp}.Offset(10*32), CC0, CC0) + VPXOR(Mem{Base: inp}.Offset(11*32), DD0, DD0) + VMOVDQU(AA0, Mem{Base: oup}.Offset(8*32)) + VMOVDQU(BB0, Mem{Base: oup}.Offset(9*32)) + VMOVDQU(CC0, Mem{Base: oup}.Offset(10*32)) + VMOVDQU(DD0, Mem{Base: oup}.Offset(11*32)) + + MOVQ(U32(384), itr1) + LEAQ(Mem{Base: inp}.Offset(384), inp) + SUBQ(U32(384), inl) + VPERM2I128(Imm(0x02), AA3, BB3, AA0) + VPERM2I128(Imm(0x02), tmpStoreAVX2, DD3, BB0) + VPERM2I128(Imm(0x13), AA3, BB3, CC0) + VPERM2I128(Imm(0x13), tmpStoreAVX2, DD3, DD0) + + JMP(LabelRef("sealAVX2SealHash")) +} + +// ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~DATA SECTION~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~## + +var ( + // Pointers for memoizing DATA section symbols + chacha20Constants_DATA_ptr, + rol16_DATA_ptr, + rol8_DATA_ptr, + sseIncMask_DATA_ptr, + avx2IncMask_DATA_ptr, + avx2InitMask_DATA_ptr, + polyClampMask_DATA_ptr, + andMask_DATA_ptr *Mem +) + +var nothingUpMySleeve = [8]uint32{ + 0x61707865, + 0x3320646e, + 0x79622d32, + 0x6b206574, + 0x61707865, + 0x3320646e, + 0x79622d32, + 0x6b206574, +} + +// ChaCha20 constants +func chacha20Constants_DATA() Mem { + if chacha20Constants_DATA_ptr != nil { + return *chacha20Constants_DATA_ptr + } + + chacha20Constants := GLOBL(ThatPeskyUnicodeDot+"chacha20Constants", NOPTR|RODATA) + chacha20Constants_DATA_ptr = &chacha20Constants + for i, v := range nothingUpMySleeve { + DATA(i*4, U32(v)) + } + return chacha20Constants +} + +var rol16Consts = [4]uint64{ + 0x0504070601000302, + 0x0D0C0F0E09080B0A, + 0x0504070601000302, + 0x0D0C0F0E09080B0A, +} + +// <<< 16 with PSHUFB +func rol16_DATA() Mem { + if rol16_DATA_ptr != nil { + return *rol16_DATA_ptr + } + + rol16 := GLOBL(ThatPeskyUnicodeDot+"rol16", NOPTR|RODATA) + rol16_DATA_ptr = &rol16 + for i, v := range rol16Consts { + DATA(i*8, U64(v)) + } + return rol16 +} + +var rol8Consts = [4]uint64{ + 0x0605040702010003, + 0x0E0D0C0F0A09080B, + 0x0605040702010003, + 0x0E0D0C0F0A09080B, +} + +// <<< 8 with PSHUFB +func rol8_DATA() Mem { + if rol8_DATA_ptr != nil { + return *rol8_DATA_ptr + } + + rol8 := GLOBL(ThatPeskyUnicodeDot+"rol8", NOPTR|RODATA) + rol8_DATA_ptr = &rol8 + for i, v := range rol8Consts { + DATA(i*8, U64(v)) + } + return rol8 +} + +var avx2InitMaskConsts = [4]uint64{ + 0x0, + 0x0, + 0x1, + 0x0, +} + +func avx2InitMask_DATA() Mem { + if avx2InitMask_DATA_ptr != nil { + return *avx2InitMask_DATA_ptr + } + + avx2InitMask := GLOBL(ThatPeskyUnicodeDot+"avx2InitMask", NOPTR|RODATA) + avx2InitMask_DATA_ptr = &avx2InitMask + for i, v := range avx2InitMaskConsts { + DATA(i*8, U64(v)) + } + return avx2InitMask +} + +var avx2IncMaskConsts = [4]uint64{ + 0x2, + 0x0, + 0x2, + 0x0, +} + +func avx2IncMask_DATA() Mem { + if avx2IncMask_DATA_ptr != nil { + return *avx2IncMask_DATA_ptr + } + + avx2IncMask := GLOBL(ThatPeskyUnicodeDot+"avx2IncMask", NOPTR|RODATA) + avx2IncMask_DATA_ptr = &avx2IncMask + for i, v := range avx2IncMaskConsts { + DATA(i*8, U64(v)) + } + return avx2IncMask +} + +var polyClampMaskConsts = [4]uint64{ + 0x0FFFFFFC0FFFFFFF, + 0x0FFFFFFC0FFFFFFC, + 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, +} + +// Poly1305 key clamp +func polyClampMask_DATA() Mem { + if polyClampMask_DATA_ptr != nil { + return *polyClampMask_DATA_ptr + } + + polyClampMask := GLOBL(ThatPeskyUnicodeDot+"polyClampMask", NOPTR|RODATA) + polyClampMask_DATA_ptr = &polyClampMask + for i, v := range polyClampMaskConsts { + DATA(i*8, U64(v)) + } + return polyClampMask +} + +var sseIncMaskConsts = [2]uint64{ + 0x1, + 0x0, +} + +func sseIncMask_DATA() Mem { + if sseIncMask_DATA_ptr != nil { + return *sseIncMask_DATA_ptr + } + + sseIncMask := GLOBL(ThatPeskyUnicodeDot+"sseIncMask", NOPTR|RODATA) + sseIncMask_DATA_ptr = &sseIncMask + for i, v := range sseIncMaskConsts { + DATA(i*8, U64(v)) + } + return sseIncMask +} + +var andMaskConsts = [30]uint64{ + 0x00000000000000ff, + 0x0000000000000000, + 0x000000000000ffff, + 0x0000000000000000, + 0x0000000000ffffff, + 0x0000000000000000, + 0x00000000ffffffff, + 0x0000000000000000, + 0x000000ffffffffff, + 0x0000000000000000, + 0x0000ffffffffffff, + 0x0000000000000000, + 0x00ffffffffffffff, + 0x0000000000000000, + 0xffffffffffffffff, + 0x0000000000000000, + 0xffffffffffffffff, + 0x00000000000000ff, + 0xffffffffffffffff, + 0x000000000000ffff, + 0xffffffffffffffff, + 0x0000000000ffffff, + 0xffffffffffffffff, + 0x00000000ffffffff, + 0xffffffffffffffff, + 0x000000ffffffffff, + 0xffffffffffffffff, + 0x0000ffffffffffff, + 0xffffffffffffffff, + 0x00ffffffffffffff, +} + +func andMask_DATA() Mem { + if andMask_DATA_ptr != nil { + return *andMask_DATA_ptr + } + + andMask := GLOBL(ThatPeskyUnicodeDot+"andMask", NOPTR|RODATA) + andMask_DATA_ptr = &andMask + for i, v := range andMaskConsts { + DATA(i*8, U64(v)) + } + return andMask +} + +// removePeskyUnicodeDot strips the dot from the relevant TEXT directives such that they +// can exist as internal assembly functions +// +// Avo v0.6.0 does not support the generation of internal assembly functions. Go's unicode +// dot tells the compiler to link a TEXT symbol to a function in the current Go package +// (or another package if specified). Avo unconditionally prepends the unicode dot to all +// TEXT symbols, making it impossible to emit an internal function without this hack. +// +// There is a pending PR to add internal functions to Avo: +// https://github.com/mmcloughlin/avo/pull/443 +// +// If merged it should allow the usage of InternalFunction("NAME") for the specified functions +func removePeskyUnicodeDot(internalFunctions []string, target string) { + bytes, err := os.ReadFile(target) + if err != nil { + panic(err) + } + + content := string(bytes) + + for _, from := range internalFunctions { + to := strings.ReplaceAll(from, ThatPeskyUnicodeDot, "") + content = strings.ReplaceAll(content, from, to) + } + + err = os.WriteFile(target, []byte(content), 0644) + if err != nil { + panic(err) + } +} diff --git a/chacha20poly1305/_asm/go.mod b/chacha20poly1305/_asm/go.mod new file mode 100644 index 0000000000..957baf2a64 --- /dev/null +++ b/chacha20poly1305/_asm/go.mod @@ -0,0 +1,15 @@ +module chacha20poly1305/_asm + +go 1.23 + +require ( + github.com/mmcloughlin/avo v0.6.0 + golang.org/x/crypto v0.26.0 +) + +require ( + golang.org/x/mod v0.20.0 // indirect + golang.org/x/sync v0.8.0 // indirect + golang.org/x/sys v0.24.0 // indirect + golang.org/x/tools v0.24.0 // indirect +) diff --git a/chacha20poly1305/_asm/go.sum b/chacha20poly1305/_asm/go.sum new file mode 100644 index 0000000000..62ea9dfb70 --- /dev/null +++ b/chacha20poly1305/_asm/go.sum @@ -0,0 +1,12 @@ +github.com/mmcloughlin/avo v0.6.0 h1:QH6FU8SKoTLaVs80GA8TJuLNkUYl4VokHKlPhVDg4YY= +github.com/mmcloughlin/avo v0.6.0/go.mod h1:8CoAGaCSYXtCPR+8y18Y9aB/kxb8JSS6FRI7mSkvD+8= +golang.org/x/crypto v0.26.0 h1:RrRspgV4mU+YwB4FYnuBoKsUapNIL5cohGAmSH3azsw= +golang.org/x/crypto v0.26.0/go.mod h1:GY7jblb9wI+FOo5y8/S2oY4zWP07AkOJ4+jxCqdqn54= +golang.org/x/mod v0.20.0 h1:utOm6MM3R3dnawAiJgn0y+xvuYRsm1RKM/4giyfDgV0= +golang.org/x/mod v0.20.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ= +golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sys v0.24.0 h1:Twjiwq9dn6R1fQcyiK+wQyHWfaz/BJB+YIpzU/Cv3Xg= +golang.org/x/sys v0.24.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/tools v0.24.0 h1:J1shsA93PJUEVaUSaay7UXAyE8aimq3GW0pjlolpa24= +golang.org/x/tools v0.24.0/go.mod h1:YhNqVBIfWHdzvTLs0d8LCuMhkKUgSUKldakyV7W/WDQ= diff --git a/chacha20poly1305/chacha20poly1305_amd64.s b/chacha20poly1305/chacha20poly1305_amd64.s index 731d2ac6db..fd5ee845f9 100644 --- a/chacha20poly1305/chacha20poly1305_amd64.s +++ b/chacha20poly1305/chacha20poly1305_amd64.s @@ -1,2715 +1,9762 @@ -// Copyright 2016 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// This file was originally from https://golang.org/cl/24717 by Vlad Krasnov of CloudFlare. +// Code generated by command: go run chacha20poly1305_amd64_asm.go -out ../chacha20poly1305_amd64.s -pkg chacha20poly1305. DO NOT EDIT. //go:build gc && !purego #include "textflag.h" -// General register allocation -#define oup DI -#define inp SI -#define inl BX -#define adp CX // free to reuse, after we hash the additional data -#define keyp R8 // free to reuse, when we copy the key to stack -#define itr2 R9 // general iterator -#define itr1 CX // general iterator -#define acc0 R10 -#define acc1 R11 -#define acc2 R12 -#define t0 R13 -#define t1 R14 -#define t2 R15 -#define t3 R8 -// Register and stack allocation for the SSE code -#define rStore (0*16)(BP) -#define sStore (1*16)(BP) -#define state1Store (2*16)(BP) -#define state2Store (3*16)(BP) -#define tmpStore (4*16)(BP) -#define ctr0Store (5*16)(BP) -#define ctr1Store (6*16)(BP) -#define ctr2Store (7*16)(BP) -#define ctr3Store (8*16)(BP) -#define A0 X0 -#define A1 X1 -#define A2 X2 -#define B0 X3 -#define B1 X4 -#define B2 X5 -#define C0 X6 -#define C1 X7 -#define C2 X8 -#define D0 X9 -#define D1 X10 -#define D2 X11 -#define T0 X12 -#define T1 X13 -#define T2 X14 -#define T3 X15 -#define A3 T0 -#define B3 T1 -#define C3 T2 -#define D3 T3 -// Register and stack allocation for the AVX2 code -#define rsStoreAVX2 (0*32)(BP) -#define state1StoreAVX2 (1*32)(BP) -#define state2StoreAVX2 (2*32)(BP) -#define ctr0StoreAVX2 (3*32)(BP) -#define ctr1StoreAVX2 (4*32)(BP) -#define ctr2StoreAVX2 (5*32)(BP) -#define ctr3StoreAVX2 (6*32)(BP) -#define tmpStoreAVX2 (7*32)(BP) // 256 bytes on stack -#define AA0 Y0 -#define AA1 Y5 -#define AA2 Y6 -#define AA3 Y7 -#define BB0 Y14 -#define BB1 Y9 -#define BB2 Y10 -#define BB3 Y11 -#define CC0 Y12 -#define CC1 Y13 -#define CC2 Y8 -#define CC3 Y15 -#define DD0 Y4 -#define DD1 Y1 -#define DD2 Y2 -#define DD3 Y3 -#define TT0 DD3 -#define TT1 AA3 -#define TT2 BB3 -#define TT3 CC3 -// ChaCha20 constants -DATA ·chacha20Constants<>+0x00(SB)/4, $0x61707865 -DATA ·chacha20Constants<>+0x04(SB)/4, $0x3320646e -DATA ·chacha20Constants<>+0x08(SB)/4, $0x79622d32 -DATA ·chacha20Constants<>+0x0c(SB)/4, $0x6b206574 -DATA ·chacha20Constants<>+0x10(SB)/4, $0x61707865 -DATA ·chacha20Constants<>+0x14(SB)/4, $0x3320646e -DATA ·chacha20Constants<>+0x18(SB)/4, $0x79622d32 -DATA ·chacha20Constants<>+0x1c(SB)/4, $0x6b206574 -// <<< 16 with PSHUFB -DATA ·rol16<>+0x00(SB)/8, $0x0504070601000302 -DATA ·rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A -DATA ·rol16<>+0x10(SB)/8, $0x0504070601000302 -DATA ·rol16<>+0x18(SB)/8, $0x0D0C0F0E09080B0A -// <<< 8 with PSHUFB -DATA ·rol8<>+0x00(SB)/8, $0x0605040702010003 -DATA ·rol8<>+0x08(SB)/8, $0x0E0D0C0F0A09080B -DATA ·rol8<>+0x10(SB)/8, $0x0605040702010003 -DATA ·rol8<>+0x18(SB)/8, $0x0E0D0C0F0A09080B - -DATA ·avx2InitMask<>+0x00(SB)/8, $0x0 -DATA ·avx2InitMask<>+0x08(SB)/8, $0x0 -DATA ·avx2InitMask<>+0x10(SB)/8, $0x1 -DATA ·avx2InitMask<>+0x18(SB)/8, $0x0 - -DATA ·avx2IncMask<>+0x00(SB)/8, $0x2 -DATA ·avx2IncMask<>+0x08(SB)/8, $0x0 -DATA ·avx2IncMask<>+0x10(SB)/8, $0x2 -DATA ·avx2IncMask<>+0x18(SB)/8, $0x0 -// Poly1305 key clamp -DATA ·polyClampMask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF -DATA ·polyClampMask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC -DATA ·polyClampMask<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF -DATA ·polyClampMask<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF - -DATA ·sseIncMask<>+0x00(SB)/8, $0x1 -DATA ·sseIncMask<>+0x08(SB)/8, $0x0 -// To load/store the last < 16 bytes in a buffer -DATA ·andMask<>+0x00(SB)/8, $0x00000000000000ff -DATA ·andMask<>+0x08(SB)/8, $0x0000000000000000 -DATA ·andMask<>+0x10(SB)/8, $0x000000000000ffff -DATA ·andMask<>+0x18(SB)/8, $0x0000000000000000 -DATA ·andMask<>+0x20(SB)/8, $0x0000000000ffffff -DATA ·andMask<>+0x28(SB)/8, $0x0000000000000000 -DATA ·andMask<>+0x30(SB)/8, $0x00000000ffffffff -DATA ·andMask<>+0x38(SB)/8, $0x0000000000000000 -DATA ·andMask<>+0x40(SB)/8, $0x000000ffffffffff -DATA ·andMask<>+0x48(SB)/8, $0x0000000000000000 -DATA ·andMask<>+0x50(SB)/8, $0x0000ffffffffffff -DATA ·andMask<>+0x58(SB)/8, $0x0000000000000000 -DATA ·andMask<>+0x60(SB)/8, $0x00ffffffffffffff -DATA ·andMask<>+0x68(SB)/8, $0x0000000000000000 -DATA ·andMask<>+0x70(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+0x78(SB)/8, $0x0000000000000000 -DATA ·andMask<>+0x80(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+0x88(SB)/8, $0x00000000000000ff -DATA ·andMask<>+0x90(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+0x98(SB)/8, $0x000000000000ffff -DATA ·andMask<>+0xa0(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+0xa8(SB)/8, $0x0000000000ffffff -DATA ·andMask<>+0xb0(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+0xb8(SB)/8, $0x00000000ffffffff -DATA ·andMask<>+0xc0(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+0xc8(SB)/8, $0x000000ffffffffff -DATA ·andMask<>+0xd0(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+0xd8(SB)/8, $0x0000ffffffffffff -DATA ·andMask<>+0xe0(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+0xe8(SB)/8, $0x00ffffffffffffff - -GLOBL ·chacha20Constants<>(SB), (NOPTR+RODATA), $32 -GLOBL ·rol16<>(SB), (NOPTR+RODATA), $32 -GLOBL ·rol8<>(SB), (NOPTR+RODATA), $32 -GLOBL ·sseIncMask<>(SB), (NOPTR+RODATA), $16 -GLOBL ·avx2IncMask<>(SB), (NOPTR+RODATA), $32 -GLOBL ·avx2InitMask<>(SB), (NOPTR+RODATA), $32 -GLOBL ·polyClampMask<>(SB), (NOPTR+RODATA), $32 -GLOBL ·andMask<>(SB), (NOPTR+RODATA), $240 -// No PALIGNR in Go ASM yet (but VPALIGNR is present). -#define shiftB0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X3, X3 -#define shiftB1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x04 // PALIGNR $4, X4, X4 -#define shiftB2Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X5, X5 -#define shiftB3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X13, X13 -#define shiftC0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X6, X6 -#define shiftC1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x08 // PALIGNR $8, X7, X7 -#define shiftC2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc0; BYTE $0x08 // PALIGNR $8, X8, X8 -#define shiftC3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X14, X14 -#define shiftD0Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x0c // PALIGNR $12, X9, X9 -#define shiftD1Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x0c // PALIGNR $12, X10, X10 -#define shiftD2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X11, X11 -#define shiftD3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x0c // PALIGNR $12, X15, X15 -#define shiftB0Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X3, X3 -#define shiftB1Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x0c // PALIGNR $12, X4, X4 -#define shiftB2Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X5, X5 -#define shiftB3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X13, X13 -#define shiftC0Right shiftC0Left -#define shiftC1Right shiftC1Left -#define shiftC2Right shiftC2Left -#define shiftC3Right shiftC3Left -#define shiftD0Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x04 // PALIGNR $4, X9, X9 -#define shiftD1Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x04 // PALIGNR $4, X10, X10 -#define shiftD2Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X11, X11 -#define shiftD3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x04 // PALIGNR $4, X15, X15 - -// Some macros - -// ROL rotates the uint32s in register R left by N bits, using temporary T. -#define ROL(N, R, T) \ - MOVO R, T; PSLLL $(N), T; PSRLL $(32-(N)), R; PXOR T, R - -// ROL16 rotates the uint32s in register R left by 16, using temporary T if needed. -#ifdef GOAMD64_v2 -#define ROL16(R, T) PSHUFB ·rol16<>(SB), R -#else -#define ROL16(R, T) ROL(16, R, T) -#endif - -// ROL8 rotates the uint32s in register R left by 8, using temporary T if needed. -#ifdef GOAMD64_v2 -#define ROL8(R, T) PSHUFB ·rol8<>(SB), R -#else -#define ROL8(R, T) ROL(8, R, T) -#endif - -#define chachaQR(A, B, C, D, T) \ - PADDD B, A; PXOR A, D; ROL16(D, T) \ - PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $12, T; PSRLL $20, B; PXOR T, B \ - PADDD B, A; PXOR A, D; ROL8(D, T) \ - PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $7, T; PSRLL $25, B; PXOR T, B - -#define chachaQR_AVX2(A, B, C, D, T) \ - VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol16<>(SB), D, D \ - VPADDD D, C, C; VPXOR C, B, B; VPSLLD $12, B, T; VPSRLD $20, B, B; VPXOR T, B, B \ - VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol8<>(SB), D, D \ - VPADDD D, C, C; VPXOR C, B, B; VPSLLD $7, B, T; VPSRLD $25, B, B; VPXOR T, B, B - -#define polyAdd(S) ADDQ S, acc0; ADCQ 8+S, acc1; ADCQ $1, acc2 -#define polyMulStage1 MOVQ (0*8)(BP), AX; MOVQ AX, t2; MULQ acc0; MOVQ AX, t0; MOVQ DX, t1; MOVQ (0*8)(BP), AX; MULQ acc1; IMULQ acc2, t2; ADDQ AX, t1; ADCQ DX, t2 -#define polyMulStage2 MOVQ (1*8)(BP), AX; MOVQ AX, t3; MULQ acc0; ADDQ AX, t1; ADCQ $0, DX; MOVQ DX, acc0; MOVQ (1*8)(BP), AX; MULQ acc1; ADDQ AX, t2; ADCQ $0, DX -#define polyMulStage3 IMULQ acc2, t3; ADDQ acc0, t2; ADCQ DX, t3 -#define polyMulReduceStage MOVQ t0, acc0; MOVQ t1, acc1; MOVQ t2, acc2; ANDQ $3, acc2; MOVQ t2, t0; ANDQ $-4, t0; MOVQ t3, t1; SHRQ $2, t3, t2; SHRQ $2, t3; ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $0, acc2; ADDQ t2, acc0; ADCQ t3, acc1; ADCQ $0, acc2 - -#define polyMulStage1_AVX2 MOVQ (0*8)(BP), DX; MOVQ DX, t2; MULXQ acc0, t0, t1; IMULQ acc2, t2; MULXQ acc1, AX, DX; ADDQ AX, t1; ADCQ DX, t2 -#define polyMulStage2_AVX2 MOVQ (1*8)(BP), DX; MULXQ acc0, acc0, AX; ADDQ acc0, t1; MULXQ acc1, acc1, t3; ADCQ acc1, t2; ADCQ $0, t3 -#define polyMulStage3_AVX2 IMULQ acc2, DX; ADDQ AX, t2; ADCQ DX, t3 - -#define polyMul polyMulStage1; polyMulStage2; polyMulStage3; polyMulReduceStage -#define polyMulAVX2 polyMulStage1_AVX2; polyMulStage2_AVX2; polyMulStage3_AVX2; polyMulReduceStage -// ---------------------------------------------------------------------------- + +// func polyHashADInternal<>() TEXT polyHashADInternal<>(SB), NOSPLIT, $0 - // adp points to beginning of additional data - // itr2 holds ad length - XORQ acc0, acc0 - XORQ acc1, acc1 - XORQ acc2, acc2 - CMPQ itr2, $13 - JNE hashADLoop - -openFastTLSAD: - // Special treatment for the TLS case of 13 bytes - MOVQ (adp), acc0 - MOVQ 5(adp), acc1 - SHRQ $24, acc1 - MOVQ $1, acc2 - polyMul + // Hack: Must declare #define macros inside of a function due to Avo constraints + // ROL rotates the uint32s in register R left by N bits, using temporary T. + #define ROL(N, R, T) \ + MOVO R, T; \ + PSLLL $(N), T; \ + PSRLL $(32-(N)), R; \ + PXOR T, R + + // ROL8 rotates the uint32s in register R left by 8, using temporary T if needed. + #ifdef GOAMD64_v2 + #define ROL8(R, T) PSHUFB ·rol8<>(SB), R + #else + #define ROL8(R, T) ROL(8, R, T) + #endif + + // ROL16 rotates the uint32s in register R left by 16, using temporary T if needed. + #ifdef GOAMD64_v2 + #define ROL16(R, T) PSHUFB ·rol16<>(SB), R + #else + #define ROL16(R, T) ROL(16, R, T) + #endif + XORQ R10, R10 + XORQ R11, R11 + XORQ R12, R12 + CMPQ R9, $0x0d + JNE hashADLoop + MOVQ (CX), R10 + MOVQ 5(CX), R11 + SHRQ $0x18, R11 + MOVQ $0x00000001, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 RET hashADLoop: // Hash in 16 byte chunks - CMPQ itr2, $16 - JB hashADTail - polyAdd(0(adp)) - LEAQ (1*16)(adp), adp - SUBQ $16, itr2 - polyMul - JMP hashADLoop + CMPQ R9, $0x10 + JB hashADTail + ADDQ (CX), R10 + ADCQ 8(CX), R11 + ADCQ $0x01, R12 + LEAQ 16(CX), CX + SUBQ $0x10, R9 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + JMP hashADLoop hashADTail: - CMPQ itr2, $0 + CMPQ R9, $0x00 JE hashADDone // Hash last < 16 byte tail - XORQ t0, t0 - XORQ t1, t1 - XORQ t2, t2 - ADDQ itr2, adp + XORQ R13, R13 + XORQ R14, R14 + XORQ R15, R15 + ADDQ R9, CX hashADTailLoop: - SHLQ $8, t0, t1 - SHLQ $8, t0 - MOVB -1(adp), t2 - XORQ t2, t0 - DECQ adp - DECQ itr2 - JNE hashADTailLoop - -hashADTailFinish: - ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 - polyMul - - // Finished AD + SHLQ $0x08, R13, R14 + SHLQ $0x08, R13 + MOVB -1(CX), R15 + XORQ R15, R13 + DECQ CX + DECQ R9 + JNE hashADTailLoop + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + hashADDone: RET -// ---------------------------------------------------------------------------- -// func chacha20Poly1305Open(dst, key, src, ad []byte) bool -TEXT ·chacha20Poly1305Open(SB), 0, $288-97 +// func chacha20Poly1305Open(dst []byte, key []uint32, src []byte, ad []byte) bool +// Requires: AVX, AVX2, BMI2, CMOV, SSE2 +TEXT ·chacha20Poly1305Open(SB), $288-97 // For aligned stack access MOVQ SP, BP - ADDQ $32, BP + ADDQ $0x20, BP ANDQ $-32, BP - MOVQ dst+0(FP), oup - MOVQ key+24(FP), keyp - MOVQ src+48(FP), inp - MOVQ src_len+56(FP), inl - MOVQ ad+72(FP), adp + MOVQ dst_base+0(FP), DI + MOVQ key_base+24(FP), R8 + MOVQ src_base+48(FP), SI + MOVQ src_len+56(FP), BX + MOVQ ad_base+72(FP), CX // Check for AVX2 support - CMPB ·useAVX2(SB), $1 + CMPB ·useAVX2+0(SB), $0x01 JE chacha20Poly1305Open_AVX2 // Special optimization, for very short buffers - CMPQ inl, $128 - JBE openSSE128 // About 16% faster + CMPQ BX, $0x80 + JBE openSSE128 // For long buffers, prepare the poly key first - MOVOU ·chacha20Constants<>(SB), A0 - MOVOU (1*16)(keyp), B0 - MOVOU (2*16)(keyp), C0 - MOVOU (3*16)(keyp), D0 - MOVO D0, T1 + MOVOU ·chacha20Constants<>+0(SB), X0 + MOVOU 16(R8), X3 + MOVOU 32(R8), X6 + MOVOU 48(R8), X9 + MOVO X9, X13 // Store state on stack for future use - MOVO B0, state1Store - MOVO C0, state2Store - MOVO D0, ctr3Store - MOVQ $10, itr2 + MOVO X3, 32(BP) + MOVO X6, 48(BP) + MOVO X9, 128(BP) + MOVQ $0x0000000a, R9 openSSEPreparePolyKey: - chachaQR(A0, B0, C0, D0, T0) - shiftB0Left; shiftC0Left; shiftD0Left - chachaQR(A0, B0, C0, D0, T0) - shiftB0Right; shiftC0Right; shiftD0Right - DECQ itr2 - JNE openSSEPreparePolyKey + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + DECQ R9 + JNE openSSEPreparePolyKey // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded - PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0 + PADDL ·chacha20Constants<>+0(SB), X0 + PADDL 32(BP), X3 // Clamp and store the key - PAND ·polyClampMask<>(SB), A0 - MOVO A0, rStore; MOVO B0, sStore + PAND ·polyClampMask<>+0(SB), X0 + MOVO X0, (BP) + MOVO X3, 16(BP) // Hash AAD - MOVQ ad_len+80(FP), itr2 + MOVQ ad_len+80(FP), R9 CALL polyHashADInternal<>(SB) openSSEMainLoop: - CMPQ inl, $256 + CMPQ BX, $0x00000100 JB openSSEMainLoopDone // Load state, increment counter blocks - MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0 - MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 - MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 - MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 + MOVO ·chacha20Constants<>+0(SB), X0 + MOVO 32(BP), X3 + MOVO 48(BP), X6 + MOVO 128(BP), X9 + PADDL ·sseIncMask<>+0(SB), X9 + MOVO X0, X1 + MOVO X3, X4 + MOVO X6, X7 + MOVO X9, X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X1, X2 + MOVO X4, X5 + MOVO X7, X8 + MOVO X10, X11 + PADDL ·sseIncMask<>+0(SB), X11 + MOVO X2, X12 + MOVO X5, X13 + MOVO X8, X14 + MOVO X11, X15 + PADDL ·sseIncMask<>+0(SB), X15 // Store counters - MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store + MOVO X9, 80(BP) + MOVO X10, 96(BP) + MOVO X11, 112(BP) + MOVO X15, 128(BP) - // There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16 - MOVQ $4, itr1 - MOVQ inp, itr2 + // There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash + // 2 blocks, and for the remaining 4 only 1 block - for a total of 16 + MOVQ $0x00000004, CX + MOVQ SI, R9 openSSEInternalLoop: - MOVO C3, tmpStore - chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) - MOVO tmpStore, C3 - MOVO C1, tmpStore - chachaQR(A3, B3, C3, D3, C1) - MOVO tmpStore, C1 - polyAdd(0(itr2)) - shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left - shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left - shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left - polyMulStage1 - polyMulStage2 - LEAQ (2*8)(itr2), itr2 - MOVO C3, tmpStore - chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) - MOVO tmpStore, C3 - MOVO C1, tmpStore - polyMulStage3 - chachaQR(A3, B3, C3, D3, C1) - MOVO tmpStore, C1 - polyMulReduceStage - shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right - shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right - shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right - DECQ itr1 - JGE openSSEInternalLoop - - polyAdd(0(itr2)) - polyMul - LEAQ (2*8)(itr2), itr2 - - CMPQ itr1, $-6 - JG openSSEInternalLoop + MOVO X14, 64(BP) + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X3 + PXOR X14, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X3 + PXOR X14, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X4 + PXOR X14, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X4 + PXOR X14, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X5 + PXOR X14, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X5 + PXOR X14, X5 + MOVO 64(BP), X14 + MOVO X7, 64(BP) + PADDD X13, X12 + PXOR X12, X15 + ROL16(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x0c, X7 + PSRLL $0x14, X13 + PXOR X7, X13 + PADDD X13, X12 + PXOR X12, X15 + ROL8(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x07, X7 + PSRLL $0x19, X13 + PXOR X7, X13 + MOVO 64(BP), X7 + ADDQ (R9), R10 + ADCQ 8(R9), R11 + ADCQ $0x01, R12 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x0c + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + LEAQ 16(R9), R9 + MOVO X14, 64(BP) + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X3 + PXOR X14, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X3 + PXOR X14, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X4 + PXOR X14, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X4 + PXOR X14, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X5 + PXOR X14, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X5 + PXOR X14, X5 + MOVO 64(BP), X14 + MOVO X7, 64(BP) + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + PADDD X13, X12 + PXOR X12, X15 + ROL16(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x0c, X7 + PSRLL $0x14, X13 + PXOR X7, X13 + PADDD X13, X12 + PXOR X12, X15 + ROL8(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x07, X7 + PSRLL $0x19, X13 + PXOR X7, X13 + MOVO 64(BP), X7 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x04 + DECQ CX + JGE openSSEInternalLoop + ADDQ (R9), R10 + ADCQ 8(R9), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(R9), R9 + CMPQ CX, $-6 + JG openSSEInternalLoop // Add in the state - PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 - PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 - PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 - PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 + PADDD ·chacha20Constants<>+0(SB), X0 + PADDD ·chacha20Constants<>+0(SB), X1 + PADDD ·chacha20Constants<>+0(SB), X2 + PADDD ·chacha20Constants<>+0(SB), X12 + PADDD 32(BP), X3 + PADDD 32(BP), X4 + PADDD 32(BP), X5 + PADDD 32(BP), X13 + PADDD 48(BP), X6 + PADDD 48(BP), X7 + PADDD 48(BP), X8 + PADDD 48(BP), X14 + PADDD 80(BP), X9 + PADDD 96(BP), X10 + PADDD 112(BP), X11 + PADDD 128(BP), X15 // Load - xor - store - MOVO D3, tmpStore - MOVOU (0*16)(inp), D3; PXOR D3, A0; MOVOU A0, (0*16)(oup) - MOVOU (1*16)(inp), D3; PXOR D3, B0; MOVOU B0, (1*16)(oup) - MOVOU (2*16)(inp), D3; PXOR D3, C0; MOVOU C0, (2*16)(oup) - MOVOU (3*16)(inp), D3; PXOR D3, D0; MOVOU D0, (3*16)(oup) - MOVOU (4*16)(inp), D0; PXOR D0, A1; MOVOU A1, (4*16)(oup) - MOVOU (5*16)(inp), D0; PXOR D0, B1; MOVOU B1, (5*16)(oup) - MOVOU (6*16)(inp), D0; PXOR D0, C1; MOVOU C1, (6*16)(oup) - MOVOU (7*16)(inp), D0; PXOR D0, D1; MOVOU D1, (7*16)(oup) - MOVOU (8*16)(inp), D0; PXOR D0, A2; MOVOU A2, (8*16)(oup) - MOVOU (9*16)(inp), D0; PXOR D0, B2; MOVOU B2, (9*16)(oup) - MOVOU (10*16)(inp), D0; PXOR D0, C2; MOVOU C2, (10*16)(oup) - MOVOU (11*16)(inp), D0; PXOR D0, D2; MOVOU D2, (11*16)(oup) - MOVOU (12*16)(inp), D0; PXOR D0, A3; MOVOU A3, (12*16)(oup) - MOVOU (13*16)(inp), D0; PXOR D0, B3; MOVOU B3, (13*16)(oup) - MOVOU (14*16)(inp), D0; PXOR D0, C3; MOVOU C3, (14*16)(oup) - MOVOU (15*16)(inp), D0; PXOR tmpStore, D0; MOVOU D0, (15*16)(oup) - LEAQ 256(inp), inp - LEAQ 256(oup), oup - SUBQ $256, inl + MOVO X15, 64(BP) + MOVOU (SI), X15 + PXOR X15, X0 + MOVOU X0, (DI) + MOVOU 16(SI), X15 + PXOR X15, X3 + MOVOU X3, 16(DI) + MOVOU 32(SI), X15 + PXOR X15, X6 + MOVOU X6, 32(DI) + MOVOU 48(SI), X15 + PXOR X15, X9 + MOVOU X9, 48(DI) + MOVOU 64(SI), X9 + PXOR X9, X1 + MOVOU X1, 64(DI) + MOVOU 80(SI), X9 + PXOR X9, X4 + MOVOU X4, 80(DI) + MOVOU 96(SI), X9 + PXOR X9, X7 + MOVOU X7, 96(DI) + MOVOU 112(SI), X9 + PXOR X9, X10 + MOVOU X10, 112(DI) + MOVOU 128(SI), X9 + PXOR X9, X2 + MOVOU X2, 128(DI) + MOVOU 144(SI), X9 + PXOR X9, X5 + MOVOU X5, 144(DI) + MOVOU 160(SI), X9 + PXOR X9, X8 + MOVOU X8, 160(DI) + MOVOU 176(SI), X9 + PXOR X9, X11 + MOVOU X11, 176(DI) + MOVOU 192(SI), X9 + PXOR X9, X12 + MOVOU X12, 192(DI) + MOVOU 208(SI), X9 + PXOR X9, X13 + MOVOU X13, 208(DI) + MOVOU 224(SI), X9 + PXOR X9, X14 + MOVOU X14, 224(DI) + MOVOU 240(SI), X9 + PXOR 64(BP), X9 + MOVOU X9, 240(DI) + LEAQ 256(SI), SI + LEAQ 256(DI), DI + SUBQ $0x00000100, BX JMP openSSEMainLoop openSSEMainLoopDone: // Handle the various tail sizes efficiently - TESTQ inl, inl + TESTQ BX, BX JE openSSEFinalize - CMPQ inl, $64 + CMPQ BX, $0x40 JBE openSSETail64 - CMPQ inl, $128 + CMPQ BX, $0x80 JBE openSSETail128 - CMPQ inl, $192 + CMPQ BX, $0xc0 JBE openSSETail192 JMP openSSETail256 openSSEFinalize: // Hash in the PT, AAD lengths - ADDQ ad_len+80(FP), acc0; ADCQ src_len+56(FP), acc1; ADCQ $1, acc2 - polyMul + ADDQ ad_len+80(FP), R10 + ADCQ src_len+56(FP), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 // Final reduce - MOVQ acc0, t0 - MOVQ acc1, t1 - MOVQ acc2, t2 - SUBQ $-5, acc0 - SBBQ $-1, acc1 - SBBQ $3, acc2 - CMOVQCS t0, acc0 - CMOVQCS t1, acc1 - CMOVQCS t2, acc2 + MOVQ R10, R13 + MOVQ R11, R14 + MOVQ R12, R15 + SUBQ $-5, R10 + SBBQ $-1, R11 + SBBQ $0x03, R12 + CMOVQCS R13, R10 + CMOVQCS R14, R11 + CMOVQCS R15, R12 // Add in the "s" part of the key - ADDQ 0+sStore, acc0 - ADCQ 8+sStore, acc1 + ADDQ 16(BP), R10 + ADCQ 24(BP), R11 // Finally, constant time compare to the tag at the end of the message XORQ AX, AX - MOVQ $1, DX - XORQ (0*8)(inp), acc0 - XORQ (1*8)(inp), acc1 - ORQ acc1, acc0 + MOVQ $0x00000001, DX + XORQ (SI), R10 + XORQ 8(SI), R11 + ORQ R11, R10 CMOVQEQ DX, AX // Return true iff tags are equal MOVB AX, ret+96(FP) RET -// ---------------------------------------------------------------------------- -// Special optimization for buffers smaller than 129 bytes openSSE128: - // For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks - MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0 - MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 - MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 - MOVO B0, T1; MOVO C0, T2; MOVO D1, T3 - MOVQ $10, itr2 + MOVOU ·chacha20Constants<>+0(SB), X0 + MOVOU 16(R8), X3 + MOVOU 32(R8), X6 + MOVOU 48(R8), X9 + MOVO X0, X1 + MOVO X3, X4 + MOVO X6, X7 + MOVO X9, X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X1, X2 + MOVO X4, X5 + MOVO X7, X8 + MOVO X10, X11 + PADDL ·sseIncMask<>+0(SB), X11 + MOVO X3, X13 + MOVO X6, X14 + MOVO X10, X15 + MOVQ $0x0000000a, R9 openSSE128InnerCipherLoop: - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) - shiftB0Left; shiftB1Left; shiftB2Left - shiftC0Left; shiftC1Left; shiftC2Left - shiftD0Left; shiftD1Left; shiftD2Left - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) - shiftB0Right; shiftB1Right; shiftB2Right - shiftC0Right; shiftC1Right; shiftC2Right - shiftD0Right; shiftD1Right; shiftD2Right - DECQ itr2 - JNE openSSE128InnerCipherLoop + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X5 + PXOR X12, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X5 + PXOR X12, X5 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X5 + PXOR X12, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X5 + PXOR X12, X5 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + DECQ R9 + JNE openSSE128InnerCipherLoop // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded - PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 - PADDL T1, B0; PADDL T1, B1; PADDL T1, B2 - PADDL T2, C1; PADDL T2, C2 - PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2 + PADDL ·chacha20Constants<>+0(SB), X0 + PADDL ·chacha20Constants<>+0(SB), X1 + PADDL ·chacha20Constants<>+0(SB), X2 + PADDL X13, X3 + PADDL X13, X4 + PADDL X13, X5 + PADDL X14, X7 + PADDL X14, X8 + PADDL X15, X10 + PADDL ·sseIncMask<>+0(SB), X15 + PADDL X15, X11 // Clamp and store the key - PAND ·polyClampMask<>(SB), A0 - MOVOU A0, rStore; MOVOU B0, sStore + PAND ·polyClampMask<>+0(SB), X0 + MOVOU X0, (BP) + MOVOU X3, 16(BP) // Hash - MOVQ ad_len+80(FP), itr2 + MOVQ ad_len+80(FP), R9 CALL polyHashADInternal<>(SB) openSSE128Open: - CMPQ inl, $16 + CMPQ BX, $0x10 JB openSSETail16 - SUBQ $16, inl + SUBQ $0x10, BX // Load for hashing - polyAdd(0(inp)) + ADDQ (SI), R10 + ADCQ 8(SI), R11 + ADCQ $0x01, R12 // Load for decryption - MOVOU (inp), T0; PXOR T0, A1; MOVOU A1, (oup) - LEAQ (1*16)(inp), inp - LEAQ (1*16)(oup), oup - polyMul + MOVOU (SI), X12 + PXOR X12, X1 + MOVOU X1, (DI) + LEAQ 16(SI), SI + LEAQ 16(DI), DI + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 // Shift the stream "left" - MOVO B1, A1 - MOVO C1, B1 - MOVO D1, C1 - MOVO A2, D1 - MOVO B2, A2 - MOVO C2, B2 - MOVO D2, C2 + MOVO X4, X1 + MOVO X7, X4 + MOVO X10, X7 + MOVO X2, X10 + MOVO X5, X2 + MOVO X8, X5 + MOVO X11, X8 JMP openSSE128Open openSSETail16: - TESTQ inl, inl + TESTQ BX, BX JE openSSEFinalize // We can safely load the CT from the end, because it is padded with the MAC - MOVQ inl, itr2 - SHLQ $4, itr2 - LEAQ ·andMask<>(SB), t0 - MOVOU (inp), T0 - ADDQ inl, inp - PAND -16(t0)(itr2*1), T0 - MOVO T0, 0+tmpStore - MOVQ T0, t0 - MOVQ 8+tmpStore, t1 - PXOR A1, T0 + MOVQ BX, R9 + SHLQ $0x04, R9 + LEAQ ·andMask<>+0(SB), R13 + MOVOU (SI), X12 + ADDQ BX, SI + PAND -16(R13)(R9*1), X12 + MOVO X12, 64(BP) + MOVQ X12, R13 + MOVQ 72(BP), R14 + PXOR X1, X12 // We can only store one byte at a time, since plaintext can be shorter than 16 bytes openSSETail16Store: - MOVQ T0, t3 - MOVB t3, (oup) - PSRLDQ $1, T0 - INCQ oup - DECQ inl + MOVQ X12, R8 + MOVB R8, (DI) + PSRLDQ $0x01, X12 + INCQ DI + DECQ BX JNE openSSETail16Store - ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 - polyMul + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 JMP openSSEFinalize -// ---------------------------------------------------------------------------- -// Special optimization for the last 64 bytes of ciphertext openSSETail64: - // Need to decrypt up to 64 bytes - prepare single block - MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store - XORQ itr2, itr2 - MOVQ inl, itr1 - CMPQ itr1, $16 - JB openSSETail64LoopB + MOVO ·chacha20Constants<>+0(SB), X0 + MOVO 32(BP), X3 + MOVO 48(BP), X6 + MOVO 128(BP), X9 + PADDL ·sseIncMask<>+0(SB), X9 + MOVO X9, 80(BP) + XORQ R9, R9 + MOVQ BX, CX + CMPQ CX, $0x10 + JB openSSETail64LoopB openSSETail64LoopA: - // Perform ChaCha rounds, while hashing the remaining input - polyAdd(0(inp)(itr2*1)) - polyMul - SUBQ $16, itr1 + ADDQ (SI)(R9*1), R10 + ADCQ 8(SI)(R9*1), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + SUBQ $0x10, CX openSSETail64LoopB: - ADDQ $16, itr2 - chachaQR(A0, B0, C0, D0, T0) - shiftB0Left; shiftC0Left; shiftD0Left - chachaQR(A0, B0, C0, D0, T0) - shiftB0Right; shiftC0Right; shiftD0Right - - CMPQ itr1, $16 - JAE openSSETail64LoopA - - CMPQ itr2, $160 - JNE openSSETail64LoopB - - PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0; PADDL state2Store, C0; PADDL ctr0Store, D0 + ADDQ $0x10, R9 + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + CMPQ CX, $0x10 + JAE openSSETail64LoopA + CMPQ R9, $0xa0 + JNE openSSETail64LoopB + PADDL ·chacha20Constants<>+0(SB), X0 + PADDL 32(BP), X3 + PADDL 48(BP), X6 + PADDL 80(BP), X9 openSSETail64DecLoop: - CMPQ inl, $16 + CMPQ BX, $0x10 JB openSSETail64DecLoopDone - SUBQ $16, inl - MOVOU (inp), T0 - PXOR T0, A0 - MOVOU A0, (oup) - LEAQ 16(inp), inp - LEAQ 16(oup), oup - MOVO B0, A0 - MOVO C0, B0 - MOVO D0, C0 + SUBQ $0x10, BX + MOVOU (SI), X12 + PXOR X12, X0 + MOVOU X0, (DI) + LEAQ 16(SI), SI + LEAQ 16(DI), DI + MOVO X3, X0 + MOVO X6, X3 + MOVO X9, X6 JMP openSSETail64DecLoop openSSETail64DecLoopDone: - MOVO A0, A1 + MOVO X0, X1 JMP openSSETail16 -// ---------------------------------------------------------------------------- -// Special optimization for the last 128 bytes of ciphertext openSSETail128: - // Need to decrypt up to 128 bytes - prepare two blocks - MOVO ·chacha20Constants<>(SB), A1; MOVO state1Store, B1; MOVO state2Store, C1; MOVO ctr3Store, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr0Store - MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr1Store - XORQ itr2, itr2 - MOVQ inl, itr1 - ANDQ $-16, itr1 + MOVO ·chacha20Constants<>+0(SB), X1 + MOVO 32(BP), X4 + MOVO 48(BP), X7 + MOVO 128(BP), X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X10, 80(BP) + MOVO X1, X0 + MOVO X4, X3 + MOVO X7, X6 + MOVO X10, X9 + PADDL ·sseIncMask<>+0(SB), X9 + MOVO X9, 96(BP) + XORQ R9, R9 + MOVQ BX, CX + ANDQ $-16, CX openSSETail128LoopA: - // Perform ChaCha rounds, while hashing the remaining input - polyAdd(0(inp)(itr2*1)) - polyMul + ADDQ (SI)(R9*1), R10 + ADCQ 8(SI)(R9*1), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 openSSETail128LoopB: - ADDQ $16, itr2 - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) - shiftB0Left; shiftC0Left; shiftD0Left - shiftB1Left; shiftC1Left; shiftD1Left - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) - shiftB0Right; shiftC0Right; shiftD0Right - shiftB1Right; shiftC1Right; shiftD1Right - - CMPQ itr2, itr1 - JB openSSETail128LoopA - - CMPQ itr2, $160 - JNE openSSETail128LoopB - - PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1 - PADDL state1Store, B0; PADDL state1Store, B1 - PADDL state2Store, C0; PADDL state2Store, C1 - PADDL ctr1Store, D0; PADDL ctr0Store, D1 - - MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 - PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1 - MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup) - - SUBQ $64, inl - LEAQ 64(inp), inp - LEAQ 64(oup), oup - JMP openSSETail64DecLoop - -// ---------------------------------------------------------------------------- -// Special optimization for the last 192 bytes of ciphertext + ADDQ $0x10, R9 + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + CMPQ R9, CX + JB openSSETail128LoopA + CMPQ R9, $0xa0 + JNE openSSETail128LoopB + PADDL ·chacha20Constants<>+0(SB), X0 + PADDL ·chacha20Constants<>+0(SB), X1 + PADDL 32(BP), X3 + PADDL 32(BP), X4 + PADDL 48(BP), X6 + PADDL 48(BP), X7 + PADDL 96(BP), X9 + PADDL 80(BP), X10 + MOVOU (SI), X12 + MOVOU 16(SI), X13 + MOVOU 32(SI), X14 + MOVOU 48(SI), X15 + PXOR X12, X1 + PXOR X13, X4 + PXOR X14, X7 + PXOR X15, X10 + MOVOU X1, (DI) + MOVOU X4, 16(DI) + MOVOU X7, 32(DI) + MOVOU X10, 48(DI) + SUBQ $0x40, BX + LEAQ 64(SI), SI + LEAQ 64(DI), DI + JMP openSSETail64DecLoop + openSSETail192: - // Need to decrypt up to 192 bytes - prepare three blocks - MOVO ·chacha20Constants<>(SB), A2; MOVO state1Store, B2; MOVO state2Store, C2; MOVO ctr3Store, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr0Store - MOVO A2, A1; MOVO B2, B1; MOVO C2, C1; MOVO D2, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store - MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr2Store - - MOVQ inl, itr1 - MOVQ $160, itr2 - CMPQ itr1, $160 - CMOVQGT itr2, itr1 - ANDQ $-16, itr1 - XORQ itr2, itr2 + MOVO ·chacha20Constants<>+0(SB), X2 + MOVO 32(BP), X5 + MOVO 48(BP), X8 + MOVO 128(BP), X11 + PADDL ·sseIncMask<>+0(SB), X11 + MOVO X11, 80(BP) + MOVO X2, X1 + MOVO X5, X4 + MOVO X8, X7 + MOVO X11, X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X10, 96(BP) + MOVO X1, X0 + MOVO X4, X3 + MOVO X7, X6 + MOVO X10, X9 + PADDL ·sseIncMask<>+0(SB), X9 + MOVO X9, 112(BP) + MOVQ BX, CX + MOVQ $0x000000a0, R9 + CMPQ CX, $0xa0 + CMOVQGT R9, CX + ANDQ $-16, CX + XORQ R9, R9 openSSLTail192LoopA: - // Perform ChaCha rounds, while hashing the remaining input - polyAdd(0(inp)(itr2*1)) - polyMul + ADDQ (SI)(R9*1), R10 + ADCQ 8(SI)(R9*1), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 openSSLTail192LoopB: - ADDQ $16, itr2 - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) - shiftB0Left; shiftC0Left; shiftD0Left - shiftB1Left; shiftC1Left; shiftD1Left - shiftB2Left; shiftC2Left; shiftD2Left - - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) - shiftB0Right; shiftC0Right; shiftD0Right - shiftB1Right; shiftC1Right; shiftD1Right - shiftB2Right; shiftC2Right; shiftD2Right - - CMPQ itr2, itr1 - JB openSSLTail192LoopA - - CMPQ itr2, $160 - JNE openSSLTail192LoopB - - CMPQ inl, $176 - JB openSSLTail192Store - - polyAdd(160(inp)) - polyMul - - CMPQ inl, $192 - JB openSSLTail192Store - - polyAdd(176(inp)) - polyMul + ADDQ $0x10, R9 + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X5 + PXOR X12, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X5 + PXOR X12, X5 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X5 + PXOR X12, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X5 + PXOR X12, X5 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + CMPQ R9, CX + JB openSSLTail192LoopA + CMPQ R9, $0xa0 + JNE openSSLTail192LoopB + CMPQ BX, $0xb0 + JB openSSLTail192Store + ADDQ 160(SI), R10 + ADCQ 168(SI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + CMPQ BX, $0xc0 + JB openSSLTail192Store + ADDQ 176(SI), R10 + ADCQ 184(SI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 openSSLTail192Store: - PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 - PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2 - PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2 - PADDL ctr2Store, D0; PADDL ctr1Store, D1; PADDL ctr0Store, D2 - - MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 - PXOR T0, A2; PXOR T1, B2; PXOR T2, C2; PXOR T3, D2 - MOVOU A2, (0*16)(oup); MOVOU B2, (1*16)(oup); MOVOU C2, (2*16)(oup); MOVOU D2, (3*16)(oup) - - MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3 - PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1 - MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) - - SUBQ $128, inl - LEAQ 128(inp), inp - LEAQ 128(oup), oup - JMP openSSETail64DecLoop - -// ---------------------------------------------------------------------------- -// Special optimization for the last 256 bytes of ciphertext + PADDL ·chacha20Constants<>+0(SB), X0 + PADDL ·chacha20Constants<>+0(SB), X1 + PADDL ·chacha20Constants<>+0(SB), X2 + PADDL 32(BP), X3 + PADDL 32(BP), X4 + PADDL 32(BP), X5 + PADDL 48(BP), X6 + PADDL 48(BP), X7 + PADDL 48(BP), X8 + PADDL 112(BP), X9 + PADDL 96(BP), X10 + PADDL 80(BP), X11 + MOVOU (SI), X12 + MOVOU 16(SI), X13 + MOVOU 32(SI), X14 + MOVOU 48(SI), X15 + PXOR X12, X2 + PXOR X13, X5 + PXOR X14, X8 + PXOR X15, X11 + MOVOU X2, (DI) + MOVOU X5, 16(DI) + MOVOU X8, 32(DI) + MOVOU X11, 48(DI) + MOVOU 64(SI), X12 + MOVOU 80(SI), X13 + MOVOU 96(SI), X14 + MOVOU 112(SI), X15 + PXOR X12, X1 + PXOR X13, X4 + PXOR X14, X7 + PXOR X15, X10 + MOVOU X1, 64(DI) + MOVOU X4, 80(DI) + MOVOU X7, 96(DI) + MOVOU X10, 112(DI) + SUBQ $0x80, BX + LEAQ 128(SI), SI + LEAQ 128(DI), DI + JMP openSSETail64DecLoop + openSSETail256: - // Need to decrypt up to 256 bytes - prepare four blocks - MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0 - MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 - MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 - MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 + MOVO ·chacha20Constants<>+0(SB), X0 + MOVO 32(BP), X3 + MOVO 48(BP), X6 + MOVO 128(BP), X9 + PADDL ·sseIncMask<>+0(SB), X9 + MOVO X0, X1 + MOVO X3, X4 + MOVO X6, X7 + MOVO X9, X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X1, X2 + MOVO X4, X5 + MOVO X7, X8 + MOVO X10, X11 + PADDL ·sseIncMask<>+0(SB), X11 + MOVO X2, X12 + MOVO X5, X13 + MOVO X8, X14 + MOVO X11, X15 + PADDL ·sseIncMask<>+0(SB), X15 // Store counters - MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store - XORQ itr2, itr2 + MOVO X9, 80(BP) + MOVO X10, 96(BP) + MOVO X11, 112(BP) + MOVO X15, 128(BP) + XORQ R9, R9 openSSETail256Loop: - // This loop inteleaves 8 ChaCha quarter rounds with 1 poly multiplication - polyAdd(0(inp)(itr2*1)) - MOVO C3, tmpStore - chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) - MOVO tmpStore, C3 - MOVO C1, tmpStore - chachaQR(A3, B3, C3, D3, C1) - MOVO tmpStore, C1 - shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left - shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left - shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left - polyMulStage1 - polyMulStage2 - MOVO C3, tmpStore - chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) - MOVO tmpStore, C3 - MOVO C1, tmpStore - chachaQR(A3, B3, C3, D3, C1) - MOVO tmpStore, C1 - polyMulStage3 - polyMulReduceStage - shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right - shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right - shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right - ADDQ $2*8, itr2 - CMPQ itr2, $160 - JB openSSETail256Loop - MOVQ inl, itr1 - ANDQ $-16, itr1 + ADDQ (SI)(R9*1), R10 + ADCQ 8(SI)(R9*1), R11 + ADCQ $0x01, R12 + MOVO X14, 64(BP) + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X3 + PXOR X14, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X3 + PXOR X14, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X4 + PXOR X14, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X4 + PXOR X14, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X5 + PXOR X14, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X5 + PXOR X14, X5 + MOVO 64(BP), X14 + MOVO X7, 64(BP) + PADDD X13, X12 + PXOR X12, X15 + ROL16(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x0c, X7 + PSRLL $0x14, X13 + PXOR X7, X13 + PADDD X13, X12 + PXOR X12, X15 + ROL8(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x07, X7 + PSRLL $0x19, X13 + PXOR X7, X13 + MOVO 64(BP), X7 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x0c + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + MOVO X14, 64(BP) + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X3 + PXOR X14, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X3 + PXOR X14, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X4 + PXOR X14, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X4 + PXOR X14, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X5 + PXOR X14, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X5 + PXOR X14, X5 + MOVO 64(BP), X14 + MOVO X7, 64(BP) + PADDD X13, X12 + PXOR X12, X15 + ROL16(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x0c, X7 + PSRLL $0x14, X13 + PXOR X7, X13 + PADDD X13, X12 + PXOR X12, X15 + ROL8(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x07, X7 + PSRLL $0x19, X13 + PXOR X7, X13 + MOVO 64(BP), X7 + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x04 + ADDQ $0x10, R9 + CMPQ R9, $0xa0 + JB openSSETail256Loop + MOVQ BX, CX + ANDQ $-16, CX openSSETail256HashLoop: - polyAdd(0(inp)(itr2*1)) - polyMul - ADDQ $2*8, itr2 - CMPQ itr2, itr1 - JB openSSETail256HashLoop + ADDQ (SI)(R9*1), R10 + ADCQ 8(SI)(R9*1), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + ADDQ $0x10, R9 + CMPQ R9, CX + JB openSSETail256HashLoop // Add in the state - PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 - PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 - PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 - PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 - MOVO D3, tmpStore + PADDD ·chacha20Constants<>+0(SB), X0 + PADDD ·chacha20Constants<>+0(SB), X1 + PADDD ·chacha20Constants<>+0(SB), X2 + PADDD ·chacha20Constants<>+0(SB), X12 + PADDD 32(BP), X3 + PADDD 32(BP), X4 + PADDD 32(BP), X5 + PADDD 32(BP), X13 + PADDD 48(BP), X6 + PADDD 48(BP), X7 + PADDD 48(BP), X8 + PADDD 48(BP), X14 + PADDD 80(BP), X9 + PADDD 96(BP), X10 + PADDD 112(BP), X11 + PADDD 128(BP), X15 + MOVO X15, 64(BP) // Load - xor - store - MOVOU (0*16)(inp), D3; PXOR D3, A0 - MOVOU (1*16)(inp), D3; PXOR D3, B0 - MOVOU (2*16)(inp), D3; PXOR D3, C0 - MOVOU (3*16)(inp), D3; PXOR D3, D0 - MOVOU A0, (0*16)(oup) - MOVOU B0, (1*16)(oup) - MOVOU C0, (2*16)(oup) - MOVOU D0, (3*16)(oup) - MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0 - PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1 - MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) - MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0 - PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2 - MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup) - LEAQ 192(inp), inp - LEAQ 192(oup), oup - SUBQ $192, inl - MOVO A3, A0 - MOVO B3, B0 - MOVO C3, C0 - MOVO tmpStore, D0 - - JMP openSSETail64DecLoop - -// ---------------------------------------------------------------------------- -// ------------------------- AVX2 Code ---------------------------------------- + MOVOU (SI), X15 + PXOR X15, X0 + MOVOU 16(SI), X15 + PXOR X15, X3 + MOVOU 32(SI), X15 + PXOR X15, X6 + MOVOU 48(SI), X15 + PXOR X15, X9 + MOVOU X0, (DI) + MOVOU X3, 16(DI) + MOVOU X6, 32(DI) + MOVOU X9, 48(DI) + MOVOU 64(SI), X0 + MOVOU 80(SI), X3 + MOVOU 96(SI), X6 + MOVOU 112(SI), X9 + PXOR X0, X1 + PXOR X3, X4 + PXOR X6, X7 + PXOR X9, X10 + MOVOU X1, 64(DI) + MOVOU X4, 80(DI) + MOVOU X7, 96(DI) + MOVOU X10, 112(DI) + MOVOU 128(SI), X0 + MOVOU 144(SI), X3 + MOVOU 160(SI), X6 + MOVOU 176(SI), X9 + PXOR X0, X2 + PXOR X3, X5 + PXOR X6, X8 + PXOR X9, X11 + MOVOU X2, 128(DI) + MOVOU X5, 144(DI) + MOVOU X8, 160(DI) + MOVOU X11, 176(DI) + LEAQ 192(SI), SI + LEAQ 192(DI), DI + SUBQ $0xc0, BX + MOVO X12, X0 + MOVO X13, X3 + MOVO X14, X6 + MOVO 64(BP), X9 + JMP openSSETail64DecLoop + chacha20Poly1305Open_AVX2: VZEROUPPER - VMOVDQU ·chacha20Constants<>(SB), AA0 - BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14 - BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12 - BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4 - VPADDD ·avx2InitMask<>(SB), DD0, DD0 + VMOVDQU ·chacha20Constants<>+0(SB), Y0 + BYTE $0xc4 + BYTE $0x42 + BYTE $0x7d + BYTE $0x5a + BYTE $0x70 + BYTE $0x10 + BYTE $0xc4 + BYTE $0x42 + BYTE $0x7d + BYTE $0x5a + BYTE $0x60 + BYTE $0x20 + BYTE $0xc4 + BYTE $0xc2 + BYTE $0x7d + BYTE $0x5a + BYTE $0x60 + BYTE $0x30 + VPADDD ·avx2InitMask<>+0(SB), Y4, Y4 // Special optimization, for very short buffers - CMPQ inl, $192 + CMPQ BX, $0xc0 JBE openAVX2192 - CMPQ inl, $320 + CMPQ BX, $0x00000140 JBE openAVX2320 // For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream - VMOVDQA BB0, state1StoreAVX2 - VMOVDQA CC0, state2StoreAVX2 - VMOVDQA DD0, ctr3StoreAVX2 - MOVQ $10, itr2 + VMOVDQA Y14, 32(BP) + VMOVDQA Y12, 64(BP) + VMOVDQA Y4, 192(BP) + MOVQ $0x0000000a, R9 openAVX2PreparePolyKey: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0 - DECQ itr2 - JNE openAVX2PreparePolyKey - - VPADDD ·chacha20Constants<>(SB), AA0, AA0 - VPADDD state1StoreAVX2, BB0, BB0 - VPADDD state2StoreAVX2, CC0, CC0 - VPADDD ctr3StoreAVX2, DD0, DD0 - - VPERM2I128 $0x02, AA0, BB0, TT0 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x04, Y4, Y4, Y4 + DECQ R9 + JNE openAVX2PreparePolyKey + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD 32(BP), Y14, Y14 + VPADDD 64(BP), Y12, Y12 + VPADDD 192(BP), Y4, Y4 + VPERM2I128 $0x02, Y0, Y14, Y3 // Clamp and store poly key - VPAND ·polyClampMask<>(SB), TT0, TT0 - VMOVDQA TT0, rsStoreAVX2 + VPAND ·polyClampMask<>+0(SB), Y3, Y3 + VMOVDQA Y3, (BP) // Stream for the first 64 bytes - VPERM2I128 $0x13, AA0, BB0, AA0 - VPERM2I128 $0x13, CC0, DD0, BB0 + VPERM2I128 $0x13, Y0, Y14, Y0 + VPERM2I128 $0x13, Y12, Y4, Y14 // Hash AD + first 64 bytes - MOVQ ad_len+80(FP), itr2 + MOVQ ad_len+80(FP), R9 CALL polyHashADInternal<>(SB) - XORQ itr1, itr1 + XORQ CX, CX openAVX2InitialHash64: - polyAdd(0(inp)(itr1*1)) - polyMulAVX2 - ADDQ $16, itr1 - CMPQ itr1, $64 - JNE openAVX2InitialHash64 + ADDQ (SI)(CX*1), R10 + ADCQ 8(SI)(CX*1), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + ADDQ $0x10, CX + CMPQ CX, $0x40 + JNE openAVX2InitialHash64 // Decrypt the first 64 bytes - VPXOR (0*32)(inp), AA0, AA0 - VPXOR (1*32)(inp), BB0, BB0 - VMOVDQU AA0, (0*32)(oup) - VMOVDQU BB0, (1*32)(oup) - LEAQ (2*32)(inp), inp - LEAQ (2*32)(oup), oup - SUBQ $64, inl + VPXOR (SI), Y0, Y0 + VPXOR 32(SI), Y14, Y14 + VMOVDQU Y0, (DI) + VMOVDQU Y14, 32(DI) + LEAQ 64(SI), SI + LEAQ 64(DI), DI + SUBQ $0x40, BX openAVX2MainLoop: - CMPQ inl, $512 + CMPQ BX, $0x00000200 JB openAVX2MainLoopDone // Load state, increment counter blocks, store the incremented counters - VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 - VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 - VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 - VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 - VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 - XORQ itr1, itr1 + VMOVDQU ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA Y0, Y6 + VMOVDQA Y0, Y7 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA Y14, Y10 + VMOVDQA Y14, Y11 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA Y12, Y8 + VMOVDQA Y12, Y15 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 + VMOVDQA Y4, 96(BP) + VMOVDQA Y1, 128(BP) + VMOVDQA Y2, 160(BP) + VMOVDQA Y3, 192(BP) + XORQ CX, CX openAVX2InternalLoop: - // Lets just say this spaghetti loop interleaves 2 quarter rounds with 3 poly multiplications - // Effectively per 512 bytes of stream we hash 480 bytes of ciphertext - polyAdd(0*8(inp)(itr1*1)) - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - polyMulStage1_AVX2 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 - polyMulStage2_AVX2 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - polyMulStage3_AVX2 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyMulReduceStage - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 - polyAdd(2*8(inp)(itr1*1)) - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - polyMulStage1_AVX2 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyMulStage2_AVX2 - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - polyMulStage3_AVX2 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 - polyMulReduceStage - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - polyAdd(4*8(inp)(itr1*1)) - LEAQ (6*8)(itr1), itr1 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyMulStage1_AVX2 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - polyMulStage2_AVX2 - VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - polyMulStage3_AVX2 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyMulReduceStage - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 - CMPQ itr1, $480 + ADDQ (SI)(CX*1), R10 + ADCQ 8(SI)(CX*1), R11 + ADCQ $0x01, R12 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + ADDQ 16(SI)(CX*1), R10 + ADCQ 24(SI)(CX*1), R11 + ADCQ $0x01, R12 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x04, Y11, Y11, Y11 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPALIGNR $0x0c, Y3, Y3, Y3 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + ADDQ 32(SI)(CX*1), R10 + ADCQ 40(SI)(CX*1), R11 + ADCQ $0x01, R12 + LEAQ 48(CX), CX + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x0c, Y11, Y11, Y11 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x04, Y2, Y2, Y2 + VPALIGNR $0x04, Y3, Y3, Y3 + CMPQ CX, $0x000001e0 JNE openAVX2InternalLoop - - VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 - VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 - VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 - VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 - VMOVDQA CC3, tmpStoreAVX2 + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 + VPADDD ·chacha20Constants<>+0(SB), Y7, Y7 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 32(BP), Y10, Y10 + VPADDD 32(BP), Y11, Y11 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD 64(BP), Y8, Y8 + VPADDD 64(BP), Y15, Y15 + VPADDD 96(BP), Y4, Y4 + VPADDD 128(BP), Y1, Y1 + VPADDD 160(BP), Y2, Y2 + VPADDD 192(BP), Y3, Y3 + VMOVDQA Y15, 224(BP) // We only hashed 480 of the 512 bytes available - hash the remaining 32 here - polyAdd(480(inp)) - polyMulAVX2 - VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0 - VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0 - VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup) - VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 - VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 - VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) + ADDQ 480(SI), R10 + ADCQ 488(SI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPERM2I128 $0x02, Y0, Y14, Y15 + VPERM2I128 $0x13, Y0, Y14, Y14 + VPERM2I128 $0x02, Y12, Y4, Y0 + VPERM2I128 $0x13, Y12, Y4, Y12 + VPXOR (SI), Y15, Y15 + VPXOR 32(SI), Y0, Y0 + VPXOR 64(SI), Y14, Y14 + VPXOR 96(SI), Y12, Y12 + VMOVDQU Y15, (DI) + VMOVDQU Y0, 32(DI) + VMOVDQU Y14, 64(DI) + VMOVDQU Y12, 96(DI) + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 + VPXOR 128(SI), Y0, Y0 + VPXOR 160(SI), Y14, Y14 + VPXOR 192(SI), Y12, Y12 + VPXOR 224(SI), Y4, Y4 + VMOVDQU Y0, 128(DI) + VMOVDQU Y14, 160(DI) + VMOVDQU Y12, 192(DI) + VMOVDQU Y4, 224(DI) // and here - polyAdd(496(inp)) - polyMulAVX2 - VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 - VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 - VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) - VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 - VPXOR (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0 - VMOVDQU AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup) - LEAQ (32*16)(inp), inp - LEAQ (32*16)(oup), oup - SUBQ $(32*16), inl + ADDQ 496(SI), R10 + ADCQ 504(SI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPERM2I128 $0x02, Y6, Y10, Y0 + VPERM2I128 $0x02, Y8, Y2, Y14 + VPERM2I128 $0x13, Y6, Y10, Y12 + VPERM2I128 $0x13, Y8, Y2, Y4 + VPXOR 256(SI), Y0, Y0 + VPXOR 288(SI), Y14, Y14 + VPXOR 320(SI), Y12, Y12 + VPXOR 352(SI), Y4, Y4 + VMOVDQU Y0, 256(DI) + VMOVDQU Y14, 288(DI) + VMOVDQU Y12, 320(DI) + VMOVDQU Y4, 352(DI) + VPERM2I128 $0x02, Y7, Y11, Y0 + VPERM2I128 $0x02, 224(BP), Y3, Y14 + VPERM2I128 $0x13, Y7, Y11, Y12 + VPERM2I128 $0x13, 224(BP), Y3, Y4 + VPXOR 384(SI), Y0, Y0 + VPXOR 416(SI), Y14, Y14 + VPXOR 448(SI), Y12, Y12 + VPXOR 480(SI), Y4, Y4 + VMOVDQU Y0, 384(DI) + VMOVDQU Y14, 416(DI) + VMOVDQU Y12, 448(DI) + VMOVDQU Y4, 480(DI) + LEAQ 512(SI), SI + LEAQ 512(DI), DI + SUBQ $0x00000200, BX JMP openAVX2MainLoop openAVX2MainLoopDone: // Handle the various tail sizes efficiently - TESTQ inl, inl + TESTQ BX, BX JE openSSEFinalize - CMPQ inl, $128 + CMPQ BX, $0x80 JBE openAVX2Tail128 - CMPQ inl, $256 + CMPQ BX, $0x00000100 JBE openAVX2Tail256 - CMPQ inl, $384 + CMPQ BX, $0x00000180 JBE openAVX2Tail384 JMP openAVX2Tail512 -// ---------------------------------------------------------------------------- -// Special optimization for buffers smaller than 193 bytes openAVX2192: - // For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks - VMOVDQA AA0, AA1 - VMOVDQA BB0, BB1 - VMOVDQA CC0, CC1 - VPADDD ·avx2IncMask<>(SB), DD0, DD1 - VMOVDQA AA0, AA2 - VMOVDQA BB0, BB2 - VMOVDQA CC0, CC2 - VMOVDQA DD0, DD2 - VMOVDQA DD1, TT3 - MOVQ $10, itr2 + VMOVDQA Y0, Y5 + VMOVDQA Y14, Y9 + VMOVDQA Y12, Y13 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VMOVDQA Y0, Y6 + VMOVDQA Y14, Y10 + VMOVDQA Y12, Y8 + VMOVDQA Y4, Y2 + VMOVDQA Y1, Y15 + MOVQ $0x0000000a, R9 openAVX2192InnerCipherLoop: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 - DECQ itr2 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + DECQ R9 JNE openAVX2192InnerCipherLoop - VPADDD AA2, AA0, AA0; VPADDD AA2, AA1, AA1 - VPADDD BB2, BB0, BB0; VPADDD BB2, BB1, BB1 - VPADDD CC2, CC0, CC0; VPADDD CC2, CC1, CC1 - VPADDD DD2, DD0, DD0; VPADDD TT3, DD1, DD1 - VPERM2I128 $0x02, AA0, BB0, TT0 + VPADDD Y6, Y0, Y0 + VPADDD Y6, Y5, Y5 + VPADDD Y10, Y14, Y14 + VPADDD Y10, Y9, Y9 + VPADDD Y8, Y12, Y12 + VPADDD Y8, Y13, Y13 + VPADDD Y2, Y4, Y4 + VPADDD Y15, Y1, Y1 + VPERM2I128 $0x02, Y0, Y14, Y3 // Clamp and store poly key - VPAND ·polyClampMask<>(SB), TT0, TT0 - VMOVDQA TT0, rsStoreAVX2 + VPAND ·polyClampMask<>+0(SB), Y3, Y3 + VMOVDQA Y3, (BP) // Stream for up to 192 bytes - VPERM2I128 $0x13, AA0, BB0, AA0 - VPERM2I128 $0x13, CC0, DD0, BB0 - VPERM2I128 $0x02, AA1, BB1, CC0 - VPERM2I128 $0x02, CC1, DD1, DD0 - VPERM2I128 $0x13, AA1, BB1, AA1 - VPERM2I128 $0x13, CC1, DD1, BB1 + VPERM2I128 $0x13, Y0, Y14, Y0 + VPERM2I128 $0x13, Y12, Y4, Y14 + VPERM2I128 $0x02, Y5, Y9, Y12 + VPERM2I128 $0x02, Y13, Y1, Y4 + VPERM2I128 $0x13, Y5, Y9, Y5 + VPERM2I128 $0x13, Y13, Y1, Y9 openAVX2ShortOpen: // Hash - MOVQ ad_len+80(FP), itr2 + MOVQ ad_len+80(FP), R9 CALL polyHashADInternal<>(SB) openAVX2ShortOpenLoop: - CMPQ inl, $32 + CMPQ BX, $0x20 JB openAVX2ShortTail32 - SUBQ $32, inl + SUBQ $0x20, BX // Load for hashing - polyAdd(0*8(inp)) - polyMulAVX2 - polyAdd(2*8(inp)) - polyMulAVX2 + ADDQ (SI), R10 + ADCQ 8(SI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + ADDQ 16(SI), R10 + ADCQ 24(SI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 // Load for decryption - VPXOR (inp), AA0, AA0 - VMOVDQU AA0, (oup) - LEAQ (1*32)(inp), inp - LEAQ (1*32)(oup), oup + VPXOR (SI), Y0, Y0 + VMOVDQU Y0, (DI) + LEAQ 32(SI), SI + LEAQ 32(DI), DI // Shift stream left - VMOVDQA BB0, AA0 - VMOVDQA CC0, BB0 - VMOVDQA DD0, CC0 - VMOVDQA AA1, DD0 - VMOVDQA BB1, AA1 - VMOVDQA CC1, BB1 - VMOVDQA DD1, CC1 - VMOVDQA AA2, DD1 - VMOVDQA BB2, AA2 + VMOVDQA Y14, Y0 + VMOVDQA Y12, Y14 + VMOVDQA Y4, Y12 + VMOVDQA Y5, Y4 + VMOVDQA Y9, Y5 + VMOVDQA Y13, Y9 + VMOVDQA Y1, Y13 + VMOVDQA Y6, Y1 + VMOVDQA Y10, Y6 JMP openAVX2ShortOpenLoop openAVX2ShortTail32: - CMPQ inl, $16 - VMOVDQA A0, A1 + CMPQ BX, $0x10 + VMOVDQA X0, X1 JB openAVX2ShortDone - - SUBQ $16, inl + SUBQ $0x10, BX // Load for hashing - polyAdd(0*8(inp)) - polyMulAVX2 + ADDQ (SI), R10 + ADCQ 8(SI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 // Load for decryption - VPXOR (inp), A0, T0 - VMOVDQU T0, (oup) - LEAQ (1*16)(inp), inp - LEAQ (1*16)(oup), oup - VPERM2I128 $0x11, AA0, AA0, AA0 - VMOVDQA A0, A1 + VPXOR (SI), X0, X12 + VMOVDQU X12, (DI) + LEAQ 16(SI), SI + LEAQ 16(DI), DI + VPERM2I128 $0x11, Y0, Y0, Y0 + VMOVDQA X0, X1 openAVX2ShortDone: VZEROUPPER JMP openSSETail16 -// ---------------------------------------------------------------------------- -// Special optimization for buffers smaller than 321 bytes openAVX2320: - // For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks - VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1 - VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2 - VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3 - MOVQ $10, itr2 + VMOVDQA Y0, Y5 + VMOVDQA Y14, Y9 + VMOVDQA Y12, Y13 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VMOVDQA Y0, Y6 + VMOVDQA Y14, Y10 + VMOVDQA Y12, Y8 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VMOVDQA Y14, Y7 + VMOVDQA Y12, Y11 + VMOVDQA Y4, Y15 + MOVQ $0x0000000a, R9 openAVX2320InnerCipherLoop: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 - DECQ itr2 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y3 + VPSRLD $0x19, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y3 + VPSRLD $0x19, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x04, Y2, Y2, Y2 + DECQ R9 JNE openAVX2320InnerCipherLoop - - VMOVDQA ·chacha20Constants<>(SB), TT0 - VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2 - VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2 - VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2 - VMOVDQA ·avx2IncMask<>(SB), TT0 - VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3 - VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3 - VPADDD TT3, DD2, DD2 + VMOVDQA ·chacha20Constants<>+0(SB), Y3 + VPADDD Y3, Y0, Y0 + VPADDD Y3, Y5, Y5 + VPADDD Y3, Y6, Y6 + VPADDD Y7, Y14, Y14 + VPADDD Y7, Y9, Y9 + VPADDD Y7, Y10, Y10 + VPADDD Y11, Y12, Y12 + VPADDD Y11, Y13, Y13 + VPADDD Y11, Y8, Y8 + VMOVDQA ·avx2IncMask<>+0(SB), Y3 + VPADDD Y15, Y4, Y4 + VPADDD Y3, Y15, Y15 + VPADDD Y15, Y1, Y1 + VPADDD Y3, Y15, Y15 + VPADDD Y15, Y2, Y2 // Clamp and store poly key - VPERM2I128 $0x02, AA0, BB0, TT0 - VPAND ·polyClampMask<>(SB), TT0, TT0 - VMOVDQA TT0, rsStoreAVX2 + VPERM2I128 $0x02, Y0, Y14, Y3 + VPAND ·polyClampMask<>+0(SB), Y3, Y3 + VMOVDQA Y3, (BP) // Stream for up to 320 bytes - VPERM2I128 $0x13, AA0, BB0, AA0 - VPERM2I128 $0x13, CC0, DD0, BB0 - VPERM2I128 $0x02, AA1, BB1, CC0 - VPERM2I128 $0x02, CC1, DD1, DD0 - VPERM2I128 $0x13, AA1, BB1, AA1 - VPERM2I128 $0x13, CC1, DD1, BB1 - VPERM2I128 $0x02, AA2, BB2, CC1 - VPERM2I128 $0x02, CC2, DD2, DD1 - VPERM2I128 $0x13, AA2, BB2, AA2 - VPERM2I128 $0x13, CC2, DD2, BB2 + VPERM2I128 $0x13, Y0, Y14, Y0 + VPERM2I128 $0x13, Y12, Y4, Y14 + VPERM2I128 $0x02, Y5, Y9, Y12 + VPERM2I128 $0x02, Y13, Y1, Y4 + VPERM2I128 $0x13, Y5, Y9, Y5 + VPERM2I128 $0x13, Y13, Y1, Y9 + VPERM2I128 $0x02, Y6, Y10, Y13 + VPERM2I128 $0x02, Y8, Y2, Y1 + VPERM2I128 $0x13, Y6, Y10, Y6 + VPERM2I128 $0x13, Y8, Y2, Y10 JMP openAVX2ShortOpen -// ---------------------------------------------------------------------------- -// Special optimization for the last 128 bytes of ciphertext openAVX2Tail128: // Need to decrypt up to 128 bytes - prepare two blocks - VMOVDQA ·chacha20Constants<>(SB), AA1 - VMOVDQA state1StoreAVX2, BB1 - VMOVDQA state2StoreAVX2, CC1 - VMOVDQA ctr3StoreAVX2, DD1 - VPADDD ·avx2IncMask<>(SB), DD1, DD1 - VMOVDQA DD1, DD0 - - XORQ itr2, itr2 - MOVQ inl, itr1 - ANDQ $-16, itr1 - TESTQ itr1, itr1 - JE openAVX2Tail128LoopB + VMOVDQA ·chacha20Constants<>+0(SB), Y5 + VMOVDQA 32(BP), Y9 + VMOVDQA 64(BP), Y13 + VMOVDQA 192(BP), Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y1 + VMOVDQA Y1, Y4 + XORQ R9, R9 + MOVQ BX, CX + ANDQ $-16, CX + TESTQ CX, CX + JE openAVX2Tail128LoopB openAVX2Tail128LoopA: - // Perform ChaCha rounds, while hashing the remaining input - polyAdd(0(inp)(itr2*1)) - polyMulAVX2 + ADDQ (SI)(R9*1), R10 + ADCQ 8(SI)(R9*1), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 openAVX2Tail128LoopB: - ADDQ $16, itr2 - chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - VPALIGNR $4, BB1, BB1, BB1 - VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $12, DD1, DD1, DD1 - chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - VPALIGNR $12, BB1, BB1, BB1 - VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $4, DD1, DD1, DD1 - CMPQ itr2, itr1 - JB openAVX2Tail128LoopA - CMPQ itr2, $160 - JNE openAVX2Tail128LoopB - - VPADDD ·chacha20Constants<>(SB), AA1, AA1 - VPADDD state1StoreAVX2, BB1, BB1 - VPADDD state2StoreAVX2, CC1, CC1 - VPADDD DD0, DD1, DD1 - VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 + ADDQ $0x10, R9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x04, Y1, Y1, Y1 + CMPQ R9, CX + JB openAVX2Tail128LoopA + CMPQ R9, $0xa0 + JNE openAVX2Tail128LoopB + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD 32(BP), Y9, Y9 + VPADDD 64(BP), Y13, Y13 + VPADDD Y4, Y1, Y1 + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 openAVX2TailLoop: - CMPQ inl, $32 + CMPQ BX, $0x20 JB openAVX2Tail - SUBQ $32, inl + SUBQ $0x20, BX // Load for decryption - VPXOR (inp), AA0, AA0 - VMOVDQU AA0, (oup) - LEAQ (1*32)(inp), inp - LEAQ (1*32)(oup), oup - VMOVDQA BB0, AA0 - VMOVDQA CC0, BB0 - VMOVDQA DD0, CC0 + VPXOR (SI), Y0, Y0 + VMOVDQU Y0, (DI) + LEAQ 32(SI), SI + LEAQ 32(DI), DI + VMOVDQA Y14, Y0 + VMOVDQA Y12, Y14 + VMOVDQA Y4, Y12 JMP openAVX2TailLoop openAVX2Tail: - CMPQ inl, $16 - VMOVDQA A0, A1 + CMPQ BX, $0x10 + VMOVDQA X0, X1 JB openAVX2TailDone - SUBQ $16, inl + SUBQ $0x10, BX // Load for decryption - VPXOR (inp), A0, T0 - VMOVDQU T0, (oup) - LEAQ (1*16)(inp), inp - LEAQ (1*16)(oup), oup - VPERM2I128 $0x11, AA0, AA0, AA0 - VMOVDQA A0, A1 + VPXOR (SI), X0, X12 + VMOVDQU X12, (DI) + LEAQ 16(SI), SI + LEAQ 16(DI), DI + VPERM2I128 $0x11, Y0, Y0, Y0 + VMOVDQA X0, X1 openAVX2TailDone: VZEROUPPER JMP openSSETail16 -// ---------------------------------------------------------------------------- -// Special optimization for the last 256 bytes of ciphertext openAVX2Tail256: - // Need to decrypt up to 256 bytes - prepare four blocks - VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1 - VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1 - VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1 - VMOVDQA ctr3StoreAVX2, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD1 - VMOVDQA DD0, TT1 - VMOVDQA DD1, TT2 + VMOVDQA ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VMOVDQA Y4, Y7 + VMOVDQA Y1, Y11 // Compute the number of iterations that will hash data - MOVQ inl, tmpStoreAVX2 - MOVQ inl, itr1 - SUBQ $128, itr1 - SHRQ $4, itr1 - MOVQ $10, itr2 - CMPQ itr1, $10 - CMOVQGT itr2, itr1 - MOVQ inp, inl - XORQ itr2, itr2 + MOVQ BX, 224(BP) + MOVQ BX, CX + SUBQ $0x80, CX + SHRQ $0x04, CX + MOVQ $0x0000000a, R9 + CMPQ CX, $0x0a + CMOVQGT R9, CX + MOVQ SI, BX + XORQ R9, R9 openAVX2Tail256LoopA: - polyAdd(0(inl)) - polyMulAVX2 - LEAQ 16(inl), inl + ADDQ (BX), R10 + ADCQ 8(BX), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(BX), BX - // Perform ChaCha rounds, while hashing the remaining input openAVX2Tail256LoopB: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 - INCQ itr2 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 - CMPQ itr2, itr1 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + INCQ R9 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + CMPQ R9, CX JB openAVX2Tail256LoopA + CMPQ R9, $0x0a + JNE openAVX2Tail256LoopB + MOVQ BX, R9 + SUBQ SI, BX + MOVQ BX, CX + MOVQ 224(BP), BX - CMPQ itr2, $10 - JNE openAVX2Tail256LoopB - - MOVQ inl, itr2 - SUBQ inp, inl - MOVQ inl, itr1 - MOVQ tmpStoreAVX2, inl - - // Hash the remainder of data (if any) openAVX2Tail256Hash: - ADDQ $16, itr1 - CMPQ itr1, inl - JGT openAVX2Tail256HashEnd - polyAdd (0(itr2)) - polyMulAVX2 - LEAQ 16(itr2), itr2 - JMP openAVX2Tail256Hash - -// Store 128 bytes safely, then go to store loop + ADDQ $0x10, CX + CMPQ CX, BX + JGT openAVX2Tail256HashEnd + ADDQ (R9), R10 + ADCQ 8(R9), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(R9), R9 + JMP openAVX2Tail256Hash + openAVX2Tail256HashEnd: - VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1 - VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1 - VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1 - VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1 - VPERM2I128 $0x02, AA0, BB0, AA2; VPERM2I128 $0x02, CC0, DD0, BB2; VPERM2I128 $0x13, AA0, BB0, CC2; VPERM2I128 $0x13, CC0, DD0, DD2 - VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 - - VPXOR (0*32)(inp), AA2, AA2; VPXOR (1*32)(inp), BB2, BB2; VPXOR (2*32)(inp), CC2, CC2; VPXOR (3*32)(inp), DD2, DD2 - VMOVDQU AA2, (0*32)(oup); VMOVDQU BB2, (1*32)(oup); VMOVDQU CC2, (2*32)(oup); VMOVDQU DD2, (3*32)(oup) - LEAQ (4*32)(inp), inp - LEAQ (4*32)(oup), oup - SUBQ $4*32, inl - - JMP openAVX2TailLoop - -// ---------------------------------------------------------------------------- -// Special optimization for the last 384 bytes of ciphertext + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD Y7, Y4, Y4 + VPADDD Y11, Y1, Y1 + VPERM2I128 $0x02, Y0, Y14, Y6 + VPERM2I128 $0x02, Y12, Y4, Y10 + VPERM2I128 $0x13, Y0, Y14, Y8 + VPERM2I128 $0x13, Y12, Y4, Y2 + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 + VPXOR (SI), Y6, Y6 + VPXOR 32(SI), Y10, Y10 + VPXOR 64(SI), Y8, Y8 + VPXOR 96(SI), Y2, Y2 + VMOVDQU Y6, (DI) + VMOVDQU Y10, 32(DI) + VMOVDQU Y8, 64(DI) + VMOVDQU Y2, 96(DI) + LEAQ 128(SI), SI + LEAQ 128(DI), DI + SUBQ $0x80, BX + JMP openAVX2TailLoop + openAVX2Tail384: // Need to decrypt up to 384 bytes - prepare six blocks - VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2 - VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2 - VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2 - VMOVDQA ctr3StoreAVX2, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD1 - VPADDD ·avx2IncMask<>(SB), DD1, DD2 - VMOVDQA DD0, ctr0StoreAVX2 - VMOVDQA DD1, ctr1StoreAVX2 - VMOVDQA DD2, ctr2StoreAVX2 + VMOVDQA ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA Y0, Y6 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA Y14, Y10 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA Y12, Y8 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VMOVDQA Y4, 96(BP) + VMOVDQA Y1, 128(BP) + VMOVDQA Y2, 160(BP) // Compute the number of iterations that will hash two blocks of data - MOVQ inl, tmpStoreAVX2 - MOVQ inl, itr1 - SUBQ $256, itr1 - SHRQ $4, itr1 - ADDQ $6, itr1 - MOVQ $10, itr2 - CMPQ itr1, $10 - CMOVQGT itr2, itr1 - MOVQ inp, inl - XORQ itr2, itr2 - - // Perform ChaCha rounds, while hashing the remaining input + MOVQ BX, 224(BP) + MOVQ BX, CX + SUBQ $0x00000100, CX + SHRQ $0x04, CX + ADDQ $0x06, CX + MOVQ $0x0000000a, R9 + CMPQ CX, $0x0a + CMOVQGT R9, CX + MOVQ SI, BX + XORQ R9, R9 + openAVX2Tail384LoopB: - polyAdd(0(inl)) - polyMulAVX2 - LEAQ 16(inl), inl + ADDQ (BX), R10 + ADCQ 8(BX), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(BX), BX openAVX2Tail384LoopA: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 - polyAdd(0(inl)) - polyMulAVX2 - LEAQ 16(inl), inl - INCQ itr2 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 - - CMPQ itr2, itr1 - JB openAVX2Tail384LoopB - - CMPQ itr2, $10 - JNE openAVX2Tail384LoopA - - MOVQ inl, itr2 - SUBQ inp, inl - MOVQ inl, itr1 - MOVQ tmpStoreAVX2, inl + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y3 + VPSRLD $0x19, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x0c, Y2, Y2, Y2 + ADDQ (BX), R10 + ADCQ 8(BX), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(BX), BX + INCQ R9 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y3 + VPSRLD $0x19, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x04, Y2, Y2, Y2 + CMPQ R9, CX + JB openAVX2Tail384LoopB + CMPQ R9, $0x0a + JNE openAVX2Tail384LoopA + MOVQ BX, R9 + SUBQ SI, BX + MOVQ BX, CX + MOVQ 224(BP), BX openAVX2Tail384Hash: - ADDQ $16, itr1 - CMPQ itr1, inl - JGT openAVX2Tail384HashEnd - polyAdd(0(itr2)) - polyMulAVX2 - LEAQ 16(itr2), itr2 - JMP openAVX2Tail384Hash - -// Store 256 bytes safely, then go to store loop + ADDQ $0x10, CX + CMPQ CX, BX + JGT openAVX2Tail384HashEnd + ADDQ (R9), R10 + ADCQ 8(R9), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(R9), R9 + JMP openAVX2Tail384Hash + openAVX2Tail384HashEnd: - VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2 - VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2 - VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2 - VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2 - VPERM2I128 $0x02, AA0, BB0, TT0; VPERM2I128 $0x02, CC0, DD0, TT1; VPERM2I128 $0x13, AA0, BB0, TT2; VPERM2I128 $0x13, CC0, DD0, TT3 - VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3 - VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup) - VPERM2I128 $0x02, AA1, BB1, TT0; VPERM2I128 $0x02, CC1, DD1, TT1; VPERM2I128 $0x13, AA1, BB1, TT2; VPERM2I128 $0x13, CC1, DD1, TT3 - VPXOR (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3 - VMOVDQU TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup) - VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 - LEAQ (8*32)(inp), inp - LEAQ (8*32)(oup), oup - SUBQ $8*32, inl + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 32(BP), Y10, Y10 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD 64(BP), Y8, Y8 + VPADDD 96(BP), Y4, Y4 + VPADDD 128(BP), Y1, Y1 + VPADDD 160(BP), Y2, Y2 + VPERM2I128 $0x02, Y0, Y14, Y3 + VPERM2I128 $0x02, Y12, Y4, Y7 + VPERM2I128 $0x13, Y0, Y14, Y11 + VPERM2I128 $0x13, Y12, Y4, Y15 + VPXOR (SI), Y3, Y3 + VPXOR 32(SI), Y7, Y7 + VPXOR 64(SI), Y11, Y11 + VPXOR 96(SI), Y15, Y15 + VMOVDQU Y3, (DI) + VMOVDQU Y7, 32(DI) + VMOVDQU Y11, 64(DI) + VMOVDQU Y15, 96(DI) + VPERM2I128 $0x02, Y5, Y9, Y3 + VPERM2I128 $0x02, Y13, Y1, Y7 + VPERM2I128 $0x13, Y5, Y9, Y11 + VPERM2I128 $0x13, Y13, Y1, Y15 + VPXOR 128(SI), Y3, Y3 + VPXOR 160(SI), Y7, Y7 + VPXOR 192(SI), Y11, Y11 + VPXOR 224(SI), Y15, Y15 + VMOVDQU Y3, 128(DI) + VMOVDQU Y7, 160(DI) + VMOVDQU Y11, 192(DI) + VMOVDQU Y15, 224(DI) + VPERM2I128 $0x02, Y6, Y10, Y0 + VPERM2I128 $0x02, Y8, Y2, Y14 + VPERM2I128 $0x13, Y6, Y10, Y12 + VPERM2I128 $0x13, Y8, Y2, Y4 + LEAQ 256(SI), SI + LEAQ 256(DI), DI + SUBQ $0x00000100, BX JMP openAVX2TailLoop -// ---------------------------------------------------------------------------- -// Special optimization for the last 512 bytes of ciphertext openAVX2Tail512: - VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 - VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 - VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 - VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 - VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 - XORQ itr1, itr1 - MOVQ inp, itr2 + VMOVDQU ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA Y0, Y6 + VMOVDQA Y0, Y7 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA Y14, Y10 + VMOVDQA Y14, Y11 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA Y12, Y8 + VMOVDQA Y12, Y15 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 + VMOVDQA Y4, 96(BP) + VMOVDQA Y1, 128(BP) + VMOVDQA Y2, 160(BP) + VMOVDQA Y3, 192(BP) + XORQ CX, CX + MOVQ SI, R9 openAVX2Tail512LoopB: - polyAdd(0(itr2)) - polyMulAVX2 - LEAQ (2*8)(itr2), itr2 + ADDQ (R9), R10 + ADCQ 8(R9), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(R9), R9 openAVX2Tail512LoopA: - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyAdd(0*8(itr2)) - polyMulAVX2 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - polyAdd(2*8(itr2)) - polyMulAVX2 - LEAQ (4*8)(itr2), itr2 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 - INCQ itr1 - CMPQ itr1, $4 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + ADDQ (R9), R10 + ADCQ 8(R9), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x04, Y11, Y11, Y11 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPALIGNR $0x0c, Y3, Y3, Y3 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + ADDQ 16(R9), R10 + ADCQ 24(R9), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 32(R9), R9 + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x0c, Y11, Y11, Y11 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x04, Y2, Y2, Y2 + VPALIGNR $0x04, Y3, Y3, Y3 + INCQ CX + CMPQ CX, $0x04 JLT openAVX2Tail512LoopB - - CMPQ itr1, $10 - JNE openAVX2Tail512LoopA - - MOVQ inl, itr1 - SUBQ $384, itr1 - ANDQ $-16, itr1 + CMPQ CX, $0x0a + JNE openAVX2Tail512LoopA + MOVQ BX, CX + SUBQ $0x00000180, CX + ANDQ $-16, CX openAVX2Tail512HashLoop: - TESTQ itr1, itr1 + TESTQ CX, CX JE openAVX2Tail512HashEnd - polyAdd(0(itr2)) - polyMulAVX2 - LEAQ 16(itr2), itr2 - SUBQ $16, itr1 + ADDQ (R9), R10 + ADCQ 8(R9), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(R9), R9 + SUBQ $0x10, CX JMP openAVX2Tail512HashLoop openAVX2Tail512HashEnd: - VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 - VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 - VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 - VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 - VMOVDQA CC3, tmpStoreAVX2 - VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0 - VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0 - VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup) - VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 - VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 - VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) - VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 - VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 - VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) - VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 - - LEAQ (12*32)(inp), inp - LEAQ (12*32)(oup), oup - SUBQ $12*32, inl - - JMP openAVX2TailLoop - -// ---------------------------------------------------------------------------- -// ---------------------------------------------------------------------------- -// func chacha20Poly1305Seal(dst, key, src, ad []byte) -TEXT ·chacha20Poly1305Seal(SB), 0, $288-96 - // For aligned stack access + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 + VPADDD ·chacha20Constants<>+0(SB), Y7, Y7 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 32(BP), Y10, Y10 + VPADDD 32(BP), Y11, Y11 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD 64(BP), Y8, Y8 + VPADDD 64(BP), Y15, Y15 + VPADDD 96(BP), Y4, Y4 + VPADDD 128(BP), Y1, Y1 + VPADDD 160(BP), Y2, Y2 + VPADDD 192(BP), Y3, Y3 + VMOVDQA Y15, 224(BP) + VPERM2I128 $0x02, Y0, Y14, Y15 + VPERM2I128 $0x13, Y0, Y14, Y14 + VPERM2I128 $0x02, Y12, Y4, Y0 + VPERM2I128 $0x13, Y12, Y4, Y12 + VPXOR (SI), Y15, Y15 + VPXOR 32(SI), Y0, Y0 + VPXOR 64(SI), Y14, Y14 + VPXOR 96(SI), Y12, Y12 + VMOVDQU Y15, (DI) + VMOVDQU Y0, 32(DI) + VMOVDQU Y14, 64(DI) + VMOVDQU Y12, 96(DI) + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 + VPXOR 128(SI), Y0, Y0 + VPXOR 160(SI), Y14, Y14 + VPXOR 192(SI), Y12, Y12 + VPXOR 224(SI), Y4, Y4 + VMOVDQU Y0, 128(DI) + VMOVDQU Y14, 160(DI) + VMOVDQU Y12, 192(DI) + VMOVDQU Y4, 224(DI) + VPERM2I128 $0x02, Y6, Y10, Y0 + VPERM2I128 $0x02, Y8, Y2, Y14 + VPERM2I128 $0x13, Y6, Y10, Y12 + VPERM2I128 $0x13, Y8, Y2, Y4 + VPXOR 256(SI), Y0, Y0 + VPXOR 288(SI), Y14, Y14 + VPXOR 320(SI), Y12, Y12 + VPXOR 352(SI), Y4, Y4 + VMOVDQU Y0, 256(DI) + VMOVDQU Y14, 288(DI) + VMOVDQU Y12, 320(DI) + VMOVDQU Y4, 352(DI) + VPERM2I128 $0x02, Y7, Y11, Y0 + VPERM2I128 $0x02, 224(BP), Y3, Y14 + VPERM2I128 $0x13, Y7, Y11, Y12 + VPERM2I128 $0x13, 224(BP), Y3, Y4 + LEAQ 384(SI), SI + LEAQ 384(DI), DI + SUBQ $0x00000180, BX + JMP openAVX2TailLoop + +DATA ·chacha20Constants<>+0(SB)/4, $0x61707865 +DATA ·chacha20Constants<>+4(SB)/4, $0x3320646e +DATA ·chacha20Constants<>+8(SB)/4, $0x79622d32 +DATA ·chacha20Constants<>+12(SB)/4, $0x6b206574 +DATA ·chacha20Constants<>+16(SB)/4, $0x61707865 +DATA ·chacha20Constants<>+20(SB)/4, $0x3320646e +DATA ·chacha20Constants<>+24(SB)/4, $0x79622d32 +DATA ·chacha20Constants<>+28(SB)/4, $0x6b206574 +GLOBL ·chacha20Constants<>(SB), RODATA|NOPTR, $32 + +DATA ·polyClampMask<>+0(SB)/8, $0x0ffffffc0fffffff +DATA ·polyClampMask<>+8(SB)/8, $0x0ffffffc0ffffffc +DATA ·polyClampMask<>+16(SB)/8, $0xffffffffffffffff +DATA ·polyClampMask<>+24(SB)/8, $0xffffffffffffffff +GLOBL ·polyClampMask<>(SB), RODATA|NOPTR, $32 + +DATA ·sseIncMask<>+0(SB)/8, $0x0000000000000001 +DATA ·sseIncMask<>+8(SB)/8, $0x0000000000000000 +GLOBL ·sseIncMask<>(SB), RODATA|NOPTR, $16 + +DATA ·andMask<>+0(SB)/8, $0x00000000000000ff +DATA ·andMask<>+8(SB)/8, $0x0000000000000000 +DATA ·andMask<>+16(SB)/8, $0x000000000000ffff +DATA ·andMask<>+24(SB)/8, $0x0000000000000000 +DATA ·andMask<>+32(SB)/8, $0x0000000000ffffff +DATA ·andMask<>+40(SB)/8, $0x0000000000000000 +DATA ·andMask<>+48(SB)/8, $0x00000000ffffffff +DATA ·andMask<>+56(SB)/8, $0x0000000000000000 +DATA ·andMask<>+64(SB)/8, $0x000000ffffffffff +DATA ·andMask<>+72(SB)/8, $0x0000000000000000 +DATA ·andMask<>+80(SB)/8, $0x0000ffffffffffff +DATA ·andMask<>+88(SB)/8, $0x0000000000000000 +DATA ·andMask<>+96(SB)/8, $0x00ffffffffffffff +DATA ·andMask<>+104(SB)/8, $0x0000000000000000 +DATA ·andMask<>+112(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+120(SB)/8, $0x0000000000000000 +DATA ·andMask<>+128(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+136(SB)/8, $0x00000000000000ff +DATA ·andMask<>+144(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+152(SB)/8, $0x000000000000ffff +DATA ·andMask<>+160(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+168(SB)/8, $0x0000000000ffffff +DATA ·andMask<>+176(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+184(SB)/8, $0x00000000ffffffff +DATA ·andMask<>+192(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+200(SB)/8, $0x000000ffffffffff +DATA ·andMask<>+208(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+216(SB)/8, $0x0000ffffffffffff +DATA ·andMask<>+224(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+232(SB)/8, $0x00ffffffffffffff +GLOBL ·andMask<>(SB), RODATA|NOPTR, $240 + +DATA ·avx2InitMask<>+0(SB)/8, $0x0000000000000000 +DATA ·avx2InitMask<>+8(SB)/8, $0x0000000000000000 +DATA ·avx2InitMask<>+16(SB)/8, $0x0000000000000001 +DATA ·avx2InitMask<>+24(SB)/8, $0x0000000000000000 +GLOBL ·avx2InitMask<>(SB), RODATA|NOPTR, $32 + +DATA ·rol16<>+0(SB)/8, $0x0504070601000302 +DATA ·rol16<>+8(SB)/8, $0x0d0c0f0e09080b0a +DATA ·rol16<>+16(SB)/8, $0x0504070601000302 +DATA ·rol16<>+24(SB)/8, $0x0d0c0f0e09080b0a +GLOBL ·rol16<>(SB), RODATA|NOPTR, $32 + +DATA ·rol8<>+0(SB)/8, $0x0605040702010003 +DATA ·rol8<>+8(SB)/8, $0x0e0d0c0f0a09080b +DATA ·rol8<>+16(SB)/8, $0x0605040702010003 +DATA ·rol8<>+24(SB)/8, $0x0e0d0c0f0a09080b +GLOBL ·rol8<>(SB), RODATA|NOPTR, $32 + +DATA ·avx2IncMask<>+0(SB)/8, $0x0000000000000002 +DATA ·avx2IncMask<>+8(SB)/8, $0x0000000000000000 +DATA ·avx2IncMask<>+16(SB)/8, $0x0000000000000002 +DATA ·avx2IncMask<>+24(SB)/8, $0x0000000000000000 +GLOBL ·avx2IncMask<>(SB), RODATA|NOPTR, $32 + +// func chacha20Poly1305Seal(dst []byte, key []uint32, src []byte, ad []byte) +// Requires: AVX, AVX2, BMI2, CMOV, SSE2 +TEXT ·chacha20Poly1305Seal(SB), $288-96 MOVQ SP, BP - ADDQ $32, BP + ADDQ $0x20, BP ANDQ $-32, BP - MOVQ dst+0(FP), oup - MOVQ key+24(FP), keyp - MOVQ src+48(FP), inp - MOVQ src_len+56(FP), inl - MOVQ ad+72(FP), adp - - CMPB ·useAVX2(SB), $1 + MOVQ dst_base+0(FP), DI + MOVQ key_base+24(FP), R8 + MOVQ src_base+48(FP), SI + MOVQ src_len+56(FP), BX + MOVQ ad_base+72(FP), CX + CMPB ·useAVX2+0(SB), $0x01 JE chacha20Poly1305Seal_AVX2 // Special optimization, for very short buffers - CMPQ inl, $128 - JBE sealSSE128 // About 15% faster + CMPQ BX, $0x80 + JBE sealSSE128 // In the seal case - prepare the poly key + 3 blocks of stream in the first iteration - MOVOU ·chacha20Constants<>(SB), A0 - MOVOU (1*16)(keyp), B0 - MOVOU (2*16)(keyp), C0 - MOVOU (3*16)(keyp), D0 + MOVOU ·chacha20Constants<>+0(SB), X0 + MOVOU 16(R8), X3 + MOVOU 32(R8), X6 + MOVOU 48(R8), X9 // Store state on stack for future use - MOVO B0, state1Store - MOVO C0, state2Store + MOVO X3, 32(BP) + MOVO X6, 48(BP) // Load state, increment counter blocks - MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 - MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 - MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 + MOVO X0, X1 + MOVO X3, X4 + MOVO X6, X7 + MOVO X9, X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X1, X2 + MOVO X4, X5 + MOVO X7, X8 + MOVO X10, X11 + PADDL ·sseIncMask<>+0(SB), X11 + MOVO X2, X12 + MOVO X5, X13 + MOVO X8, X14 + MOVO X11, X15 + PADDL ·sseIncMask<>+0(SB), X15 // Store counters - MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store - MOVQ $10, itr2 + MOVO X9, 80(BP) + MOVO X10, 96(BP) + MOVO X11, 112(BP) + MOVO X15, 128(BP) + MOVQ $0x0000000a, R9 sealSSEIntroLoop: - MOVO C3, tmpStore - chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) - MOVO tmpStore, C3 - MOVO C1, tmpStore - chachaQR(A3, B3, C3, D3, C1) - MOVO tmpStore, C1 - shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left - shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left - shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left - - MOVO C3, tmpStore - chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) - MOVO tmpStore, C3 - MOVO C1, tmpStore - chachaQR(A3, B3, C3, D3, C1) - MOVO tmpStore, C1 - shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right - shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right - shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right - DECQ itr2 - JNE sealSSEIntroLoop + MOVO X14, 64(BP) + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X3 + PXOR X14, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X3 + PXOR X14, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X4 + PXOR X14, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X4 + PXOR X14, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X5 + PXOR X14, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X5 + PXOR X14, X5 + MOVO 64(BP), X14 + MOVO X7, 64(BP) + PADDD X13, X12 + PXOR X12, X15 + ROL16(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x0c, X7 + PSRLL $0x14, X13 + PXOR X7, X13 + PADDD X13, X12 + PXOR X12, X15 + ROL8(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x07, X7 + PSRLL $0x19, X13 + PXOR X7, X13 + MOVO 64(BP), X7 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x0c + MOVO X14, 64(BP) + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X3 + PXOR X14, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X3 + PXOR X14, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X4 + PXOR X14, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X4 + PXOR X14, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X5 + PXOR X14, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X5 + PXOR X14, X5 + MOVO 64(BP), X14 + MOVO X7, 64(BP) + PADDD X13, X12 + PXOR X12, X15 + ROL16(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x0c, X7 + PSRLL $0x14, X13 + PXOR X7, X13 + PADDD X13, X12 + PXOR X12, X15 + ROL8(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x07, X7 + PSRLL $0x19, X13 + PXOR X7, X13 + MOVO 64(BP), X7 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x04 + DECQ R9 + JNE sealSSEIntroLoop // Add in the state - PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 - PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 - PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 - PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 + PADDD ·chacha20Constants<>+0(SB), X0 + PADDD ·chacha20Constants<>+0(SB), X1 + PADDD ·chacha20Constants<>+0(SB), X2 + PADDD ·chacha20Constants<>+0(SB), X12 + PADDD 32(BP), X3 + PADDD 32(BP), X4 + PADDD 32(BP), X5 + PADDD 32(BP), X13 + PADDD 48(BP), X7 + PADDD 48(BP), X8 + PADDD 48(BP), X14 + PADDD 96(BP), X10 + PADDD 112(BP), X11 + PADDD 128(BP), X15 // Clamp and store the key - PAND ·polyClampMask<>(SB), A0 - MOVO A0, rStore - MOVO B0, sStore + PAND ·polyClampMask<>+0(SB), X0 + MOVO X0, (BP) + MOVO X3, 16(BP) // Hash AAD - MOVQ ad_len+80(FP), itr2 - CALL polyHashADInternal<>(SB) - - MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0 - PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1 - MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup) - MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0 - PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2 - MOVOU A2, (4*16)(oup); MOVOU B2, (5*16)(oup); MOVOU C2, (6*16)(oup); MOVOU D2, (7*16)(oup) - - MOVQ $128, itr1 - SUBQ $128, inl - LEAQ 128(inp), inp - - MOVO A3, A1; MOVO B3, B1; MOVO C3, C1; MOVO D3, D1 - - CMPQ inl, $64 - JBE sealSSE128SealHash - - MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0 - PXOR A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3 - MOVOU A3, (8*16)(oup); MOVOU B3, (9*16)(oup); MOVOU C3, (10*16)(oup); MOVOU D3, (11*16)(oup) - - ADDQ $64, itr1 - SUBQ $64, inl - LEAQ 64(inp), inp - - MOVQ $2, itr1 - MOVQ $8, itr2 - - CMPQ inl, $64 - JBE sealSSETail64 - CMPQ inl, $128 - JBE sealSSETail128 - CMPQ inl, $192 - JBE sealSSETail192 + MOVQ ad_len+80(FP), R9 + CALL polyHashADInternal<>(SB) + MOVOU (SI), X0 + MOVOU 16(SI), X3 + MOVOU 32(SI), X6 + MOVOU 48(SI), X9 + PXOR X0, X1 + PXOR X3, X4 + PXOR X6, X7 + PXOR X9, X10 + MOVOU X1, (DI) + MOVOU X4, 16(DI) + MOVOU X7, 32(DI) + MOVOU X10, 48(DI) + MOVOU 64(SI), X0 + MOVOU 80(SI), X3 + MOVOU 96(SI), X6 + MOVOU 112(SI), X9 + PXOR X0, X2 + PXOR X3, X5 + PXOR X6, X8 + PXOR X9, X11 + MOVOU X2, 64(DI) + MOVOU X5, 80(DI) + MOVOU X8, 96(DI) + MOVOU X11, 112(DI) + MOVQ $0x00000080, CX + SUBQ $0x80, BX + LEAQ 128(SI), SI + MOVO X12, X1 + MOVO X13, X4 + MOVO X14, X7 + MOVO X15, X10 + CMPQ BX, $0x40 + JBE sealSSE128SealHash + MOVOU (SI), X0 + MOVOU 16(SI), X3 + MOVOU 32(SI), X6 + MOVOU 48(SI), X9 + PXOR X0, X12 + PXOR X3, X13 + PXOR X6, X14 + PXOR X9, X15 + MOVOU X12, 128(DI) + MOVOU X13, 144(DI) + MOVOU X14, 160(DI) + MOVOU X15, 176(DI) + ADDQ $0x40, CX + SUBQ $0x40, BX + LEAQ 64(SI), SI + MOVQ $0x00000002, CX + MOVQ $0x00000008, R9 + CMPQ BX, $0x40 + JBE sealSSETail64 + CMPQ BX, $0x80 + JBE sealSSETail128 + CMPQ BX, $0xc0 + JBE sealSSETail192 sealSSEMainLoop: // Load state, increment counter blocks - MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0 - MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 - MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 - MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 + MOVO ·chacha20Constants<>+0(SB), X0 + MOVO 32(BP), X3 + MOVO 48(BP), X6 + MOVO 128(BP), X9 + PADDL ·sseIncMask<>+0(SB), X9 + MOVO X0, X1 + MOVO X3, X4 + MOVO X6, X7 + MOVO X9, X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X1, X2 + MOVO X4, X5 + MOVO X7, X8 + MOVO X10, X11 + PADDL ·sseIncMask<>+0(SB), X11 + MOVO X2, X12 + MOVO X5, X13 + MOVO X8, X14 + MOVO X11, X15 + PADDL ·sseIncMask<>+0(SB), X15 // Store counters - MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store + MOVO X9, 80(BP) + MOVO X10, 96(BP) + MOVO X11, 112(BP) + MOVO X15, 128(BP) sealSSEInnerLoop: - MOVO C3, tmpStore - chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) - MOVO tmpStore, C3 - MOVO C1, tmpStore - chachaQR(A3, B3, C3, D3, C1) - MOVO tmpStore, C1 - polyAdd(0(oup)) - shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left - shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left - shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left - polyMulStage1 - polyMulStage2 - LEAQ (2*8)(oup), oup - MOVO C3, tmpStore - chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) - MOVO tmpStore, C3 - MOVO C1, tmpStore - polyMulStage3 - chachaQR(A3, B3, C3, D3, C1) - MOVO tmpStore, C1 - polyMulReduceStage - shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right - shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right - shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right - DECQ itr2 - JGE sealSSEInnerLoop - polyAdd(0(oup)) - polyMul - LEAQ (2*8)(oup), oup - DECQ itr1 - JG sealSSEInnerLoop + MOVO X14, 64(BP) + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X3 + PXOR X14, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X3 + PXOR X14, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X4 + PXOR X14, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X4 + PXOR X14, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X5 + PXOR X14, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X5 + PXOR X14, X5 + MOVO 64(BP), X14 + MOVO X7, 64(BP) + PADDD X13, X12 + PXOR X12, X15 + ROL16(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x0c, X7 + PSRLL $0x14, X13 + PXOR X7, X13 + PADDD X13, X12 + PXOR X12, X15 + ROL8(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x07, X7 + PSRLL $0x19, X13 + PXOR X7, X13 + MOVO 64(BP), X7 + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x0c + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + LEAQ 16(DI), DI + MOVO X14, 64(BP) + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X3 + PXOR X14, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X3 + PXOR X14, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X4 + PXOR X14, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X4 + PXOR X14, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X5 + PXOR X14, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X5 + PXOR X14, X5 + MOVO 64(BP), X14 + MOVO X7, 64(BP) + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + PADDD X13, X12 + PXOR X12, X15 + ROL16(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x0c, X7 + PSRLL $0x14, X13 + PXOR X7, X13 + PADDD X13, X12 + PXOR X12, X15 + ROL8(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x07, X7 + PSRLL $0x19, X13 + PXOR X7, X13 + MOVO 64(BP), X7 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x04 + DECQ R9 + JGE sealSSEInnerLoop + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI + DECQ CX + JG sealSSEInnerLoop // Add in the state - PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 - PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 - PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 - PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 - MOVO D3, tmpStore + PADDD ·chacha20Constants<>+0(SB), X0 + PADDD ·chacha20Constants<>+0(SB), X1 + PADDD ·chacha20Constants<>+0(SB), X2 + PADDD ·chacha20Constants<>+0(SB), X12 + PADDD 32(BP), X3 + PADDD 32(BP), X4 + PADDD 32(BP), X5 + PADDD 32(BP), X13 + PADDD 48(BP), X6 + PADDD 48(BP), X7 + PADDD 48(BP), X8 + PADDD 48(BP), X14 + PADDD 80(BP), X9 + PADDD 96(BP), X10 + PADDD 112(BP), X11 + PADDD 128(BP), X15 + MOVO X15, 64(BP) // Load - xor - store - MOVOU (0*16)(inp), D3; PXOR D3, A0 - MOVOU (1*16)(inp), D3; PXOR D3, B0 - MOVOU (2*16)(inp), D3; PXOR D3, C0 - MOVOU (3*16)(inp), D3; PXOR D3, D0 - MOVOU A0, (0*16)(oup) - MOVOU B0, (1*16)(oup) - MOVOU C0, (2*16)(oup) - MOVOU D0, (3*16)(oup) - MOVO tmpStore, D3 - - MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0 - PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1 - MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) - MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0 - PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2 - MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup) - ADDQ $192, inp - MOVQ $192, itr1 - SUBQ $192, inl - MOVO A3, A1 - MOVO B3, B1 - MOVO C3, C1 - MOVO D3, D1 - CMPQ inl, $64 + MOVOU (SI), X15 + PXOR X15, X0 + MOVOU 16(SI), X15 + PXOR X15, X3 + MOVOU 32(SI), X15 + PXOR X15, X6 + MOVOU 48(SI), X15 + PXOR X15, X9 + MOVOU X0, (DI) + MOVOU X3, 16(DI) + MOVOU X6, 32(DI) + MOVOU X9, 48(DI) + MOVO 64(BP), X15 + MOVOU 64(SI), X0 + MOVOU 80(SI), X3 + MOVOU 96(SI), X6 + MOVOU 112(SI), X9 + PXOR X0, X1 + PXOR X3, X4 + PXOR X6, X7 + PXOR X9, X10 + MOVOU X1, 64(DI) + MOVOU X4, 80(DI) + MOVOU X7, 96(DI) + MOVOU X10, 112(DI) + MOVOU 128(SI), X0 + MOVOU 144(SI), X3 + MOVOU 160(SI), X6 + MOVOU 176(SI), X9 + PXOR X0, X2 + PXOR X3, X5 + PXOR X6, X8 + PXOR X9, X11 + MOVOU X2, 128(DI) + MOVOU X5, 144(DI) + MOVOU X8, 160(DI) + MOVOU X11, 176(DI) + ADDQ $0xc0, SI + MOVQ $0x000000c0, CX + SUBQ $0xc0, BX + MOVO X12, X1 + MOVO X13, X4 + MOVO X14, X7 + MOVO X15, X10 + CMPQ BX, $0x40 JBE sealSSE128SealHash - MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0 - PXOR A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3 - MOVOU A3, (12*16)(oup); MOVOU B3, (13*16)(oup); MOVOU C3, (14*16)(oup); MOVOU D3, (15*16)(oup) - LEAQ 64(inp), inp - SUBQ $64, inl - MOVQ $6, itr1 - MOVQ $4, itr2 - CMPQ inl, $192 + MOVOU (SI), X0 + MOVOU 16(SI), X3 + MOVOU 32(SI), X6 + MOVOU 48(SI), X9 + PXOR X0, X12 + PXOR X3, X13 + PXOR X6, X14 + PXOR X9, X15 + MOVOU X12, 192(DI) + MOVOU X13, 208(DI) + MOVOU X14, 224(DI) + MOVOU X15, 240(DI) + LEAQ 64(SI), SI + SUBQ $0x40, BX + MOVQ $0x00000006, CX + MOVQ $0x00000004, R9 + CMPQ BX, $0xc0 JG sealSSEMainLoop - - MOVQ inl, itr1 - TESTQ inl, inl + MOVQ BX, CX + TESTQ BX, BX JE sealSSE128SealHash - MOVQ $6, itr1 - CMPQ inl, $64 + MOVQ $0x00000006, CX + CMPQ BX, $0x40 JBE sealSSETail64 - CMPQ inl, $128 + CMPQ BX, $0x80 JBE sealSSETail128 JMP sealSSETail192 -// ---------------------------------------------------------------------------- -// Special optimization for the last 64 bytes of plaintext sealSSETail64: - // Need to encrypt up to 64 bytes - prepare single block, hash 192 or 256 bytes - MOVO ·chacha20Constants<>(SB), A1 - MOVO state1Store, B1 - MOVO state2Store, C1 - MOVO ctr3Store, D1 - PADDL ·sseIncMask<>(SB), D1 - MOVO D1, ctr0Store + MOVO ·chacha20Constants<>+0(SB), X1 + MOVO 32(BP), X4 + MOVO 48(BP), X7 + MOVO 128(BP), X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X10, 80(BP) sealSSETail64LoopA: - // Perform ChaCha rounds, while hashing the previously encrypted ciphertext - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI sealSSETail64LoopB: - chachaQR(A1, B1, C1, D1, T1) - shiftB1Left; shiftC1Left; shiftD1Left - chachaQR(A1, B1, C1, D1, T1) - shiftB1Right; shiftC1Right; shiftD1Right - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup - - DECQ itr1 - JG sealSSETail64LoopA - - DECQ itr2 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X13) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X13 + PSLLL $0x0c, X13 + PSRLL $0x14, X4 + PXOR X13, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X13) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X13 + PSLLL $0x07, X13 + PSRLL $0x19, X4 + PXOR X13, X4 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X13) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X13 + PSLLL $0x0c, X13 + PSRLL $0x14, X4 + PXOR X13, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X13) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X13 + PSLLL $0x07, X13 + PSRLL $0x19, X4 + PXOR X13, X4 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI + DECQ CX + JG sealSSETail64LoopA + DECQ R9 JGE sealSSETail64LoopB - PADDL ·chacha20Constants<>(SB), A1 - PADDL state1Store, B1 - PADDL state2Store, C1 - PADDL ctr0Store, D1 + PADDL ·chacha20Constants<>+0(SB), X1 + PADDL 32(BP), X4 + PADDL 48(BP), X7 + PADDL 80(BP), X10 + JMP sealSSE128Seal - JMP sealSSE128Seal - -// ---------------------------------------------------------------------------- -// Special optimization for the last 128 bytes of plaintext sealSSETail128: - // Need to encrypt up to 128 bytes - prepare two blocks, hash 192 or 256 bytes - MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store - MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store + MOVO ·chacha20Constants<>+0(SB), X0 + MOVO 32(BP), X3 + MOVO 48(BP), X6 + MOVO 128(BP), X9 + PADDL ·sseIncMask<>+0(SB), X9 + MOVO X9, 80(BP) + MOVO X0, X1 + MOVO X3, X4 + MOVO X6, X7 + MOVO X9, X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X10, 96(BP) sealSSETail128LoopA: - // Perform ChaCha rounds, while hashing the previously encrypted ciphertext - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI sealSSETail128LoopB: - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) - shiftB0Left; shiftC0Left; shiftD0Left - shiftB1Left; shiftC1Left; shiftD1Left - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) - shiftB0Right; shiftC0Right; shiftD0Right - shiftB1Right; shiftC1Right; shiftD1Right - - DECQ itr1 - JG sealSSETail128LoopA - - DECQ itr2 - JGE sealSSETail128LoopB - - PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1 - PADDL state1Store, B0; PADDL state1Store, B1 - PADDL state2Store, C0; PADDL state2Store, C1 - PADDL ctr0Store, D0; PADDL ctr1Store, D1 - - MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 - PXOR T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0 - MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup) - - MOVQ $64, itr1 - LEAQ 64(inp), inp - SUBQ $64, inl - - JMP sealSSE128SealHash - -// ---------------------------------------------------------------------------- -// Special optimization for the last 192 bytes of plaintext + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + DECQ CX + JG sealSSETail128LoopA + DECQ R9 + JGE sealSSETail128LoopB + PADDL ·chacha20Constants<>+0(SB), X0 + PADDL ·chacha20Constants<>+0(SB), X1 + PADDL 32(BP), X3 + PADDL 32(BP), X4 + PADDL 48(BP), X6 + PADDL 48(BP), X7 + PADDL 80(BP), X9 + PADDL 96(BP), X10 + MOVOU (SI), X12 + MOVOU 16(SI), X13 + MOVOU 32(SI), X14 + MOVOU 48(SI), X15 + PXOR X12, X0 + PXOR X13, X3 + PXOR X14, X6 + PXOR X15, X9 + MOVOU X0, (DI) + MOVOU X3, 16(DI) + MOVOU X6, 32(DI) + MOVOU X9, 48(DI) + MOVQ $0x00000040, CX + LEAQ 64(SI), SI + SUBQ $0x40, BX + JMP sealSSE128SealHash + sealSSETail192: - // Need to encrypt up to 192 bytes - prepare three blocks, hash 192 or 256 bytes - MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store - MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store - MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr2Store + MOVO ·chacha20Constants<>+0(SB), X0 + MOVO 32(BP), X3 + MOVO 48(BP), X6 + MOVO 128(BP), X9 + PADDL ·sseIncMask<>+0(SB), X9 + MOVO X9, 80(BP) + MOVO X0, X1 + MOVO X3, X4 + MOVO X6, X7 + MOVO X9, X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X10, 96(BP) + MOVO X1, X2 + MOVO X4, X5 + MOVO X7, X8 + MOVO X10, X11 + PADDL ·sseIncMask<>+0(SB), X11 + MOVO X11, 112(BP) sealSSETail192LoopA: - // Perform ChaCha rounds, while hashing the previously encrypted ciphertext - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI sealSSETail192LoopB: - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) - shiftB0Left; shiftC0Left; shiftD0Left - shiftB1Left; shiftC1Left; shiftD1Left - shiftB2Left; shiftC2Left; shiftD2Left - - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup - - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) - shiftB0Right; shiftC0Right; shiftD0Right - shiftB1Right; shiftC1Right; shiftD1Right - shiftB2Right; shiftC2Right; shiftD2Right - - DECQ itr1 - JG sealSSETail192LoopA - - DECQ itr2 - JGE sealSSETail192LoopB - - PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 - PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2 - PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2 - PADDL ctr0Store, D0; PADDL ctr1Store, D1; PADDL ctr2Store, D2 - - MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 - PXOR T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0 - MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup) - MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3 - PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1 - MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) - - MOVO A2, A1 - MOVO B2, B1 - MOVO C2, C1 - MOVO D2, D1 - MOVQ $128, itr1 - LEAQ 128(inp), inp - SUBQ $128, inl - - JMP sealSSE128SealHash - -// ---------------------------------------------------------------------------- -// Special seal optimization for buffers smaller than 129 bytes + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X5 + PXOR X12, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X5 + PXOR X12, X5 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X5 + PXOR X12, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X5 + PXOR X12, X5 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + DECQ CX + JG sealSSETail192LoopA + DECQ R9 + JGE sealSSETail192LoopB + PADDL ·chacha20Constants<>+0(SB), X0 + PADDL ·chacha20Constants<>+0(SB), X1 + PADDL ·chacha20Constants<>+0(SB), X2 + PADDL 32(BP), X3 + PADDL 32(BP), X4 + PADDL 32(BP), X5 + PADDL 48(BP), X6 + PADDL 48(BP), X7 + PADDL 48(BP), X8 + PADDL 80(BP), X9 + PADDL 96(BP), X10 + PADDL 112(BP), X11 + MOVOU (SI), X12 + MOVOU 16(SI), X13 + MOVOU 32(SI), X14 + MOVOU 48(SI), X15 + PXOR X12, X0 + PXOR X13, X3 + PXOR X14, X6 + PXOR X15, X9 + MOVOU X0, (DI) + MOVOU X3, 16(DI) + MOVOU X6, 32(DI) + MOVOU X9, 48(DI) + MOVOU 64(SI), X12 + MOVOU 80(SI), X13 + MOVOU 96(SI), X14 + MOVOU 112(SI), X15 + PXOR X12, X1 + PXOR X13, X4 + PXOR X14, X7 + PXOR X15, X10 + MOVOU X1, 64(DI) + MOVOU X4, 80(DI) + MOVOU X7, 96(DI) + MOVOU X10, 112(DI) + MOVO X2, X1 + MOVO X5, X4 + MOVO X8, X7 + MOVO X11, X10 + MOVQ $0x00000080, CX + LEAQ 128(SI), SI + SUBQ $0x80, BX + JMP sealSSE128SealHash + sealSSE128: - // For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks - MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0 - MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 - MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 - MOVO B0, T1; MOVO C0, T2; MOVO D1, T3 - MOVQ $10, itr2 + MOVOU ·chacha20Constants<>+0(SB), X0 + MOVOU 16(R8), X3 + MOVOU 32(R8), X6 + MOVOU 48(R8), X9 + MOVO X0, X1 + MOVO X3, X4 + MOVO X6, X7 + MOVO X9, X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X1, X2 + MOVO X4, X5 + MOVO X7, X8 + MOVO X10, X11 + PADDL ·sseIncMask<>+0(SB), X11 + MOVO X3, X13 + MOVO X6, X14 + MOVO X10, X15 + MOVQ $0x0000000a, R9 sealSSE128InnerCipherLoop: - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) - shiftB0Left; shiftB1Left; shiftB2Left - shiftC0Left; shiftC1Left; shiftC2Left - shiftD0Left; shiftD1Left; shiftD2Left - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) - shiftB0Right; shiftB1Right; shiftB2Right - shiftC0Right; shiftC1Right; shiftC2Right - shiftD0Right; shiftD1Right; shiftD2Right - DECQ itr2 - JNE sealSSE128InnerCipherLoop + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X5 + PXOR X12, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X5 + PXOR X12, X5 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X5 + PXOR X12, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X5 + PXOR X12, X5 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + DECQ R9 + JNE sealSSE128InnerCipherLoop // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded - PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 - PADDL T1, B0; PADDL T1, B1; PADDL T1, B2 - PADDL T2, C1; PADDL T2, C2 - PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2 - PAND ·polyClampMask<>(SB), A0 - MOVOU A0, rStore - MOVOU B0, sStore + PADDL ·chacha20Constants<>+0(SB), X0 + PADDL ·chacha20Constants<>+0(SB), X1 + PADDL ·chacha20Constants<>+0(SB), X2 + PADDL X13, X3 + PADDL X13, X4 + PADDL X13, X5 + PADDL X14, X7 + PADDL X14, X8 + PADDL X15, X10 + PADDL ·sseIncMask<>+0(SB), X15 + PADDL X15, X11 + PAND ·polyClampMask<>+0(SB), X0 + MOVOU X0, (BP) + MOVOU X3, 16(BP) // Hash - MOVQ ad_len+80(FP), itr2 + MOVQ ad_len+80(FP), R9 CALL polyHashADInternal<>(SB) - XORQ itr1, itr1 + XORQ CX, CX sealSSE128SealHash: - // itr1 holds the number of bytes encrypted but not yet hashed - CMPQ itr1, $16 - JB sealSSE128Seal - polyAdd(0(oup)) - polyMul - - SUBQ $16, itr1 - ADDQ $16, oup - - JMP sealSSE128SealHash + CMPQ CX, $0x10 + JB sealSSE128Seal + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + SUBQ $0x10, CX + ADDQ $0x10, DI + JMP sealSSE128SealHash sealSSE128Seal: - CMPQ inl, $16 + CMPQ BX, $0x10 JB sealSSETail - SUBQ $16, inl + SUBQ $0x10, BX // Load for decryption - MOVOU (inp), T0 - PXOR T0, A1 - MOVOU A1, (oup) - LEAQ (1*16)(inp), inp - LEAQ (1*16)(oup), oup + MOVOU (SI), X12 + PXOR X12, X1 + MOVOU X1, (DI) + LEAQ 16(SI), SI + LEAQ 16(DI), DI // Extract for hashing - MOVQ A1, t0 - PSRLDQ $8, A1 - MOVQ A1, t1 - ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 - polyMul + MOVQ X1, R13 + PSRLDQ $0x08, X1 + MOVQ X1, R14 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 // Shift the stream "left" - MOVO B1, A1 - MOVO C1, B1 - MOVO D1, C1 - MOVO A2, D1 - MOVO B2, A2 - MOVO C2, B2 - MOVO D2, C2 + MOVO X4, X1 + MOVO X7, X4 + MOVO X10, X7 + MOVO X2, X10 + MOVO X5, X2 + MOVO X8, X5 + MOVO X11, X8 JMP sealSSE128Seal sealSSETail: - TESTQ inl, inl + TESTQ BX, BX JE sealSSEFinalize // We can only load the PT one byte at a time to avoid read after end of buffer - MOVQ inl, itr2 - SHLQ $4, itr2 - LEAQ ·andMask<>(SB), t0 - MOVQ inl, itr1 - LEAQ -1(inp)(inl*1), inp - XORQ t2, t2 - XORQ t3, t3 + MOVQ BX, R9 + SHLQ $0x04, R9 + LEAQ ·andMask<>+0(SB), R13 + MOVQ BX, CX + LEAQ -1(SI)(BX*1), SI + XORQ R15, R15 + XORQ R8, R8 XORQ AX, AX sealSSETailLoadLoop: - SHLQ $8, t2, t3 - SHLQ $8, t2 - MOVB (inp), AX - XORQ AX, t2 - LEAQ -1(inp), inp - DECQ itr1 + SHLQ $0x08, R15, R8 + SHLQ $0x08, R15 + MOVB (SI), AX + XORQ AX, R15 + LEAQ -1(SI), SI + DECQ CX JNE sealSSETailLoadLoop - MOVQ t2, 0+tmpStore - MOVQ t3, 8+tmpStore - PXOR 0+tmpStore, A1 - MOVOU A1, (oup) - MOVOU -16(t0)(itr2*1), T0 - PAND T0, A1 - MOVQ A1, t0 - PSRLDQ $8, A1 - MOVQ A1, t1 - ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 - polyMul - - ADDQ inl, oup + MOVQ R15, 64(BP) + MOVQ R8, 72(BP) + PXOR 64(BP), X1 + MOVOU X1, (DI) + MOVOU -16(R13)(R9*1), X12 + PAND X12, X1 + MOVQ X1, R13 + PSRLDQ $0x08, X1 + MOVQ X1, R14 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + ADDQ BX, DI sealSSEFinalize: // Hash in the buffer lengths - ADDQ ad_len+80(FP), acc0 - ADCQ src_len+56(FP), acc1 - ADCQ $1, acc2 - polyMul + ADDQ ad_len+80(FP), R10 + ADCQ src_len+56(FP), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 // Final reduce - MOVQ acc0, t0 - MOVQ acc1, t1 - MOVQ acc2, t2 - SUBQ $-5, acc0 - SBBQ $-1, acc1 - SBBQ $3, acc2 - CMOVQCS t0, acc0 - CMOVQCS t1, acc1 - CMOVQCS t2, acc2 + MOVQ R10, R13 + MOVQ R11, R14 + MOVQ R12, R15 + SUBQ $-5, R10 + SBBQ $-1, R11 + SBBQ $0x03, R12 + CMOVQCS R13, R10 + CMOVQCS R14, R11 + CMOVQCS R15, R12 // Add in the "s" part of the key - ADDQ 0+sStore, acc0 - ADCQ 8+sStore, acc1 + ADDQ 16(BP), R10 + ADCQ 24(BP), R11 // Finally store the tag at the end of the message - MOVQ acc0, (0*8)(oup) - MOVQ acc1, (1*8)(oup) + MOVQ R10, (DI) + MOVQ R11, 8(DI) RET -// ---------------------------------------------------------------------------- -// ------------------------- AVX2 Code ---------------------------------------- chacha20Poly1305Seal_AVX2: VZEROUPPER - VMOVDQU ·chacha20Constants<>(SB), AA0 - BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14 - BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12 - BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4 - VPADDD ·avx2InitMask<>(SB), DD0, DD0 + VMOVDQU ·chacha20Constants<>+0(SB), Y0 + BYTE $0xc4 + BYTE $0x42 + BYTE $0x7d + BYTE $0x5a + BYTE $0x70 + BYTE $0x10 + BYTE $0xc4 + BYTE $0x42 + BYTE $0x7d + BYTE $0x5a + BYTE $0x60 + BYTE $0x20 + BYTE $0xc4 + BYTE $0xc2 + BYTE $0x7d + BYTE $0x5a + BYTE $0x60 + BYTE $0x30 + VPADDD ·avx2InitMask<>+0(SB), Y4, Y4 // Special optimizations, for very short buffers - CMPQ inl, $192 - JBE seal192AVX2 // 33% faster - CMPQ inl, $320 - JBE seal320AVX2 // 17% faster + CMPQ BX, $0x000000c0 + JBE seal192AVX2 + CMPQ BX, $0x00000140 + JBE seal320AVX2 // For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream - VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 - VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3; VMOVDQA BB0, state1StoreAVX2 - VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3; VMOVDQA CC0, state2StoreAVX2 - VPADDD ·avx2IncMask<>(SB), DD0, DD1; VMOVDQA DD0, ctr0StoreAVX2 - VPADDD ·avx2IncMask<>(SB), DD1, DD2; VMOVDQA DD1, ctr1StoreAVX2 - VPADDD ·avx2IncMask<>(SB), DD2, DD3; VMOVDQA DD2, ctr2StoreAVX2 - VMOVDQA DD3, ctr3StoreAVX2 - MOVQ $10, itr2 + VMOVDQA Y0, Y5 + VMOVDQA Y0, Y6 + VMOVDQA Y0, Y7 + VMOVDQA Y14, Y9 + VMOVDQA Y14, Y10 + VMOVDQA Y14, Y11 + VMOVDQA Y14, 32(BP) + VMOVDQA Y12, Y13 + VMOVDQA Y12, Y8 + VMOVDQA Y12, Y15 + VMOVDQA Y12, 64(BP) + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VMOVDQA Y4, 96(BP) + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VMOVDQA Y1, 128(BP) + VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 + VMOVDQA Y2, 160(BP) + VMOVDQA Y3, 192(BP) + MOVQ $0x0000000a, R9 sealAVX2IntroLoop: - VMOVDQA CC3, tmpStoreAVX2 - chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) - VMOVDQA tmpStoreAVX2, CC3 - VMOVDQA CC1, tmpStoreAVX2 - chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) - VMOVDQA tmpStoreAVX2, CC1 - - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0 - VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1 - VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2 - VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3 - - VMOVDQA CC3, tmpStoreAVX2 - chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) - VMOVDQA tmpStoreAVX2, CC3 - VMOVDQA CC1, tmpStoreAVX2 - chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) - VMOVDQA tmpStoreAVX2, CC1 - - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0 - VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1 - VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2 - VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3 - DECQ itr2 - JNE sealAVX2IntroLoop - - VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 - VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 - VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 - VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 - - VPERM2I128 $0x13, CC0, DD0, CC0 // Stream bytes 96 - 127 - VPERM2I128 $0x02, AA0, BB0, DD0 // The Poly1305 key - VPERM2I128 $0x13, AA0, BB0, AA0 // Stream bytes 64 - 95 + VMOVDQA Y15, 224(BP) + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VMOVDQA 224(BP), Y15 + VMOVDQA Y13, 224(BP) + VPADDD Y11, Y7, Y7 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y3, Y15, Y15 + VPXOR Y15, Y11, Y11 + VPSLLD $0x0c, Y11, Y13 + VPSRLD $0x14, Y11, Y11 + VPXOR Y13, Y11, Y11 + VPADDD Y11, Y7, Y7 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y3, Y15, Y15 + VPXOR Y15, Y11, Y11 + VPSLLD $0x07, Y11, Y13 + VPSRLD $0x19, Y11, Y11 + VPXOR Y13, Y11, Y11 + VMOVDQA 224(BP), Y13 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPALIGNR $0x04, Y11, Y11, Y11 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x0c, Y3, Y3, Y3 + VMOVDQA Y15, 224(BP) + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VMOVDQA 224(BP), Y15 + VMOVDQA Y13, 224(BP) + VPADDD Y11, Y7, Y7 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y3, Y15, Y15 + VPXOR Y15, Y11, Y11 + VPSLLD $0x0c, Y11, Y13 + VPSRLD $0x14, Y11, Y11 + VPXOR Y13, Y11, Y11 + VPADDD Y11, Y7, Y7 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y3, Y15, Y15 + VPXOR Y15, Y11, Y11 + VPSLLD $0x07, Y11, Y13 + VPSRLD $0x19, Y11, Y11 + VPXOR Y13, Y11, Y11 + VMOVDQA 224(BP), Y13 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x04, Y2, Y2, Y2 + VPALIGNR $0x0c, Y11, Y11, Y11 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x04, Y3, Y3, Y3 + DECQ R9 + JNE sealAVX2IntroLoop + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 + VPADDD ·chacha20Constants<>+0(SB), Y7, Y7 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 32(BP), Y10, Y10 + VPADDD 32(BP), Y11, Y11 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD 64(BP), Y8, Y8 + VPADDD 64(BP), Y15, Y15 + VPADDD 96(BP), Y4, Y4 + VPADDD 128(BP), Y1, Y1 + VPADDD 160(BP), Y2, Y2 + VPADDD 192(BP), Y3, Y3 + VPERM2I128 $0x13, Y12, Y4, Y12 + VPERM2I128 $0x02, Y0, Y14, Y4 + VPERM2I128 $0x13, Y0, Y14, Y0 // Clamp and store poly key - VPAND ·polyClampMask<>(SB), DD0, DD0 - VMOVDQA DD0, rsStoreAVX2 + VPAND ·polyClampMask<>+0(SB), Y4, Y4 + VMOVDQA Y4, (BP) // Hash AD - MOVQ ad_len+80(FP), itr2 + MOVQ ad_len+80(FP), R9 CALL polyHashADInternal<>(SB) // Can store at least 320 bytes - VPXOR (0*32)(inp), AA0, AA0 - VPXOR (1*32)(inp), CC0, CC0 - VMOVDQU AA0, (0*32)(oup) - VMOVDQU CC0, (1*32)(oup) - - VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 - VPXOR (2*32)(inp), AA0, AA0; VPXOR (3*32)(inp), BB0, BB0; VPXOR (4*32)(inp), CC0, CC0; VPXOR (5*32)(inp), DD0, DD0 - VMOVDQU AA0, (2*32)(oup); VMOVDQU BB0, (3*32)(oup); VMOVDQU CC0, (4*32)(oup); VMOVDQU DD0, (5*32)(oup) - VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 - VPXOR (6*32)(inp), AA0, AA0; VPXOR (7*32)(inp), BB0, BB0; VPXOR (8*32)(inp), CC0, CC0; VPXOR (9*32)(inp), DD0, DD0 - VMOVDQU AA0, (6*32)(oup); VMOVDQU BB0, (7*32)(oup); VMOVDQU CC0, (8*32)(oup); VMOVDQU DD0, (9*32)(oup) - - MOVQ $320, itr1 - SUBQ $320, inl - LEAQ 320(inp), inp - - VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, CC3, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, CC3, DD3, DD0 - CMPQ inl, $128 + VPXOR (SI), Y0, Y0 + VPXOR 32(SI), Y12, Y12 + VMOVDQU Y0, (DI) + VMOVDQU Y12, 32(DI) + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 + VPXOR 64(SI), Y0, Y0 + VPXOR 96(SI), Y14, Y14 + VPXOR 128(SI), Y12, Y12 + VPXOR 160(SI), Y4, Y4 + VMOVDQU Y0, 64(DI) + VMOVDQU Y14, 96(DI) + VMOVDQU Y12, 128(DI) + VMOVDQU Y4, 160(DI) + VPERM2I128 $0x02, Y6, Y10, Y0 + VPERM2I128 $0x02, Y8, Y2, Y14 + VPERM2I128 $0x13, Y6, Y10, Y12 + VPERM2I128 $0x13, Y8, Y2, Y4 + VPXOR 192(SI), Y0, Y0 + VPXOR 224(SI), Y14, Y14 + VPXOR 256(SI), Y12, Y12 + VPXOR 288(SI), Y4, Y4 + VMOVDQU Y0, 192(DI) + VMOVDQU Y14, 224(DI) + VMOVDQU Y12, 256(DI) + VMOVDQU Y4, 288(DI) + MOVQ $0x00000140, CX + SUBQ $0x00000140, BX + LEAQ 320(SI), SI + VPERM2I128 $0x02, Y7, Y11, Y0 + VPERM2I128 $0x02, Y15, Y3, Y14 + VPERM2I128 $0x13, Y7, Y11, Y12 + VPERM2I128 $0x13, Y15, Y3, Y4 + CMPQ BX, $0x80 JBE sealAVX2SealHash - - VPXOR (0*32)(inp), AA0, AA0; VPXOR (1*32)(inp), BB0, BB0; VPXOR (2*32)(inp), CC0, CC0; VPXOR (3*32)(inp), DD0, DD0 - VMOVDQU AA0, (10*32)(oup); VMOVDQU BB0, (11*32)(oup); VMOVDQU CC0, (12*32)(oup); VMOVDQU DD0, (13*32)(oup) - SUBQ $128, inl - LEAQ 128(inp), inp - - MOVQ $8, itr1 - MOVQ $2, itr2 - - CMPQ inl, $128 - JBE sealAVX2Tail128 - CMPQ inl, $256 - JBE sealAVX2Tail256 - CMPQ inl, $384 - JBE sealAVX2Tail384 - CMPQ inl, $512 - JBE sealAVX2Tail512 + VPXOR (SI), Y0, Y0 + VPXOR 32(SI), Y14, Y14 + VPXOR 64(SI), Y12, Y12 + VPXOR 96(SI), Y4, Y4 + VMOVDQU Y0, 320(DI) + VMOVDQU Y14, 352(DI) + VMOVDQU Y12, 384(DI) + VMOVDQU Y4, 416(DI) + SUBQ $0x80, BX + LEAQ 128(SI), SI + MOVQ $0x00000008, CX + MOVQ $0x00000002, R9 + CMPQ BX, $0x80 + JBE sealAVX2Tail128 + CMPQ BX, $0x00000100 + JBE sealAVX2Tail256 + CMPQ BX, $0x00000180 + JBE sealAVX2Tail384 + CMPQ BX, $0x00000200 + JBE sealAVX2Tail512 // We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop - VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 - VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 - VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 - VMOVDQA ctr3StoreAVX2, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 - VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 - - VMOVDQA CC3, tmpStoreAVX2 - chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) - VMOVDQA tmpStoreAVX2, CC3 - VMOVDQA CC1, tmpStoreAVX2 - chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) - VMOVDQA tmpStoreAVX2, CC1 - - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0 - VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1 - VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2 - VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3 - - VMOVDQA CC3, tmpStoreAVX2 - chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) - VMOVDQA tmpStoreAVX2, CC3 - VMOVDQA CC1, tmpStoreAVX2 - chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) - VMOVDQA tmpStoreAVX2, CC1 - - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0 - VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1 - VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2 - VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - - SUBQ $16, oup // Adjust the pointer - MOVQ $9, itr1 - JMP sealAVX2InternalLoopStart + VMOVDQA ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA Y0, Y6 + VMOVDQA Y0, Y7 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA Y14, Y10 + VMOVDQA Y14, Y11 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA Y12, Y8 + VMOVDQA Y12, Y15 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 + VMOVDQA Y4, 96(BP) + VMOVDQA Y1, 128(BP) + VMOVDQA Y2, 160(BP) + VMOVDQA Y3, 192(BP) + VMOVDQA Y15, 224(BP) + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VMOVDQA 224(BP), Y15 + VMOVDQA Y13, 224(BP) + VPADDD Y11, Y7, Y7 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y3, Y15, Y15 + VPXOR Y15, Y11, Y11 + VPSLLD $0x0c, Y11, Y13 + VPSRLD $0x14, Y11, Y11 + VPXOR Y13, Y11, Y11 + VPADDD Y11, Y7, Y7 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y3, Y15, Y15 + VPXOR Y15, Y11, Y11 + VPSLLD $0x07, Y11, Y13 + VPSRLD $0x19, Y11, Y11 + VPXOR Y13, Y11, Y11 + VMOVDQA 224(BP), Y13 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPALIGNR $0x04, Y11, Y11, Y11 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x0c, Y3, Y3, Y3 + VMOVDQA Y15, 224(BP) + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VMOVDQA 224(BP), Y15 + VMOVDQA Y13, 224(BP) + VPADDD Y11, Y7, Y7 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y3, Y15, Y15 + VPXOR Y15, Y11, Y11 + VPSLLD $0x0c, Y11, Y13 + VPSRLD $0x14, Y11, Y11 + VPXOR Y13, Y11, Y11 + VPADDD Y11, Y7, Y7 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y3, Y15, Y15 + VPXOR Y15, Y11, Y11 + VPSLLD $0x07, Y11, Y13 + VPSRLD $0x19, Y11, Y11 + VPXOR Y13, Y11, Y11 + VMOVDQA 224(BP), Y13 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x04, Y2, Y2, Y2 + VPALIGNR $0x0c, Y11, Y11, Y11 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x04, Y3, Y3, Y3 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + SUBQ $0x10, DI + MOVQ $0x00000009, CX + JMP sealAVX2InternalLoopStart sealAVX2MainLoop: - // Load state, increment counter blocks, store the incremented counters - VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 - VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 - VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 - VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 - VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 - MOVQ $10, itr1 + VMOVDQU ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA Y0, Y6 + VMOVDQA Y0, Y7 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA Y14, Y10 + VMOVDQA Y14, Y11 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA Y12, Y8 + VMOVDQA Y12, Y15 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 + VMOVDQA Y4, 96(BP) + VMOVDQA Y1, 128(BP) + VMOVDQA Y2, 160(BP) + VMOVDQA Y3, 192(BP) + MOVQ $0x0000000a, CX sealAVX2InternalLoop: - polyAdd(0*8(oup)) - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - polyMulStage1_AVX2 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 - polyMulStage2_AVX2 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - polyMulStage3_AVX2 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyMulReduceStage + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 sealAVX2InternalLoopStart: - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 - polyAdd(2*8(oup)) - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - polyMulStage1_AVX2 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyMulStage2_AVX2 - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - polyMulStage3_AVX2 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 - polyMulReduceStage - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - polyAdd(4*8(oup)) - LEAQ (6*8)(oup), oup - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyMulStage1_AVX2 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - polyMulStage2_AVX2 - VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - polyMulStage3_AVX2 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyMulReduceStage - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 - DECQ itr1 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + ADDQ 16(DI), R10 + ADCQ 24(DI), R11 + ADCQ $0x01, R12 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x04, Y11, Y11, Y11 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPALIGNR $0x0c, Y3, Y3, Y3 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + ADDQ 32(DI), R10 + ADCQ 40(DI), R11 + ADCQ $0x01, R12 + LEAQ 48(DI), DI + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x0c, Y11, Y11, Y11 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x04, Y2, Y2, Y2 + VPALIGNR $0x04, Y3, Y3, Y3 + DECQ CX JNE sealAVX2InternalLoop - - VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 - VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 - VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 - VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 - VMOVDQA CC3, tmpStoreAVX2 + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 + VPADDD ·chacha20Constants<>+0(SB), Y7, Y7 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 32(BP), Y10, Y10 + VPADDD 32(BP), Y11, Y11 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD 64(BP), Y8, Y8 + VPADDD 64(BP), Y15, Y15 + VPADDD 96(BP), Y4, Y4 + VPADDD 128(BP), Y1, Y1 + VPADDD 160(BP), Y2, Y2 + VPADDD 192(BP), Y3, Y3 + VMOVDQA Y15, 224(BP) // We only hashed 480 of the 512 bytes available - hash the remaining 32 here - polyAdd(0*8(oup)) - polyMulAVX2 - LEAQ (4*8)(oup), oup - VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0 - VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0 - VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup) - VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 - VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 - VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 32(DI), DI + VPERM2I128 $0x02, Y0, Y14, Y15 + VPERM2I128 $0x13, Y0, Y14, Y14 + VPERM2I128 $0x02, Y12, Y4, Y0 + VPERM2I128 $0x13, Y12, Y4, Y12 + VPXOR (SI), Y15, Y15 + VPXOR 32(SI), Y0, Y0 + VPXOR 64(SI), Y14, Y14 + VPXOR 96(SI), Y12, Y12 + VMOVDQU Y15, (DI) + VMOVDQU Y0, 32(DI) + VMOVDQU Y14, 64(DI) + VMOVDQU Y12, 96(DI) + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 + VPXOR 128(SI), Y0, Y0 + VPXOR 160(SI), Y14, Y14 + VPXOR 192(SI), Y12, Y12 + VPXOR 224(SI), Y4, Y4 + VMOVDQU Y0, 128(DI) + VMOVDQU Y14, 160(DI) + VMOVDQU Y12, 192(DI) + VMOVDQU Y4, 224(DI) // and here - polyAdd(-2*8(oup)) - polyMulAVX2 - VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 - VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 - VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) - VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 - VPXOR (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0 - VMOVDQU AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup) - LEAQ (32*16)(inp), inp - SUBQ $(32*16), inl - CMPQ inl, $512 + ADDQ -16(DI), R10 + ADCQ -8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPERM2I128 $0x02, Y6, Y10, Y0 + VPERM2I128 $0x02, Y8, Y2, Y14 + VPERM2I128 $0x13, Y6, Y10, Y12 + VPERM2I128 $0x13, Y8, Y2, Y4 + VPXOR 256(SI), Y0, Y0 + VPXOR 288(SI), Y14, Y14 + VPXOR 320(SI), Y12, Y12 + VPXOR 352(SI), Y4, Y4 + VMOVDQU Y0, 256(DI) + VMOVDQU Y14, 288(DI) + VMOVDQU Y12, 320(DI) + VMOVDQU Y4, 352(DI) + VPERM2I128 $0x02, Y7, Y11, Y0 + VPERM2I128 $0x02, 224(BP), Y3, Y14 + VPERM2I128 $0x13, Y7, Y11, Y12 + VPERM2I128 $0x13, 224(BP), Y3, Y4 + VPXOR 384(SI), Y0, Y0 + VPXOR 416(SI), Y14, Y14 + VPXOR 448(SI), Y12, Y12 + VPXOR 480(SI), Y4, Y4 + VMOVDQU Y0, 384(DI) + VMOVDQU Y14, 416(DI) + VMOVDQU Y12, 448(DI) + VMOVDQU Y4, 480(DI) + LEAQ 512(SI), SI + SUBQ $0x00000200, BX + CMPQ BX, $0x00000200 JG sealAVX2MainLoop // Tail can only hash 480 bytes - polyAdd(0*8(oup)) - polyMulAVX2 - polyAdd(2*8(oup)) - polyMulAVX2 - LEAQ 32(oup), oup - - MOVQ $10, itr1 - MOVQ $0, itr2 - CMPQ inl, $128 - JBE sealAVX2Tail128 - CMPQ inl, $256 - JBE sealAVX2Tail256 - CMPQ inl, $384 - JBE sealAVX2Tail384 - JMP sealAVX2Tail512 - -// ---------------------------------------------------------------------------- -// Special optimization for buffers smaller than 193 bytes + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + ADDQ 16(DI), R10 + ADCQ 24(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 32(DI), DI + MOVQ $0x0000000a, CX + MOVQ $0x00000000, R9 + CMPQ BX, $0x80 + JBE sealAVX2Tail128 + CMPQ BX, $0x00000100 + JBE sealAVX2Tail256 + CMPQ BX, $0x00000180 + JBE sealAVX2Tail384 + JMP sealAVX2Tail512 + seal192AVX2: - // For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks - VMOVDQA AA0, AA1 - VMOVDQA BB0, BB1 - VMOVDQA CC0, CC1 - VPADDD ·avx2IncMask<>(SB), DD0, DD1 - VMOVDQA AA0, AA2 - VMOVDQA BB0, BB2 - VMOVDQA CC0, CC2 - VMOVDQA DD0, DD2 - VMOVDQA DD1, TT3 - MOVQ $10, itr2 + VMOVDQA Y0, Y5 + VMOVDQA Y14, Y9 + VMOVDQA Y12, Y13 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VMOVDQA Y0, Y6 + VMOVDQA Y14, Y10 + VMOVDQA Y12, Y8 + VMOVDQA Y4, Y2 + VMOVDQA Y1, Y15 + MOVQ $0x0000000a, R9 sealAVX2192InnerCipherLoop: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 - DECQ itr2 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + DECQ R9 JNE sealAVX2192InnerCipherLoop - VPADDD AA2, AA0, AA0; VPADDD AA2, AA1, AA1 - VPADDD BB2, BB0, BB0; VPADDD BB2, BB1, BB1 - VPADDD CC2, CC0, CC0; VPADDD CC2, CC1, CC1 - VPADDD DD2, DD0, DD0; VPADDD TT3, DD1, DD1 - VPERM2I128 $0x02, AA0, BB0, TT0 + VPADDD Y6, Y0, Y0 + VPADDD Y6, Y5, Y5 + VPADDD Y10, Y14, Y14 + VPADDD Y10, Y9, Y9 + VPADDD Y8, Y12, Y12 + VPADDD Y8, Y13, Y13 + VPADDD Y2, Y4, Y4 + VPADDD Y15, Y1, Y1 + VPERM2I128 $0x02, Y0, Y14, Y3 // Clamp and store poly key - VPAND ·polyClampMask<>(SB), TT0, TT0 - VMOVDQA TT0, rsStoreAVX2 + VPAND ·polyClampMask<>+0(SB), Y3, Y3 + VMOVDQA Y3, (BP) // Stream for up to 192 bytes - VPERM2I128 $0x13, AA0, BB0, AA0 - VPERM2I128 $0x13, CC0, DD0, BB0 - VPERM2I128 $0x02, AA1, BB1, CC0 - VPERM2I128 $0x02, CC1, DD1, DD0 - VPERM2I128 $0x13, AA1, BB1, AA1 - VPERM2I128 $0x13, CC1, DD1, BB1 + VPERM2I128 $0x13, Y0, Y14, Y0 + VPERM2I128 $0x13, Y12, Y4, Y14 + VPERM2I128 $0x02, Y5, Y9, Y12 + VPERM2I128 $0x02, Y13, Y1, Y4 + VPERM2I128 $0x13, Y5, Y9, Y5 + VPERM2I128 $0x13, Y13, Y1, Y9 sealAVX2ShortSeal: // Hash aad - MOVQ ad_len+80(FP), itr2 + MOVQ ad_len+80(FP), R9 CALL polyHashADInternal<>(SB) - XORQ itr1, itr1 + XORQ CX, CX sealAVX2SealHash: // itr1 holds the number of bytes encrypted but not yet hashed - CMPQ itr1, $16 - JB sealAVX2ShortSealLoop - polyAdd(0(oup)) - polyMul - SUBQ $16, itr1 - ADDQ $16, oup - JMP sealAVX2SealHash + CMPQ CX, $0x10 + JB sealAVX2ShortSealLoop + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + SUBQ $0x10, CX + ADDQ $0x10, DI + JMP sealAVX2SealHash sealAVX2ShortSealLoop: - CMPQ inl, $32 + CMPQ BX, $0x20 JB sealAVX2ShortTail32 - SUBQ $32, inl + SUBQ $0x20, BX // Load for encryption - VPXOR (inp), AA0, AA0 - VMOVDQU AA0, (oup) - LEAQ (1*32)(inp), inp + VPXOR (SI), Y0, Y0 + VMOVDQU Y0, (DI) + LEAQ 32(SI), SI // Now can hash - polyAdd(0*8(oup)) - polyMulAVX2 - polyAdd(2*8(oup)) - polyMulAVX2 - LEAQ (1*32)(oup), oup + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + ADDQ 16(DI), R10 + ADCQ 24(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 32(DI), DI // Shift stream left - VMOVDQA BB0, AA0 - VMOVDQA CC0, BB0 - VMOVDQA DD0, CC0 - VMOVDQA AA1, DD0 - VMOVDQA BB1, AA1 - VMOVDQA CC1, BB1 - VMOVDQA DD1, CC1 - VMOVDQA AA2, DD1 - VMOVDQA BB2, AA2 + VMOVDQA Y14, Y0 + VMOVDQA Y12, Y14 + VMOVDQA Y4, Y12 + VMOVDQA Y5, Y4 + VMOVDQA Y9, Y5 + VMOVDQA Y13, Y9 + VMOVDQA Y1, Y13 + VMOVDQA Y6, Y1 + VMOVDQA Y10, Y6 JMP sealAVX2ShortSealLoop sealAVX2ShortTail32: - CMPQ inl, $16 - VMOVDQA A0, A1 + CMPQ BX, $0x10 + VMOVDQA X0, X1 JB sealAVX2ShortDone - - SUBQ $16, inl + SUBQ $0x10, BX // Load for encryption - VPXOR (inp), A0, T0 - VMOVDQU T0, (oup) - LEAQ (1*16)(inp), inp + VPXOR (SI), X0, X12 + VMOVDQU X12, (DI) + LEAQ 16(SI), SI // Hash - polyAdd(0*8(oup)) - polyMulAVX2 - LEAQ (1*16)(oup), oup - VPERM2I128 $0x11, AA0, AA0, AA0 - VMOVDQA A0, A1 + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI + VPERM2I128 $0x11, Y0, Y0, Y0 + VMOVDQA X0, X1 sealAVX2ShortDone: VZEROUPPER JMP sealSSETail -// ---------------------------------------------------------------------------- -// Special optimization for buffers smaller than 321 bytes seal320AVX2: - // For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks - VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1 - VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2 - VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3 - MOVQ $10, itr2 + VMOVDQA Y0, Y5 + VMOVDQA Y14, Y9 + VMOVDQA Y12, Y13 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VMOVDQA Y0, Y6 + VMOVDQA Y14, Y10 + VMOVDQA Y12, Y8 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VMOVDQA Y14, Y7 + VMOVDQA Y12, Y11 + VMOVDQA Y4, Y15 + MOVQ $0x0000000a, R9 sealAVX2320InnerCipherLoop: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 - DECQ itr2 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y3 + VPSRLD $0x19, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y3 + VPSRLD $0x19, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x04, Y2, Y2, Y2 + DECQ R9 JNE sealAVX2320InnerCipherLoop - - VMOVDQA ·chacha20Constants<>(SB), TT0 - VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2 - VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2 - VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2 - VMOVDQA ·avx2IncMask<>(SB), TT0 - VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3 - VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3 - VPADDD TT3, DD2, DD2 + VMOVDQA ·chacha20Constants<>+0(SB), Y3 + VPADDD Y3, Y0, Y0 + VPADDD Y3, Y5, Y5 + VPADDD Y3, Y6, Y6 + VPADDD Y7, Y14, Y14 + VPADDD Y7, Y9, Y9 + VPADDD Y7, Y10, Y10 + VPADDD Y11, Y12, Y12 + VPADDD Y11, Y13, Y13 + VPADDD Y11, Y8, Y8 + VMOVDQA ·avx2IncMask<>+0(SB), Y3 + VPADDD Y15, Y4, Y4 + VPADDD Y3, Y15, Y15 + VPADDD Y15, Y1, Y1 + VPADDD Y3, Y15, Y15 + VPADDD Y15, Y2, Y2 // Clamp and store poly key - VPERM2I128 $0x02, AA0, BB0, TT0 - VPAND ·polyClampMask<>(SB), TT0, TT0 - VMOVDQA TT0, rsStoreAVX2 + VPERM2I128 $0x02, Y0, Y14, Y3 + VPAND ·polyClampMask<>+0(SB), Y3, Y3 + VMOVDQA Y3, (BP) // Stream for up to 320 bytes - VPERM2I128 $0x13, AA0, BB0, AA0 - VPERM2I128 $0x13, CC0, DD0, BB0 - VPERM2I128 $0x02, AA1, BB1, CC0 - VPERM2I128 $0x02, CC1, DD1, DD0 - VPERM2I128 $0x13, AA1, BB1, AA1 - VPERM2I128 $0x13, CC1, DD1, BB1 - VPERM2I128 $0x02, AA2, BB2, CC1 - VPERM2I128 $0x02, CC2, DD2, DD1 - VPERM2I128 $0x13, AA2, BB2, AA2 - VPERM2I128 $0x13, CC2, DD2, BB2 + VPERM2I128 $0x13, Y0, Y14, Y0 + VPERM2I128 $0x13, Y12, Y4, Y14 + VPERM2I128 $0x02, Y5, Y9, Y12 + VPERM2I128 $0x02, Y13, Y1, Y4 + VPERM2I128 $0x13, Y5, Y9, Y5 + VPERM2I128 $0x13, Y13, Y1, Y9 + VPERM2I128 $0x02, Y6, Y10, Y13 + VPERM2I128 $0x02, Y8, Y2, Y1 + VPERM2I128 $0x13, Y6, Y10, Y6 + VPERM2I128 $0x13, Y8, Y2, Y10 JMP sealAVX2ShortSeal -// ---------------------------------------------------------------------------- -// Special optimization for the last 128 bytes of ciphertext sealAVX2Tail128: - // Need to decrypt up to 128 bytes - prepare two blocks - // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed - // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed - VMOVDQA ·chacha20Constants<>(SB), AA0 - VMOVDQA state1StoreAVX2, BB0 - VMOVDQA state2StoreAVX2, CC0 - VMOVDQA ctr3StoreAVX2, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD0 - VMOVDQA DD0, DD1 + VMOVDQA ·chacha20Constants<>+0(SB), Y0 + VMOVDQA 32(BP), Y14 + VMOVDQA 64(BP), Y12 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VMOVDQA Y4, Y1 sealAVX2Tail128LoopA: - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI sealAVX2Tail128LoopB: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) - polyAdd(0(oup)) - polyMul - VPALIGNR $4, BB0, BB0, BB0 - VPALIGNR $8, CC0, CC0, CC0 - VPALIGNR $12, DD0, DD0, DD0 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) - polyAdd(16(oup)) - polyMul - LEAQ 32(oup), oup - VPALIGNR $12, BB0, BB0, BB0 - VPALIGNR $8, CC0, CC0, CC0 - VPALIGNR $4, DD0, DD0, DD0 - DECQ itr1 - JG sealAVX2Tail128LoopA - DECQ itr2 - JGE sealAVX2Tail128LoopB - - VPADDD ·chacha20Constants<>(SB), AA0, AA1 - VPADDD state1StoreAVX2, BB0, BB1 - VPADDD state2StoreAVX2, CC0, CC1 - VPADDD DD1, DD0, DD1 - - VPERM2I128 $0x02, AA1, BB1, AA0 - VPERM2I128 $0x02, CC1, DD1, BB0 - VPERM2I128 $0x13, AA1, BB1, CC0 - VPERM2I128 $0x13, CC1, DD1, DD0 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + ADDQ 16(DI), R10 + ADCQ 24(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 32(DI), DI + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x04, Y4, Y4, Y4 + DECQ CX + JG sealAVX2Tail128LoopA + DECQ R9 + JGE sealAVX2Tail128LoopB + VPADDD ·chacha20Constants<>+0(SB), Y0, Y5 + VPADDD 32(BP), Y14, Y9 + VPADDD 64(BP), Y12, Y13 + VPADDD Y1, Y4, Y1 + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 JMP sealAVX2ShortSealLoop -// ---------------------------------------------------------------------------- -// Special optimization for the last 256 bytes of ciphertext sealAVX2Tail256: - // Need to decrypt up to 256 bytes - prepare two blocks - // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed - // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed - VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA ·chacha20Constants<>(SB), AA1 - VMOVDQA state1StoreAVX2, BB0; VMOVDQA state1StoreAVX2, BB1 - VMOVDQA state2StoreAVX2, CC0; VMOVDQA state2StoreAVX2, CC1 - VMOVDQA ctr3StoreAVX2, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD1 - VMOVDQA DD0, TT1 - VMOVDQA DD1, TT2 + VMOVDQA ·chacha20Constants<>+0(SB), Y0 + VMOVDQA ·chacha20Constants<>+0(SB), Y5 + VMOVDQA 32(BP), Y14 + VMOVDQA 32(BP), Y9 + VMOVDQA 64(BP), Y12 + VMOVDQA 64(BP), Y13 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VMOVDQA Y4, Y7 + VMOVDQA Y1, Y11 sealAVX2Tail256LoopA: - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI sealAVX2Tail256LoopB: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - polyAdd(0(oup)) - polyMul - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - polyAdd(16(oup)) - polyMul - LEAQ 32(oup), oup - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 - DECQ itr1 - JG sealAVX2Tail256LoopA - DECQ itr2 - JGE sealAVX2Tail256LoopB - - VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1 - VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1 - VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1 - VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1 - VPERM2I128 $0x02, AA0, BB0, TT0 - VPERM2I128 $0x02, CC0, DD0, TT1 - VPERM2I128 $0x13, AA0, BB0, TT2 - VPERM2I128 $0x13, CC0, DD0, TT3 - VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3 - VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup) - MOVQ $128, itr1 - LEAQ 128(inp), inp - SUBQ $128, inl - VPERM2I128 $0x02, AA1, BB1, AA0 - VPERM2I128 $0x02, CC1, DD1, BB0 - VPERM2I128 $0x13, AA1, BB1, CC0 - VPERM2I128 $0x13, CC1, DD1, DD0 - - JMP sealAVX2SealHash - -// ---------------------------------------------------------------------------- -// Special optimization for the last 384 bytes of ciphertext + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + ADDQ 16(DI), R10 + ADCQ 24(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 32(DI), DI + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + DECQ CX + JG sealAVX2Tail256LoopA + DECQ R9 + JGE sealAVX2Tail256LoopB + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD Y7, Y4, Y4 + VPADDD Y11, Y1, Y1 + VPERM2I128 $0x02, Y0, Y14, Y3 + VPERM2I128 $0x02, Y12, Y4, Y7 + VPERM2I128 $0x13, Y0, Y14, Y11 + VPERM2I128 $0x13, Y12, Y4, Y15 + VPXOR (SI), Y3, Y3 + VPXOR 32(SI), Y7, Y7 + VPXOR 64(SI), Y11, Y11 + VPXOR 96(SI), Y15, Y15 + VMOVDQU Y3, (DI) + VMOVDQU Y7, 32(DI) + VMOVDQU Y11, 64(DI) + VMOVDQU Y15, 96(DI) + MOVQ $0x00000080, CX + LEAQ 128(SI), SI + SUBQ $0x80, BX + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 + JMP sealAVX2SealHash + sealAVX2Tail384: - // Need to decrypt up to 384 bytes - prepare two blocks - // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed - // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed - VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2 - VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2 - VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2 - VMOVDQA ctr3StoreAVX2, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2 - VMOVDQA DD0, TT1; VMOVDQA DD1, TT2; VMOVDQA DD2, TT3 + VMOVDQA ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA Y0, Y6 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA Y14, Y10 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA Y12, Y8 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VMOVDQA Y4, Y7 + VMOVDQA Y1, Y11 + VMOVDQA Y2, Y15 sealAVX2Tail384LoopA: - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI sealAVX2Tail384LoopB: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) - polyAdd(0(oup)) - polyMul - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) - polyAdd(16(oup)) - polyMul - LEAQ 32(oup), oup - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 - DECQ itr1 - JG sealAVX2Tail384LoopA - DECQ itr2 - JGE sealAVX2Tail384LoopB - - VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2 - VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2 - VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2 - VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1; VPADDD TT3, DD2, DD2 - VPERM2I128 $0x02, AA0, BB0, TT0 - VPERM2I128 $0x02, CC0, DD0, TT1 - VPERM2I128 $0x13, AA0, BB0, TT2 - VPERM2I128 $0x13, CC0, DD0, TT3 - VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3 - VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup) - VPERM2I128 $0x02, AA1, BB1, TT0 - VPERM2I128 $0x02, CC1, DD1, TT1 - VPERM2I128 $0x13, AA1, BB1, TT2 - VPERM2I128 $0x13, CC1, DD1, TT3 - VPXOR (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3 - VMOVDQU TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup) - MOVQ $256, itr1 - LEAQ 256(inp), inp - SUBQ $256, inl - VPERM2I128 $0x02, AA2, BB2, AA0 - VPERM2I128 $0x02, CC2, DD2, BB0 - VPERM2I128 $0x13, AA2, BB2, CC0 - VPERM2I128 $0x13, CC2, DD2, DD0 - - JMP sealAVX2SealHash - -// ---------------------------------------------------------------------------- -// Special optimization for the last 512 bytes of ciphertext + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y3 + VPSRLD $0x19, Y10, Y10 + VPXOR Y3, Y10, Y10 + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y3 + VPSRLD $0x19, Y10, Y10 + VPXOR Y3, Y10, Y10 + ADDQ 16(DI), R10 + ADCQ 24(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 32(DI), DI + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x04, Y2, Y2, Y2 + DECQ CX + JG sealAVX2Tail384LoopA + DECQ R9 + JGE sealAVX2Tail384LoopB + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 32(BP), Y10, Y10 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD 64(BP), Y8, Y8 + VPADDD Y7, Y4, Y4 + VPADDD Y11, Y1, Y1 + VPADDD Y15, Y2, Y2 + VPERM2I128 $0x02, Y0, Y14, Y3 + VPERM2I128 $0x02, Y12, Y4, Y7 + VPERM2I128 $0x13, Y0, Y14, Y11 + VPERM2I128 $0x13, Y12, Y4, Y15 + VPXOR (SI), Y3, Y3 + VPXOR 32(SI), Y7, Y7 + VPXOR 64(SI), Y11, Y11 + VPXOR 96(SI), Y15, Y15 + VMOVDQU Y3, (DI) + VMOVDQU Y7, 32(DI) + VMOVDQU Y11, 64(DI) + VMOVDQU Y15, 96(DI) + VPERM2I128 $0x02, Y5, Y9, Y3 + VPERM2I128 $0x02, Y13, Y1, Y7 + VPERM2I128 $0x13, Y5, Y9, Y11 + VPERM2I128 $0x13, Y13, Y1, Y15 + VPXOR 128(SI), Y3, Y3 + VPXOR 160(SI), Y7, Y7 + VPXOR 192(SI), Y11, Y11 + VPXOR 224(SI), Y15, Y15 + VMOVDQU Y3, 128(DI) + VMOVDQU Y7, 160(DI) + VMOVDQU Y11, 192(DI) + VMOVDQU Y15, 224(DI) + MOVQ $0x00000100, CX + LEAQ 256(SI), SI + SUBQ $0x00000100, BX + VPERM2I128 $0x02, Y6, Y10, Y0 + VPERM2I128 $0x02, Y8, Y2, Y14 + VPERM2I128 $0x13, Y6, Y10, Y12 + VPERM2I128 $0x13, Y8, Y2, Y4 + JMP sealAVX2SealHash + sealAVX2Tail512: - // Need to decrypt up to 512 bytes - prepare two blocks - // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed - // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed - VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 - VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 - VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 - VMOVDQA ctr3StoreAVX2, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 - VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 + VMOVDQA ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA Y0, Y6 + VMOVDQA Y0, Y7 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA Y14, Y10 + VMOVDQA Y14, Y11 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA Y12, Y8 + VMOVDQA Y12, Y15 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 + VMOVDQA Y4, 96(BP) + VMOVDQA Y1, 128(BP) + VMOVDQA Y2, 160(BP) + VMOVDQA Y3, 192(BP) sealAVX2Tail512LoopA: - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI sealAVX2Tail512LoopB: - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyAdd(0*8(oup)) - polyMulAVX2 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - polyAdd(2*8(oup)) - polyMulAVX2 - LEAQ (4*8)(oup), oup - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 - - DECQ itr1 - JG sealAVX2Tail512LoopA - DECQ itr2 - JGE sealAVX2Tail512LoopB - - VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 - VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 - VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 - VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 - VMOVDQA CC3, tmpStoreAVX2 - VPERM2I128 $0x02, AA0, BB0, CC3 - VPXOR (0*32)(inp), CC3, CC3 - VMOVDQU CC3, (0*32)(oup) - VPERM2I128 $0x02, CC0, DD0, CC3 - VPXOR (1*32)(inp), CC3, CC3 - VMOVDQU CC3, (1*32)(oup) - VPERM2I128 $0x13, AA0, BB0, CC3 - VPXOR (2*32)(inp), CC3, CC3 - VMOVDQU CC3, (2*32)(oup) - VPERM2I128 $0x13, CC0, DD0, CC3 - VPXOR (3*32)(inp), CC3, CC3 - VMOVDQU CC3, (3*32)(oup) - - VPERM2I128 $0x02, AA1, BB1, AA0 - VPERM2I128 $0x02, CC1, DD1, BB0 - VPERM2I128 $0x13, AA1, BB1, CC0 - VPERM2I128 $0x13, CC1, DD1, DD0 - VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 - VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) - - VPERM2I128 $0x02, AA2, BB2, AA0 - VPERM2I128 $0x02, CC2, DD2, BB0 - VPERM2I128 $0x13, AA2, BB2, CC0 - VPERM2I128 $0x13, CC2, DD2, DD0 - VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 - VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) - - MOVQ $384, itr1 - LEAQ 384(inp), inp - SUBQ $384, inl - VPERM2I128 $0x02, AA3, BB3, AA0 - VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0 - VPERM2I128 $0x13, AA3, BB3, CC0 - VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 - - JMP sealAVX2SealHash + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x04, Y11, Y11, Y11 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPALIGNR $0x0c, Y3, Y3, Y3 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + ADDQ 16(DI), R10 + ADCQ 24(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 32(DI), DI + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x0c, Y11, Y11, Y11 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x04, Y2, Y2, Y2 + VPALIGNR $0x04, Y3, Y3, Y3 + DECQ CX + JG sealAVX2Tail512LoopA + DECQ R9 + JGE sealAVX2Tail512LoopB + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 + VPADDD ·chacha20Constants<>+0(SB), Y7, Y7 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 32(BP), Y10, Y10 + VPADDD 32(BP), Y11, Y11 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD 64(BP), Y8, Y8 + VPADDD 64(BP), Y15, Y15 + VPADDD 96(BP), Y4, Y4 + VPADDD 128(BP), Y1, Y1 + VPADDD 160(BP), Y2, Y2 + VPADDD 192(BP), Y3, Y3 + VMOVDQA Y15, 224(BP) + VPERM2I128 $0x02, Y0, Y14, Y15 + VPXOR (SI), Y15, Y15 + VMOVDQU Y15, (DI) + VPERM2I128 $0x02, Y12, Y4, Y15 + VPXOR 32(SI), Y15, Y15 + VMOVDQU Y15, 32(DI) + VPERM2I128 $0x13, Y0, Y14, Y15 + VPXOR 64(SI), Y15, Y15 + VMOVDQU Y15, 64(DI) + VPERM2I128 $0x13, Y12, Y4, Y15 + VPXOR 96(SI), Y15, Y15 + VMOVDQU Y15, 96(DI) + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 + VPXOR 128(SI), Y0, Y0 + VPXOR 160(SI), Y14, Y14 + VPXOR 192(SI), Y12, Y12 + VPXOR 224(SI), Y4, Y4 + VMOVDQU Y0, 128(DI) + VMOVDQU Y14, 160(DI) + VMOVDQU Y12, 192(DI) + VMOVDQU Y4, 224(DI) + VPERM2I128 $0x02, Y6, Y10, Y0 + VPERM2I128 $0x02, Y8, Y2, Y14 + VPERM2I128 $0x13, Y6, Y10, Y12 + VPERM2I128 $0x13, Y8, Y2, Y4 + VPXOR 256(SI), Y0, Y0 + VPXOR 288(SI), Y14, Y14 + VPXOR 320(SI), Y12, Y12 + VPXOR 352(SI), Y4, Y4 + VMOVDQU Y0, 256(DI) + VMOVDQU Y14, 288(DI) + VMOVDQU Y12, 320(DI) + VMOVDQU Y4, 352(DI) + MOVQ $0x00000180, CX + LEAQ 384(SI), SI + SUBQ $0x00000180, BX + VPERM2I128 $0x02, Y7, Y11, Y0 + VPERM2I128 $0x02, 224(BP), Y3, Y14 + VPERM2I128 $0x13, Y7, Y11, Y12 + VPERM2I128 $0x13, 224(BP), Y3, Y4 + JMP sealAVX2SealHash