diff --git a/chacha20/chacha_ppc64le.s b/chacha20/chacha_ppc64le.s index 66aebae258..c672ccf698 100644 --- a/chacha20/chacha_ppc64le.s +++ b/chacha20/chacha_ppc64le.s @@ -33,6 +33,9 @@ #define CONSTBASE R16 #define BLOCKS R17 +// for VPERMXOR +#define MASK R18 + DATA consts<>+0x00(SB)/8, $0x3320646e61707865 DATA consts<>+0x08(SB)/8, $0x6b20657479622d32 DATA consts<>+0x10(SB)/8, $0x0000000000000001 @@ -53,7 +56,11 @@ DATA consts<>+0x80(SB)/8, $0x6b2065746b206574 DATA consts<>+0x88(SB)/8, $0x6b2065746b206574 DATA consts<>+0x90(SB)/8, $0x0000000100000000 DATA consts<>+0x98(SB)/8, $0x0000000300000002 -GLOBL consts<>(SB), RODATA, $0xa0 +DATA consts<>+0xa0(SB)/8, $0x5566774411223300 +DATA consts<>+0xa8(SB)/8, $0xddeeffcc99aabb88 +DATA consts<>+0xb0(SB)/8, $0x6677445522330011 +DATA consts<>+0xb8(SB)/8, $0xeeffccddaabb8899 +GLOBL consts<>(SB), RODATA, $0xc0 //func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32) TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40 @@ -70,6 +77,9 @@ TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40 MOVD $48, R10 MOVD $64, R11 SRD $6, LEN, BLOCKS + // for VPERMXOR + MOVD $consts<>+0xa0(SB), MASK + MOVD $16, R20 // V16 LXVW4X (CONSTBASE)(R0), VS48 ADD $80,CONSTBASE @@ -87,6 +97,10 @@ TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40 // V28 LXVW4X (CONSTBASE)(R11), VS60 + // Load mask constants for VPERMXOR + LXVW4X (MASK)(R0), V20 + LXVW4X (MASK)(R20), V21 + // splat slot from V19 -> V26 VSPLTW $0, V19, V26 @@ -97,7 +111,7 @@ TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40 MOVD $10, R14 MOVD R14, CTR - + PCALIGN $16 loop_outer_vsx: // V0, V1, V2, V3 LXVW4X (R0)(CONSTBASE), VS32 @@ -128,22 +142,17 @@ loop_outer_vsx: VSPLTISW $12, V28 VSPLTISW $8, V29 VSPLTISW $7, V30 - + PCALIGN $16 loop_vsx: VADDUWM V0, V4, V0 VADDUWM V1, V5, V1 VADDUWM V2, V6, V2 VADDUWM V3, V7, V3 - VXOR V12, V0, V12 - VXOR V13, V1, V13 - VXOR V14, V2, V14 - VXOR V15, V3, V15 - - VRLW V12, V27, V12 - VRLW V13, V27, V13 - VRLW V14, V27, V14 - VRLW V15, V27, V15 + VPERMXOR V12, V0, V21, V12 + VPERMXOR V13, V1, V21, V13 + VPERMXOR V14, V2, V21, V14 + VPERMXOR V15, V3, V21, V15 VADDUWM V8, V12, V8 VADDUWM V9, V13, V9 @@ -165,15 +174,10 @@ loop_vsx: VADDUWM V2, V6, V2 VADDUWM V3, V7, V3 - VXOR V12, V0, V12 - VXOR V13, V1, V13 - VXOR V14, V2, V14 - VXOR V15, V3, V15 - - VRLW V12, V29, V12 - VRLW V13, V29, V13 - VRLW V14, V29, V14 - VRLW V15, V29, V15 + VPERMXOR V12, V0, V20, V12 + VPERMXOR V13, V1, V20, V13 + VPERMXOR V14, V2, V20, V14 + VPERMXOR V15, V3, V20, V15 VADDUWM V8, V12, V8 VADDUWM V9, V13, V9 @@ -195,15 +199,10 @@ loop_vsx: VADDUWM V2, V7, V2 VADDUWM V3, V4, V3 - VXOR V15, V0, V15 - VXOR V12, V1, V12 - VXOR V13, V2, V13 - VXOR V14, V3, V14 - - VRLW V15, V27, V15 - VRLW V12, V27, V12 - VRLW V13, V27, V13 - VRLW V14, V27, V14 + VPERMXOR V15, V0, V21, V15 + VPERMXOR V12, V1, V21, V12 + VPERMXOR V13, V2, V21, V13 + VPERMXOR V14, V3, V21, V14 VADDUWM V10, V15, V10 VADDUWM V11, V12, V11 @@ -225,15 +224,10 @@ loop_vsx: VADDUWM V2, V7, V2 VADDUWM V3, V4, V3 - VXOR V15, V0, V15 - VXOR V12, V1, V12 - VXOR V13, V2, V13 - VXOR V14, V3, V14 - - VRLW V15, V29, V15 - VRLW V12, V29, V12 - VRLW V13, V29, V13 - VRLW V14, V29, V14 + VPERMXOR V15, V0, V20, V15 + VPERMXOR V12, V1, V20, V12 + VPERMXOR V13, V2, V20, V13 + VPERMXOR V14, V3, V20, V14 VADDUWM V10, V15, V10 VADDUWM V11, V12, V11 @@ -249,48 +243,48 @@ loop_vsx: VRLW V6, V30, V6 VRLW V7, V30, V7 VRLW V4, V30, V4 - BC 16, LT, loop_vsx + BDNZ loop_vsx VADDUWM V12, V26, V12 - WORD $0x13600F8C // VMRGEW V0, V1, V27 - WORD $0x13821F8C // VMRGEW V2, V3, V28 + VMRGEW V0, V1, V27 + VMRGEW V2, V3, V28 - WORD $0x10000E8C // VMRGOW V0, V1, V0 - WORD $0x10421E8C // VMRGOW V2, V3, V2 + VMRGOW V0, V1, V0 + VMRGOW V2, V3, V2 - WORD $0x13A42F8C // VMRGEW V4, V5, V29 - WORD $0x13C63F8C // VMRGEW V6, V7, V30 + VMRGEW V4, V5, V29 + VMRGEW V6, V7, V30 XXPERMDI VS32, VS34, $0, VS33 XXPERMDI VS32, VS34, $3, VS35 XXPERMDI VS59, VS60, $0, VS32 XXPERMDI VS59, VS60, $3, VS34 - WORD $0x10842E8C // VMRGOW V4, V5, V4 - WORD $0x10C63E8C // VMRGOW V6, V7, V6 + VMRGOW V4, V5, V4 + VMRGOW V6, V7, V6 - WORD $0x13684F8C // VMRGEW V8, V9, V27 - WORD $0x138A5F8C // VMRGEW V10, V11, V28 + VMRGEW V8, V9, V27 + VMRGEW V10, V11, V28 XXPERMDI VS36, VS38, $0, VS37 XXPERMDI VS36, VS38, $3, VS39 XXPERMDI VS61, VS62, $0, VS36 XXPERMDI VS61, VS62, $3, VS38 - WORD $0x11084E8C // VMRGOW V8, V9, V8 - WORD $0x114A5E8C // VMRGOW V10, V11, V10 + VMRGOW V8, V9, V8 + VMRGOW V10, V11, V10 - WORD $0x13AC6F8C // VMRGEW V12, V13, V29 - WORD $0x13CE7F8C // VMRGEW V14, V15, V30 + VMRGEW V12, V13, V29 + VMRGEW V14, V15, V30 XXPERMDI VS40, VS42, $0, VS41 XXPERMDI VS40, VS42, $3, VS43 XXPERMDI VS59, VS60, $0, VS40 XXPERMDI VS59, VS60, $3, VS42 - WORD $0x118C6E8C // VMRGOW V12, V13, V12 - WORD $0x11CE7E8C // VMRGOW V14, V15, V14 + VMRGOW V12, V13, V12 + VMRGOW V14, V15, V14 VSPLTISW $4, V27 VADDUWM V26, V27, V26 @@ -431,7 +425,7 @@ tail_vsx: ADD $-1, R11, R12 ADD $-1, INP ADD $-1, OUT - + PCALIGN $16 looptail_vsx: // Copying the result to OUT // in bytes. @@ -439,7 +433,7 @@ looptail_vsx: MOVBZU 1(INP), TMP XOR KEY, TMP, KEY MOVBU KEY, 1(OUT) - BC 16, LT, looptail_vsx + BDNZ looptail_vsx // Clear the stack values STXVW4X VS48, (R11)(R0)