Skip to content

Commit

Permalink
Restore rotated EOR sequence
Browse files Browse the repository at this point in the history
This is a couple of percent faster for small inputs in my benchmarks.
  • Loading branch information
cespare committed Dec 2, 2022
1 parent bbc4786 commit 27bcde0
Showing 1 changed file with 10 additions and 7 deletions.
17 changes: 10 additions & 7 deletions xxhash_arm64.s
Original file line number Diff line number Diff line change
Expand Up @@ -95,31 +95,34 @@ afterLoop:
LDP.P 16(p), (x1, x2)

round0(x1)
EOR x1, h
// NOTE: here and below, sequencing the EOR after the ROR (using a
// rotated register) is worth a small but measurable speedup for small
// inputs.
ROR $64-27, h
EOR x1 @> 64-27, h, h
MADD h, prime4, prime1, h

round0(x2)
EOR x2, h
ROR $64-27, h
EOR x2 @> 64-27, h, h
MADD h, prime4, prime1, h

try8:
TBZ $3, n, try4
MOVD.P 8(p), x1

round0(x1)
EOR x1, h
ROR $64-27, h
EOR x1 @> 64-27, h, h
MADD h, prime4, prime1, h

try4:
TBZ $2, n, try2
MOVWU.P 4(p), x2

MUL prime1, x2
EOR x2, h
ROR $64-23, h
EOR x2 @> 64-23, h, h
MADD h, prime3, prime2, h

try2:
Expand All @@ -129,22 +132,22 @@ try2:
LSR $8, x3, x2

MUL prime5, x1
EOR x1, h
ROR $64-11, h
EOR x1 @> 64-11, h, h
MUL prime1, h

MUL prime5, x2
EOR x2, h
ROR $64-11, h
EOR x2 @> 64-11, h, h
MUL prime1, h

try1:
TBZ $0, n, finalize
MOVBU (p), x4

MUL prime5, x4
EOR x4, h
ROR $64-11, h
EOR x4 @> 64-11, h, h
MUL prime1, h

finalize:
Expand Down

0 comments on commit 27bcde0

Please sign in to comment.