-
Notifications
You must be signed in to change notification settings - Fork 12.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fast algorithm for u128 (and i128) divided by small constant #54867
Comments
No point in doing it in rustc, if it should be done in LLVM. |
Do you have benchmarks that the multiply algorithm is actually faster for Note that the 32-bit one does imul rcx, rax
shr rcx, 35 which is using a 64-bit multiply (two-argument form, with So it's possible that the |
I have no proof, but this is worth exploring. The specialized-div-rem shows that we could go much faster than the current div and rem operations. |
@scottmcm I’m fairly sure that it would need at most 3 multiplications and a few additions to calculate the value necessary for shifting. |
The fact that LLVM cannot reduce the division to multiplications by a constant might be caused by #44545 or is an independent issue. I extracted the small divisor path from my algorithms, and the compiler was able to reduce the divisions to multiplications by a constant.
I have checked for correctness with my fuzz tester. |
I investigated this recently for #76017 (comment). It is a bug in LLVM and it isn’t like it is impossible for it to strength-reduce, its just that doing so: a) requires the upper-half of the multiplication result (i.e. for 128-bit multiplication it requires the upper 128-bits of a 256-bit result); and I think I know an alternative way to resolve it, though, just haven’t had the time to get back to it.
Turns out it isn’t on 32-bit targets! |
I’ve drafted a LLVM differential to fix this https://reviews.llvm.org/D87976. |
In the meantime GCC had implemented it: pub fn reversed(mut n: u128) -> u128 { // In base 10.
let mut reversed = 0;
while n != 0 {
reversed = reversed * 10 + n % 10;
n /= 10;
}
reversed
} Rustc 1.52.0-nightly (152f660 2021-02-17): reversed:
push rbp
push r15
push r14
push r13
push r12
push rbx
sub rsp, 8
mov rax, rdi
or rax, rsi
je .LBB0_1
mov rbx, rsi
mov r14, rdi
xor edx, edx
mov r15d, 10
xor ecx, ecx
.LBB0_4:
mulx rax, r13, r15
lea rcx, [rcx + 4*rcx]
lea r12, [rax + 2*rcx]
mov edx, 10
mov rdi, r14
mov rsi, rbx
xor ecx, ecx
call qword ptr [rip + __udivti3@GOTPCREL]
mov rsi, rdx
mov rdx, rax
mulx rcx, rdi, r15
lea rdx, [rsi + 4*rsi]
lea rbp, [rcx + 2*rdx]
mov rdx, r14
sub rdx, rdi
mov rcx, rbx
sbb rcx, rbp
add rdx, r13
adc rcx, r12
cmp r14, 10
sbb rbx, 0
mov r14, rax
mov rbx, rsi
jae .LBB0_4
jmp .LBB0_2
.LBB0_1:
xor edx, edx
xor ecx, ecx
.LBB0_2:
mov rax, rdx
mov rdx, rcx
add rsp, 8
pop rbx
pop r12
pop r13
pop r14
pop r15
pop rbp
ret __uint128_t reversed(__uint128_t n) {
__uint128_t reversed = 0;
while (n != 0) {
reversed = reversed * 10 + n % 10;
n /= 10;
}
return reversed;
} GCC thunk 11.0.1 20210307 (experimental): reversed(unsigned __int128):
mov r8, rsi
mov r9, rdi
mov rsi, rdi
mov rax, r8
mov rdi, r8
or rax, r9
je .L6
push r15
xor eax, eax
xor edx, edx
mov r15d, 10
push r14
movabs r14, -3689348814741910324
push r13
xor r13d, r13d
push r12
movabs r12, -3689348814741910323
push rbp
push rbx
.L5:
imul rcx, rdx, 10
mul r15
mov r9, rdx
mov r8, rax
add r9, rcx
mov rcx, rsi
add rcx, rdi
adc rcx, 0
xor r11d, r11d
mov rax, rcx
mul r12
mov rax, rdx
and rdx, -4
shr rax, 2
add rdx, rax
mov rax, rsi
sub rcx, rdx
mov rdx, rdi
sub rax, rcx
mov r10, rcx
sbb rdx, r11
mov rbp, rdx
mov rdx, rax
imul rdx, r14
imul rbp, r12
add rbp, rdx
mul r12
mov rcx, rax
mov rbx, rdx
and eax, 1
mov edx, 5
mul rdx
add rbx, rbp
add rax, r10
adc rdx, r11
add rax, r8
mov r8, rdi
mov rdi, rbx
adc rdx, r9
mov r9, rsi
mov rsi, rcx
shr rdi
shrd rsi, rbx, 1
mov ebx, 9
cmp rbx, r9
mov rbx, r13
sbb rbx, r8
jc .L5
pop rbx
pop rbp
pop r12
pop r13
pop r14
pop r15
ret
.L6:
mov rax, r9
mov rdx, r8
ret |
Yes, the primary couple of concerns I've seen blocking the LLVM diff is:
and the regression on codegen quality in certain corner cases. The former can probably be resolved by some sort of a target property. The latter is harder, I suspect. |
Btw, clang is also able to optimize this for C (godbolt): #include "stdint.h"
uint64_t div10(uint64_t num) {
return num / 10;
}
unsigned __int128 div10(unsigned __int128 num) {
return num / 10;
} gives: div10(unsigned long): # @div10(unsigned long)
mov rax, rdi
movabs rcx, -3689348814741910323
mul rcx
mov rax, rdx
shr rax, 3
ret
div10(unsigned __int128): # @div10(unsigned __int128)
shrd rdi, rsi, 1
shr rsi
mov rcx, rdi
add rcx, rsi
adc rcx, 0
movabs r8, -3689348814741910323
mov rax, rcx
mul r8
shr rdx, 2
lea rax, [rdx + 4*rdx]
sub rcx, rax
sub rdi, rcx
sbb rsi, 0
movabs rcx, -3689348814741910324
mov rax, rdi
mul r8
imul rcx, rdi
add rdx, rcx
imul r8, rsi
add rdx, r8
ret Still an issue for Rust though. cc also #103126 where the size reductions of libcore thanks to that PR are probably due to this issue (see also this zulip discussion). |
[citation needed] This should be fixed with the LLVM 16 update in nightly, and as far as I can tell it is: https://rust.godbolt.org/z/E85djEnTY |
Now Rustc nightly is able to do it for u128 (but not for i128). |
Comparing the idiv variant under clang #include "stdint.h"
__int128 idiv10( __int128 num) {
return num / 10;
} and rustc: pub fn idiv10(a: i128) -> i128 {
a / 10
} Both emit almost exactly the same code under opts: example::idiv10:
push rax
mov edx, 10
xor ecx, ecx
call qword ptr [rip + __divti3@GOTPCREL] ; clang: call __divti3@PLT
pop rcx
ret |
@nikic my mistake, I compared stable rustc with clang trunk. Didn't check nightly rustc. I'm very happy that it's now optimized 😆 . |
This is an enhancement request. While u16, u32 and u64 numbers get divided by a small constant divisor using a fast algorithm, the same isn't true for u128 numbers:
Generate asm (with -O):
The faster algorithm is short enough and it could be added to rustc:
http://ridiculousfish.com/blog/posts/labor-of-division-episode-i.html
http://ridiculousfish.com/blog/posts/labor-of-division-episode-ii.html
http://libdivide.com/
http://ridiculousfish.com/blog/posts/labor-of-division-episode-iii.html
The text was updated successfully, but these errors were encountered: