-
Notifications
You must be signed in to change notification settings - Fork 13k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Calling Vec::extend repeatedly in a for loop is faster than calling it once on iter::flatten #79992
Comments
@rustbot label I-slow |
That is likely due to the inner iterators implementing I don't think these two lines do the right thing in your code. |
You are correct in that those lines don't do the right thing in the general case, however for the specific case of how i tend to structure my inputs for the advent of code puzzles it's exactly right. When I'm using this type I'm reading input from a file, typically one character per T, one row per line. The outer iterator is a &[u8].split.map, which pipes through the original text size in size_hint, which is the total input size. Since i'm doing one character per T this ends up being a very good slight overestimation of how much space I need (since it's counting newlines too). |
Assembly (at the time of writing)alloc::raw_vec::finish_grow: # @alloc::raw_vec::finish_grow
# %bb.0:
pushq %r15
pushq %r14
pushq %rbx
movq %rdx, %rbx
movq %rsi, %r15
movq %rdi, %r14
cmpq $0, 8(%rcx)
je .LBB0_3
# %bb.1:
movq 16(%rcx), %rsi
testq %rsi, %rsi
je .LBB0_3
# %bb.2:
movq (%rcx), %rdi
movq %r15, %rdx
movq %rbx, %rcx
callq *__rust_realloc@GOTPCREL(%rip)
jmp .LBB0_7
.LBB0_3:
testq %rbx, %rbx
je .LBB0_4
# %bb.6:
movq __rust_no_alloc_shim_is_unstable@GOTPCREL(%rip), %rax
movzbl (%rax), %eax
movq %rbx, %rdi
movq %r15, %rsi
callq *__rust_alloc@GOTPCREL(%rip)
jmp .LBB0_7
.LBB0_4:
movq %r15, %rax
.LBB0_7:
xorl %ecx, %ecx
testq %rax, %rax
cmovneq %rax, %r15
sete %cl
movq %r15, 8(%r14)
movq %rbx, 16(%r14)
movq %rcx, (%r14)
popq %rbx
popq %r14
popq %r15
retq
# -- End function
alloc::raw_vec::RawVec<T,A>::grow_one: # @"alloc::raw_vec::RawVec<T,A>::grow_one"
# %bb.0:
pushq %r14
pushq %rbx
subq $56, %rsp
movq (%rdi), %rax
cmpq $-1, %rax
je .LBB1_1
# %bb.2:
movq %rdi, %rbx
leaq 1(%rax), %rcx
leaq (%rax,%rax), %rdx
cmpq %rcx, %rdx
cmovaq %rdx, %rcx
cmpq $5, %rcx
movl $4, %r14d
cmovaeq %rcx, %r14
xorl %edi, %edi
shrq $61, %rcx
jne .LBB1_3
# %bb.4:
leaq (,%r14,8), %rdx
movabsq $9223372036854775800, %rcx # imm = 0x7FFFFFFFFFFFFFF8
# implicit-def: $rsi
cmpq %rcx, %rdx
ja .LBB1_10
# %bb.5:
testq %rax, %rax
je .LBB1_6
# %bb.7:
movq 8(%rbx), %rcx
shlq $3, %rax
movq %rcx, 32(%rsp)
movq %rax, 48(%rsp)
movl $8, %eax
jmp .LBB1_8
.LBB1_6:
xorl %eax, %eax
.LBB1_8:
movq %rax, 40(%rsp)
leaq 8(%rsp), %rdi
leaq 32(%rsp), %rcx
movl $8, %esi
callq alloc::raw_vec::finish_grow
cmpl $1, 8(%rsp)
je .LBB1_9
# %bb.11:
movq 16(%rsp), %rax
movq %rax, 8(%rbx)
movq %r14, (%rbx)
addq $56, %rsp
popq %rbx
popq %r14
retq
.LBB1_1:
xorl %edi, %edi
# implicit-def: $rsi
callq *alloc::raw_vec::handle_error@GOTPCREL(%rip)
.LBB1_9:
movq 16(%rsp), %rdi
movq 24(%rsp), %rsi
.LBB1_10:
callq *alloc::raw_vec::handle_error@GOTPCREL(%rip)
.LBB1_3:
# implicit-def: $rsi
callq *alloc::raw_vec::handle_error@GOTPCREL(%rip)
# -- End function
.LCPI2_0:
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.byte 1 # 0x1
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.LCPI2_1:
.quad 2 # 0x2
.quad 2 # 0x2
.LCPI2_2:
.quad 4 # 0x4
.quad 4 # 0x4
playground::fp: # @playground::fp
# %bb.0:
pushq %rbp
pushq %r15
pushq %r14
pushq %r13
pushq %r12
pushq %rbx
pushq %rax
movabsq $2305843009213693948, %rbp # imm = 0x1FFFFFFFFFFFFFFC
leaq (,%rdx,8), %r12
leaq 3(%rbp), %rax
xorl %r13d, %r13d
cmpq %rax, %rdx
ja .LBB2_14
# %bb.1:
movabsq $9223372036854775800, %rax # imm = 0x7FFFFFFFFFFFFFF8
cmpq %rax, %r12
ja .LBB2_14
# %bb.2:
movq %rdx, %rbx
movq %rsi, %r15
movq %rdi, %r14
testq %r12, %r12
je .LBB2_3
# %bb.4:
movq __rust_no_alloc_shim_is_unstable@GOTPCREL(%rip), %rax
movzbl (%rax), %eax
movl $8, %r13d
movl $8, %esi
movq %r12, %rdi
callq *__rust_alloc@GOTPCREL(%rip)
testq %rax, %rax
je .LBB2_14
# %bb.5:
movq %rbx, %rcx
testq %rbx, %rbx
jne .LBB2_7
jmp .LBB2_13
.LBB2_3:
movl $8, %eax
xorl %ecx, %ecx
testq %rbx, %rbx
je .LBB2_13
.LBB2_7:
cmpq $4, %rbx
jae .LBB2_9
# %bb.8:
xorl %ebp, %ebp
jmp .LBB2_12
.LBB2_9:
andq %rbx, %rbp
movdqa .LCPI2_0(%rip), %xmm0 # xmm0 = [0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0]
xorl %edx, %edx
movq %r15, %xmm1
pshufd $68, %xmm1, %xmm1 # xmm1 = xmm1[0,1,0,1]
movdqa .LCPI2_1(%rip), %xmm2 # xmm2 = [2,2]
movdqa .LCPI2_2(%rip), %xmm3 # xmm3 = [4,4]
.LBB2_10: # =>This Inner Loop Header: Depth=1
movdqa %xmm1, %xmm4
paddq %xmm0, %xmm4
movdqu %xmm4, (%rax,%rdx,8)
paddq %xmm2, %xmm4
movdqu %xmm4, 16(%rax,%rdx,8)
addq $4, %rdx
paddq %xmm3, %xmm0
cmpq %rdx, %rbp
jne .LBB2_10
# %bb.11:
cmpq %rbx, %rbp
je .LBB2_13
.LBB2_12: # =>This Inner Loop Header: Depth=1
leaq (%r15,%rbp), %rdx
movq %rdx, (%rax,%rbp,8)
incq %rbp
cmpq %rbp, %rbx
jne .LBB2_12
.LBB2_13:
movq %rcx, (%r14)
movq %rax, 8(%r14)
movq %rbx, 16(%r14)
movq %r14, %rax
addq $8, %rsp
popq %rbx
popq %r12
popq %r13
popq %r14
popq %r15
popq %rbp
retq
.LBB2_14:
movq %r13, %rdi
movq %r12, %rsi
callq *alloc::raw_vec::handle_error@GOTPCREL(%rip)
# -- End function
playground::proc: # @playground::proc
# %bb.0:
pushq %r15
pushq %r14
pushq %r13
pushq %r12
pushq %rbx
subq $32, %rsp
leaq (,%rdx,8), %r12
xorl %r13d, %r13d
movq %rdx, %rax
shrq $61, %rax
jne .LBB3_16
# %bb.1:
movabsq $9223372036854775800, %rax # imm = 0x7FFFFFFFFFFFFFF8
cmpq %rax, %r12
ja .LBB3_16
# %bb.2:
movq %rdx, %r14
movq %rsi, %r15
movq %rdi, %rbx
testq %r12, %r12
je .LBB3_3
# %bb.4:
movq __rust_no_alloc_shim_is_unstable@GOTPCREL(%rip), %rax
movzbl (%rax), %eax
movl $8, %r13d
movl $8, %esi
movq %r12, %rdi
callq *__rust_alloc@GOTPCREL(%rip)
testq %rax, %rax
je .LBB3_16
# %bb.5:
movq %r14, %rcx
jmp .LBB3_6
.LBB3_3:
movl $8, %eax
xorl %ecx, %ecx
.LBB3_6:
movq %rcx, 8(%rsp)
movq %rax, 16(%rsp)
movq $0, 24(%rsp)
testq %r14, %r14
je .LBB3_12
# %bb.7:
xorl %r13d, %r13d
leaq 8(%rsp), %r12
jmp .LBB3_8
.LBB3_11: # in Loop: Header=BB3_8 Depth=1
leaq (%r15,%r13), %rcx
movq %rcx, (%rax,%r13,8)
incq %r13
movq %r13, 24(%rsp)
cmpq %r13, %r14
je .LBB3_12
.LBB3_8: # =>This Inner Loop Header: Depth=1
cmpq 8(%rsp), %r13
jne .LBB3_11
# %bb.9: # in Loop: Header=BB3_8 Depth=1
movq %r12, %rdi
callq alloc::raw_vec::RawVec<T,A>::grow_one
# %bb.10: # in Loop: Header=BB3_8 Depth=1
movq 16(%rsp), %rax
jmp .LBB3_11
.LBB3_12:
movq 24(%rsp), %rax
movq %rax, 16(%rbx)
movq 8(%rsp), %rax
movq %rax, (%rbx)
movq 16(%rsp), %rax
movq %rax, 8(%rbx)
movq %rbx, %rax
addq $32, %rsp
popq %rbx
popq %r12
popq %r13
popq %r14
popq %r15
retq
.LBB3_16:
movq %r13, %rdi
movq %r12, %rsi
callq *alloc::raw_vec::handle_error@GOTPCREL(%rip)
movq %rax, %rbx
movq 8(%rsp), %rsi
testq %rsi, %rsi
je .LBB3_15
# %bb.14:
movq 16(%rsp), %rdi
shlq $3, %rsi
movl $8, %edx
callq *__rust_dealloc@GOTPCREL(%rip)
.LBB3_15:
movq %rbx, %rdi
callq _Unwind_Resume@PLT
# -- End function
|
See smmalis37/aoc2020@3fbf5d9#diff-6965df871359c91ee03b5564314f3e131c0877b8603aea626b5d8f9180eb4ee8R28-L29
The highlighted change, when measured in isolation from the other changes in that commit, resulted in a roughly 30-40% speedup of the entire function (i.e. the entire chain of iterators ending in the collect call that calls this FromIter implementation). If measured in isolation I assume the impact would be even greater. I expected the two to be equivalent.
The code being measured is day 11's parser: https://github.com/smmalis37/aoc2020/blob/3fbf5d9b9e89d18c213c0d9df6a08db9364b2604/src/days/day11.rs#L22. To replicate my results just clone that repo and run
cargo run --release 11
and look at the measured time of the parser (the rest of day 11 will be measured too but it isn't calling this code).The text was updated successfully, but these errors were encountered: