-
Notifications
You must be signed in to change notification settings - Fork 12.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Noop-loop is no longer optimized away on Nightly #121239
Comments
I was able to "fix" the nightly codegen to be identical to beta by adding |
cc @nikic |
WG-prioritization assigning priority (Zulip discussion). @rustbot label -I-prioritize +P-medium |
Odd, changing
to
fixes it. But |
I think the the direct cause here is a correctness fix in LLVM 18 which ends up stripping more poison flags during certain loop transformations, and ends up (likely unnecessarily) dropping The reason why offset_from helps is that it allows making use of the But this is not actually required (https://alive2.llvm.org/ce/z/gbbYmE). It works just as well with unsigned exact division. I'll see about fixing that. |
Upstream patch: llvm/llvm-project#82334 |
Now this is just rude. Apparently LoopIdiomRecognize strips the flags -- without actually performing any transform. I think LIR does some speculative expansion and then drops instructions again, but this doesn't recover flag changes. |
Upstream issue for LIR: llvm/llvm-project#82337 |
Update to LLVM 18.1.0 rc 3 Fixes rust-lang#120819. Fixes rust-lang#121180. Fixes rust-lang#121239. Fixes rust-lang#121367.
Update to LLVM 18.1.0 rc 3 Fixes rust-lang#120819. Fixes rust-lang#121180. Fixes rust-lang#121239. Fixes rust-lang#121367.
Update to LLVM 18.1.0 rc 3 Fixes rust-lang#120819. Fixes rust-lang#121180. Fixes rust-lang#121239. Fixes rust-lang#121367.
Update to LLVM 18.1.0 rc 3 Fixes rust-lang#120819. Fixes rust-lang#121180. Fixes rust-lang#121239. Fixes rust-lang#121367.
Update to LLVM 18.1.0 rc 3 Fixes rust-lang#120819. Fixes rust-lang#121180. Fixes rust-lang#121239. Fixes rust-lang#121367.
Update to LLVM 18.1.0 rc 3 Fixes rust-lang#120819. Fixes rust-lang#121180. Fixes rust-lang#121239. Fixes rust-lang#121367.
Looks like my reduction was insufficient. The example on godbolt does optimize now but the original code that I was working on still fails with the I'll extract the IR from a codegen test. |
unoptimized IR, using sub_ptr: sub_ptr.ll.txt optimized IR, using sub_ptr, fails to eliminate loop; ModuleID = 'vec_in_place2.ce5efceb0ce65a83-cgu.0'
source_filename = "vec_in_place2.ce5efceb0ce65a83-cgu.0"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
%"alloc::vec::Vec<Foo>" = type { %"alloc::raw_vec::RawVec<Foo>", i64 }
%"alloc::raw_vec::RawVec<Foo>" = type { i64, ptr, %"alloc::alloc::Global" }
%"alloc::alloc::Global" = type {}
%Foo = type { i64, i64, i64, i64 }
; Function Attrs: nofree norecurse nosync nounwind nonlazybind uwtable
define void @vec_iterator_cast_aggregate(ptr noalias nocapture noundef writeonly sret(%"alloc::vec::Vec<Foo>") align 8 dereferenceable(24) %_0, ptr noalias nocapture noundef readonly align 8 dereferenceable(24) %vec) unnamed_addr #0 personality ptr @rust_eh_personality {
start:
%me.sroa.0.0.copyload.i = load i64, ptr %vec, align 8, !alias.scope !3, !noalias !6
%me.sroa.4.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %vec, i64 8
%me.sroa.4.0.copyload.i = load ptr, ptr %me.sroa.4.0.self.sroa_idx.i, align 8, !alias.scope !3, !noalias !6, !nonnull !8, !noundef !8
%me.sroa.5.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %vec, i64 16
%me.sroa.5.0.copyload.i = load i64, ptr %me.sroa.5.0.self.sroa_idx.i, align 8, !alias.scope !3, !noalias !6
%_19.i = getelementptr inbounds [4 x i64], ptr %me.sroa.4.0.copyload.i, i64 %me.sroa.5.0.copyload.i
tail call void @llvm.experimental.noalias.scope.decl(metadata !9)
%_19.i.idx = and i64 %me.sroa.5.0.copyload.i, 576460752303423487
%0 = icmp eq i64 %me.sroa.5.0.copyload.i, 0
br i1 %0, label %"_ZN5alloc3vec16in_place_collect108_$LT$impl$u20$alloc..vec..spec_from_iter..SpecFromIter$LT$T$C$I$GT$$u20$for$u20$alloc..vec..Vec$LT$T$GT$$GT$9from_iter17h3c7c786b418efdf9E.exit", label %bb4.i.i.preheader
bb4.i.i.preheader: ; preds = %start
%xtraiter = and i64 %me.sroa.5.0.copyload.i, 1
%1 = icmp eq i64 %_19.i.idx, 1
br i1 %1, label %"_ZN5alloc3vec16in_place_collect108_$LT$impl$u20$alloc..vec..spec_from_iter..SpecFromIter$LT$T$C$I$GT$$u20$for$u20$alloc..vec..Vec$LT$T$GT$$GT$9from_iter17h3c7c786b418efdf9E.exit.loopexit.unr-lcssa", label %bb4.i.i.preheader.new
bb4.i.i.preheader.new: ; preds = %bb4.i.i.preheader
%unroll_iter = sub nsw i64 %_19.i.idx, %xtraiter
br label %bb4.i.i
bb4.i.i: ; preds = %bb4.i.i, %bb4.i.i.preheader.new
%state.sroa.5.0.i.i = phi i64 [ 0, %bb4.i.i.preheader.new ], [ %_10.i.i.1, %bb4.i.i ]
%niter = phi i64 [ 0, %bb4.i.i.preheader.new ], [ %niter.next.1, %bb4.i.i ]
%_10.i.i = or disjoint i64 %state.sroa.5.0.i.i, 1
%_14.neg.i.i = sub i64 %state.sroa.5.0.i.i, %_19.i.idx
%_18.i.i.i.i.i = getelementptr inbounds [4 x i64], ptr %_19.i, i64 %_14.neg.i.i
%2 = load <4 x i64>, ptr %_18.i.i.i.i.i, align 8, !noalias !12
%dst.i.i = getelementptr inbounds %Foo, ptr %me.sroa.4.0.copyload.i, i64 %state.sroa.5.0.i.i
store <4 x i64> %2, ptr %dst.i.i, align 8, !noalias !25
%_10.i.i.1 = add nuw i64 %state.sroa.5.0.i.i, 2
%_14.neg.i.i.1 = sub i64 %_10.i.i, %_19.i.idx
%_18.i.i.i.i.i.1 = getelementptr inbounds [4 x i64], ptr %_19.i, i64 %_14.neg.i.i.1
%3 = load <4 x i64>, ptr %_18.i.i.i.i.i.1, align 8, !noalias !12
%dst.i.i.1 = getelementptr inbounds %Foo, ptr %me.sroa.4.0.copyload.i, i64 %_10.i.i
store <4 x i64> %3, ptr %dst.i.i.1, align 8, !noalias !25
%niter.next.1 = add i64 %niter, 2
%niter.ncmp.1 = icmp eq i64 %niter.next.1, %unroll_iter
br i1 %niter.ncmp.1, label %"_ZN5alloc3vec16in_place_collect108_$LT$impl$u20$alloc..vec..spec_from_iter..SpecFromIter$LT$T$C$I$GT$$u20$for$u20$alloc..vec..Vec$LT$T$GT$$GT$9from_iter17h3c7c786b418efdf9E.exit.loopexit.unr-lcssa", label %bb4.i.i
"_ZN5alloc3vec16in_place_collect108_$LT$impl$u20$alloc..vec..spec_from_iter..SpecFromIter$LT$T$C$I$GT$$u20$for$u20$alloc..vec..Vec$LT$T$GT$$GT$9from_iter17h3c7c786b418efdf9E.exit.loopexit.unr-lcssa": ; preds = %bb4.i.i, %bb4.i.i.preheader
%state.sroa.5.0.i.i.unr = phi i64 [ 0, %bb4.i.i.preheader ], [ %_10.i.i.1, %bb4.i.i ]
%lcmp.mod.not = icmp eq i64 %xtraiter, 0
br i1 %lcmp.mod.not, label %"_ZN5alloc3vec16in_place_collect108_$LT$impl$u20$alloc..vec..spec_from_iter..SpecFromIter$LT$T$C$I$GT$$u20$for$u20$alloc..vec..Vec$LT$T$GT$$GT$9from_iter17h3c7c786b418efdf9E.exit", label %bb4.i.i.epil
bb4.i.i.epil: ; preds = %"_ZN5alloc3vec16in_place_collect108_$LT$impl$u20$alloc..vec..spec_from_iter..SpecFromIter$LT$T$C$I$GT$$u20$for$u20$alloc..vec..Vec$LT$T$GT$$GT$9from_iter17h3c7c786b418efdf9E.exit.loopexit.unr-lcssa"
%_14.neg.i.i.epil = sub i64 %state.sroa.5.0.i.i.unr, %_19.i.idx
%_18.i.i.i.i.i.epil = getelementptr inbounds [4 x i64], ptr %_19.i, i64 %_14.neg.i.i.epil
%4 = load <4 x i64>, ptr %_18.i.i.i.i.i.epil, align 8, !noalias !12
%dst.i.i.epil = getelementptr inbounds %Foo, ptr %me.sroa.4.0.copyload.i, i64 %state.sroa.5.0.i.i.unr
store <4 x i64> %4, ptr %dst.i.i.epil, align 8, !noalias !25
br label %"_ZN5alloc3vec16in_place_collect108_$LT$impl$u20$alloc..vec..spec_from_iter..SpecFromIter$LT$T$C$I$GT$$u20$for$u20$alloc..vec..Vec$LT$T$GT$$GT$9from_iter17h3c7c786b418efdf9E.exit"
"_ZN5alloc3vec16in_place_collect108_$LT$impl$u20$alloc..vec..spec_from_iter..SpecFromIter$LT$T$C$I$GT$$u20$for$u20$alloc..vec..Vec$LT$T$GT$$GT$9from_iter17h3c7c786b418efdf9E.exit": ; preds = %bb4.i.i.epil, %"_ZN5alloc3vec16in_place_collect108_$LT$impl$u20$alloc..vec..spec_from_iter..SpecFromIter$LT$T$C$I$GT$$u20$for$u20$alloc..vec..Vec$LT$T$GT$$GT$9from_iter17h3c7c786b418efdf9E.exit.loopexit.unr-lcssa", %start
%_18.i = and i64 %me.sroa.0.0.copyload.i, 576460752303423487
store i64 %_18.i, ptr %_0, align 8, !alias.scope !9, !noalias !26
%vec.sroa.4.0._0.sroa_idx.i = getelementptr inbounds i8, ptr %_0, i64 8
store ptr %me.sroa.4.0.copyload.i, ptr %vec.sroa.4.0._0.sroa_idx.i, align 8, !alias.scope !9, !noalias !26
%vec.sroa.5.0._0.sroa_idx.i = getelementptr inbounds i8, ptr %_0, i64 16
store i64 %_19.i.idx, ptr %vec.sroa.5.0._0.sroa_idx.i, align 8, !alias.scope !9, !noalias !26
ret void
}
; Function Attrs: nonlazybind uwtable
declare noundef i32 @rust_eh_personality(i32 noundef, i32 noundef, i64 noundef, ptr noundef, ptr noundef) unnamed_addr #1
; Function Attrs: nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite)
declare void @llvm.experimental.noalias.scope.decl(metadata) #2
attributes #0 = { nofree norecurse nosync nounwind nonlazybind uwtable "probe-stack"="inline-asm" "target-cpu"="x86-64" }
attributes #1 = { nonlazybind uwtable "probe-stack"="inline-asm" "target-cpu"="x86-64" }
attributes #2 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) }
!llvm.module.flags = !{!0, !1}
!llvm.ident = !{!2}
!0 = !{i32 8, !"PIC Level", i32 2}
!1 = !{i32 2, !"RtLibUseGOT", i32 1}
!2 = !{!"rustc version 1.78.0-dev"}
!3 = !{!4}
!4 = distinct !{!4, !5, !"_ZN90_$LT$alloc..vec..Vec$LT$T$C$A$GT$$u20$as$u20$core..iter..traits..collect..IntoIterator$GT$9into_iter17he352ce3d58cff200E: %self"}
!5 = distinct !{!5, !"_ZN90_$LT$alloc..vec..Vec$LT$T$C$A$GT$$u20$as$u20$core..iter..traits..collect..IntoIterator$GT$9into_iter17he352ce3d58cff200E"}
!6 = !{!7}
!7 = distinct !{!7, !5, !"_ZN90_$LT$alloc..vec..Vec$LT$T$C$A$GT$$u20$as$u20$core..iter..traits..collect..IntoIterator$GT$9into_iter17he352ce3d58cff200E: %_0"}
!8 = !{}
!9 = !{!10}
!10 = distinct !{!10, !11, !"_ZN5alloc3vec16in_place_collect108_$LT$impl$u20$alloc..vec..spec_from_iter..SpecFromIter$LT$T$C$I$GT$$u20$for$u20$alloc..vec..Vec$LT$T$GT$$GT$9from_iter17h3c7c786b418efdf9E: %_0"}
!11 = distinct !{!11, !"_ZN5alloc3vec16in_place_collect108_$LT$impl$u20$alloc..vec..spec_from_iter..SpecFromIter$LT$T$C$I$GT$$u20$for$u20$alloc..vec..Vec$LT$T$GT$$GT$9from_iter17h3c7c786b418efdf9E"}
!12 = !{!13, !15, !16, !18, !19, !21, !22, !10, !24}
!13 = distinct !{!13, !14, !"_ZN103_$LT$alloc..vec..into_iter..IntoIter$LT$T$C$A$GT$$u20$as$u20$core..iter..traits..iterator..Iterator$GT$24index_from_end_unchecked17h43b8872b965e6d69E: %_0"}
!14 = distinct !{!14, !"_ZN103_$LT$alloc..vec..into_iter..IntoIter$LT$T$C$A$GT$$u20$as$u20$core..iter..traits..iterator..Iterator$GT$24index_from_end_unchecked17h43b8872b965e6d69E"}
!15 = distinct !{!15, !14, !"_ZN103_$LT$alloc..vec..into_iter..IntoIter$LT$T$C$A$GT$$u20$as$u20$core..iter..traits..iterator..Iterator$GT$24index_from_end_unchecked17h43b8872b965e6d69E: %self"}
!16 = distinct !{!16, !17, !"_ZN79_$LT$I$u20$as$u20$core..iter..traits..unchecked_iterator..SpecIndexedAccess$GT$30index_from_end_unchecked_inner17h727d6efa915e51e5E: %_0"}
!17 = distinct !{!17, !"_ZN79_$LT$I$u20$as$u20$core..iter..traits..unchecked_iterator..SpecIndexedAccess$GT$30index_from_end_unchecked_inner17h727d6efa915e51e5E"}
!18 = distinct !{!18, !17, !"_ZN79_$LT$I$u20$as$u20$core..iter..traits..unchecked_iterator..SpecIndexedAccess$GT$30index_from_end_unchecked_inner17h727d6efa915e51e5E: %self"}
!19 = distinct !{!19, !20, !"_ZN102_$LT$core..iter..adapters..map..Map$LT$I$C$F$GT$$u20$as$u20$core..iter..traits..iterator..Iterator$GT$24index_from_end_unchecked17hfc88e8970d8e3af0E: %_0"}
!20 = distinct !{!20, !"_ZN102_$LT$core..iter..adapters..map..Map$LT$I$C$F$GT$$u20$as$u20$core..iter..traits..iterator..Iterator$GT$24index_from_end_unchecked17hfc88e8970d8e3af0E"}
!21 = distinct !{!21, !20, !"_ZN102_$LT$core..iter..adapters..map..Map$LT$I$C$F$GT$$u20$as$u20$core..iter..traits..iterator..Iterator$GT$24index_from_end_unchecked17hfc88e8970d8e3af0E: %self"}
!22 = distinct !{!22, !23, !"_ZN83_$LT$I$u20$as$u20$alloc..vec..in_place_collect..SpecInPlaceCollect$LT$T$C$I$GT$$GT$16collect_in_place17h56b8fd95ce366a3bE: argument 0"}
!23 = distinct !{!23, !"_ZN83_$LT$I$u20$as$u20$alloc..vec..in_place_collect..SpecInPlaceCollect$LT$T$C$I$GT$$GT$16collect_in_place17h56b8fd95ce366a3bE"}
!24 = distinct !{!24, !11, !"_ZN5alloc3vec16in_place_collect108_$LT$impl$u20$alloc..vec..spec_from_iter..SpecFromIter$LT$T$C$I$GT$$u20$for$u20$alloc..vec..Vec$LT$T$GT$$GT$9from_iter17h3c7c786b418efdf9E: %iterator"}
!25 = !{!22, !10, !24}
!26 = !{!24} optimized IR, using offset_from, eliminates the loop
|
@the8472 Can you please provide a Rust test case or unoptimized IR? |
The text file above should contain unoptimized IR. |
Oh sorry, totally missed the first line... |
This optimizes with LLVM 19, presumably due to the fix for llvm/llvm-project#82337. But this is too risky to backport. |
LLVM does not know that the multiplication never overflows, which causes it to generate unnecessary instructions. Use `usize::unchecked_mul`, so that it can fold the `dst_cap` calculation when `size_of::<I::SRC>() == size_of::<T>()`. Running: ``` rustc -C llvm-args=-x86-asm-syntax=intel -O src/lib.rs --emit asm` ``` ```rust pub struct Foo([usize; 3]); pub fn unwrap_copy(v: Vec<Foo>) -> Vec<[usize; 3]> { v.into_iter().map(|f| f.0).collect() } ``` Before this commit: ``` define void @unwrap_copy(ptr noalias nocapture noundef writeonly sret([24 x i8]) align 8 dereferenceable(24) %_0, ptr noalias nocapture noundef readonly align 8 dereferenceable(24) %iter) { start: %me.sroa.0.0.copyload.i = load i64, ptr %iter, align 8 %me.sroa.4.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 8 %me.sroa.4.0.copyload.i = load ptr, ptr %me.sroa.4.0.self.sroa_idx.i, align 8 %me.sroa.5.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 16 %me.sroa.5.0.copyload.i = load i64, ptr %me.sroa.5.0.self.sroa_idx.i, align 8 %_19.i.idx = mul nsw i64 %me.sroa.5.0.copyload.i, 24 %0 = udiv i64 %_19.i.idx, 24 %_16.i.i = mul i64 %me.sroa.0.0.copyload.i, 24 %dst_cap.i.i = udiv i64 %_16.i.i, 24 store i64 %dst_cap.i.i, ptr %_0, align 8 %1 = getelementptr inbounds i8, ptr %_0, i64 8 store ptr %me.sroa.4.0.copyload.i, ptr %1, align 8 %2 = getelementptr inbounds i8, ptr %_0, i64 16 store i64 %0, ptr %2, align 8 ret void } ``` After: ``` define void @unwrap_copy(ptr noalias nocapture noundef writeonly sret([24 x i8]) align 8 dereferenceable(24) %_0, ptr noalias nocapture noundef readonly align 8 dereferenceable(24) %iter) { start: %me.sroa.0.0.copyload.i = load i64, ptr %iter, align 8 %me.sroa.4.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 8 %me.sroa.4.0.copyload.i = load ptr, ptr %me.sroa.4.0.self.sroa_idx.i, align 8 %me.sroa.5.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 16 %me.sroa.5.0.copyload.i = load i64, ptr %me.sroa.5.0.self.sroa_idx.i, align 8 %_19.i.idx = mul nsw i64 %me.sroa.5.0.copyload.i, 24 %0 = udiv i64 %_19.i.idx, 24 store i64 %me.sroa.0.0.copyload.i, ptr %_0, align 8 %1 = getelementptr inbounds i8, ptr %_0, i64 8 store ptr %me.sroa.4.0.copyload.i, ptr %1, align 8 %2 = getelementptr inbounds i8, ptr %_0, i64 16 store i64 %0, ptr %2, align 8, !alias.scope !9, !noalias !14 ret void } ``` Note that there is still one more `mul,udiv` pair that I couldn't get rid of. The root cause is the same issue as rust-lang#121239, the `nuw` gets stripped off of `ptr::sub_ptr`.
optimize inplace collection of Vec This PR has the following changes: 1. Using `usize::unchecked_mul` in https://github.com/rust-lang/rust/blob/79424056b05eaa9563d16dfab9b9a0c8f033f220/library/alloc/src/vec/in_place_collect.rs#L262 as LLVM, does not know that the operation can't wrap, since that's the size of the original allocation. Given the following: ```rust pub struct Foo([usize; 3]); pub fn unwrap_copy(v: Vec<Foo>) -> Vec<[usize; 3]> { v.into_iter().map(|f| f.0).collect() } ``` <details> <summary>Before this commit:</summary> ```llvm define void `@unwrap_copy(ptr` noalias nocapture noundef writeonly sret([24 x i8]) align 8 dereferenceable(24) %_0, ptr noalias nocapture noundef readonly align 8 dereferenceable(24) %iter) { start: %me.sroa.0.0.copyload.i = load i64, ptr %iter, align 8 %me.sroa.4.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 8 %me.sroa.4.0.copyload.i = load ptr, ptr %me.sroa.4.0.self.sroa_idx.i, align 8 %me.sroa.5.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 16 %me.sroa.5.0.copyload.i = load i64, ptr %me.sroa.5.0.self.sroa_idx.i, align 8 %_19.i.idx = mul nsw i64 %me.sroa.5.0.copyload.i, 24 %0 = udiv i64 %_19.i.idx, 24 ; Unnecessary calculation %_16.i.i = mul i64 %me.sroa.0.0.copyload.i, 24 %dst_cap.i.i = udiv i64 %_16.i.i, 24 store i64 %dst_cap.i.i, ptr %_0, align 8 %1 = getelementptr inbounds i8, ptr %_0, i64 8 store ptr %me.sroa.4.0.copyload.i, ptr %1, align 8 %2 = getelementptr inbounds i8, ptr %_0, i64 16 store i64 %0, ptr %2, align 8 ret void } ``` </details> <details> <summary>After:</summary> ```llvm define void `@unwrap_copy(ptr` noalias nocapture noundef writeonly sret([24 x i8]) align 8 dereferenceable(24) %_0, ptr noalias nocapture noundef readonly align 8 dereferenceable(24) %iter) { start: %me.sroa.0.0.copyload.i = load i64, ptr %iter, align 8 %me.sroa.4.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 8 %me.sroa.4.0.copyload.i = load ptr, ptr %me.sroa.4.0.self.sroa_idx.i, align 8 %me.sroa.5.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 16 %me.sroa.5.0.copyload.i = load i64, ptr %me.sroa.5.0.self.sroa_idx.i, align 8 %_19.i.idx = mul nsw i64 %me.sroa.5.0.copyload.i, 24 %0 = udiv i64 %_19.i.idx, 24 store i64 %me.sroa.0.0.copyload.i, ptr %_0, align 8 %1 = getelementptr inbounds i8, ptr %_0, i64 8 store ptr %me.sroa.4.0.copyload.i, ptr %1, align 8 %2 = getelementptr inbounds i8, ptr %_0, i64 16 store i64 %0, ptr %2, align 8, !alias.scope !9, !noalias !14 ret void } ``` </details> Note that there is still one more `mul,udiv` pair that I couldn't get rid of. The root cause is the same issue as rust-lang#121239, the `nuw` gets stripped off of `ptr::sub_ptr`. 2. `Iterator::try_fold` gets called on the underlying Iterator in `SpecInPlaceCollect::collect_in_place` whenever it does not implement `TrustedRandomAccess`. For types that impl `Drop`, LLVM currently can't tell that the drop can never occur, when using the default `Iterator::try_fold` implementation. For example, given the following code from rust-lang#120493 ```rust #[repr(transparent)] struct WrappedClone { inner: String } #[no_mangle] pub fn unwrap_clone(list: Vec<WrappedClone>) -> Vec<String> { list.into_iter().map(|s| s.inner).collect() } ``` <details> <summary>The asm for the `unwrap_clone` method is currently:</summary> ```asm unwrap_clone: push rbp push r15 push r14 push r13 push r12 push rbx push rax mov rbx, rdi mov r12, qword ptr [rsi] mov rdi, qword ptr [rsi + 8] mov rax, qword ptr [rsi + 16] movabs rsi, -6148914691236517205 mov r14, r12 test rax, rax je .LBB0_10 lea rcx, [rax + 2*rax] lea r14, [r12 + 8*rcx] shl rax, 3 lea rax, [rax + 2*rax] xor ecx, ecx .LBB0_2: cmp qword ptr [r12 + rcx], 0 je .LBB0_4 add rcx, 24 cmp rax, rcx jne .LBB0_2 jmp .LBB0_10 .LBB0_4: lea rdx, [rax - 24] lea r14, [r12 + rcx] cmp rdx, rcx je .LBB0_10 mov qword ptr [rsp], rdi sub rax, rcx add rax, -24 mul rsi mov r15, rdx lea rbp, [r12 + rcx] add rbp, 32 shr r15, 4 mov r13, qword ptr [rip + __rust_dealloc@GOTPCREL] jmp .LBB0_6 .LBB0_8: add rbp, 24 dec r15 je .LBB0_9 .LBB0_6: mov rsi, qword ptr [rbp] test rsi, rsi je .LBB0_8 mov rdi, qword ptr [rbp - 8] mov edx, 1 call r13 jmp .LBB0_8 .LBB0_9: mov rdi, qword ptr [rsp] movabs rsi, -6148914691236517205 .LBB0_10: sub r14, r12 mov rax, r14 mul rsi shr rdx, 4 mov qword ptr [rbx], r12 mov qword ptr [rbx + 8], rdi mov qword ptr [rbx + 16], rdx mov rax, rbx add rsp, 8 pop rbx pop r12 pop r13 pop r14 pop r15 pop rbp ret ``` </details> <details> <summary>After this PR:</summary> ```asm unwrap_clone: mov rax, rdi movups xmm0, xmmword ptr [rsi] mov rcx, qword ptr [rsi + 16] movups xmmword ptr [rdi], xmm0 mov qword ptr [rdi + 16], rcx ret ``` </details> Fixes rust-lang#120493
LLVM does not know that the multiplication never overflows, which causes it to generate unnecessary instructions. Use `usize::unchecked_mul`, so that it can fold the `dst_cap` calculation when `size_of::<I::SRC>() == size_of::<T>()`. Running: ``` rustc -C llvm-args=-x86-asm-syntax=intel -O src/lib.rs --emit asm` ``` ```rust pub struct Foo([usize; 3]); pub fn unwrap_copy(v: Vec<Foo>) -> Vec<[usize; 3]> { v.into_iter().map(|f| f.0).collect() } ``` Before this commit: ``` define void @unwrap_copy(ptr noalias nocapture noundef writeonly sret([24 x i8]) align 8 dereferenceable(24) %_0, ptr noalias nocapture noundef readonly align 8 dereferenceable(24) %iter) { start: %me.sroa.0.0.copyload.i = load i64, ptr %iter, align 8 %me.sroa.4.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 8 %me.sroa.4.0.copyload.i = load ptr, ptr %me.sroa.4.0.self.sroa_idx.i, align 8 %me.sroa.5.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 16 %me.sroa.5.0.copyload.i = load i64, ptr %me.sroa.5.0.self.sroa_idx.i, align 8 %_19.i.idx = mul nsw i64 %me.sroa.5.0.copyload.i, 24 %0 = udiv i64 %_19.i.idx, 24 %_16.i.i = mul i64 %me.sroa.0.0.copyload.i, 24 %dst_cap.i.i = udiv i64 %_16.i.i, 24 store i64 %dst_cap.i.i, ptr %_0, align 8 %1 = getelementptr inbounds i8, ptr %_0, i64 8 store ptr %me.sroa.4.0.copyload.i, ptr %1, align 8 %2 = getelementptr inbounds i8, ptr %_0, i64 16 store i64 %0, ptr %2, align 8 ret void } ``` After: ``` define void @unwrap_copy(ptr noalias nocapture noundef writeonly sret([24 x i8]) align 8 dereferenceable(24) %_0, ptr noalias nocapture noundef readonly align 8 dereferenceable(24) %iter) { start: %me.sroa.0.0.copyload.i = load i64, ptr %iter, align 8 %me.sroa.4.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 8 %me.sroa.4.0.copyload.i = load ptr, ptr %me.sroa.4.0.self.sroa_idx.i, align 8 %me.sroa.5.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 16 %me.sroa.5.0.copyload.i = load i64, ptr %me.sroa.5.0.self.sroa_idx.i, align 8 %_19.i.idx = mul nsw i64 %me.sroa.5.0.copyload.i, 24 %0 = udiv i64 %_19.i.idx, 24 store i64 %me.sroa.0.0.copyload.i, ptr %_0, align 8 %1 = getelementptr inbounds i8, ptr %_0, i64 8 store ptr %me.sroa.4.0.copyload.i, ptr %1, align 8 %2 = getelementptr inbounds i8, ptr %_0, i64 16 store i64 %0, ptr %2, align 8, !alias.scope !9, !noalias !14 ret void } ``` Note that there is still one more `mul,udiv` pair that I couldn't get rid of. The root cause is the same issue as rust-lang#121239, the `nuw` gets stripped off of `ptr::sub_ptr`.
optimize inplace collection of Vec This PR has the following changes: 1. Using `usize::unchecked_mul` in https://github.com/rust-lang/rust/blob/79424056b05eaa9563d16dfab9b9a0c8f033f220/library/alloc/src/vec/in_place_collect.rs#L262 as LLVM, does not know that the operation can't wrap, since that's the size of the original allocation. Given the following: ```rust pub struct Foo([usize; 3]); pub fn unwrap_copy(v: Vec<Foo>) -> Vec<[usize; 3]> { v.into_iter().map(|f| f.0).collect() } ``` <details> <summary>Before this commit:</summary> ```llvm define void `@unwrap_copy(ptr` noalias nocapture noundef writeonly sret([24 x i8]) align 8 dereferenceable(24) %_0, ptr noalias nocapture noundef readonly align 8 dereferenceable(24) %iter) { start: %me.sroa.0.0.copyload.i = load i64, ptr %iter, align 8 %me.sroa.4.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 8 %me.sroa.4.0.copyload.i = load ptr, ptr %me.sroa.4.0.self.sroa_idx.i, align 8 %me.sroa.5.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 16 %me.sroa.5.0.copyload.i = load i64, ptr %me.sroa.5.0.self.sroa_idx.i, align 8 %_19.i.idx = mul nsw i64 %me.sroa.5.0.copyload.i, 24 %0 = udiv i64 %_19.i.idx, 24 ; Unnecessary calculation %_16.i.i = mul i64 %me.sroa.0.0.copyload.i, 24 %dst_cap.i.i = udiv i64 %_16.i.i, 24 store i64 %dst_cap.i.i, ptr %_0, align 8 %1 = getelementptr inbounds i8, ptr %_0, i64 8 store ptr %me.sroa.4.0.copyload.i, ptr %1, align 8 %2 = getelementptr inbounds i8, ptr %_0, i64 16 store i64 %0, ptr %2, align 8 ret void } ``` </details> <details> <summary>After:</summary> ```llvm define void `@unwrap_copy(ptr` noalias nocapture noundef writeonly sret([24 x i8]) align 8 dereferenceable(24) %_0, ptr noalias nocapture noundef readonly align 8 dereferenceable(24) %iter) { start: %me.sroa.0.0.copyload.i = load i64, ptr %iter, align 8 %me.sroa.4.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 8 %me.sroa.4.0.copyload.i = load ptr, ptr %me.sroa.4.0.self.sroa_idx.i, align 8 %me.sroa.5.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 16 %me.sroa.5.0.copyload.i = load i64, ptr %me.sroa.5.0.self.sroa_idx.i, align 8 %_19.i.idx = mul nsw i64 %me.sroa.5.0.copyload.i, 24 %0 = udiv i64 %_19.i.idx, 24 store i64 %me.sroa.0.0.copyload.i, ptr %_0, align 8 %1 = getelementptr inbounds i8, ptr %_0, i64 8 store ptr %me.sroa.4.0.copyload.i, ptr %1, align 8 %2 = getelementptr inbounds i8, ptr %_0, i64 16 store i64 %0, ptr %2, align 8, !alias.scope !9, !noalias !14 ret void } ``` </details> Note that there is still one more `mul,udiv` pair that I couldn't get rid of. The root cause is the same issue as rust-lang#121239, the `nuw` gets stripped off of `ptr::sub_ptr`. 2. `Iterator::try_fold` gets called on the underlying Iterator in `SpecInPlaceCollect::collect_in_place` whenever it does not implement `TrustedRandomAccess`. For types that impl `Drop`, LLVM currently can't tell that the drop can never occur, when using the default `Iterator::try_fold` implementation. For example, given the following code from rust-lang#120493 ```rust #[repr(transparent)] struct WrappedClone { inner: String } #[no_mangle] pub fn unwrap_clone(list: Vec<WrappedClone>) -> Vec<String> { list.into_iter().map(|s| s.inner).collect() } ``` <details> <summary>The asm for the `unwrap_clone` method is currently:</summary> ```asm unwrap_clone: push rbp push r15 push r14 push r13 push r12 push rbx push rax mov rbx, rdi mov r12, qword ptr [rsi] mov rdi, qword ptr [rsi + 8] mov rax, qword ptr [rsi + 16] movabs rsi, -6148914691236517205 mov r14, r12 test rax, rax je .LBB0_10 lea rcx, [rax + 2*rax] lea r14, [r12 + 8*rcx] shl rax, 3 lea rax, [rax + 2*rax] xor ecx, ecx .LBB0_2: cmp qword ptr [r12 + rcx], 0 je .LBB0_4 add rcx, 24 cmp rax, rcx jne .LBB0_2 jmp .LBB0_10 .LBB0_4: lea rdx, [rax - 24] lea r14, [r12 + rcx] cmp rdx, rcx je .LBB0_10 mov qword ptr [rsp], rdi sub rax, rcx add rax, -24 mul rsi mov r15, rdx lea rbp, [r12 + rcx] add rbp, 32 shr r15, 4 mov r13, qword ptr [rip + __rust_dealloc@GOTPCREL] jmp .LBB0_6 .LBB0_8: add rbp, 24 dec r15 je .LBB0_9 .LBB0_6: mov rsi, qword ptr [rbp] test rsi, rsi je .LBB0_8 mov rdi, qword ptr [rbp - 8] mov edx, 1 call r13 jmp .LBB0_8 .LBB0_9: mov rdi, qword ptr [rsp] movabs rsi, -6148914691236517205 .LBB0_10: sub r14, r12 mov rax, r14 mul rsi shr rdx, 4 mov qword ptr [rbx], r12 mov qword ptr [rbx + 8], rdi mov qword ptr [rbx + 16], rdx mov rax, rbx add rsp, 8 pop rbx pop r12 pop r13 pop r14 pop r15 pop rbp ret ``` </details> <details> <summary>After this PR:</summary> ```asm unwrap_clone: mov rax, rdi movups xmm0, xmmword ptr [rsi] mov rcx, qword ptr [rsi + 16] movups xmmword ptr [rdi], xmm0 mov qword ptr [rdi + 16], rcx ret ``` </details> Fixes rust-lang#120493
optimize inplace collection of Vec This PR has the following changes: 1. Using `usize::unchecked_mul` in https://github.com/rust-lang/rust/blob/79424056b05eaa9563d16dfab9b9a0c8f033f220/library/alloc/src/vec/in_place_collect.rs#L262 as LLVM, does not know that the operation can't wrap, since that's the size of the original allocation. Given the following: ```rust pub struct Foo([usize; 3]); pub fn unwrap_copy(v: Vec<Foo>) -> Vec<[usize; 3]> { v.into_iter().map(|f| f.0).collect() } ``` <details> <summary>Before this commit:</summary> ```llvm define void `@unwrap_copy(ptr` noalias nocapture noundef writeonly sret([24 x i8]) align 8 dereferenceable(24) %_0, ptr noalias nocapture noundef readonly align 8 dereferenceable(24) %iter) { start: %me.sroa.0.0.copyload.i = load i64, ptr %iter, align 8 %me.sroa.4.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 8 %me.sroa.4.0.copyload.i = load ptr, ptr %me.sroa.4.0.self.sroa_idx.i, align 8 %me.sroa.5.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 16 %me.sroa.5.0.copyload.i = load i64, ptr %me.sroa.5.0.self.sroa_idx.i, align 8 %_19.i.idx = mul nsw i64 %me.sroa.5.0.copyload.i, 24 %0 = udiv i64 %_19.i.idx, 24 ; Unnecessary calculation %_16.i.i = mul i64 %me.sroa.0.0.copyload.i, 24 %dst_cap.i.i = udiv i64 %_16.i.i, 24 store i64 %dst_cap.i.i, ptr %_0, align 8 %1 = getelementptr inbounds i8, ptr %_0, i64 8 store ptr %me.sroa.4.0.copyload.i, ptr %1, align 8 %2 = getelementptr inbounds i8, ptr %_0, i64 16 store i64 %0, ptr %2, align 8 ret void } ``` </details> <details> <summary>After:</summary> ```llvm define void `@unwrap_copy(ptr` noalias nocapture noundef writeonly sret([24 x i8]) align 8 dereferenceable(24) %_0, ptr noalias nocapture noundef readonly align 8 dereferenceable(24) %iter) { start: %me.sroa.0.0.copyload.i = load i64, ptr %iter, align 8 %me.sroa.4.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 8 %me.sroa.4.0.copyload.i = load ptr, ptr %me.sroa.4.0.self.sroa_idx.i, align 8 %me.sroa.5.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 16 %me.sroa.5.0.copyload.i = load i64, ptr %me.sroa.5.0.self.sroa_idx.i, align 8 %_19.i.idx = mul nsw i64 %me.sroa.5.0.copyload.i, 24 %0 = udiv i64 %_19.i.idx, 24 store i64 %me.sroa.0.0.copyload.i, ptr %_0, align 8 %1 = getelementptr inbounds i8, ptr %_0, i64 8 store ptr %me.sroa.4.0.copyload.i, ptr %1, align 8 %2 = getelementptr inbounds i8, ptr %_0, i64 16 store i64 %0, ptr %2, align 8, !alias.scope !9, !noalias !14 ret void } ``` </details> Note that there is still one more `mul,udiv` pair that I couldn't get rid of. The root cause is the same issue as rust-lang#121239, the `nuw` gets stripped off of `ptr::sub_ptr`. 2. `Iterator::try_fold` gets called on the underlying Iterator in `SpecInPlaceCollect::collect_in_place` whenever it does not implement `TrustedRandomAccess`. For types that impl `Drop`, LLVM currently can't tell that the drop can never occur, when using the default `Iterator::try_fold` implementation. For example, given the following code from rust-lang#120493 ```rust #[repr(transparent)] struct WrappedClone { inner: String } #[no_mangle] pub fn unwrap_clone(list: Vec<WrappedClone>) -> Vec<String> { list.into_iter().map(|s| s.inner).collect() } ``` <details> <summary>The asm for the `unwrap_clone` method is currently:</summary> ```asm unwrap_clone: push rbp push r15 push r14 push r13 push r12 push rbx push rax mov rbx, rdi mov r12, qword ptr [rsi] mov rdi, qword ptr [rsi + 8] mov rax, qword ptr [rsi + 16] movabs rsi, -6148914691236517205 mov r14, r12 test rax, rax je .LBB0_10 lea rcx, [rax + 2*rax] lea r14, [r12 + 8*rcx] shl rax, 3 lea rax, [rax + 2*rax] xor ecx, ecx .LBB0_2: cmp qword ptr [r12 + rcx], 0 je .LBB0_4 add rcx, 24 cmp rax, rcx jne .LBB0_2 jmp .LBB0_10 .LBB0_4: lea rdx, [rax - 24] lea r14, [r12 + rcx] cmp rdx, rcx je .LBB0_10 mov qword ptr [rsp], rdi sub rax, rcx add rax, -24 mul rsi mov r15, rdx lea rbp, [r12 + rcx] add rbp, 32 shr r15, 4 mov r13, qword ptr [rip + __rust_dealloc@GOTPCREL] jmp .LBB0_6 .LBB0_8: add rbp, 24 dec r15 je .LBB0_9 .LBB0_6: mov rsi, qword ptr [rbp] test rsi, rsi je .LBB0_8 mov rdi, qword ptr [rbp - 8] mov edx, 1 call r13 jmp .LBB0_8 .LBB0_9: mov rdi, qword ptr [rsp] movabs rsi, -6148914691236517205 .LBB0_10: sub r14, r12 mov rax, r14 mul rsi shr rdx, 4 mov qword ptr [rbx], r12 mov qword ptr [rbx + 8], rdi mov qword ptr [rbx + 16], rdx mov rax, rbx add rsp, 8 pop rbx pop r12 pop r13 pop r14 pop r15 pop rbp ret ``` </details> <details> <summary>After this PR:</summary> ```asm unwrap_clone: mov rax, rdi movups xmm0, xmmword ptr [rsi] mov rcx, qword ptr [rsi + 16] movups xmmword ptr [rdi], xmm0 mov qword ptr [rdi + 16], rcx ret ``` </details> Fixes rust-lang#120493
optimize inplace collection of Vec This PR has the following changes: 1. Using `usize::unchecked_mul` in https://github.com/rust-lang/rust/blob/79424056b05eaa9563d16dfab9b9a0c8f033f220/library/alloc/src/vec/in_place_collect.rs#L262 as LLVM, does not know that the operation can't wrap, since that's the size of the original allocation. Given the following: ```rust pub struct Foo([usize; 3]); pub fn unwrap_copy(v: Vec<Foo>) -> Vec<[usize; 3]> { v.into_iter().map(|f| f.0).collect() } ``` <details> <summary>Before this commit:</summary> ```llvm define void `@unwrap_copy(ptr` noalias nocapture noundef writeonly sret([24 x i8]) align 8 dereferenceable(24) %_0, ptr noalias nocapture noundef readonly align 8 dereferenceable(24) %iter) { start: %me.sroa.0.0.copyload.i = load i64, ptr %iter, align 8 %me.sroa.4.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 8 %me.sroa.4.0.copyload.i = load ptr, ptr %me.sroa.4.0.self.sroa_idx.i, align 8 %me.sroa.5.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 16 %me.sroa.5.0.copyload.i = load i64, ptr %me.sroa.5.0.self.sroa_idx.i, align 8 %_19.i.idx = mul nsw i64 %me.sroa.5.0.copyload.i, 24 %0 = udiv i64 %_19.i.idx, 24 ; Unnecessary calculation %_16.i.i = mul i64 %me.sroa.0.0.copyload.i, 24 %dst_cap.i.i = udiv i64 %_16.i.i, 24 store i64 %dst_cap.i.i, ptr %_0, align 8 %1 = getelementptr inbounds i8, ptr %_0, i64 8 store ptr %me.sroa.4.0.copyload.i, ptr %1, align 8 %2 = getelementptr inbounds i8, ptr %_0, i64 16 store i64 %0, ptr %2, align 8 ret void } ``` </details> <details> <summary>After:</summary> ```llvm define void `@unwrap_copy(ptr` noalias nocapture noundef writeonly sret([24 x i8]) align 8 dereferenceable(24) %_0, ptr noalias nocapture noundef readonly align 8 dereferenceable(24) %iter) { start: %me.sroa.0.0.copyload.i = load i64, ptr %iter, align 8 %me.sroa.4.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 8 %me.sroa.4.0.copyload.i = load ptr, ptr %me.sroa.4.0.self.sroa_idx.i, align 8 %me.sroa.5.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 16 %me.sroa.5.0.copyload.i = load i64, ptr %me.sroa.5.0.self.sroa_idx.i, align 8 %_19.i.idx = mul nsw i64 %me.sroa.5.0.copyload.i, 24 %0 = udiv i64 %_19.i.idx, 24 store i64 %me.sroa.0.0.copyload.i, ptr %_0, align 8 %1 = getelementptr inbounds i8, ptr %_0, i64 8 store ptr %me.sroa.4.0.copyload.i, ptr %1, align 8 %2 = getelementptr inbounds i8, ptr %_0, i64 16 store i64 %0, ptr %2, align 8, !alias.scope !9, !noalias !14 ret void } ``` </details> Note that there is still one more `mul,udiv` pair that I couldn't get rid of. The root cause is the same issue as rust-lang/rust#121239, the `nuw` gets stripped off of `ptr::sub_ptr`. 2. `Iterator::try_fold` gets called on the underlying Iterator in `SpecInPlaceCollect::collect_in_place` whenever it does not implement `TrustedRandomAccess`. For types that impl `Drop`, LLVM currently can't tell that the drop can never occur, when using the default `Iterator::try_fold` implementation. For example, given the following code from #120493 ```rust #[repr(transparent)] struct WrappedClone { inner: String } #[no_mangle] pub fn unwrap_clone(list: Vec<WrappedClone>) -> Vec<String> { list.into_iter().map(|s| s.inner).collect() } ``` <details> <summary>The asm for the `unwrap_clone` method is currently:</summary> ```asm unwrap_clone: push rbp push r15 push r14 push r13 push r12 push rbx push rax mov rbx, rdi mov r12, qword ptr [rsi] mov rdi, qword ptr [rsi + 8] mov rax, qword ptr [rsi + 16] movabs rsi, -6148914691236517205 mov r14, r12 test rax, rax je .LBB0_10 lea rcx, [rax + 2*rax] lea r14, [r12 + 8*rcx] shl rax, 3 lea rax, [rax + 2*rax] xor ecx, ecx .LBB0_2: cmp qword ptr [r12 + rcx], 0 je .LBB0_4 add rcx, 24 cmp rax, rcx jne .LBB0_2 jmp .LBB0_10 .LBB0_4: lea rdx, [rax - 24] lea r14, [r12 + rcx] cmp rdx, rcx je .LBB0_10 mov qword ptr [rsp], rdi sub rax, rcx add rax, -24 mul rsi mov r15, rdx lea rbp, [r12 + rcx] add rbp, 32 shr r15, 4 mov r13, qword ptr [rip + __rust_dealloc@GOTPCREL] jmp .LBB0_6 .LBB0_8: add rbp, 24 dec r15 je .LBB0_9 .LBB0_6: mov rsi, qword ptr [rbp] test rsi, rsi je .LBB0_8 mov rdi, qword ptr [rbp - 8] mov edx, 1 call r13 jmp .LBB0_8 .LBB0_9: mov rdi, qword ptr [rsp] movabs rsi, -6148914691236517205 .LBB0_10: sub r14, r12 mov rax, r14 mul rsi shr rdx, 4 mov qword ptr [rbx], r12 mov qword ptr [rbx + 8], rdi mov qword ptr [rbx + 16], rdx mov rax, rbx add rsp, 8 pop rbx pop r12 pop r13 pop r14 pop r15 pop rbp ret ``` </details> <details> <summary>After this PR:</summary> ```asm unwrap_clone: mov rax, rdi movups xmm0, xmmword ptr [rsi] mov rcx, qword ptr [rsi + 16] movups xmmword ptr [rdi], xmm0 mov qword ptr [rdi + 16], rcx ret ``` </details> Fixes rust-lang/rust#120493
optimize inplace collection of Vec This PR has the following changes: 1. Using `usize::unchecked_mul` in https://github.com/rust-lang/rust/blob/79424056b05eaa9563d16dfab9b9a0c8f033f220/library/alloc/src/vec/in_place_collect.rs#L262 as LLVM, does not know that the operation can't wrap, since that's the size of the original allocation. Given the following: ```rust pub struct Foo([usize; 3]); pub fn unwrap_copy(v: Vec<Foo>) -> Vec<[usize; 3]> { v.into_iter().map(|f| f.0).collect() } ``` <details> <summary>Before this commit:</summary> ```llvm define void `@unwrap_copy(ptr` noalias nocapture noundef writeonly sret([24 x i8]) align 8 dereferenceable(24) %_0, ptr noalias nocapture noundef readonly align 8 dereferenceable(24) %iter) { start: %me.sroa.0.0.copyload.i = load i64, ptr %iter, align 8 %me.sroa.4.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 8 %me.sroa.4.0.copyload.i = load ptr, ptr %me.sroa.4.0.self.sroa_idx.i, align 8 %me.sroa.5.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 16 %me.sroa.5.0.copyload.i = load i64, ptr %me.sroa.5.0.self.sroa_idx.i, align 8 %_19.i.idx = mul nsw i64 %me.sroa.5.0.copyload.i, 24 %0 = udiv i64 %_19.i.idx, 24 ; Unnecessary calculation %_16.i.i = mul i64 %me.sroa.0.0.copyload.i, 24 %dst_cap.i.i = udiv i64 %_16.i.i, 24 store i64 %dst_cap.i.i, ptr %_0, align 8 %1 = getelementptr inbounds i8, ptr %_0, i64 8 store ptr %me.sroa.4.0.copyload.i, ptr %1, align 8 %2 = getelementptr inbounds i8, ptr %_0, i64 16 store i64 %0, ptr %2, align 8 ret void } ``` </details> <details> <summary>After:</summary> ```llvm define void `@unwrap_copy(ptr` noalias nocapture noundef writeonly sret([24 x i8]) align 8 dereferenceable(24) %_0, ptr noalias nocapture noundef readonly align 8 dereferenceable(24) %iter) { start: %me.sroa.0.0.copyload.i = load i64, ptr %iter, align 8 %me.sroa.4.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 8 %me.sroa.4.0.copyload.i = load ptr, ptr %me.sroa.4.0.self.sroa_idx.i, align 8 %me.sroa.5.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 16 %me.sroa.5.0.copyload.i = load i64, ptr %me.sroa.5.0.self.sroa_idx.i, align 8 %_19.i.idx = mul nsw i64 %me.sroa.5.0.copyload.i, 24 %0 = udiv i64 %_19.i.idx, 24 store i64 %me.sroa.0.0.copyload.i, ptr %_0, align 8 %1 = getelementptr inbounds i8, ptr %_0, i64 8 store ptr %me.sroa.4.0.copyload.i, ptr %1, align 8 %2 = getelementptr inbounds i8, ptr %_0, i64 16 store i64 %0, ptr %2, align 8, !alias.scope !9, !noalias !14 ret void } ``` </details> Note that there is still one more `mul,udiv` pair that I couldn't get rid of. The root cause is the same issue as rust-lang/rust#121239, the `nuw` gets stripped off of `ptr::sub_ptr`. 2. `Iterator::try_fold` gets called on the underlying Iterator in `SpecInPlaceCollect::collect_in_place` whenever it does not implement `TrustedRandomAccess`. For types that impl `Drop`, LLVM currently can't tell that the drop can never occur, when using the default `Iterator::try_fold` implementation. For example, given the following code from #120493 ```rust #[repr(transparent)] struct WrappedClone { inner: String } #[no_mangle] pub fn unwrap_clone(list: Vec<WrappedClone>) -> Vec<String> { list.into_iter().map(|s| s.inner).collect() } ``` <details> <summary>The asm for the `unwrap_clone` method is currently:</summary> ```asm unwrap_clone: push rbp push r15 push r14 push r13 push r12 push rbx push rax mov rbx, rdi mov r12, qword ptr [rsi] mov rdi, qword ptr [rsi + 8] mov rax, qword ptr [rsi + 16] movabs rsi, -6148914691236517205 mov r14, r12 test rax, rax je .LBB0_10 lea rcx, [rax + 2*rax] lea r14, [r12 + 8*rcx] shl rax, 3 lea rax, [rax + 2*rax] xor ecx, ecx .LBB0_2: cmp qword ptr [r12 + rcx], 0 je .LBB0_4 add rcx, 24 cmp rax, rcx jne .LBB0_2 jmp .LBB0_10 .LBB0_4: lea rdx, [rax - 24] lea r14, [r12 + rcx] cmp rdx, rcx je .LBB0_10 mov qword ptr [rsp], rdi sub rax, rcx add rax, -24 mul rsi mov r15, rdx lea rbp, [r12 + rcx] add rbp, 32 shr r15, 4 mov r13, qword ptr [rip + __rust_dealloc@GOTPCREL] jmp .LBB0_6 .LBB0_8: add rbp, 24 dec r15 je .LBB0_9 .LBB0_6: mov rsi, qword ptr [rbp] test rsi, rsi je .LBB0_8 mov rdi, qword ptr [rbp - 8] mov edx, 1 call r13 jmp .LBB0_8 .LBB0_9: mov rdi, qword ptr [rsp] movabs rsi, -6148914691236517205 .LBB0_10: sub r14, r12 mov rax, r14 mul rsi shr rdx, 4 mov qword ptr [rbx], r12 mov qword ptr [rbx + 8], rdi mov qword ptr [rbx + 16], rdx mov rax, rbx add rsp, 8 pop rbx pop r12 pop r13 pop r14 pop r15 pop rbp ret ``` </details> <details> <summary>After this PR:</summary> ```asm unwrap_clone: mov rax, rdi movups xmm0, xmmword ptr [rsi] mov rcx, qword ptr [rsi + 16] movups xmmword ptr [rdi], xmm0 mov qword ptr [rdi + 16], rcx ret ``` </details> Fixes rust-lang/rust#120493
optimize inplace collection of Vec This PR has the following changes: 1. Using `usize::unchecked_mul` in https://github.com/rust-lang/rust/blob/79424056b05eaa9563d16dfab9b9a0c8f033f220/library/alloc/src/vec/in_place_collect.rs#L262 as LLVM, does not know that the operation can't wrap, since that's the size of the original allocation. Given the following: ```rust pub struct Foo([usize; 3]); pub fn unwrap_copy(v: Vec<Foo>) -> Vec<[usize; 3]> { v.into_iter().map(|f| f.0).collect() } ``` <details> <summary>Before this commit:</summary> ```llvm define void `@unwrap_copy(ptr` noalias nocapture noundef writeonly sret([24 x i8]) align 8 dereferenceable(24) %_0, ptr noalias nocapture noundef readonly align 8 dereferenceable(24) %iter) { start: %me.sroa.0.0.copyload.i = load i64, ptr %iter, align 8 %me.sroa.4.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 8 %me.sroa.4.0.copyload.i = load ptr, ptr %me.sroa.4.0.self.sroa_idx.i, align 8 %me.sroa.5.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 16 %me.sroa.5.0.copyload.i = load i64, ptr %me.sroa.5.0.self.sroa_idx.i, align 8 %_19.i.idx = mul nsw i64 %me.sroa.5.0.copyload.i, 24 %0 = udiv i64 %_19.i.idx, 24 ; Unnecessary calculation %_16.i.i = mul i64 %me.sroa.0.0.copyload.i, 24 %dst_cap.i.i = udiv i64 %_16.i.i, 24 store i64 %dst_cap.i.i, ptr %_0, align 8 %1 = getelementptr inbounds i8, ptr %_0, i64 8 store ptr %me.sroa.4.0.copyload.i, ptr %1, align 8 %2 = getelementptr inbounds i8, ptr %_0, i64 16 store i64 %0, ptr %2, align 8 ret void } ``` </details> <details> <summary>After:</summary> ```llvm define void `@unwrap_copy(ptr` noalias nocapture noundef writeonly sret([24 x i8]) align 8 dereferenceable(24) %_0, ptr noalias nocapture noundef readonly align 8 dereferenceable(24) %iter) { start: %me.sroa.0.0.copyload.i = load i64, ptr %iter, align 8 %me.sroa.4.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 8 %me.sroa.4.0.copyload.i = load ptr, ptr %me.sroa.4.0.self.sroa_idx.i, align 8 %me.sroa.5.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 16 %me.sroa.5.0.copyload.i = load i64, ptr %me.sroa.5.0.self.sroa_idx.i, align 8 %_19.i.idx = mul nsw i64 %me.sroa.5.0.copyload.i, 24 %0 = udiv i64 %_19.i.idx, 24 store i64 %me.sroa.0.0.copyload.i, ptr %_0, align 8 %1 = getelementptr inbounds i8, ptr %_0, i64 8 store ptr %me.sroa.4.0.copyload.i, ptr %1, align 8 %2 = getelementptr inbounds i8, ptr %_0, i64 16 store i64 %0, ptr %2, align 8, !alias.scope !9, !noalias !14 ret void } ``` </details> Note that there is still one more `mul,udiv` pair that I couldn't get rid of. The root cause is the same issue as rust-lang/rust#121239, the `nuw` gets stripped off of `ptr::sub_ptr`. 2. `Iterator::try_fold` gets called on the underlying Iterator in `SpecInPlaceCollect::collect_in_place` whenever it does not implement `TrustedRandomAccess`. For types that impl `Drop`, LLVM currently can't tell that the drop can never occur, when using the default `Iterator::try_fold` implementation. For example, given the following code from #120493 ```rust #[repr(transparent)] struct WrappedClone { inner: String } #[no_mangle] pub fn unwrap_clone(list: Vec<WrappedClone>) -> Vec<String> { list.into_iter().map(|s| s.inner).collect() } ``` <details> <summary>The asm for the `unwrap_clone` method is currently:</summary> ```asm unwrap_clone: push rbp push r15 push r14 push r13 push r12 push rbx push rax mov rbx, rdi mov r12, qword ptr [rsi] mov rdi, qword ptr [rsi + 8] mov rax, qword ptr [rsi + 16] movabs rsi, -6148914691236517205 mov r14, r12 test rax, rax je .LBB0_10 lea rcx, [rax + 2*rax] lea r14, [r12 + 8*rcx] shl rax, 3 lea rax, [rax + 2*rax] xor ecx, ecx .LBB0_2: cmp qword ptr [r12 + rcx], 0 je .LBB0_4 add rcx, 24 cmp rax, rcx jne .LBB0_2 jmp .LBB0_10 .LBB0_4: lea rdx, [rax - 24] lea r14, [r12 + rcx] cmp rdx, rcx je .LBB0_10 mov qword ptr [rsp], rdi sub rax, rcx add rax, -24 mul rsi mov r15, rdx lea rbp, [r12 + rcx] add rbp, 32 shr r15, 4 mov r13, qword ptr [rip + __rust_dealloc@GOTPCREL] jmp .LBB0_6 .LBB0_8: add rbp, 24 dec r15 je .LBB0_9 .LBB0_6: mov rsi, qword ptr [rbp] test rsi, rsi je .LBB0_8 mov rdi, qword ptr [rbp - 8] mov edx, 1 call r13 jmp .LBB0_8 .LBB0_9: mov rdi, qword ptr [rsp] movabs rsi, -6148914691236517205 .LBB0_10: sub r14, r12 mov rax, r14 mul rsi shr rdx, 4 mov qword ptr [rbx], r12 mov qword ptr [rbx + 8], rdi mov qword ptr [rbx + 16], rdx mov rax, rbx add rsp, 8 pop rbx pop r12 pop r13 pop r14 pop r15 pop rbp ret ``` </details> <details> <summary>After this PR:</summary> ```asm unwrap_clone: mov rax, rdi movups xmm0, xmmword ptr [rsi] mov rcx, qword ptr [rsi + 16] movups xmmword ptr [rdi], xmm0 mov qword ptr [rdi + 16], rcx ret ``` </details> Fixes rust-lang/rust#120493
Not sure how to confirm whether this is really fixed now or not. |
I'll dedust my branch and check it locally. |
triage: original repro indeed fixed, @the8472 any news on that local check? |
|
While working on #120682 I noticed that some optimizations regressed after a rebase. The following should be a reduction.
I tried this code:
godbolt
I expected to see this happen (beta):
Instead, this happened (nightly):
Meta
rustc --version --verbose
:The text was updated successfully, but these errors were encountered: