diff --git a/core/src/slice/sort/mod.rs b/core/src/slice/sort/mod.rs index 79852708b81ea..81b99a4364b2e 100644 --- a/core/src/slice/sort/mod.rs +++ b/core/src/slice/sort/mod.rs @@ -5,4 +5,5 @@ pub mod stable; pub mod unstable; pub(crate) mod select; +#[cfg(not(feature = "optimize_for_size"))] pub(crate) mod shared; diff --git a/core/src/slice/sort/select.rs b/core/src/slice/sort/select.rs index f6529f23bcb3f..b60e33b3a2f0d 100644 --- a/core/src/slice/sort/select.rs +++ b/core/src/slice/sort/select.rs @@ -6,9 +6,13 @@ //! for pivot selection. Using this as a fallback ensures O(n) worst case running time with //! better performance than one would get using heapsort as fallback. +use crate::intrinsics; use crate::mem::{self, SizedTypeProperties}; +#[cfg(not(feature = "optimize_for_size"))] use crate::slice::sort::shared::pivot::choose_pivot; +#[cfg(not(feature = "optimize_for_size"))] use crate::slice::sort::shared::smallsort::insertion_sort_shift_left; +#[cfg(not(feature = "optimize_for_size"))] use crate::slice::sort::unstable::quicksort::partition; /// Reorders the slice such that the element at `index` is at its final sorted position. @@ -40,7 +44,15 @@ where let min_idx = min_index(v, &mut is_less).unwrap(); v.swap(min_idx, index); } else { - partition_at_index_loop(v, index, None, &mut is_less); + #[cfg(not(feature = "optimize_for_size"))] + { + partition_at_index_loop(v, index, None, &mut is_less); + } + + #[cfg(feature = "optimize_for_size")] + { + median_of_medians(v, &mut is_less, index); + } } let (left, right) = v.split_at_mut(index); @@ -53,6 +65,7 @@ where // most once, it doesn't make sense to use something more sophisticated than insertion-sort. const INSERTION_SORT_THRESHOLD: usize = 16; +#[cfg(not(feature = "optimize_for_size"))] fn partition_at_index_loop<'a, T, F>( mut v: &'a mut [T], mut index: usize, @@ -167,8 +180,17 @@ fn median_of_medians bool>(mut v: &mut [T], is_less: &mut loop { if v.len() <= INSERTION_SORT_THRESHOLD { if v.len() >= 2 { - insertion_sort_shift_left(v, 1, is_less); + #[cfg(not(feature = "optimize_for_size"))] + { + insertion_sort_shift_left(v, 1, is_less); + } + + #[cfg(feature = "optimize_for_size")] + { + bubble_sort(v, is_less); + } } + return; } @@ -230,7 +252,15 @@ fn median_of_ninthers bool>(v: &mut [T], is_less: &mut F) median_of_medians(&mut v[lo..lo + frac], is_less, pivot); - partition(v, lo + pivot, is_less) + #[cfg(not(feature = "optimize_for_size"))] + { + partition(v, lo + pivot, is_less) + } + + #[cfg(feature = "optimize_for_size")] + { + partition_size_opt(v, lo + pivot, is_less) + } } /// Moves around the 9 elements at the indices a..i, such that @@ -298,3 +328,92 @@ fn median_idx bool>( } b } + +// It's possible to re-use the insertion sort in the smallsort module, but with optimize_for_size it +// would clutter that module with cfg statements and make it generally harder to read and develop. +// So to decouple things and simplify it, we use a an even smaller bubble sort. +#[cfg(feature = "optimize_for_size")] +fn bubble_sort bool>(v: &mut [T], is_less: &mut F) { + let mut n = v.len(); + let mut did_swap = true; + + while did_swap && n > 1 { + did_swap = false; + for i in 1..n { + // SAFETY: The loop construction implies that `i` and `i - 1` will always be in-bounds. + unsafe { + if is_less(v.get_unchecked(i), v.get_unchecked(i - 1)) { + v.swap_unchecked(i - 1, i); + did_swap = true; + } + } + } + n -= 1; + } +} + +#[cfg(feature = "optimize_for_size")] +fn partition_size_opt(v: &mut [T], pivot: usize, is_less: &mut F) -> usize +where + F: FnMut(&T, &T) -> bool, +{ + let len = v.len(); + + // Allows for panic-free code-gen by proving this property to the compiler. + if len == 0 { + return 0; + } + + if pivot >= len { + intrinsics::abort(); + } + + // SAFETY: We checked that `pivot` is in-bounds. + unsafe { + // Place the pivot at the beginning of slice. + v.swap_unchecked(0, pivot); + } + let (pivot, v_without_pivot) = v.split_at_mut(1); + + // Assuming that Rust generates noalias LLVM IR we can be sure that a partition function + // signature of the form `(v: &mut [T], pivot: &T)` guarantees that pivot and v can't alias. + // Having this guarantee is crucial for optimizations. It's possible to copy the pivot value + // into a stack value, but this creates issues for types with interior mutability mandating + // a drop guard. + let pivot = &mut pivot[0]; + + let num_lt = partition_lomuto_branchless_simple(v_without_pivot, pivot, is_less); + + if num_lt >= len { + intrinsics::abort(); + } + + // SAFETY: We checked that `num_lt` is in-bounds. + unsafe { + // Place the pivot between the two partitions. + v.swap_unchecked(0, num_lt); + } + + num_lt +} + +#[cfg(feature = "optimize_for_size")] +fn partition_lomuto_branchless_simple bool>( + v: &mut [T], + pivot: &T, + is_less: &mut F, +) -> usize { + let mut left = 0; + + for right in 0..v.len() { + // SAFETY: `left` can at max be incremented by 1 each loop iteration, which implies that + // left <= right and that both are in-bounds. + unsafe { + let right_is_lt = is_less(v.get_unchecked(right), pivot); + v.swap_unchecked(left, right); + left += right_is_lt as usize; + } + } + + left +} diff --git a/core/src/slice/sort/shared/smallsort.rs b/core/src/slice/sort/shared/smallsort.rs index db0c5c72822c0..5027962ccb415 100644 --- a/core/src/slice/sort/shared/smallsort.rs +++ b/core/src/slice/sort/shared/smallsort.rs @@ -378,7 +378,7 @@ where /// Swap two values in the slice pointed to by `v_base` at the position `a_pos` and `b_pos` if the /// value at position `b_pos` is less than the one at position `a_pos`. -pub unsafe fn swap_if_less(v_base: *mut T, a_pos: usize, b_pos: usize, is_less: &mut F) +unsafe fn swap_if_less(v_base: *mut T, a_pos: usize, b_pos: usize, is_less: &mut F) where F: FnMut(&T, &T) -> bool, { diff --git a/core/src/slice/sort/stable/mod.rs b/core/src/slice/sort/stable/mod.rs index a383b0f589ccf..a61a95a225455 100644 --- a/core/src/slice/sort/stable/mod.rs +++ b/core/src/slice/sort/stable/mod.rs @@ -1,15 +1,24 @@ //! This module contains the entry points for `slice::sort`. +#[cfg(not(feature = "optimize_for_size"))] +use crate::cmp; +use crate::intrinsics; use crate::mem::{self, MaybeUninit, SizedTypeProperties}; +#[cfg(not(feature = "optimize_for_size"))] use crate::slice::sort::shared::smallsort::{ insertion_sort_shift_left, StableSmallSortTypeImpl, SMALL_SORT_GENERAL_SCRATCH_LEN, }; -use crate::{cmp, intrinsics}; -pub(crate) mod drift; pub(crate) mod merge; + +#[cfg(not(feature = "optimize_for_size"))] +pub(crate) mod drift; +#[cfg(not(feature = "optimize_for_size"))] pub(crate) mod quicksort; +#[cfg(feature = "optimize_for_size")] +pub(crate) mod tiny; + /// Stable sort called driftsort by Orson Peters and Lukas Bergdoll. /// Design document: /// @@ -30,25 +39,48 @@ pub fn sort bool, BufT: BufGuard>(v: &mut [T], is_less return; } - // More advanced sorting methods than insertion sort are faster if called in - // a hot loop for small inputs, but for general-purpose code the small - // binary size of insertion sort is more important. The instruction cache in - // modern processors is very valuable, and for a single sort call in general - // purpose code any gains from an advanced method are cancelled by i-cache - // misses during the sort, and thrashing the i-cache for surrounding code. - const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20; - if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) { - insertion_sort_shift_left(v, 1, is_less); - return; + #[cfg(not(feature = "optimize_for_size"))] + { + // More advanced sorting methods than insertion sort are faster if called in + // a hot loop for small inputs, but for general-purpose code the small + // binary size of insertion sort is more important. The instruction cache in + // modern processors is very valuable, and for a single sort call in general + // purpose code any gains from an advanced method are cancelled by i-cache + // misses during the sort, and thrashing the i-cache for surrounding code. + const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20; + if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) { + insertion_sort_shift_left(v, 1, is_less); + return; + } + + driftsort_main::(v, is_less); } - driftsort_main::(v, is_less); + #[cfg(feature = "optimize_for_size")] + { + let alloc_len = len / 2; + + // For small inputs 4KiB of stack storage suffices, which allows us to avoid + // calling the (de-)allocator. Benchmarks showed this was quite beneficial. + let mut stack_buf = AlignedStorage::::new(); + let stack_scratch = stack_buf.as_uninit_slice_mut(); + let mut heap_buf; + let scratch = if stack_scratch.len() >= alloc_len { + stack_scratch + } else { + heap_buf = BufT::with_capacity(alloc_len); + heap_buf.as_uninit_slice_mut() + }; + + tiny::mergesort(v, scratch, is_less); + } } /// See [`sort`] /// /// Deliberately don't inline the main sorting routine entrypoint to ensure the /// inlined insertion sort i-cache footprint remains minimal. +#[cfg(not(feature = "optimize_for_size"))] #[inline(never)] fn driftsort_main bool, BufT: BufGuard>(v: &mut [T], is_less: &mut F) { // By allocating n elements of memory we can ensure the entire input can diff --git a/core/src/slice/sort/stable/tiny.rs b/core/src/slice/sort/stable/tiny.rs new file mode 100644 index 0000000000000..307011f3ee8c4 --- /dev/null +++ b/core/src/slice/sort/stable/tiny.rs @@ -0,0 +1,75 @@ +//! Binary-size optimized mergesort inspired by https://github.com/voultapher/tiny-sort-rs. + +use crate::mem::{ManuallyDrop, MaybeUninit}; +use crate::ptr; +use crate::slice::sort::stable::merge; + +/// Tiny recursive top-down merge sort optimized for binary size. It has no adaptiveness whatsoever, +/// no run detection, etc. +#[inline(always)] +pub fn mergesort bool>( + v: &mut [T], + scratch: &mut [MaybeUninit], + is_less: &mut F, +) { + let len = v.len(); + + if len > 2 { + let mid = len / 2; + + // SAFETY: mid is in-bounds. + unsafe { + // Sort the left half recursively. + mergesort(v.get_unchecked_mut(..mid), scratch, is_less); + // Sort the right half recursively. + mergesort(v.get_unchecked_mut(mid..), scratch, is_less); + } + + merge::merge(v, scratch, mid, is_less); + } else if len == 2 { + // Branchless swap the two elements. This reduces the recursion depth and improves + // perf significantly at a small binary-size cost. Trades ~10% perf boost for integers + // for ~50 bytes in the binary. + + // SAFETY: We checked the len, the pointers we create are valid and don't overlap. + unsafe { + swap_if_less(v.as_mut_ptr(), 0, 1, is_less); + } + } +} + +/// Swap two values in the slice pointed to by `v_base` at the position `a_pos` and `b_pos` if the +/// value at position `b_pos` is less than the one at position `a_pos`. +unsafe fn swap_if_less(v_base: *mut T, a_pos: usize, b_pos: usize, is_less: &mut F) +where + F: FnMut(&T, &T) -> bool, +{ + // SAFETY: the caller must guarantee that `a` and `b` each added to `v_base` yield valid + // pointers into `v_base`, and are properly aligned, and part of the same allocation. + unsafe { + let v_a = v_base.add(a_pos); + let v_b = v_base.add(b_pos); + + // PANIC SAFETY: if is_less panics, no scratch memory was created and the slice should still be + // in a well defined state, without duplicates. + + // Important to only swap if it is more and not if it is equal. is_less should return false for + // equal, so we don't swap. + let should_swap = is_less(&*v_b, &*v_a); + + // This is a branchless version of swap if. + // The equivalent code with a branch would be: + // + // if should_swap { + // ptr::swap(left, right, 1); + // } + + // The goal is to generate cmov instructions here. + let left_swap = if should_swap { v_b } else { v_a }; + let right_swap = if should_swap { v_a } else { v_b }; + + let right_swap_tmp = ManuallyDrop::new(ptr::read(right_swap)); + ptr::copy(left_swap, v_a, 1); + ptr::copy_nonoverlapping(&*right_swap_tmp, v_b, 1); + } +} diff --git a/core/src/slice/sort/unstable/mod.rs b/core/src/slice/sort/unstable/mod.rs index 932e01f4401e5..43b5cc79fc7f3 100644 --- a/core/src/slice/sort/unstable/mod.rs +++ b/core/src/slice/sort/unstable/mod.rs @@ -2,10 +2,13 @@ use crate::intrinsics; use crate::mem::SizedTypeProperties; +#[cfg(not(feature = "optimize_for_size"))] use crate::slice::sort::shared::find_existing_run; +#[cfg(not(feature = "optimize_for_size"))] use crate::slice::sort::shared::smallsort::insertion_sort_shift_left; pub(crate) mod heapsort; +#[cfg(not(feature = "optimize_for_size"))] pub(crate) mod quicksort; /// Unstable sort called ipnsort by Lukas Bergdoll and Orson Peters. @@ -28,25 +31,37 @@ pub fn sort bool>(v: &mut [T], is_less: &mut F) { return; } - // More advanced sorting methods than insertion sort are faster if called in - // a hot loop for small inputs, but for general-purpose code the small - // binary size of insertion sort is more important. The instruction cache in - // modern processors is very valuable, and for a single sort call in general - // purpose code any gains from an advanced method are cancelled by i-cache - // misses during the sort, and thrashing the i-cache for surrounding code. - const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20; - if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) { - insertion_sort_shift_left(v, 1, is_less); - return; + #[cfg(not(feature = "optimize_for_size"))] + { + // More advanced sorting methods than insertion sort are faster if called in + // a hot loop for small inputs, but for general-purpose code the small + // binary size of insertion sort is more important. The instruction cache in + // modern processors is very valuable, and for a single sort call in general + // purpose code any gains from an advanced method are cancelled by i-cache + // misses during the sort, and thrashing the i-cache for surrounding code. + const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20; + if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) { + insertion_sort_shift_left(v, 1, is_less); + return; + } + + ipnsort(v, is_less); } - ipnsort(v, is_less); + #[cfg(feature = "optimize_for_size")] + { + // SAFETY: We checked that `len >= 2`. + unsafe { + heapsort::heapsort(v, is_less); + } + } } /// See [`sort`] /// /// Deliberately don't inline the main sorting routine entrypoint to ensure the /// inlined insertion sort i-cache footprint remains minimal. +#[cfg(not(feature = "optimize_for_size"))] #[inline(never)] fn ipnsort(v: &mut [T], is_less: &mut F) where diff --git a/core/src/slice/sort/unstable/quicksort.rs b/core/src/slice/sort/unstable/quicksort.rs index cd53656e9b4b8..e55ebe67e1306 100644 --- a/core/src/slice/sort/unstable/quicksort.rs +++ b/core/src/slice/sort/unstable/quicksort.rs @@ -98,13 +98,15 @@ where return 0; } - // Allows for panic-free code-gen by proving this property to the compiler. if pivot >= len { intrinsics::abort(); } - // Place the pivot at the beginning of slice. - v.swap(0, pivot); + // SAFETY: We checked that `pivot` is in-bounds. + unsafe { + // Place the pivot at the beginning of slice. + v.swap_unchecked(0, pivot); + } let (pivot, v_without_pivot) = v.split_at_mut(1); // Assuming that Rust generates noalias LLVM IR we can be sure that a partition function @@ -118,8 +120,15 @@ where // compile-time by only instantiating the code that is needed. Idea by Frank Steffahn. let num_lt = (const { inst_partition::() })(v_without_pivot, pivot, is_less); - // Place the pivot between the two partitions. - v.swap(0, num_lt); + if num_lt >= len { + intrinsics::abort(); + } + + // SAFETY: We checked that `num_lt` is in-bounds. + unsafe { + // Place the pivot between the two partitions. + v.swap_unchecked(0, num_lt); + } num_lt }