From 8ec348afdd35752677914ba28146d7b82e901817 Mon Sep 17 00:00:00 2001
From: Ivan Tham <pickfire@riseup.net>
Date: Mon, 20 Jul 2020 23:02:28 +0800
Subject: [PATCH] Remove branch in optimized is_ascii
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Performs slightly better in short or medium bytes by eliminating
the last branch check on `byte_pos == len` and always check the
last byte as it is always at most one `usize`.

Benchmark, before `libcore`, after `libcore_new`. It improves
medium and short by 1ns but regresses unaligned_tail by 2ns,
either way we can get unaligned_tail have a tiny chance of 1/8
on a 64 bit machine. I don't think we should bet on that, the
probability is worse than dice.

test long::case00_libcore                     ... bench:          38 ns/iter (+/- 1) = 183947 MB/s
test long::case00_libcore_new                 ... bench:          38 ns/iter (+/- 1) = 183947 MB/s
test long::case01_iter_all                    ... bench:         227 ns/iter (+/- 6) = 30792 MB/s
test long::case02_align_to                    ... bench:          40 ns/iter (+/- 1) = 174750 MB/s
test long::case03_align_to_unrolled           ... bench:          19 ns/iter (+/- 1) = 367894 MB/s
test medium::case00_libcore                   ... bench:           5 ns/iter (+/- 0) = 6400 MB/s
test medium::case00_libcore_new               ... bench:           4 ns/iter (+/- 0) = 8000 MB/s
test medium::case01_iter_all                  ... bench:          20 ns/iter (+/- 1) = 1600 MB/s
test medium::case02_align_to                  ... bench:           6 ns/iter (+/- 0) = 5333 MB/s
test medium::case03_align_to_unrolled         ... bench:           5 ns/iter (+/- 0) = 6400 MB/s
test short::case00_libcore                    ... bench:           7 ns/iter (+/- 0) = 1000 MB/s
test short::case00_libcore_new                ... bench:           6 ns/iter (+/- 0) = 1166 MB/s
test short::case01_iter_all                   ... bench:           5 ns/iter (+/- 0) = 1400 MB/s
test short::case02_align_to                   ... bench:           5 ns/iter (+/- 0) = 1400 MB/s
test short::case03_align_to_unrolled          ... bench:           5 ns/iter (+/- 1) = 1400 MB/s
test unaligned_both::case00_libcore           ... bench:           4 ns/iter (+/- 0) = 7500 MB/s
test unaligned_both::case00_libcore_new       ... bench:           4 ns/iter (+/- 0) = 7500 MB/s
test unaligned_both::case01_iter_all          ... bench:          26 ns/iter (+/- 0) = 1153 MB/s
test unaligned_both::case02_align_to          ... bench:          13 ns/iter (+/- 2) = 2307 MB/s
test unaligned_both::case03_align_to_unrolled ... bench:          11 ns/iter (+/- 0) = 2727 MB/s
test unaligned_head::case00_libcore           ... bench:           5 ns/iter (+/- 0) = 6200 MB/s
test unaligned_head::case00_libcore_new       ... bench:           5 ns/iter (+/- 0) = 6200 MB/s
test unaligned_head::case01_iter_all          ... bench:          19 ns/iter (+/- 1) = 1631 MB/s
test unaligned_head::case02_align_to          ... bench:          10 ns/iter (+/- 0) = 3100 MB/s
test unaligned_head::case03_align_to_unrolled ... bench:          14 ns/iter (+/- 0) = 2214 MB/s
test unaligned_tail::case00_libcore           ... bench:           3 ns/iter (+/- 0) = 10333 MB/s
test unaligned_tail::case00_libcore_new       ... bench:           5 ns/iter (+/- 0) = 6200 MB/s
test unaligned_tail::case01_iter_all          ... bench:          19 ns/iter (+/- 0) = 1631 MB/s
test unaligned_tail::case02_align_to          ... bench:          10 ns/iter (+/- 0) = 3100 MB/s
test unaligned_tail::case03_align_to_unrolled ... bench:          13 ns/iter (+/- 0) = 2384 MB/s

Rough (unfair) maths on improvements for fun: 1ns * 7/8 - 2ns * 1/8 = 0.625ns

Inspired by fish and zsh clever trick to highlight missing linefeeds (⏎)
and branchless implementation of binary_search in rust.
---
 library/core/src/slice/mod.rs | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/library/core/src/slice/mod.rs b/library/core/src/slice/mod.rs
index 93608a1ce4864..bedf871c024bc 100644
--- a/library/core/src/slice/mod.rs
+++ b/library/core/src/slice/mod.rs
@@ -2902,7 +2902,7 @@ fn contains_nonascii(v: usize) -> bool {
 ///
 /// - Read the first word with an unaligned load.
 /// - Align the pointer, read subsequent words until end with aligned loads.
-/// - If there's a tail, the last `usize` from `s` with an unaligned load.
+/// - Read the last `usize` from `s` with an unaligned load.
 ///
 /// If any of these loads produces something for which `contains_nonascii`
 /// (above) returns true, then we know the answer is false.
@@ -2949,7 +2949,10 @@ fn is_ascii(s: &[u8]) -> bool {
     // `align_offset` though.
     debug_assert_eq!((word_ptr as usize) % mem::align_of::<usize>(), 0);
 
-    while byte_pos <= len - USIZE_SIZE {
+    // Read subsequent words until the last aligned word, excluding the last
+    // aligned word by itself to be done in tail check later, to ensure that
+    // tail is always one `usize` at most to extra branch `byte_pos == len`.
+    while byte_pos < len - USIZE_SIZE {
         debug_assert!(
             // Sanity check that the read is in bounds
             (word_ptr as usize + USIZE_SIZE) <= (start.wrapping_add(len) as usize) &&
@@ -2970,15 +2973,9 @@ fn is_ascii(s: &[u8]) -> bool {
         word_ptr = unsafe { word_ptr.add(1) };
     }
 
-    // If we have anything left over, it should be at-most 1 usize worth of bytes,
-    // which we check with a read_unaligned.
-    if byte_pos == len {
-        return true;
-    }
-
     // Sanity check to ensure there really is only one `usize` left. This should
     // be guaranteed by our loop condition.
-    debug_assert!(byte_pos < len && len - byte_pos < USIZE_SIZE);
+    debug_assert!(byte_pos <= len && len - byte_pos <= USIZE_SIZE);
 
     // SAFETY: This relies on `len >= USIZE_SIZE`, which we check at the start.
     let last_word = unsafe { (start.add(len - USIZE_SIZE) as *const usize).read_unaligned() };