From 1825c7e1a604a6b0d4f4b95aca210e953e467516 Mon Sep 17 00:00:00 2001
From: Lukasz Anforowicz <anforowicz@users.noreply.github.com>
Date: Wed, 1 Nov 2023 18:04:22 -0700
Subject: [PATCH] Using `std::simd` to speed-up `unfilter` for `Paeth` for
 bpp=3 and bpp=6 (#414)

---
 .github/workflows/rust.yml |   2 +-
 benches/unfilter.rs        |   4 +-
 src/filter.rs              | 255 ++++++++++++++++++++++++++++++++-----
 src/lib.rs                 |   1 +
 4 files changed, 225 insertions(+), 37 deletions(-)
diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index e75cf13a..581b46df 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -55,7 +55,7 @@ jobs:
     - run: rustup default stable
     - name: test
       run: >
-        cargo test -v --all-targets --all-features
+        cargo test -v --all-targets
   rustfmt:
     runs-on: ubuntu-latest
     steps:
diff --git a/benches/unfilter.rs b/benches/unfilter.rs
index 2f6e1f2f..4ff5daa8 100644
--- a/benches/unfilter.rs
+++ b/benches/unfilter.rs
@@ -2,9 +2,9 @@
 //!
 //! ```
 //! $ alias bench="rustup run nightly cargo bench"
-//! $ bench --bench=unfilter --features=benchmarks -- --save-baseline my_baseline
+//! $ bench --bench=unfilter --features=benchmarks,unstable -- --save-baseline my_baseline
 //! ... tweak something, say the Sub filter ...
-//! $ bench --bench=unfilter --features=benchmarks -- filter=Sub --baseline my_baseline
+//! $ bench --bench=unfilter --features=benchmarks,unstable -- filter=Sub --baseline my_baseline
 //! ```
 
 use criterion::{criterion_group, criterion_main, Criterion, Throughput};
diff --git a/src/filter.rs b/src/filter.rs
index b561e4e9..22663add 100644
--- a/src/filter.rs
+++ b/src/filter.rs
@@ -2,6 +2,170 @@ use core::convert::TryInto;
 
 use crate::common::BytesPerPixel;
 
+/// SIMD helpers for `fn unfilter`
+///
+/// TODO(https://github.com/rust-lang/rust/issues/86656): Stop gating this module behind the
+/// "unstable" feature of the `png` crate.  This should be possible once the "portable_simd"
+/// feature of Rust gets stabilized.
+#[cfg(feature = "unstable")]
+mod simd {
+    use std::simd::{
+        u8x4, u8x8, LaneCount, Simd, SimdInt, SimdOrd, SimdPartialEq, SimdUint, SupportedLaneCount,
+    };
+
+    /// This is an equivalent of the `PaethPredictor` function from
+    /// [the spec](http://www.libpng.org/pub/png/spec/1.2/PNG-Filters.html#Filter-type-4-Paeth)
+    /// except that it simultaenously calculates the predictor for all SIMD lanes.
+    /// Mapping between parameter names and pixel positions can be found in
+    /// [a diagram here](https://www.w3.org/TR/png/#filter-byte-positions).
+    ///
+    /// Examples of how different pixel types may be represented as multiple SIMD lanes:
+    /// - RGBA => 4 lanes of `i16x4` contain R, G, B, A
+    /// - RGB  => 4 lanes of `i16x4` contain R, G, B, and a ignored 4th value
+    ///
+    /// The SIMD algorithm below is based on [`libpng`](https://github.com/glennrp/libpng/blob/f8e5fa92b0e37ab597616f554bee254157998227/intel/filter_sse2_intrinsics.c#L261-L280).
+    fn paeth_predictor<const N: usize>(
+        a: Simd<i16, N>,
+        b: Simd<i16, N>,
+        c: Simd<i16, N>,
+    ) -> Simd<i16, N>
+    where
+        LaneCount<N>: SupportedLaneCount,
+    {
+        let pa = b - c; // (p-a) == (a+b-c - a) == (b-c)
+        let pb = a - c; // (p-b) == (a+b-c - b) == (a-c)
+        let pc = pa + pb; // (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c)
+
+        let pa = pa.abs();
+        let pb = pb.abs();
+        let pc = pc.abs();
+
+        let smallest = pc.simd_min(pa.simd_min(pb));
+
+        // Paeth algorithm breaks ties favoring a over b over c, so we execute the following
+        // lane-wise selection:
+        //
+        //     if smalest == pa
+        //         then select a
+        //         else select (if smallest == pb then select b else select c)
+        smallest
+            .simd_eq(pa)
+            .select(a, smallest.simd_eq(pb).select(b, c))
+    }
+
+    /// Memory of previous pixels (as needed to unfilter `FilterType::Paeth`).
+    /// See also https://www.w3.org/TR/png/#filter-byte-positions
+    #[derive(Default)]
+    struct PaethState<const N: usize>
+    where
+        LaneCount<N>: SupportedLaneCount,
+    {
+        /// Previous pixel in the previous row.
+        c: Simd<i16, N>,
+
+        /// Previous pixel in the current row.
+        a: Simd<i16, N>,
+    }
+
+    /// Mutates `x` as needed to unfilter `FilterType::Paeth`.
+    ///
+    /// `b` is the current pixel in the previous row.  `x` is the current pixel in the current row.
+    /// See also https://www.w3.org/TR/png/#filter-byte-positions
+    fn paeth_step<const N: usize>(state: &mut PaethState<N>, b: Simd<u8, N>, x: &mut Simd<u8, N>)
+    where
+        LaneCount<N>: SupportedLaneCount,
+    {
+        // Storing the inputs.
+        let b = b.cast::<i16>();
+
+        // Calculating the new value of the current pixel.
+        let predictor = paeth_predictor(state.a, b, state.c);
+        *x += predictor.cast::<u8>();
+
+        // Preparing for the next step.
+        state.c = b;
+        state.a = x.cast::<i16>();
+    }
+
+    fn load3(src: &[u8]) -> u8x4 {
+        u8x4::from_array([src[0], src[1], src[2], 0])
+    }
+
+    fn store3(src: u8x4, dest: &mut [u8]) {
+        dest[0..3].copy_from_slice(&src.to_array()[0..3])
+    }
+
+    /// Undoes `FilterType::Paeth` for `BytesPerPixel::Three`.
+    pub fn unfilter_paeth3(mut prev_row: &[u8], mut curr_row: &mut [u8]) {
+        debug_assert_eq!(prev_row.len(), curr_row.len());
+        debug_assert_eq!(prev_row.len() % 3, 0);
+
+        let mut state = PaethState::<4>::default();
+        while prev_row.len() >= 4 {
+            // `u8x4` requires working with `[u8;4]`, but we can just load and ignore the first
+            // byte from the next triple.  This optimization technique mimics the algorithm found
+            // in
+            // https://github.com/glennrp/libpng/blob/f8e5fa92b0e37ab597616f554bee254157998227/intel/filter_sse2_intrinsics.c#L130-L131
+            let b = u8x4::from_slice(prev_row);
+            let mut x = u8x4::from_slice(curr_row);
+
+            paeth_step(&mut state, b, &mut x);
+
+            // We can speculate that writing 4 bytes might be more efficient (just as with using
+            // `u8x4::from_slice` above), but we can't use that here, because we can't clobber the
+            // first byte of the next pixel in the `curr_row`.
+            store3(x, curr_row);
+
+            prev_row = &prev_row[3..];
+            curr_row = &mut curr_row[3..];
+        }
+        // Can't use `u8x4::from_slice` for the last `[u8;3]`.
+        let b = load3(prev_row);
+        let mut x = load3(curr_row);
+        paeth_step(&mut state, b, &mut x);
+        store3(x, curr_row);
+    }
+
+    fn load6(src: &[u8]) -> u8x8 {
+        u8x8::from_array([src[0], src[1], src[2], src[3], src[4], src[5], 0, 0])
+    }
+
+    fn store6(src: u8x8, dest: &mut [u8]) {
+        dest[0..6].copy_from_slice(&src.to_array()[0..6])
+    }
+
+    /// Undoes `FilterType::Paeth` for `BytesPerPixel::Six`.
+    pub fn unfilter_paeth6(mut prev_row: &[u8], mut curr_row: &mut [u8]) {
+        debug_assert_eq!(prev_row.len(), curr_row.len());
+        debug_assert_eq!(prev_row.len() % 6, 0);
+
+        let mut state = PaethState::<8>::default();
+        while prev_row.len() >= 8 {
+            // `u8x8` requires working with `[u8;8]`, but we can just load and ignore the first two
+            // bytes from the next pixel.  This optimization technique mimics the algorithm found
+            // in
+            // https://github.com/glennrp/libpng/blob/f8e5fa92b0e37ab597616f554bee254157998227/intel/filter_sse2_intrinsics.c#L130-L131
+            let b = u8x8::from_slice(prev_row);
+            let mut x = u8x8::from_slice(curr_row);
+
+            paeth_step(&mut state, b, &mut x);
+
+            // We can speculate that writing 8 bytes might be more efficient (just as with using
+            // `u8x8::from_slice` above), but we can't use that here, because we can't clobber the
+            // first bytes of the next pixel in the `curr_row`.
+            store6(x, curr_row);
+
+            prev_row = &prev_row[6..];
+            curr_row = &mut curr_row[6..];
+        }
+        // Can't use `u8x8::from_slice` for the last `[u8;6]`.
+        let b = load6(prev_row);
+        let mut x = load6(curr_row);
+        paeth_step(&mut state, b, &mut x);
+        store6(x, curr_row);
+    }
+}
+
 /// The byte level filter applied to scanlines to prepare them for compression.
 ///
 /// Compression in general benefits from repetitive data. The filter is a content-aware method of
@@ -401,21 +565,31 @@ pub(crate) fn unfilter(
                     }
                 }
                 BytesPerPixel::Three => {
-                    let mut a_bpp = [0; 3];
-                    let mut c_bpp = [0; 3];
-                    for (chunk, b_bpp) in current.chunks_exact_mut(3).zip(previous.chunks_exact(3))
+                    #[cfg(feature = "unstable")]
+                    simd::unfilter_paeth3(previous, current);
+
+                    #[cfg(not(feature = "unstable"))]
                     {
-                        let new_chunk = [
-                            chunk[0]
-                                .wrapping_add(filter_paeth_decode(a_bpp[0], b_bpp[0], c_bpp[0])),
-                            chunk[1]
-                                .wrapping_add(filter_paeth_decode(a_bpp[1], b_bpp[1], c_bpp[1])),
-                            chunk[2]
-                                .wrapping_add(filter_paeth_decode(a_bpp[2], b_bpp[2], c_bpp[2])),
-                        ];
-                        *TryInto::<&mut [u8; 3]>::try_into(chunk).unwrap() = new_chunk;
-                        a_bpp = new_chunk;
-                        c_bpp = b_bpp.try_into().unwrap();
+                        let mut a_bpp = [0; 3];
+                        let mut c_bpp = [0; 3];
+                        for (chunk, b_bpp) in
+                            current.chunks_exact_mut(3).zip(previous.chunks_exact(3))
+                        {
+                            let new_chunk = [
+                                chunk[0].wrapping_add(filter_paeth_decode(
+                                    a_bpp[0], b_bpp[0], c_bpp[0],
+                                )),
+                                chunk[1].wrapping_add(filter_paeth_decode(
+                                    a_bpp[1], b_bpp[1], c_bpp[1],
+                                )),
+                                chunk[2].wrapping_add(filter_paeth_decode(
+                                    a_bpp[2], b_bpp[2], c_bpp[2],
+                                )),
+                            ];
+                            *TryInto::<&mut [u8; 3]>::try_into(chunk).unwrap() = new_chunk;
+                            a_bpp = new_chunk;
+                            c_bpp = b_bpp.try_into().unwrap();
+                        }
                     }
                 }
                 BytesPerPixel::Four => {
@@ -439,27 +613,40 @@ pub(crate) fn unfilter(
                     }
                 }
                 BytesPerPixel::Six => {
-                    let mut a_bpp = [0; 6];
-                    let mut c_bpp = [0; 6];
-                    for (chunk, b_bpp) in current.chunks_exact_mut(6).zip(previous.chunks_exact(6))
+                    #[cfg(feature = "unstable")]
+                    simd::unfilter_paeth6(previous, current);
+
+                    #[cfg(not(feature = "unstable"))]
                     {
-                        let new_chunk = [
-                            chunk[0]
-                                .wrapping_add(filter_paeth_decode(a_bpp[0], b_bpp[0], c_bpp[0])),
-                            chunk[1]
-                                .wrapping_add(filter_paeth_decode(a_bpp[1], b_bpp[1], c_bpp[1])),
-                            chunk[2]
-                                .wrapping_add(filter_paeth_decode(a_bpp[2], b_bpp[2], c_bpp[2])),
-                            chunk[3]
-                                .wrapping_add(filter_paeth_decode(a_bpp[3], b_bpp[3], c_bpp[3])),
-                            chunk[4]
-                                .wrapping_add(filter_paeth_decode(a_bpp[4], b_bpp[4], c_bpp[4])),
-                            chunk[5]
-                                .wrapping_add(filter_paeth_decode(a_bpp[5], b_bpp[5], c_bpp[5])),
-                        ];
-                        *TryInto::<&mut [u8; 6]>::try_into(chunk).unwrap() = new_chunk;
-                        a_bpp = new_chunk;
-                        c_bpp = b_bpp.try_into().unwrap();
+                        let mut a_bpp = [0; 6];
+                        let mut c_bpp = [0; 6];
+                        for (chunk, b_bpp) in
+                            current.chunks_exact_mut(6).zip(previous.chunks_exact(6))
+                        {
+                            let new_chunk = [
+                                chunk[0].wrapping_add(filter_paeth_decode(
+                                    a_bpp[0], b_bpp[0], c_bpp[0],
+                                )),
+                                chunk[1].wrapping_add(filter_paeth_decode(
+                                    a_bpp[1], b_bpp[1], c_bpp[1],
+                                )),
+                                chunk[2].wrapping_add(filter_paeth_decode(
+                                    a_bpp[2], b_bpp[2], c_bpp[2],
+                                )),
+                                chunk[3].wrapping_add(filter_paeth_decode(
+                                    a_bpp[3], b_bpp[3], c_bpp[3],
+                                )),
+                                chunk[4].wrapping_add(filter_paeth_decode(
+                                    a_bpp[4], b_bpp[4], c_bpp[4],
+                                )),
+                                chunk[5].wrapping_add(filter_paeth_decode(
+                                    a_bpp[5], b_bpp[5], c_bpp[5],
+                                )),
+                            ];
+                            *TryInto::<&mut [u8; 6]>::try_into(chunk).unwrap() = new_chunk;
+                            a_bpp = new_chunk;
+                            c_bpp = b_bpp.try_into().unwrap();
+                        }
                     }
                 }
                 BytesPerPixel::Eight => {
diff --git a/src/lib.rs b/src/lib.rs
index 1bcfdb99..e71d4c5d 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -58,6 +58,7 @@
 //! ```
 //!
 
+#![cfg_attr(feature = "unstable", feature(portable_simd))]
 #![forbid(unsafe_code)]
 
 #[macro_use]