From 1825c7e1a604a6b0d4f4b95aca210e953e467516 Mon Sep 17 00:00:00 2001 From: Lukasz Anforowicz Date: Wed, 1 Nov 2023 18:04:22 -0700 Subject: [PATCH] Using `std::simd` to speed-up `unfilter` for `Paeth` for bpp=3 and bpp=6 (#414) --- .github/workflows/rust.yml | 2 +- benches/unfilter.rs | 4 +- src/filter.rs | 255 ++++++++++++++++++++++++++++++++----- src/lib.rs | 1 + 4 files changed, 225 insertions(+), 37 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index e75cf13a..581b46df 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -55,7 +55,7 @@ jobs: - run: rustup default stable - name: test run: > - cargo test -v --all-targets --all-features + cargo test -v --all-targets rustfmt: runs-on: ubuntu-latest steps: diff --git a/benches/unfilter.rs b/benches/unfilter.rs index 2f6e1f2f..4ff5daa8 100644 --- a/benches/unfilter.rs +++ b/benches/unfilter.rs @@ -2,9 +2,9 @@ //! //! ``` //! $ alias bench="rustup run nightly cargo bench" -//! $ bench --bench=unfilter --features=benchmarks -- --save-baseline my_baseline +//! $ bench --bench=unfilter --features=benchmarks,unstable -- --save-baseline my_baseline //! ... tweak something, say the Sub filter ... -//! $ bench --bench=unfilter --features=benchmarks -- filter=Sub --baseline my_baseline +//! $ bench --bench=unfilter --features=benchmarks,unstable -- filter=Sub --baseline my_baseline //! ``` use criterion::{criterion_group, criterion_main, Criterion, Throughput}; diff --git a/src/filter.rs b/src/filter.rs index b561e4e9..22663add 100644 --- a/src/filter.rs +++ b/src/filter.rs @@ -2,6 +2,170 @@ use core::convert::TryInto; use crate::common::BytesPerPixel; +/// SIMD helpers for `fn unfilter` +/// +/// TODO(https://github.com/rust-lang/rust/issues/86656): Stop gating this module behind the +/// "unstable" feature of the `png` crate. This should be possible once the "portable_simd" +/// feature of Rust gets stabilized. +#[cfg(feature = "unstable")] +mod simd { + use std::simd::{ + u8x4, u8x8, LaneCount, Simd, SimdInt, SimdOrd, SimdPartialEq, SimdUint, SupportedLaneCount, + }; + + /// This is an equivalent of the `PaethPredictor` function from + /// [the spec](http://www.libpng.org/pub/png/spec/1.2/PNG-Filters.html#Filter-type-4-Paeth) + /// except that it simultaenously calculates the predictor for all SIMD lanes. + /// Mapping between parameter names and pixel positions can be found in + /// [a diagram here](https://www.w3.org/TR/png/#filter-byte-positions). + /// + /// Examples of how different pixel types may be represented as multiple SIMD lanes: + /// - RGBA => 4 lanes of `i16x4` contain R, G, B, A + /// - RGB => 4 lanes of `i16x4` contain R, G, B, and a ignored 4th value + /// + /// The SIMD algorithm below is based on [`libpng`](https://github.com/glennrp/libpng/blob/f8e5fa92b0e37ab597616f554bee254157998227/intel/filter_sse2_intrinsics.c#L261-L280). + fn paeth_predictor( + a: Simd, + b: Simd, + c: Simd, + ) -> Simd + where + LaneCount: SupportedLaneCount, + { + let pa = b - c; // (p-a) == (a+b-c - a) == (b-c) + let pb = a - c; // (p-b) == (a+b-c - b) == (a-c) + let pc = pa + pb; // (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) + + let pa = pa.abs(); + let pb = pb.abs(); + let pc = pc.abs(); + + let smallest = pc.simd_min(pa.simd_min(pb)); + + // Paeth algorithm breaks ties favoring a over b over c, so we execute the following + // lane-wise selection: + // + // if smalest == pa + // then select a + // else select (if smallest == pb then select b else select c) + smallest + .simd_eq(pa) + .select(a, smallest.simd_eq(pb).select(b, c)) + } + + /// Memory of previous pixels (as needed to unfilter `FilterType::Paeth`). + /// See also https://www.w3.org/TR/png/#filter-byte-positions + #[derive(Default)] + struct PaethState + where + LaneCount: SupportedLaneCount, + { + /// Previous pixel in the previous row. + c: Simd, + + /// Previous pixel in the current row. + a: Simd, + } + + /// Mutates `x` as needed to unfilter `FilterType::Paeth`. + /// + /// `b` is the current pixel in the previous row. `x` is the current pixel in the current row. + /// See also https://www.w3.org/TR/png/#filter-byte-positions + fn paeth_step(state: &mut PaethState, b: Simd, x: &mut Simd) + where + LaneCount: SupportedLaneCount, + { + // Storing the inputs. + let b = b.cast::(); + + // Calculating the new value of the current pixel. + let predictor = paeth_predictor(state.a, b, state.c); + *x += predictor.cast::(); + + // Preparing for the next step. + state.c = b; + state.a = x.cast::(); + } + + fn load3(src: &[u8]) -> u8x4 { + u8x4::from_array([src[0], src[1], src[2], 0]) + } + + fn store3(src: u8x4, dest: &mut [u8]) { + dest[0..3].copy_from_slice(&src.to_array()[0..3]) + } + + /// Undoes `FilterType::Paeth` for `BytesPerPixel::Three`. + pub fn unfilter_paeth3(mut prev_row: &[u8], mut curr_row: &mut [u8]) { + debug_assert_eq!(prev_row.len(), curr_row.len()); + debug_assert_eq!(prev_row.len() % 3, 0); + + let mut state = PaethState::<4>::default(); + while prev_row.len() >= 4 { + // `u8x4` requires working with `[u8;4]`, but we can just load and ignore the first + // byte from the next triple. This optimization technique mimics the algorithm found + // in + // https://github.com/glennrp/libpng/blob/f8e5fa92b0e37ab597616f554bee254157998227/intel/filter_sse2_intrinsics.c#L130-L131 + let b = u8x4::from_slice(prev_row); + let mut x = u8x4::from_slice(curr_row); + + paeth_step(&mut state, b, &mut x); + + // We can speculate that writing 4 bytes might be more efficient (just as with using + // `u8x4::from_slice` above), but we can't use that here, because we can't clobber the + // first byte of the next pixel in the `curr_row`. + store3(x, curr_row); + + prev_row = &prev_row[3..]; + curr_row = &mut curr_row[3..]; + } + // Can't use `u8x4::from_slice` for the last `[u8;3]`. + let b = load3(prev_row); + let mut x = load3(curr_row); + paeth_step(&mut state, b, &mut x); + store3(x, curr_row); + } + + fn load6(src: &[u8]) -> u8x8 { + u8x8::from_array([src[0], src[1], src[2], src[3], src[4], src[5], 0, 0]) + } + + fn store6(src: u8x8, dest: &mut [u8]) { + dest[0..6].copy_from_slice(&src.to_array()[0..6]) + } + + /// Undoes `FilterType::Paeth` for `BytesPerPixel::Six`. + pub fn unfilter_paeth6(mut prev_row: &[u8], mut curr_row: &mut [u8]) { + debug_assert_eq!(prev_row.len(), curr_row.len()); + debug_assert_eq!(prev_row.len() % 6, 0); + + let mut state = PaethState::<8>::default(); + while prev_row.len() >= 8 { + // `u8x8` requires working with `[u8;8]`, but we can just load and ignore the first two + // bytes from the next pixel. This optimization technique mimics the algorithm found + // in + // https://github.com/glennrp/libpng/blob/f8e5fa92b0e37ab597616f554bee254157998227/intel/filter_sse2_intrinsics.c#L130-L131 + let b = u8x8::from_slice(prev_row); + let mut x = u8x8::from_slice(curr_row); + + paeth_step(&mut state, b, &mut x); + + // We can speculate that writing 8 bytes might be more efficient (just as with using + // `u8x8::from_slice` above), but we can't use that here, because we can't clobber the + // first bytes of the next pixel in the `curr_row`. + store6(x, curr_row); + + prev_row = &prev_row[6..]; + curr_row = &mut curr_row[6..]; + } + // Can't use `u8x8::from_slice` for the last `[u8;6]`. + let b = load6(prev_row); + let mut x = load6(curr_row); + paeth_step(&mut state, b, &mut x); + store6(x, curr_row); + } +} + /// The byte level filter applied to scanlines to prepare them for compression. /// /// Compression in general benefits from repetitive data. The filter is a content-aware method of @@ -401,21 +565,31 @@ pub(crate) fn unfilter( } } BytesPerPixel::Three => { - let mut a_bpp = [0; 3]; - let mut c_bpp = [0; 3]; - for (chunk, b_bpp) in current.chunks_exact_mut(3).zip(previous.chunks_exact(3)) + #[cfg(feature = "unstable")] + simd::unfilter_paeth3(previous, current); + + #[cfg(not(feature = "unstable"))] { - let new_chunk = [ - chunk[0] - .wrapping_add(filter_paeth_decode(a_bpp[0], b_bpp[0], c_bpp[0])), - chunk[1] - .wrapping_add(filter_paeth_decode(a_bpp[1], b_bpp[1], c_bpp[1])), - chunk[2] - .wrapping_add(filter_paeth_decode(a_bpp[2], b_bpp[2], c_bpp[2])), - ]; - *TryInto::<&mut [u8; 3]>::try_into(chunk).unwrap() = new_chunk; - a_bpp = new_chunk; - c_bpp = b_bpp.try_into().unwrap(); + let mut a_bpp = [0; 3]; + let mut c_bpp = [0; 3]; + for (chunk, b_bpp) in + current.chunks_exact_mut(3).zip(previous.chunks_exact(3)) + { + let new_chunk = [ + chunk[0].wrapping_add(filter_paeth_decode( + a_bpp[0], b_bpp[0], c_bpp[0], + )), + chunk[1].wrapping_add(filter_paeth_decode( + a_bpp[1], b_bpp[1], c_bpp[1], + )), + chunk[2].wrapping_add(filter_paeth_decode( + a_bpp[2], b_bpp[2], c_bpp[2], + )), + ]; + *TryInto::<&mut [u8; 3]>::try_into(chunk).unwrap() = new_chunk; + a_bpp = new_chunk; + c_bpp = b_bpp.try_into().unwrap(); + } } } BytesPerPixel::Four => { @@ -439,27 +613,40 @@ pub(crate) fn unfilter( } } BytesPerPixel::Six => { - let mut a_bpp = [0; 6]; - let mut c_bpp = [0; 6]; - for (chunk, b_bpp) in current.chunks_exact_mut(6).zip(previous.chunks_exact(6)) + #[cfg(feature = "unstable")] + simd::unfilter_paeth6(previous, current); + + #[cfg(not(feature = "unstable"))] { - let new_chunk = [ - chunk[0] - .wrapping_add(filter_paeth_decode(a_bpp[0], b_bpp[0], c_bpp[0])), - chunk[1] - .wrapping_add(filter_paeth_decode(a_bpp[1], b_bpp[1], c_bpp[1])), - chunk[2] - .wrapping_add(filter_paeth_decode(a_bpp[2], b_bpp[2], c_bpp[2])), - chunk[3] - .wrapping_add(filter_paeth_decode(a_bpp[3], b_bpp[3], c_bpp[3])), - chunk[4] - .wrapping_add(filter_paeth_decode(a_bpp[4], b_bpp[4], c_bpp[4])), - chunk[5] - .wrapping_add(filter_paeth_decode(a_bpp[5], b_bpp[5], c_bpp[5])), - ]; - *TryInto::<&mut [u8; 6]>::try_into(chunk).unwrap() = new_chunk; - a_bpp = new_chunk; - c_bpp = b_bpp.try_into().unwrap(); + let mut a_bpp = [0; 6]; + let mut c_bpp = [0; 6]; + for (chunk, b_bpp) in + current.chunks_exact_mut(6).zip(previous.chunks_exact(6)) + { + let new_chunk = [ + chunk[0].wrapping_add(filter_paeth_decode( + a_bpp[0], b_bpp[0], c_bpp[0], + )), + chunk[1].wrapping_add(filter_paeth_decode( + a_bpp[1], b_bpp[1], c_bpp[1], + )), + chunk[2].wrapping_add(filter_paeth_decode( + a_bpp[2], b_bpp[2], c_bpp[2], + )), + chunk[3].wrapping_add(filter_paeth_decode( + a_bpp[3], b_bpp[3], c_bpp[3], + )), + chunk[4].wrapping_add(filter_paeth_decode( + a_bpp[4], b_bpp[4], c_bpp[4], + )), + chunk[5].wrapping_add(filter_paeth_decode( + a_bpp[5], b_bpp[5], c_bpp[5], + )), + ]; + *TryInto::<&mut [u8; 6]>::try_into(chunk).unwrap() = new_chunk; + a_bpp = new_chunk; + c_bpp = b_bpp.try_into().unwrap(); + } } } BytesPerPixel::Eight => { diff --git a/src/lib.rs b/src/lib.rs index 1bcfdb99..e71d4c5d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -58,6 +58,7 @@ //! ``` //! +#![cfg_attr(feature = "unstable", feature(portable_simd))] #![forbid(unsafe_code)] #[macro_use]