Skip to content

Commit

Permalink
Using std::simd to speed-up unfilter for Paeth for bpp=3 and bp…
Browse files Browse the repository at this point in the history
…p=6 (#414)
  • Loading branch information
anforowicz authored Nov 2, 2023
1 parent baea055 commit 1825c7e
Show file tree
Hide file tree
Showing 4 changed files with 225 additions and 37 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ jobs:
- run: rustup default stable
- name: test
run: >
cargo test -v --all-targets --all-features
cargo test -v --all-targets
rustfmt:
runs-on: ubuntu-latest
steps:
Expand Down
4 changes: 2 additions & 2 deletions benches/unfilter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
//!
//! ```
//! $ alias bench="rustup run nightly cargo bench"
//! $ bench --bench=unfilter --features=benchmarks -- --save-baseline my_baseline
//! $ bench --bench=unfilter --features=benchmarks,unstable -- --save-baseline my_baseline
//! ... tweak something, say the Sub filter ...
//! $ bench --bench=unfilter --features=benchmarks -- filter=Sub --baseline my_baseline
//! $ bench --bench=unfilter --features=benchmarks,unstable -- filter=Sub --baseline my_baseline
//! ```
use criterion::{criterion_group, criterion_main, Criterion, Throughput};
Expand Down
255 changes: 221 additions & 34 deletions src/filter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,170 @@ use core::convert::TryInto;

use crate::common::BytesPerPixel;

/// SIMD helpers for `fn unfilter`
///
/// TODO(https://github.com/rust-lang/rust/issues/86656): Stop gating this module behind the
/// "unstable" feature of the `png` crate. This should be possible once the "portable_simd"
/// feature of Rust gets stabilized.
#[cfg(feature = "unstable")]
mod simd {
use std::simd::{
u8x4, u8x8, LaneCount, Simd, SimdInt, SimdOrd, SimdPartialEq, SimdUint, SupportedLaneCount,
};

/// This is an equivalent of the `PaethPredictor` function from
/// [the spec](http://www.libpng.org/pub/png/spec/1.2/PNG-Filters.html#Filter-type-4-Paeth)
/// except that it simultaenously calculates the predictor for all SIMD lanes.
/// Mapping between parameter names and pixel positions can be found in
/// [a diagram here](https://www.w3.org/TR/png/#filter-byte-positions).
///
/// Examples of how different pixel types may be represented as multiple SIMD lanes:
/// - RGBA => 4 lanes of `i16x4` contain R, G, B, A
/// - RGB => 4 lanes of `i16x4` contain R, G, B, and a ignored 4th value
///
/// The SIMD algorithm below is based on [`libpng`](https://github.com/glennrp/libpng/blob/f8e5fa92b0e37ab597616f554bee254157998227/intel/filter_sse2_intrinsics.c#L261-L280).
fn paeth_predictor<const N: usize>(
a: Simd<i16, N>,
b: Simd<i16, N>,
c: Simd<i16, N>,
) -> Simd<i16, N>
where
LaneCount<N>: SupportedLaneCount,
{
let pa = b - c; // (p-a) == (a+b-c - a) == (b-c)
let pb = a - c; // (p-b) == (a+b-c - b) == (a-c)
let pc = pa + pb; // (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c)

let pa = pa.abs();
let pb = pb.abs();
let pc = pc.abs();

let smallest = pc.simd_min(pa.simd_min(pb));

// Paeth algorithm breaks ties favoring a over b over c, so we execute the following
// lane-wise selection:
//
// if smalest == pa
// then select a
// else select (if smallest == pb then select b else select c)
smallest
.simd_eq(pa)
.select(a, smallest.simd_eq(pb).select(b, c))
}

/// Memory of previous pixels (as needed to unfilter `FilterType::Paeth`).
/// See also https://www.w3.org/TR/png/#filter-byte-positions
#[derive(Default)]
struct PaethState<const N: usize>
where
LaneCount<N>: SupportedLaneCount,
{
/// Previous pixel in the previous row.
c: Simd<i16, N>,

/// Previous pixel in the current row.
a: Simd<i16, N>,
}

/// Mutates `x` as needed to unfilter `FilterType::Paeth`.
///
/// `b` is the current pixel in the previous row. `x` is the current pixel in the current row.
/// See also https://www.w3.org/TR/png/#filter-byte-positions
fn paeth_step<const N: usize>(state: &mut PaethState<N>, b: Simd<u8, N>, x: &mut Simd<u8, N>)
where
LaneCount<N>: SupportedLaneCount,
{
// Storing the inputs.
let b = b.cast::<i16>();

// Calculating the new value of the current pixel.
let predictor = paeth_predictor(state.a, b, state.c);
*x += predictor.cast::<u8>();

// Preparing for the next step.
state.c = b;
state.a = x.cast::<i16>();
}

fn load3(src: &[u8]) -> u8x4 {
u8x4::from_array([src[0], src[1], src[2], 0])
}

fn store3(src: u8x4, dest: &mut [u8]) {
dest[0..3].copy_from_slice(&src.to_array()[0..3])
}

/// Undoes `FilterType::Paeth` for `BytesPerPixel::Three`.
pub fn unfilter_paeth3(mut prev_row: &[u8], mut curr_row: &mut [u8]) {
debug_assert_eq!(prev_row.len(), curr_row.len());
debug_assert_eq!(prev_row.len() % 3, 0);

let mut state = PaethState::<4>::default();
while prev_row.len() >= 4 {
// `u8x4` requires working with `[u8;4]`, but we can just load and ignore the first
// byte from the next triple. This optimization technique mimics the algorithm found
// in
// https://github.com/glennrp/libpng/blob/f8e5fa92b0e37ab597616f554bee254157998227/intel/filter_sse2_intrinsics.c#L130-L131
let b = u8x4::from_slice(prev_row);
let mut x = u8x4::from_slice(curr_row);

paeth_step(&mut state, b, &mut x);

// We can speculate that writing 4 bytes might be more efficient (just as with using
// `u8x4::from_slice` above), but we can't use that here, because we can't clobber the
// first byte of the next pixel in the `curr_row`.
store3(x, curr_row);

prev_row = &prev_row[3..];
curr_row = &mut curr_row[3..];
}
// Can't use `u8x4::from_slice` for the last `[u8;3]`.
let b = load3(prev_row);
let mut x = load3(curr_row);
paeth_step(&mut state, b, &mut x);
store3(x, curr_row);
}

fn load6(src: &[u8]) -> u8x8 {
u8x8::from_array([src[0], src[1], src[2], src[3], src[4], src[5], 0, 0])
}

fn store6(src: u8x8, dest: &mut [u8]) {
dest[0..6].copy_from_slice(&src.to_array()[0..6])
}

/// Undoes `FilterType::Paeth` for `BytesPerPixel::Six`.
pub fn unfilter_paeth6(mut prev_row: &[u8], mut curr_row: &mut [u8]) {
debug_assert_eq!(prev_row.len(), curr_row.len());
debug_assert_eq!(prev_row.len() % 6, 0);

let mut state = PaethState::<8>::default();
while prev_row.len() >= 8 {
// `u8x8` requires working with `[u8;8]`, but we can just load and ignore the first two
// bytes from the next pixel. This optimization technique mimics the algorithm found
// in
// https://github.com/glennrp/libpng/blob/f8e5fa92b0e37ab597616f554bee254157998227/intel/filter_sse2_intrinsics.c#L130-L131
let b = u8x8::from_slice(prev_row);
let mut x = u8x8::from_slice(curr_row);

paeth_step(&mut state, b, &mut x);

// We can speculate that writing 8 bytes might be more efficient (just as with using
// `u8x8::from_slice` above), but we can't use that here, because we can't clobber the
// first bytes of the next pixel in the `curr_row`.
store6(x, curr_row);

prev_row = &prev_row[6..];
curr_row = &mut curr_row[6..];
}
// Can't use `u8x8::from_slice` for the last `[u8;6]`.
let b = load6(prev_row);
let mut x = load6(curr_row);
paeth_step(&mut state, b, &mut x);
store6(x, curr_row);
}
}

/// The byte level filter applied to scanlines to prepare them for compression.
///
/// Compression in general benefits from repetitive data. The filter is a content-aware method of
Expand Down Expand Up @@ -401,21 +565,31 @@ pub(crate) fn unfilter(
}
}
BytesPerPixel::Three => {
let mut a_bpp = [0; 3];
let mut c_bpp = [0; 3];
for (chunk, b_bpp) in current.chunks_exact_mut(3).zip(previous.chunks_exact(3))
#[cfg(feature = "unstable")]
simd::unfilter_paeth3(previous, current);

#[cfg(not(feature = "unstable"))]
{
let new_chunk = [
chunk[0]
.wrapping_add(filter_paeth_decode(a_bpp[0], b_bpp[0], c_bpp[0])),
chunk[1]
.wrapping_add(filter_paeth_decode(a_bpp[1], b_bpp[1], c_bpp[1])),
chunk[2]
.wrapping_add(filter_paeth_decode(a_bpp[2], b_bpp[2], c_bpp[2])),
];
*TryInto::<&mut [u8; 3]>::try_into(chunk).unwrap() = new_chunk;
a_bpp = new_chunk;
c_bpp = b_bpp.try_into().unwrap();
let mut a_bpp = [0; 3];
let mut c_bpp = [0; 3];
for (chunk, b_bpp) in
current.chunks_exact_mut(3).zip(previous.chunks_exact(3))
{
let new_chunk = [
chunk[0].wrapping_add(filter_paeth_decode(
a_bpp[0], b_bpp[0], c_bpp[0],
)),
chunk[1].wrapping_add(filter_paeth_decode(
a_bpp[1], b_bpp[1], c_bpp[1],
)),
chunk[2].wrapping_add(filter_paeth_decode(
a_bpp[2], b_bpp[2], c_bpp[2],
)),
];
*TryInto::<&mut [u8; 3]>::try_into(chunk).unwrap() = new_chunk;
a_bpp = new_chunk;
c_bpp = b_bpp.try_into().unwrap();
}
}
}
BytesPerPixel::Four => {
Expand All @@ -439,27 +613,40 @@ pub(crate) fn unfilter(
}
}
BytesPerPixel::Six => {
let mut a_bpp = [0; 6];
let mut c_bpp = [0; 6];
for (chunk, b_bpp) in current.chunks_exact_mut(6).zip(previous.chunks_exact(6))
#[cfg(feature = "unstable")]
simd::unfilter_paeth6(previous, current);

#[cfg(not(feature = "unstable"))]
{
let new_chunk = [
chunk[0]
.wrapping_add(filter_paeth_decode(a_bpp[0], b_bpp[0], c_bpp[0])),
chunk[1]
.wrapping_add(filter_paeth_decode(a_bpp[1], b_bpp[1], c_bpp[1])),
chunk[2]
.wrapping_add(filter_paeth_decode(a_bpp[2], b_bpp[2], c_bpp[2])),
chunk[3]
.wrapping_add(filter_paeth_decode(a_bpp[3], b_bpp[3], c_bpp[3])),
chunk[4]
.wrapping_add(filter_paeth_decode(a_bpp[4], b_bpp[4], c_bpp[4])),
chunk[5]
.wrapping_add(filter_paeth_decode(a_bpp[5], b_bpp[5], c_bpp[5])),
];
*TryInto::<&mut [u8; 6]>::try_into(chunk).unwrap() = new_chunk;
a_bpp = new_chunk;
c_bpp = b_bpp.try_into().unwrap();
let mut a_bpp = [0; 6];
let mut c_bpp = [0; 6];
for (chunk, b_bpp) in
current.chunks_exact_mut(6).zip(previous.chunks_exact(6))
{
let new_chunk = [
chunk[0].wrapping_add(filter_paeth_decode(
a_bpp[0], b_bpp[0], c_bpp[0],
)),
chunk[1].wrapping_add(filter_paeth_decode(
a_bpp[1], b_bpp[1], c_bpp[1],
)),
chunk[2].wrapping_add(filter_paeth_decode(
a_bpp[2], b_bpp[2], c_bpp[2],
)),
chunk[3].wrapping_add(filter_paeth_decode(
a_bpp[3], b_bpp[3], c_bpp[3],
)),
chunk[4].wrapping_add(filter_paeth_decode(
a_bpp[4], b_bpp[4], c_bpp[4],
)),
chunk[5].wrapping_add(filter_paeth_decode(
a_bpp[5], b_bpp[5], c_bpp[5],
)),
];
*TryInto::<&mut [u8; 6]>::try_into(chunk).unwrap() = new_chunk;
a_bpp = new_chunk;
c_bpp = b_bpp.try_into().unwrap();
}
}
}
BytesPerPixel::Eight => {
Expand Down
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
//! ```
//!
#![cfg_attr(feature = "unstable", feature(portable_simd))]
#![forbid(unsafe_code)]

#[macro_use]
Expand Down

0 comments on commit 1825c7e

Please sign in to comment.