Skip to content

Commit

Permalink
Using std::simd to speed-up unfilter for Paeth / Three bpp.
Browse files Browse the repository at this point in the history
  • Loading branch information
anforowicz committed Sep 21, 2023
1 parent bf2c26b commit 2601e49
Show file tree
Hide file tree
Showing 4 changed files with 108 additions and 22 deletions.
4 changes: 2 additions & 2 deletions benches/unfilter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
//!
//! ```
//! $ alias bench="rustup run nightly cargo bench"
//! $ bench --bench=unfilter --features=benchmarks -- --save-baseline my_baseline
//! $ bench --bench=unfilter --features=benchmarks,unstable -- --save-baseline my_baseline
//! ... tweak something, say the Sub filter ...
//! $ bench --bench=unfilter --features=benchmarks -- filter=Sub --baseline my_baseline
//! $ bench --bench=unfilter --features=benchmarks,unstable -- filter=Sub --baseline my_baseline
//! ```
use criterion::{criterion_group, criterion_main, Criterion, Throughput};
Expand Down
7 changes: 1 addition & 6 deletions src/benchable_apis.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,7 @@ use crate::filter::FilterType;

/// Re-exporting `unfilter` to make it easier to benchmark, despite some items being only
/// `pub(crate)`: `fn unfilter`, `enum BytesPerPixel`.
pub fn unfilter(
filter: FilterType,
tbpp: u8,
previous: &[u8],
current: &mut [u8],
) {
pub fn unfilter(filter: FilterType, tbpp: u8, previous: &[u8], current: &mut [u8]) {
let tbpp = BytesPerPixel::for_prediction(tbpp as usize);
crate::filter::unfilter(filter, tbpp, previous, current)
}
118 changes: 104 additions & 14 deletions src/filter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,86 @@ use core::convert::TryInto;

use crate::common::BytesPerPixel;

// TODO(https://github.com/rust-lang/rust/issues/86656): Stop gating this module behind the
// "unstable" feature of the `png` crate. This should be possible once the "portable_simd" feature
// of Rust gets stabilized.
#[cfg(feature = "unstable")]
mod simd {
use std::simd::{i16x4, u8x4, SimdInt, SimdOrd, SimdPartialEq, SimdUint};

/// This is an equivalent of the `PaethPredictor` function from the spec [1],
/// except that it simultaenously calculates the predictor for all SIMD lanes.
///
/// Examples of how different pixel types may be represented as multiple SIMD lanes:
/// - RGBA => 4 lanes of `i16x4` contain R, G, B, A
/// - RGB => 4 lanes of `i16x4` contain R, G, B, and a ignored 4th value
///
/// The SIMD algorithm below is based on
/// https://github.com/glennrp/libpng/blob/f8e5fa92b0e37ab597616f554bee254157998227/intel/filter_sse2_intrinsics.c#L261-L280
fn paeth_predictor(a: i16x4, b: i16x4, c: i16x4) -> i16x4 {
let pa = b - c; // (p-a) == (a+b-c - a) == (b-c)
let pb = a - c; // (p-b) == (a+b-c - b) == (a-c)
let pc = pa + pb; // (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c)

let pa = pa.abs();
let pb = pb.abs();
let pc = pc.abs();

let smallest = pc.simd_min(pa.simd_min(pb));

// Paeth algorithm breaks ties favoring a over b over c, so we execute the following
// lane-wise selection:
//
// if smalest == pa
// then select a
// else select
// if smallest == pb then select b else select c
smallest
.simd_eq(pa)
.select(a, smallest.simd_eq(pb).select(b, c))
}

fn load3(row: &[u8]) -> u8x4 {
let mut simd = u8x4::default();
simd[0] = row[0];
simd[1] = row[1];
simd[2] = row[2];
simd
}

fn store3(simd: u8x4, dest: &mut [u8]) {
dest[0] = simd[0];
dest[1] = simd[1];
dest[2] = simd[2];
}

pub fn unfilter_paeth3(prev_row: &[u8], curr_row: &mut [u8]) {
// Paeth tries to predict pixel d using the pixel to the left of it, a,
// and two pixels from the previous row, b and c:
//
// prev_row: c b
// curr_row: a d
//
// The first pixel has no left context, and so uses an Up filter, p = b.
// This works naturally with our main loop's p = a+b-c if we force a and c
// to zero.
let mut a = i16x4::default();
let mut c = i16x4::default();

for (prev, curr) in prev_row.chunks_exact(3).zip(curr_row.chunks_exact_mut(3)) {
let b = load3(prev).cast::<i16>();
let mut x = load3(curr);

let predictor = paeth_predictor(a, b, c);
x += predictor.cast::<u8>();
store3(x, curr);

c = b;
a = x.cast::<i16>();
}
}
}

/// The byte level filter applied to scanlines to prepare them for compression.
///
/// Compression in general benefits from repetitive data. The filter is a content-aware method of
Expand Down Expand Up @@ -401,21 +481,31 @@ pub(crate) fn unfilter(
}
}
BytesPerPixel::Three => {
let mut a_bpp = [0; 3];
let mut c_bpp = [0; 3];
for (chunk, b_bpp) in current.chunks_exact_mut(3).zip(previous.chunks_exact(3))
#[cfg(feature = "unstable")]
simd::unfilter_paeth3(previous, current);

#[cfg(not(feature = "unstable"))]
{
let new_chunk = [
chunk[0]
.wrapping_add(filter_paeth_decode(a_bpp[0], b_bpp[0], c_bpp[0])),
chunk[1]
.wrapping_add(filter_paeth_decode(a_bpp[1], b_bpp[1], c_bpp[1])),
chunk[2]
.wrapping_add(filter_paeth_decode(a_bpp[2], b_bpp[2], c_bpp[2])),
];
*TryInto::<&mut [u8; 3]>::try_into(chunk).unwrap() = new_chunk;
a_bpp = new_chunk;
c_bpp = b_bpp.try_into().unwrap();
let mut a_bpp = [0; 3];
let mut c_bpp = [0; 3];
for (chunk, b_bpp) in
current.chunks_exact_mut(3).zip(previous.chunks_exact(3))
{
let new_chunk = [
chunk[0].wrapping_add(filter_paeth_decode(
a_bpp[0], b_bpp[0], c_bpp[0],
)),
chunk[1].wrapping_add(filter_paeth_decode(
a_bpp[1], b_bpp[1], c_bpp[1],
)),
chunk[2].wrapping_add(filter_paeth_decode(
a_bpp[2], b_bpp[2], c_bpp[2],
)),
];
*TryInto::<&mut [u8; 3]>::try_into(chunk).unwrap() = new_chunk;
a_bpp = new_chunk;
c_bpp = b_bpp.try_into().unwrap();
}
}
}
BytesPerPixel::Four => {
Expand Down
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
//! ```
//!
#![cfg_attr(feature = "unstable", feature(portable_simd))]
#![forbid(unsafe_code)]

#[macro_use]
Expand Down

0 comments on commit 2601e49

Please sign in to comment.