Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Using std::simd to speed-up unfilter for Paeth for bpp=3 and bpp=6 #414

Merged
merged 5 commits into from
Nov 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ jobs:
- run: rustup default stable
- name: test
run: >
cargo test -v --all-targets --all-features
cargo test -v --all-targets
rustfmt:
runs-on: ubuntu-latest
steps:
Expand Down
4 changes: 2 additions & 2 deletions benches/unfilter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
//!
//! ```
//! $ alias bench="rustup run nightly cargo bench"
//! $ bench --bench=unfilter --features=benchmarks -- --save-baseline my_baseline
//! $ bench --bench=unfilter --features=benchmarks,unstable -- --save-baseline my_baseline
//! ... tweak something, say the Sub filter ...
//! $ bench --bench=unfilter --features=benchmarks -- filter=Sub --baseline my_baseline
//! $ bench --bench=unfilter --features=benchmarks,unstable -- filter=Sub --baseline my_baseline
//! ```

use criterion::{criterion_group, criterion_main, Criterion, Throughput};
Expand Down
255 changes: 221 additions & 34 deletions src/filter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,170 @@ use core::convert::TryInto;

use crate::common::BytesPerPixel;

/// SIMD helpers for `fn unfilter`
///
/// TODO(https://github.com/rust-lang/rust/issues/86656): Stop gating this module behind the
/// "unstable" feature of the `png` crate. This should be possible once the "portable_simd"
/// feature of Rust gets stabilized.
#[cfg(feature = "unstable")]
mod simd {
use std::simd::{
u8x4, u8x8, LaneCount, Simd, SimdInt, SimdOrd, SimdPartialEq, SimdUint, SupportedLaneCount,
};

/// This is an equivalent of the `PaethPredictor` function from
/// [the spec](http://www.libpng.org/pub/png/spec/1.2/PNG-Filters.html#Filter-type-4-Paeth)
/// except that it simultaenously calculates the predictor for all SIMD lanes.
/// Mapping between parameter names and pixel positions can be found in
/// [a diagram here](https://www.w3.org/TR/png/#filter-byte-positions).
///
/// Examples of how different pixel types may be represented as multiple SIMD lanes:
/// - RGBA => 4 lanes of `i16x4` contain R, G, B, A
/// - RGB => 4 lanes of `i16x4` contain R, G, B, and a ignored 4th value
///
/// The SIMD algorithm below is based on [`libpng`](https://github.com/glennrp/libpng/blob/f8e5fa92b0e37ab597616f554bee254157998227/intel/filter_sse2_intrinsics.c#L261-L280).
fn paeth_predictor<const N: usize>(
a: Simd<i16, N>,
b: Simd<i16, N>,
c: Simd<i16, N>,
) -> Simd<i16, N>
where
LaneCount<N>: SupportedLaneCount,
{
let pa = b - c; // (p-a) == (a+b-c - a) == (b-c)
let pb = a - c; // (p-b) == (a+b-c - b) == (a-c)
let pc = pa + pb; // (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c)

let pa = pa.abs();
let pb = pb.abs();
let pc = pc.abs();

let smallest = pc.simd_min(pa.simd_min(pb));

// Paeth algorithm breaks ties favoring a over b over c, so we execute the following
// lane-wise selection:
//
// if smalest == pa
// then select a
// else select (if smallest == pb then select b else select c)
smallest
.simd_eq(pa)
.select(a, smallest.simd_eq(pb).select(b, c))
}

/// Memory of previous pixels (as needed to unfilter `FilterType::Paeth`).
/// See also https://www.w3.org/TR/png/#filter-byte-positions
#[derive(Default)]
struct PaethState<const N: usize>
where
LaneCount<N>: SupportedLaneCount,
{
/// Previous pixel in the previous row.
c: Simd<i16, N>,

/// Previous pixel in the current row.
a: Simd<i16, N>,
}

/// Mutates `x` as needed to unfilter `FilterType::Paeth`.
///
/// `b` is the current pixel in the previous row. `x` is the current pixel in the current row.
/// See also https://www.w3.org/TR/png/#filter-byte-positions
fn paeth_step<const N: usize>(state: &mut PaethState<N>, b: Simd<u8, N>, x: &mut Simd<u8, N>)
where
LaneCount<N>: SupportedLaneCount,
{
// Storing the inputs.
let b = b.cast::<i16>();

// Calculating the new value of the current pixel.
let predictor = paeth_predictor(state.a, b, state.c);
*x += predictor.cast::<u8>();

// Preparing for the next step.
state.c = b;
state.a = x.cast::<i16>();
}

fn load3(src: &[u8]) -> u8x4 {
u8x4::from_array([src[0], src[1], src[2], 0])
}

fn store3(src: u8x4, dest: &mut [u8]) {
dest[0..3].copy_from_slice(&src.to_array()[0..3])
}

/// Undoes `FilterType::Paeth` for `BytesPerPixel::Three`.
pub fn unfilter_paeth3(mut prev_row: &[u8], mut curr_row: &mut [u8]) {
debug_assert_eq!(prev_row.len(), curr_row.len());
debug_assert_eq!(prev_row.len() % 3, 0);

let mut state = PaethState::<4>::default();
while prev_row.len() >= 4 {
// `u8x4` requires working with `[u8;4]`, but we can just load and ignore the first
// byte from the next triple. This optimization technique mimics the algorithm found
// in
// https://github.com/glennrp/libpng/blob/f8e5fa92b0e37ab597616f554bee254157998227/intel/filter_sse2_intrinsics.c#L130-L131
let b = u8x4::from_slice(prev_row);
let mut x = u8x4::from_slice(curr_row);

paeth_step(&mut state, b, &mut x);

// We can speculate that writing 4 bytes might be more efficient (just as with using
// `u8x4::from_slice` above), but we can't use that here, because we can't clobber the
// first byte of the next pixel in the `curr_row`.
store3(x, curr_row);

prev_row = &prev_row[3..];
curr_row = &mut curr_row[3..];
}
// Can't use `u8x4::from_slice` for the last `[u8;3]`.
anforowicz marked this conversation as resolved.
Show resolved Hide resolved
let b = load3(prev_row);
let mut x = load3(curr_row);
paeth_step(&mut state, b, &mut x);
store3(x, curr_row);
}

fn load6(src: &[u8]) -> u8x8 {
u8x8::from_array([src[0], src[1], src[2], src[3], src[4], src[5], 0, 0])
}

fn store6(src: u8x8, dest: &mut [u8]) {
dest[0..6].copy_from_slice(&src.to_array()[0..6])
}

/// Undoes `FilterType::Paeth` for `BytesPerPixel::Six`.
pub fn unfilter_paeth6(mut prev_row: &[u8], mut curr_row: &mut [u8]) {
debug_assert_eq!(prev_row.len(), curr_row.len());
debug_assert_eq!(prev_row.len() % 6, 0);

let mut state = PaethState::<8>::default();
while prev_row.len() >= 8 {
// `u8x8` requires working with `[u8;8]`, but we can just load and ignore the first two
// bytes from the next pixel. This optimization technique mimics the algorithm found
// in
// https://github.com/glennrp/libpng/blob/f8e5fa92b0e37ab597616f554bee254157998227/intel/filter_sse2_intrinsics.c#L130-L131
let b = u8x8::from_slice(prev_row);
let mut x = u8x8::from_slice(curr_row);

paeth_step(&mut state, b, &mut x);

// We can speculate that writing 8 bytes might be more efficient (just as with using
// `u8x8::from_slice` above), but we can't use that here, because we can't clobber the
// first bytes of the next pixel in the `curr_row`.
store6(x, curr_row);

prev_row = &prev_row[6..];
curr_row = &mut curr_row[6..];
}
// Can't use `u8x8::from_slice` for the last `[u8;6]`.
let b = load6(prev_row);
let mut x = load6(curr_row);
paeth_step(&mut state, b, &mut x);
store6(x, curr_row);
}
}

/// The byte level filter applied to scanlines to prepare them for compression.
///
/// Compression in general benefits from repetitive data. The filter is a content-aware method of
Expand Down Expand Up @@ -401,21 +565,31 @@ pub(crate) fn unfilter(
}
}
BytesPerPixel::Three => {
let mut a_bpp = [0; 3];
let mut c_bpp = [0; 3];
for (chunk, b_bpp) in current.chunks_exact_mut(3).zip(previous.chunks_exact(3))
#[cfg(feature = "unstable")]
simd::unfilter_paeth3(previous, current);

#[cfg(not(feature = "unstable"))]
{
let new_chunk = [
chunk[0]
.wrapping_add(filter_paeth_decode(a_bpp[0], b_bpp[0], c_bpp[0])),
chunk[1]
.wrapping_add(filter_paeth_decode(a_bpp[1], b_bpp[1], c_bpp[1])),
chunk[2]
.wrapping_add(filter_paeth_decode(a_bpp[2], b_bpp[2], c_bpp[2])),
];
*TryInto::<&mut [u8; 3]>::try_into(chunk).unwrap() = new_chunk;
a_bpp = new_chunk;
c_bpp = b_bpp.try_into().unwrap();
let mut a_bpp = [0; 3];
let mut c_bpp = [0; 3];
for (chunk, b_bpp) in
current.chunks_exact_mut(3).zip(previous.chunks_exact(3))
{
let new_chunk = [
chunk[0].wrapping_add(filter_paeth_decode(
a_bpp[0], b_bpp[0], c_bpp[0],
)),
chunk[1].wrapping_add(filter_paeth_decode(
a_bpp[1], b_bpp[1], c_bpp[1],
)),
chunk[2].wrapping_add(filter_paeth_decode(
a_bpp[2], b_bpp[2], c_bpp[2],
)),
];
*TryInto::<&mut [u8; 3]>::try_into(chunk).unwrap() = new_chunk;
a_bpp = new_chunk;
c_bpp = b_bpp.try_into().unwrap();
}
}
}
BytesPerPixel::Four => {
Expand All @@ -439,27 +613,40 @@ pub(crate) fn unfilter(
}
}
BytesPerPixel::Six => {
let mut a_bpp = [0; 6];
let mut c_bpp = [0; 6];
for (chunk, b_bpp) in current.chunks_exact_mut(6).zip(previous.chunks_exact(6))
#[cfg(feature = "unstable")]
simd::unfilter_paeth6(previous, current);

#[cfg(not(feature = "unstable"))]
{
let new_chunk = [
chunk[0]
.wrapping_add(filter_paeth_decode(a_bpp[0], b_bpp[0], c_bpp[0])),
chunk[1]
.wrapping_add(filter_paeth_decode(a_bpp[1], b_bpp[1], c_bpp[1])),
chunk[2]
.wrapping_add(filter_paeth_decode(a_bpp[2], b_bpp[2], c_bpp[2])),
chunk[3]
.wrapping_add(filter_paeth_decode(a_bpp[3], b_bpp[3], c_bpp[3])),
chunk[4]
.wrapping_add(filter_paeth_decode(a_bpp[4], b_bpp[4], c_bpp[4])),
chunk[5]
.wrapping_add(filter_paeth_decode(a_bpp[5], b_bpp[5], c_bpp[5])),
];
*TryInto::<&mut [u8; 6]>::try_into(chunk).unwrap() = new_chunk;
a_bpp = new_chunk;
c_bpp = b_bpp.try_into().unwrap();
let mut a_bpp = [0; 6];
let mut c_bpp = [0; 6];
for (chunk, b_bpp) in
current.chunks_exact_mut(6).zip(previous.chunks_exact(6))
{
let new_chunk = [
chunk[0].wrapping_add(filter_paeth_decode(
a_bpp[0], b_bpp[0], c_bpp[0],
)),
chunk[1].wrapping_add(filter_paeth_decode(
a_bpp[1], b_bpp[1], c_bpp[1],
)),
chunk[2].wrapping_add(filter_paeth_decode(
a_bpp[2], b_bpp[2], c_bpp[2],
)),
chunk[3].wrapping_add(filter_paeth_decode(
a_bpp[3], b_bpp[3], c_bpp[3],
)),
chunk[4].wrapping_add(filter_paeth_decode(
a_bpp[4], b_bpp[4], c_bpp[4],
)),
chunk[5].wrapping_add(filter_paeth_decode(
a_bpp[5], b_bpp[5], c_bpp[5],
)),
];
*TryInto::<&mut [u8; 6]>::try_into(chunk).unwrap() = new_chunk;
a_bpp = new_chunk;
c_bpp = b_bpp.try_into().unwrap();
}
}
}
BytesPerPixel::Eight => {
Expand Down
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
//! ```
//!

#![cfg_attr(feature = "unstable", feature(portable_simd))]
#![forbid(unsafe_code)]

#[macro_use]
Expand Down
Loading