Skip to content

Commit

Permalink
Extending std::simd improvements to Avg and Sub filter types.
Browse files Browse the repository at this point in the history
  • Loading branch information
anforowicz committed Sep 21, 2023
1 parent 2601e49 commit cf54e6c
Showing 1 changed file with 121 additions and 39 deletions.
160 changes: 121 additions & 39 deletions src/filter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use crate::common::BytesPerPixel;
// of Rust gets stabilized.
#[cfg(feature = "unstable")]
mod simd {
use std::simd::{i16x4, u8x4, SimdInt, SimdOrd, SimdPartialEq, SimdUint};
use std::simd::{i16x4, u16x4, u8x4, SimdInt, SimdOrd, SimdPartialEq, SimdUint};

/// This is an equivalent of the `PaethPredictor` function from the spec [1],
/// except that it simultaenously calculates the predictor for all SIMD lanes.
Expand Down Expand Up @@ -55,31 +55,100 @@ mod simd {
dest[2] = simd[2];
}

pub fn unfilter_paeth3(prev_row: &[u8], curr_row: &mut [u8]) {
// Paeth tries to predict pixel d using the pixel to the left of it, a,
// and two pixels from the previous row, b and c:
//
// prev_row: c b
// curr_row: a d
//
// The first pixel has no left context, and so uses an Up filter, p = b.
// This works naturally with our main loop's p = a+b-c if we force a and c
// to zero.
let mut a = i16x4::default();
let mut c = i16x4::default();
trait UnfilterState: Default {
fn step(&mut self, prev: u8x4, curr: &mut u8x4);
}

for (prev, curr) in prev_row.chunks_exact(3).zip(curr_row.chunks_exact_mut(3)) {
let b = load3(prev).cast::<i16>();
let mut x = load3(curr);
/// Memory of previous pixels.
/// See also https://www.w3.org/TR/png/#filter-byte-positions
#[derive(Default)]
struct SubState {
/// Previous pixel in the current row.
a: u8x4,
}

impl UnfilterState for SubState {
fn step(&mut self, _prev: u8x4, curr: &mut u8x4) {
// Calculating the new value of the current pixel.
*curr += self.a;

let predictor = paeth_predictor(a, b, c);
x += predictor.cast::<u8>();
store3(x, curr);
// Preparing for the next step.
self.a = *curr;
}
}

/// Memory of previous pixels.
/// See also https://www.w3.org/TR/png/#filter-byte-positions
#[derive(Default)]
struct AvgState {
/// Previous pixel in the current row.
a: u16x4,
}

c = b;
a = x.cast::<i16>();
impl UnfilterState for AvgState {
fn step(&mut self, prev: u8x4, curr: &mut u8x4) {
// Storing the inputs.
let b = prev.cast::<u16>();
let x = curr;

// Calculating the new value of the current pixel.
let one = u16x4::splat(1);
let avg = (self.a + b) >> one;
*x += avg.cast::<u8>();

// Preparing for the next step.
self.a = x.cast::<u16>();
}
}

/// Memory of previous pixels.
/// See also https://www.w3.org/TR/png/#filter-byte-positions
#[derive(Default)]
struct PaethState {
/// Previous pixel in the previous row.
c: i16x4,

/// Previous pixel in the current row.
a: i16x4,
}

impl UnfilterState for PaethState {
fn step(&mut self, prev: u8x4, curr: &mut u8x4) {
// Storing the inputs.
let b = prev.cast::<i16>();
let x = curr;

// Calculating the new value of the current pixel.
let predictor = paeth_predictor(self.a, b, self.c);
*x += predictor.cast::<u8>();

// Preparing for the next step.
self.c = b;
self.a = x.cast::<i16>();
}
}

fn unfilter3<T: UnfilterState>(prev_row: &[u8], curr_row: &mut [u8]) {
let mut state = T::default();
for (prev, curr) in prev_row.chunks_exact(3).zip(curr_row.chunks_exact_mut(3)) {
let prev_simd = load3(prev);
let mut curr_simd = load3(curr);
state.step(prev_simd, &mut curr_simd);
store3(curr_simd, curr);
}
}

pub fn unfilter_sub3(prev_row: &[u8], curr_row: &mut [u8]) {
unfilter3::<SubState>(prev_row, curr_row);
}

pub fn unfilter_avg3(prev_row: &[u8], curr_row: &mut [u8]) {
unfilter3::<AvgState>(prev_row, curr_row);
}

pub fn unfilter_paeth3(prev_row: &[u8], curr_row: &mut [u8]) {
unfilter3::<PaethState>(prev_row, curr_row);
}
}

/// The byte level filter applied to scanlines to prepare them for compression.
Expand Down Expand Up @@ -306,15 +375,21 @@ pub(crate) fn unfilter(
}
}
BytesPerPixel::Three => {
let mut prev = [0; 3];
for chunk in current.chunks_exact_mut(3) {
let new_chunk = [
chunk[0].wrapping_add(prev[0]),
chunk[1].wrapping_add(prev[1]),
chunk[2].wrapping_add(prev[2]),
];
*TryInto::<&mut [u8; 3]>::try_into(chunk).unwrap() = new_chunk;
prev = new_chunk;
#[cfg(feature = "unstable")]
simd::unfilter_sub3(previous, current);

#[cfg(not(feature = "unstable"))]
{
let mut prev = [0; 3];
for chunk in current.chunks_exact_mut(3) {
let new_chunk = [
chunk[0].wrapping_add(prev[0]),
chunk[1].wrapping_add(prev[1]),
chunk[2].wrapping_add(prev[2]),
];
*TryInto::<&mut [u8; 3]>::try_into(chunk).unwrap() = new_chunk;
prev = new_chunk;
}
}
}
BytesPerPixel::Four => {
Expand Down Expand Up @@ -390,15 +465,22 @@ pub(crate) fn unfilter(
}
}
BytesPerPixel::Three => {
let mut lprev = [0; 3];
for (chunk, above) in current.chunks_exact_mut(3).zip(previous.chunks_exact(3)) {
let new_chunk = [
chunk[0].wrapping_add(((above[0] as u16 + lprev[0] as u16) / 2) as u8),
chunk[1].wrapping_add(((above[1] as u16 + lprev[1] as u16) / 2) as u8),
chunk[2].wrapping_add(((above[2] as u16 + lprev[2] as u16) / 2) as u8),
];
*TryInto::<&mut [u8; 3]>::try_into(chunk).unwrap() = new_chunk;
lprev = new_chunk;
#[cfg(feature = "unstable")]
simd::unfilter_avg3(previous, current);

#[cfg(not(feature = "unstable"))]
{
let mut lprev = [0; 3];
for (chunk, above) in current.chunks_exact_mut(3).zip(previous.chunks_exact(3))
{
let new_chunk = [
chunk[0].wrapping_add(((above[0] as u16 + lprev[0] as u16) / 2) as u8),
chunk[1].wrapping_add(((above[1] as u16 + lprev[1] as u16) / 2) as u8),
chunk[2].wrapping_add(((above[2] as u16 + lprev[2] as u16) / 2) as u8),
];
*TryInto::<&mut [u8; 3]>::try_into(chunk).unwrap() = new_chunk;
lprev = new_chunk;
}
}
}
BytesPerPixel::Four => {
Expand Down

0 comments on commit cf54e6c

Please sign in to comment.