Skip to content

Commit 7acabce

Browse files
committed
perf: 4bpp version of the sub filter
Improves performance by around 40% on the Epyc system, 431% on the Cortex-A520.
1 parent 551f36e commit 7acabce

File tree

2 files changed

+144
-68
lines changed

2 files changed

+144
-68
lines changed

src/filter/mod.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,11 @@ pub(crate) fn unfilter(
154154
}
155155
}
156156
BytesPerPixel::Four => {
157+
#[cfg(feature = "unstable")]
158+
{
159+
simd::sub_unfilter_4bpp(current);
160+
return;
161+
}
157162
let mut prev = [0; 4];
158163
for chunk in current.chunks_exact_mut(4) {
159164
let new_chunk = [

src/filter/simd.rs

Lines changed: 139 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -313,74 +313,145 @@ pub fn paeth_unfilter_4bpp(row: &mut [u8], prev_row: &[u8]) {
313313
a_bpp = new_chunk;
314314
c_bpp = b_bpp.try_into().unwrap();
315315
}
316+
}
317+
318+
/// Unfilters a row of pixels using the Sub filter. Implements via a prefix sum.
319+
pub fn sub_unfilter_3bpp(current: &mut [u8]) {
320+
const BPP: usize = 3;
321+
const STRIDE_BYTES: usize = 48; // 16 pixels * 3 bytes/pixel
322+
type SimdVector = Simd<u8, STRIDE_BYTES>;
323+
324+
let mut prev_simd_a: Simd<u8, BPP> = Default::default(); // Unfiltered value of the pixel to the left
325+
326+
const UNROLL_FACTOR: usize = 2;
327+
const UNROLLED_STRIDE_BYTES: usize = STRIDE_BYTES * UNROLL_FACTOR;
328+
329+
let chunks_unrolled = current.len() / UNROLLED_STRIDE_BYTES;
330+
let (simd_current_unrolled, mut remainder_current_simd_tail) =
331+
current.split_at_mut(chunks_unrolled * UNROLLED_STRIDE_BYTES);
332+
333+
for unrolled_chunk in simd_current_unrolled.chunks_exact_mut(UNROLLED_STRIDE_BYTES) {
334+
let (chunk1_slice, chunk2_slice) = unrolled_chunk.split_at_mut(STRIDE_BYTES);
335+
336+
// Process chunk 1
337+
let mut x_vec1: SimdVector = SimdVector::from_slice(chunk1_slice);
338+
let carry_in_vec1 = prev_simd_a.resize::<STRIDE_BYTES>(0u8);
339+
x_vec1 = x_vec1 + carry_in_vec1;
340+
x_vec1 = x_vec1 + x_vec1.shift_elements_right::<BPP>(0u8);
341+
x_vec1 = x_vec1 + x_vec1.shift_elements_right::<{ 2 * BPP }>(0u8);
342+
x_vec1 = x_vec1 + x_vec1.shift_elements_right::<{ 4 * BPP }>(0u8);
343+
x_vec1 = x_vec1 + x_vec1.shift_elements_right::<{ 8 * BPP }>(0u8);
344+
let prev_simd_a_for_chunk2 = x_vec1.extract::<{ STRIDE_BYTES - BPP }, BPP>();
345+
x_vec1.copy_to_slice(chunk1_slice);
346+
347+
// Process chunk 2
348+
let mut x_vec2: SimdVector = SimdVector::from_slice(chunk2_slice);
349+
let carry_in_vec2 = prev_simd_a_for_chunk2.resize::<STRIDE_BYTES>(0u8);
350+
x_vec2 = x_vec2 + carry_in_vec2;
351+
x_vec2 = x_vec2 + x_vec2.shift_elements_right::<BPP>(0u8);
352+
x_vec2 = x_vec2 + x_vec2.shift_elements_right::<{ 2 * BPP }>(0u8);
353+
x_vec2 = x_vec2 + x_vec2.shift_elements_right::<{ 4 * BPP }>(0u8);
354+
x_vec2 = x_vec2 + x_vec2.shift_elements_right::<{ 8 * BPP }>(0u8);
355+
prev_simd_a = x_vec2.extract::<{ STRIDE_BYTES - BPP }, BPP>();
356+
x_vec2.copy_to_slice(chunk2_slice);
357+
}
358+
359+
// Process any remaining single STRIDE_BYTES chunk
360+
if remainder_current_simd_tail.len() >= STRIDE_BYTES {
361+
let (chunk_single_slice, scalar_remainder_slice) =
362+
remainder_current_simd_tail.split_at_mut(STRIDE_BYTES);
363+
let mut x_vec: SimdVector = SimdVector::from_slice(chunk_single_slice);
364+
let carry_in_vec = prev_simd_a.resize::<STRIDE_BYTES>(0u8);
365+
x_vec = x_vec + carry_in_vec;
366+
x_vec = x_vec + x_vec.shift_elements_right::<BPP>(0u8);
367+
x_vec = x_vec + x_vec.shift_elements_right::<{ 2 * BPP }>(0u8);
368+
x_vec = x_vec + x_vec.shift_elements_right::<{ 4 * BPP }>(0u8);
369+
x_vec = x_vec + x_vec.shift_elements_right::<{ 8 * BPP }>(0u8);
370+
prev_simd_a = x_vec.extract::<{ STRIDE_BYTES - BPP }, BPP>();
371+
x_vec.copy_to_slice(chunk_single_slice);
372+
remainder_current_simd_tail = scalar_remainder_slice;
373+
}
374+
375+
// Scalar remainder processing
376+
let mut prev_scalar_a = prev_simd_a.to_array();
377+
for chunk in remainder_current_simd_tail.chunks_exact_mut(BPP) {
378+
let new_chunk = [
379+
chunk[0].wrapping_add(prev_scalar_a[0]),
380+
chunk[1].wrapping_add(prev_scalar_a[1]),
381+
chunk[2].wrapping_add(prev_scalar_a[2]),
382+
];
383+
*TryInto::<&mut [u8; BPP]>::try_into(chunk).unwrap() = new_chunk;
384+
prev_scalar_a = new_chunk;
385+
}
386+
}
387+
388+
/// Unfilters a row of pixels using the Sub filter. Implements via a prefix sum.
389+
pub fn sub_unfilter_4bpp(current: &mut [u8]) {
390+
const BPP: usize = 4;
391+
const STRIDE_BYTES: usize = 64; // 16 pixels * 4 bytes/pixel
392+
type SimdVector = Simd<u8, STRIDE_BYTES>;
393+
394+
let mut prev_pixel_val: Simd<u8, BPP> = Simd::splat(0); // Unfiltered value of the pixel to the left
395+
396+
const UNROLL_FACTOR: usize = 2;
397+
const UNROLLED_STRIDE_BYTES: usize = STRIDE_BYTES * UNROLL_FACTOR;
398+
399+
let chunks_unrolled = current.len() / UNROLLED_STRIDE_BYTES;
400+
let (simd_current_unrolled, mut remainder_current_simd_tail) =
401+
current.split_at_mut(chunks_unrolled * UNROLLED_STRIDE_BYTES);
402+
403+
for unrolled_chunk in simd_current_unrolled.chunks_exact_mut(UNROLLED_STRIDE_BYTES) {
404+
let (chunk1_slice, chunk2_slice) = unrolled_chunk.split_at_mut(STRIDE_BYTES);
405+
406+
// Process chunk 1
407+
let mut x_vec1: SimdVector = SimdVector::from_slice(chunk1_slice);
408+
let carry_in_vec1 = prev_pixel_val.resize::<STRIDE_BYTES>(0u8);
409+
x_vec1 = x_vec1 + carry_in_vec1;
410+
x_vec1 = x_vec1 + x_vec1.shift_elements_right::<BPP>(0u8);
411+
x_vec1 = x_vec1 + x_vec1.shift_elements_right::<{ 2 * BPP }>(0u8);
412+
x_vec1 = x_vec1 + x_vec1.shift_elements_right::<{ 4 * BPP }>(0u8);
413+
x_vec1 = x_vec1 + x_vec1.shift_elements_right::<{ 8 * BPP }>(0u8);
414+
let prev_pixel_val_for_chunk2 = x_vec1.extract::<{ STRIDE_BYTES - BPP }, BPP>();
415+
x_vec1.copy_to_slice(chunk1_slice);
416+
417+
// Process chunk 2
418+
let mut x_vec2: SimdVector = SimdVector::from_slice(chunk2_slice);
419+
let carry_in_vec2 = prev_pixel_val_for_chunk2.resize::<STRIDE_BYTES>(0u8);
420+
x_vec2 = x_vec2 + carry_in_vec2;
421+
x_vec2 = x_vec2 + x_vec2.shift_elements_right::<BPP>(0u8);
422+
x_vec2 = x_vec2 + x_vec2.shift_elements_right::<{ 2 * BPP }>(0u8);
423+
x_vec2 = x_vec2 + x_vec2.shift_elements_right::<{ 4 * BPP }>(0u8);
424+
x_vec2 = x_vec2 + x_vec2.shift_elements_right::<{ 8 * BPP }>(0u8);
425+
prev_pixel_val = x_vec2.extract::<{ STRIDE_BYTES - BPP }, BPP>();
426+
x_vec2.copy_to_slice(chunk2_slice);
427+
}
428+
429+
// Process any remaining single STRIDE_BYTES chunk
430+
if remainder_current_simd_tail.len() >= STRIDE_BYTES {
431+
let (chunk_single_slice, scalar_remainder_slice) =
432+
remainder_current_simd_tail.split_at_mut(STRIDE_BYTES);
433+
let mut x_out: SimdVector = SimdVector::from_slice(chunk_single_slice);
434+
let carry_in_vec = prev_pixel_val.resize::<STRIDE_BYTES>(0u8);
435+
x_out = x_out + carry_in_vec;
436+
x_out = x_out + x_out.shift_elements_right::<BPP>(0u8);
437+
x_out = x_out + x_out.shift_elements_right::<{ 2 * BPP }>(0u8);
438+
x_out = x_out + x_out.shift_elements_right::<{ 4 * BPP }>(0u8);
439+
x_out = x_out + x_out.shift_elements_right::<{ 8 * BPP }>(0u8);
440+
prev_pixel_val = x_out.extract::<{ STRIDE_BYTES - BPP }, BPP>();
441+
x_out.copy_to_slice(chunk_single_slice);
442+
remainder_current_simd_tail = scalar_remainder_slice;
443+
}
316444

317-
/// Unfilters a row of pixels using the Sub filter. Implements via a prefix sum.
318-
pub fn sub_unfilter_3bpp(current: &mut [u8]) {
319-
const BPP: usize = 3;
320-
const STRIDE_BYTES: usize = 48; // 16 pixels * 3 bytes/pixel
321-
type SimdVector = Simd<u8, STRIDE_BYTES>;
322-
323-
let mut prev_simd_a: Simd<u8, BPP> = Default::default(); // Unfiltered value of the pixel to the left
324-
325-
const UNROLL_FACTOR: usize = 2;
326-
const UNROLLED_STRIDE_BYTES: usize = STRIDE_BYTES * UNROLL_FACTOR;
327-
328-
let chunks_unrolled = current.len() / UNROLLED_STRIDE_BYTES;
329-
let (simd_current_unrolled, mut remainder_current_simd_tail) =
330-
current.split_at_mut(chunks_unrolled * UNROLLED_STRIDE_BYTES);
331-
332-
for unrolled_chunk in simd_current_unrolled.chunks_exact_mut(UNROLLED_STRIDE_BYTES) {
333-
let (chunk1_slice, chunk2_slice) = unrolled_chunk.split_at_mut(STRIDE_BYTES);
334-
335-
// Process chunk 1
336-
let mut x_vec1: SimdVector = SimdVector::from_slice(chunk1_slice);
337-
let carry_in_vec1 = prev_simd_a.resize::<STRIDE_BYTES>(0u8);
338-
x_vec1 = x_vec1 + carry_in_vec1;
339-
x_vec1 = x_vec1 + x_vec1.shift_elements_right::<BPP>(0u8);
340-
x_vec1 = x_vec1 + x_vec1.shift_elements_right::<{ 2 * BPP }>(0u8);
341-
x_vec1 = x_vec1 + x_vec1.shift_elements_right::<{ 4 * BPP }>(0u8);
342-
x_vec1 = x_vec1 + x_vec1.shift_elements_right::<{ 8 * BPP }>(0u8);
343-
let prev_simd_a_for_chunk2 = x_vec1.extract::<{ STRIDE_BYTES - BPP }, BPP>();
344-
x_vec1.copy_to_slice(chunk1_slice);
345-
346-
// Process chunk 2
347-
let mut x_vec2: SimdVector = SimdVector::from_slice(chunk2_slice);
348-
let carry_in_vec2 = prev_simd_a_for_chunk2.resize::<STRIDE_BYTES>(0u8);
349-
x_vec2 = x_vec2 + carry_in_vec2;
350-
x_vec2 = x_vec2 + x_vec2.shift_elements_right::<BPP>(0u8);
351-
x_vec2 = x_vec2 + x_vec2.shift_elements_right::<{ 2 * BPP }>(0u8);
352-
x_vec2 = x_vec2 + x_vec2.shift_elements_right::<{ 4 * BPP }>(0u8);
353-
x_vec2 = x_vec2 + x_vec2.shift_elements_right::<{ 8 * BPP }>(0u8);
354-
prev_simd_a = x_vec2.extract::<{ STRIDE_BYTES - BPP }, BPP>();
355-
x_vec2.copy_to_slice(chunk2_slice);
356-
}
357-
358-
// Process any remaining single STRIDE_BYTES chunk
359-
if remainder_current_simd_tail.len() >= STRIDE_BYTES {
360-
let (chunk_single_slice, scalar_remainder_slice) =
361-
remainder_current_simd_tail.split_at_mut(STRIDE_BYTES);
362-
let mut x_vec: SimdVector = SimdVector::from_slice(chunk_single_slice);
363-
let carry_in_vec = prev_simd_a.resize::<STRIDE_BYTES>(0u8);
364-
x_vec = x_vec + carry_in_vec;
365-
x_vec = x_vec + x_vec.shift_elements_right::<BPP>(0u8);
366-
x_vec = x_vec + x_vec.shift_elements_right::<{ 2 * BPP }>(0u8);
367-
x_vec = x_vec + x_vec.shift_elements_right::<{ 4 * BPP }>(0u8);
368-
x_vec = x_vec + x_vec.shift_elements_right::<{ 8 * BPP }>(0u8);
369-
prev_simd_a = x_vec.extract::<{ STRIDE_BYTES - BPP }, BPP>();
370-
x_vec.copy_to_slice(chunk_single_slice);
371-
remainder_current_simd_tail = scalar_remainder_slice;
372-
}
373-
374-
// Scalar remainder processing
375-
let mut prev_scalar_a = prev_simd_a.to_array();
376-
for chunk in remainder_current_simd_tail.chunks_exact_mut(BPP) {
377-
let new_chunk = [
378-
chunk[0].wrapping_add(prev_scalar_a[0]),
379-
chunk[1].wrapping_add(prev_scalar_a[1]),
380-
chunk[2].wrapping_add(prev_scalar_a[2]),
381-
];
382-
*TryInto::<&mut [u8; BPP]>::try_into(chunk).unwrap() = new_chunk;
383-
prev_scalar_a = new_chunk;
384-
}
445+
// Scalar remainder processing
446+
let mut prev_scalar = prev_pixel_val.to_array();
447+
for chunk in remainder_current_simd_tail.chunks_exact_mut(BPP) {
448+
let new_chunk = [
449+
chunk[0].wrapping_add(prev_scalar[0]),
450+
chunk[1].wrapping_add(prev_scalar[1]),
451+
chunk[2].wrapping_add(prev_scalar[2]),
452+
chunk[3].wrapping_add(prev_scalar[3]),
453+
];
454+
*core::convert::TryInto::<&mut [u8; BPP]>::try_into(chunk).unwrap() = new_chunk;
455+
prev_scalar = new_chunk;
385456
}
386457
}

0 commit comments

Comments
 (0)