perf: 4bpp version of the sub filter

Sentimentron · Sentimentron · commit 7acabce158a2 · 2025-12-03T21:32:09.000Z
Improves performance by around 40% on the Epyc system,
431% on the Cortex-A520.
diff --git a/src/filter/mod.rs b/src/filter/mod.rs
@@ -154,6 +154,11 @@ pub(crate) fn unfilter(
                 }
             }
             BytesPerPixel::Four => {
+                #[cfg(feature = "unstable")]
+                {
+                    simd::sub_unfilter_4bpp(current);
+                    return;
+                }
                 let mut prev = [0; 4];
                 for chunk in current.chunks_exact_mut(4) {
                     let new_chunk = [
diff --git a/src/filter/simd.rs b/src/filter/simd.rs
@@ -313,74 +313,145 @@ pub fn paeth_unfilter_4bpp(row: &mut [u8], prev_row: &[u8]) {
         a_bpp = new_chunk;
         c_bpp = b_bpp.try_into().unwrap();
     }
+}
+
+/// Unfilters a row of pixels using the Sub filter. Implements via a prefix sum.
+pub fn sub_unfilter_3bpp(current: &mut [u8]) {
+    const BPP: usize = 3;
+    const STRIDE_BYTES: usize = 48; // 16 pixels * 3 bytes/pixel
+    type SimdVector = Simd<u8, STRIDE_BYTES>;
+
+    let mut prev_simd_a: Simd<u8, BPP> = Default::default(); // Unfiltered value of the pixel to the left
+
+    const UNROLL_FACTOR: usize = 2;
+    const UNROLLED_STRIDE_BYTES: usize = STRIDE_BYTES * UNROLL_FACTOR;
+
+    let chunks_unrolled = current.len() / UNROLLED_STRIDE_BYTES;
+    let (simd_current_unrolled, mut remainder_current_simd_tail) =
+        current.split_at_mut(chunks_unrolled * UNROLLED_STRIDE_BYTES);
+
+    for unrolled_chunk in simd_current_unrolled.chunks_exact_mut(UNROLLED_STRIDE_BYTES) {
+        let (chunk1_slice, chunk2_slice) = unrolled_chunk.split_at_mut(STRIDE_BYTES);
+
+        // Process chunk 1
+        let mut x_vec1: SimdVector = SimdVector::from_slice(chunk1_slice);
+        let carry_in_vec1 = prev_simd_a.resize::<STRIDE_BYTES>(0u8);
+        x_vec1 = x_vec1 + carry_in_vec1;
+        x_vec1 = x_vec1 + x_vec1.shift_elements_right::<BPP>(0u8);
+        x_vec1 = x_vec1 + x_vec1.shift_elements_right::<{ 2 * BPP }>(0u8);
+        x_vec1 = x_vec1 + x_vec1.shift_elements_right::<{ 4 * BPP }>(0u8);
+        x_vec1 = x_vec1 + x_vec1.shift_elements_right::<{ 8 * BPP }>(0u8);
+        let prev_simd_a_for_chunk2 = x_vec1.extract::<{ STRIDE_BYTES - BPP }, BPP>();
+        x_vec1.copy_to_slice(chunk1_slice);
+
+        // Process chunk 2
+        let mut x_vec2: SimdVector = SimdVector::from_slice(chunk2_slice);
+        let carry_in_vec2 = prev_simd_a_for_chunk2.resize::<STRIDE_BYTES>(0u8);
+        x_vec2 = x_vec2 + carry_in_vec2;
+        x_vec2 = x_vec2 + x_vec2.shift_elements_right::<BPP>(0u8);
+        x_vec2 = x_vec2 + x_vec2.shift_elements_right::<{ 2 * BPP }>(0u8);
+        x_vec2 = x_vec2 + x_vec2.shift_elements_right::<{ 4 * BPP }>(0u8);
+        x_vec2 = x_vec2 + x_vec2.shift_elements_right::<{ 8 * BPP }>(0u8);
+        prev_simd_a = x_vec2.extract::<{ STRIDE_BYTES - BPP }, BPP>();
+        x_vec2.copy_to_slice(chunk2_slice);
+    }
+
+    // Process any remaining single STRIDE_BYTES chunk
+    if remainder_current_simd_tail.len() >= STRIDE_BYTES {
+        let (chunk_single_slice, scalar_remainder_slice) =
+            remainder_current_simd_tail.split_at_mut(STRIDE_BYTES);
+        let mut x_vec: SimdVector = SimdVector::from_slice(chunk_single_slice);
+        let carry_in_vec = prev_simd_a.resize::<STRIDE_BYTES>(0u8);
+        x_vec = x_vec + carry_in_vec;
+        x_vec = x_vec + x_vec.shift_elements_right::<BPP>(0u8);
+        x_vec = x_vec + x_vec.shift_elements_right::<{ 2 * BPP }>(0u8);
+        x_vec = x_vec + x_vec.shift_elements_right::<{ 4 * BPP }>(0u8);
+        x_vec = x_vec + x_vec.shift_elements_right::<{ 8 * BPP }>(0u8);
+        prev_simd_a = x_vec.extract::<{ STRIDE_BYTES - BPP }, BPP>();
+        x_vec.copy_to_slice(chunk_single_slice);
+        remainder_current_simd_tail = scalar_remainder_slice;
+    }
+
+    // Scalar remainder processing
+    let mut prev_scalar_a = prev_simd_a.to_array();
+    for chunk in remainder_current_simd_tail.chunks_exact_mut(BPP) {
+        let new_chunk = [
+            chunk[0].wrapping_add(prev_scalar_a[0]),
+            chunk[1].wrapping_add(prev_scalar_a[1]),
+            chunk[2].wrapping_add(prev_scalar_a[2]),
+        ];
+        *TryInto::<&mut [u8; BPP]>::try_into(chunk).unwrap() = new_chunk;
+        prev_scalar_a = new_chunk;
+    }
+}
+
+/// Unfilters a row of pixels using the Sub filter. Implements via a prefix sum.
+pub fn sub_unfilter_4bpp(current: &mut [u8]) {
+    const BPP: usize = 4;
+    const STRIDE_BYTES: usize = 64; // 16 pixels * 4 bytes/pixel
+    type SimdVector = Simd<u8, STRIDE_BYTES>;
+
+    let mut prev_pixel_val: Simd<u8, BPP> = Simd::splat(0); // Unfiltered value of the pixel to the left
+
+    const UNROLL_FACTOR: usize = 2;
+    const UNROLLED_STRIDE_BYTES: usize = STRIDE_BYTES * UNROLL_FACTOR;
+
+    let chunks_unrolled = current.len() / UNROLLED_STRIDE_BYTES;
+    let (simd_current_unrolled, mut remainder_current_simd_tail) =
+        current.split_at_mut(chunks_unrolled * UNROLLED_STRIDE_BYTES);
+
+    for unrolled_chunk in simd_current_unrolled.chunks_exact_mut(UNROLLED_STRIDE_BYTES) {
+        let (chunk1_slice, chunk2_slice) = unrolled_chunk.split_at_mut(STRIDE_BYTES);
+
+        // Process chunk 1
+        let mut x_vec1: SimdVector = SimdVector::from_slice(chunk1_slice);
+        let carry_in_vec1 = prev_pixel_val.resize::<STRIDE_BYTES>(0u8);
+        x_vec1 = x_vec1 + carry_in_vec1;
+        x_vec1 = x_vec1 + x_vec1.shift_elements_right::<BPP>(0u8);
+        x_vec1 = x_vec1 + x_vec1.shift_elements_right::<{ 2 * BPP }>(0u8);
+        x_vec1 = x_vec1 + x_vec1.shift_elements_right::<{ 4 * BPP }>(0u8);
+        x_vec1 = x_vec1 + x_vec1.shift_elements_right::<{ 8 * BPP }>(0u8);
+        let prev_pixel_val_for_chunk2 = x_vec1.extract::<{ STRIDE_BYTES - BPP }, BPP>();
+        x_vec1.copy_to_slice(chunk1_slice);
+
+        // Process chunk 2
+        let mut x_vec2: SimdVector = SimdVector::from_slice(chunk2_slice);
+        let carry_in_vec2 = prev_pixel_val_for_chunk2.resize::<STRIDE_BYTES>(0u8);
+        x_vec2 = x_vec2 + carry_in_vec2;
+        x_vec2 = x_vec2 + x_vec2.shift_elements_right::<BPP>(0u8);
+        x_vec2 = x_vec2 + x_vec2.shift_elements_right::<{ 2 * BPP }>(0u8);
+        x_vec2 = x_vec2 + x_vec2.shift_elements_right::<{ 4 * BPP }>(0u8);
+        x_vec2 = x_vec2 + x_vec2.shift_elements_right::<{ 8 * BPP }>(0u8);
+        prev_pixel_val = x_vec2.extract::<{ STRIDE_BYTES - BPP }, BPP>();
+        x_vec2.copy_to_slice(chunk2_slice);
+    }
+
+    // Process any remaining single STRIDE_BYTES chunk
+    if remainder_current_simd_tail.len() >= STRIDE_BYTES {
+        let (chunk_single_slice, scalar_remainder_slice) =
+            remainder_current_simd_tail.split_at_mut(STRIDE_BYTES);
+        let mut x_out: SimdVector = SimdVector::from_slice(chunk_single_slice);
+        let carry_in_vec = prev_pixel_val.resize::<STRIDE_BYTES>(0u8);
+        x_out = x_out + carry_in_vec;
+        x_out = x_out + x_out.shift_elements_right::<BPP>(0u8);
+        x_out = x_out + x_out.shift_elements_right::<{ 2 * BPP }>(0u8);
+        x_out = x_out + x_out.shift_elements_right::<{ 4 * BPP }>(0u8);
+        x_out = x_out + x_out.shift_elements_right::<{ 8 * BPP }>(0u8);
+        prev_pixel_val = x_out.extract::<{ STRIDE_BYTES - BPP }, BPP>();
+        x_out.copy_to_slice(chunk_single_slice);
+        remainder_current_simd_tail = scalar_remainder_slice;
+    }
 
-    /// Unfilters a row of pixels using the Sub filter. Implements via a prefix sum.
-    pub fn sub_unfilter_3bpp(current: &mut [u8]) {
-        const BPP: usize = 3;
-        const STRIDE_BYTES: usize = 48; // 16 pixels * 3 bytes/pixel
-        type SimdVector = Simd<u8, STRIDE_BYTES>;
-
-        let mut prev_simd_a: Simd<u8, BPP> = Default::default(); // Unfiltered value of the pixel to the left
-
-        const UNROLL_FACTOR: usize = 2;
-        const UNROLLED_STRIDE_BYTES: usize = STRIDE_BYTES * UNROLL_FACTOR;
-
-        let chunks_unrolled = current.len() / UNROLLED_STRIDE_BYTES;
-        let (simd_current_unrolled, mut remainder_current_simd_tail) =
-            current.split_at_mut(chunks_unrolled * UNROLLED_STRIDE_BYTES);
-
-        for unrolled_chunk in simd_current_unrolled.chunks_exact_mut(UNROLLED_STRIDE_BYTES) {
-            let (chunk1_slice, chunk2_slice) = unrolled_chunk.split_at_mut(STRIDE_BYTES);
-
-            // Process chunk 1
-            let mut x_vec1: SimdVector = SimdVector::from_slice(chunk1_slice);
-            let carry_in_vec1 = prev_simd_a.resize::<STRIDE_BYTES>(0u8);
-            x_vec1 = x_vec1 + carry_in_vec1;
-            x_vec1 = x_vec1 + x_vec1.shift_elements_right::<BPP>(0u8);
-            x_vec1 = x_vec1 + x_vec1.shift_elements_right::<{ 2 * BPP }>(0u8);
-            x_vec1 = x_vec1 + x_vec1.shift_elements_right::<{ 4 * BPP }>(0u8);
-            x_vec1 = x_vec1 + x_vec1.shift_elements_right::<{ 8 * BPP }>(0u8);
-            let prev_simd_a_for_chunk2 = x_vec1.extract::<{ STRIDE_BYTES - BPP }, BPP>();
-            x_vec1.copy_to_slice(chunk1_slice);
-
-            // Process chunk 2
-            let mut x_vec2: SimdVector = SimdVector::from_slice(chunk2_slice);
-            let carry_in_vec2 = prev_simd_a_for_chunk2.resize::<STRIDE_BYTES>(0u8);
-            x_vec2 = x_vec2 + carry_in_vec2;
-            x_vec2 = x_vec2 + x_vec2.shift_elements_right::<BPP>(0u8);
-            x_vec2 = x_vec2 + x_vec2.shift_elements_right::<{ 2 * BPP }>(0u8);
-            x_vec2 = x_vec2 + x_vec2.shift_elements_right::<{ 4 * BPP }>(0u8);
-            x_vec2 = x_vec2 + x_vec2.shift_elements_right::<{ 8 * BPP }>(0u8);
-            prev_simd_a = x_vec2.extract::<{ STRIDE_BYTES - BPP }, BPP>();
-            x_vec2.copy_to_slice(chunk2_slice);
-        }
-
-        // Process any remaining single STRIDE_BYTES chunk
-        if remainder_current_simd_tail.len() >= STRIDE_BYTES {
-            let (chunk_single_slice, scalar_remainder_slice) =
-                remainder_current_simd_tail.split_at_mut(STRIDE_BYTES);
-            let mut x_vec: SimdVector = SimdVector::from_slice(chunk_single_slice);
-            let carry_in_vec = prev_simd_a.resize::<STRIDE_BYTES>(0u8);
-            x_vec = x_vec + carry_in_vec;
-            x_vec = x_vec + x_vec.shift_elements_right::<BPP>(0u8);
-            x_vec = x_vec + x_vec.shift_elements_right::<{ 2 * BPP }>(0u8);
-            x_vec = x_vec + x_vec.shift_elements_right::<{ 4 * BPP }>(0u8);
-            x_vec = x_vec + x_vec.shift_elements_right::<{ 8 * BPP }>(0u8);
-            prev_simd_a = x_vec.extract::<{ STRIDE_BYTES - BPP }, BPP>();
-            x_vec.copy_to_slice(chunk_single_slice);
-            remainder_current_simd_tail = scalar_remainder_slice;
-        }
-
-        // Scalar remainder processing
-        let mut prev_scalar_a = prev_simd_a.to_array();
-        for chunk in remainder_current_simd_tail.chunks_exact_mut(BPP) {
-            let new_chunk = [
-                chunk[0].wrapping_add(prev_scalar_a[0]),
-                chunk[1].wrapping_add(prev_scalar_a[1]),
-                chunk[2].wrapping_add(prev_scalar_a[2]),
-            ];
-            *TryInto::<&mut [u8; BPP]>::try_into(chunk).unwrap() = new_chunk;
-            prev_scalar_a = new_chunk;
-        }
+    // Scalar remainder processing
+    let mut prev_scalar = prev_pixel_val.to_array();
+    for chunk in remainder_current_simd_tail.chunks_exact_mut(BPP) {
+        let new_chunk = [
+            chunk[0].wrapping_add(prev_scalar[0]),
+            chunk[1].wrapping_add(prev_scalar[1]),
+            chunk[2].wrapping_add(prev_scalar[2]),
+            chunk[3].wrapping_add(prev_scalar[3]),
+        ];
+        *core::convert::TryInto::<&mut [u8; BPP]>::try_into(chunk).unwrap() = new_chunk;
+        prev_scalar = new_chunk;
     }
 }

Original file line number	Diff line number	Diff line change
`@@ -154,6 +154,11 @@ pub(crate) fn unfilter(`
`154`	`154`	`}`
`155`	`155`	`}`
`156`	`156`	`BytesPerPixel::Four => {`
	`157`	`+ #[cfg(feature = "unstable")]`
	`158`	`+ {`
	`159`	`+ simd::sub_unfilter_4bpp(current);`
	`160`	`+ return;`
	`161`	`+ }`
`157`	`162`	`let mut prev = [0; 4];`
`158`	`163`	`for chunk in current.chunks_exact_mut(4) {`
`159`	`164`	`let new_chunk = [`