@@ -313,74 +313,145 @@ pub fn paeth_unfilter_4bpp(row: &mut [u8], prev_row: &[u8]) {
313313 a_bpp = new_chunk;
314314 c_bpp = b_bpp. try_into ( ) . unwrap ( ) ;
315315 }
316+ }
317+
318+ /// Unfilters a row of pixels using the Sub filter. Implements via a prefix sum.
319+ pub fn sub_unfilter_3bpp ( current : & mut [ u8 ] ) {
320+ const BPP : usize = 3 ;
321+ const STRIDE_BYTES : usize = 48 ; // 16 pixels * 3 bytes/pixel
322+ type SimdVector = Simd < u8 , STRIDE_BYTES > ;
323+
324+ let mut prev_simd_a: Simd < u8 , BPP > = Default :: default ( ) ; // Unfiltered value of the pixel to the left
325+
326+ const UNROLL_FACTOR : usize = 2 ;
327+ const UNROLLED_STRIDE_BYTES : usize = STRIDE_BYTES * UNROLL_FACTOR ;
328+
329+ let chunks_unrolled = current. len ( ) / UNROLLED_STRIDE_BYTES ;
330+ let ( simd_current_unrolled, mut remainder_current_simd_tail) =
331+ current. split_at_mut ( chunks_unrolled * UNROLLED_STRIDE_BYTES ) ;
332+
333+ for unrolled_chunk in simd_current_unrolled. chunks_exact_mut ( UNROLLED_STRIDE_BYTES ) {
334+ let ( chunk1_slice, chunk2_slice) = unrolled_chunk. split_at_mut ( STRIDE_BYTES ) ;
335+
336+ // Process chunk 1
337+ let mut x_vec1: SimdVector = SimdVector :: from_slice ( chunk1_slice) ;
338+ let carry_in_vec1 = prev_simd_a. resize :: < STRIDE_BYTES > ( 0u8 ) ;
339+ x_vec1 = x_vec1 + carry_in_vec1;
340+ x_vec1 = x_vec1 + x_vec1. shift_elements_right :: < BPP > ( 0u8 ) ;
341+ x_vec1 = x_vec1 + x_vec1. shift_elements_right :: < { 2 * BPP } > ( 0u8 ) ;
342+ x_vec1 = x_vec1 + x_vec1. shift_elements_right :: < { 4 * BPP } > ( 0u8 ) ;
343+ x_vec1 = x_vec1 + x_vec1. shift_elements_right :: < { 8 * BPP } > ( 0u8 ) ;
344+ let prev_simd_a_for_chunk2 = x_vec1. extract :: < { STRIDE_BYTES - BPP } , BPP > ( ) ;
345+ x_vec1. copy_to_slice ( chunk1_slice) ;
346+
347+ // Process chunk 2
348+ let mut x_vec2: SimdVector = SimdVector :: from_slice ( chunk2_slice) ;
349+ let carry_in_vec2 = prev_simd_a_for_chunk2. resize :: < STRIDE_BYTES > ( 0u8 ) ;
350+ x_vec2 = x_vec2 + carry_in_vec2;
351+ x_vec2 = x_vec2 + x_vec2. shift_elements_right :: < BPP > ( 0u8 ) ;
352+ x_vec2 = x_vec2 + x_vec2. shift_elements_right :: < { 2 * BPP } > ( 0u8 ) ;
353+ x_vec2 = x_vec2 + x_vec2. shift_elements_right :: < { 4 * BPP } > ( 0u8 ) ;
354+ x_vec2 = x_vec2 + x_vec2. shift_elements_right :: < { 8 * BPP } > ( 0u8 ) ;
355+ prev_simd_a = x_vec2. extract :: < { STRIDE_BYTES - BPP } , BPP > ( ) ;
356+ x_vec2. copy_to_slice ( chunk2_slice) ;
357+ }
358+
359+ // Process any remaining single STRIDE_BYTES chunk
360+ if remainder_current_simd_tail. len ( ) >= STRIDE_BYTES {
361+ let ( chunk_single_slice, scalar_remainder_slice) =
362+ remainder_current_simd_tail. split_at_mut ( STRIDE_BYTES ) ;
363+ let mut x_vec: SimdVector = SimdVector :: from_slice ( chunk_single_slice) ;
364+ let carry_in_vec = prev_simd_a. resize :: < STRIDE_BYTES > ( 0u8 ) ;
365+ x_vec = x_vec + carry_in_vec;
366+ x_vec = x_vec + x_vec. shift_elements_right :: < BPP > ( 0u8 ) ;
367+ x_vec = x_vec + x_vec. shift_elements_right :: < { 2 * BPP } > ( 0u8 ) ;
368+ x_vec = x_vec + x_vec. shift_elements_right :: < { 4 * BPP } > ( 0u8 ) ;
369+ x_vec = x_vec + x_vec. shift_elements_right :: < { 8 * BPP } > ( 0u8 ) ;
370+ prev_simd_a = x_vec. extract :: < { STRIDE_BYTES - BPP } , BPP > ( ) ;
371+ x_vec. copy_to_slice ( chunk_single_slice) ;
372+ remainder_current_simd_tail = scalar_remainder_slice;
373+ }
374+
375+ // Scalar remainder processing
376+ let mut prev_scalar_a = prev_simd_a. to_array ( ) ;
377+ for chunk in remainder_current_simd_tail. chunks_exact_mut ( BPP ) {
378+ let new_chunk = [
379+ chunk[ 0 ] . wrapping_add ( prev_scalar_a[ 0 ] ) ,
380+ chunk[ 1 ] . wrapping_add ( prev_scalar_a[ 1 ] ) ,
381+ chunk[ 2 ] . wrapping_add ( prev_scalar_a[ 2 ] ) ,
382+ ] ;
383+ * TryInto :: < & mut [ u8 ; BPP ] > :: try_into ( chunk) . unwrap ( ) = new_chunk;
384+ prev_scalar_a = new_chunk;
385+ }
386+ }
387+
388+ /// Unfilters a row of pixels using the Sub filter. Implements via a prefix sum.
389+ pub fn sub_unfilter_4bpp ( current : & mut [ u8 ] ) {
390+ const BPP : usize = 4 ;
391+ const STRIDE_BYTES : usize = 64 ; // 16 pixels * 4 bytes/pixel
392+ type SimdVector = Simd < u8 , STRIDE_BYTES > ;
393+
394+ let mut prev_pixel_val: Simd < u8 , BPP > = Simd :: splat ( 0 ) ; // Unfiltered value of the pixel to the left
395+
396+ const UNROLL_FACTOR : usize = 2 ;
397+ const UNROLLED_STRIDE_BYTES : usize = STRIDE_BYTES * UNROLL_FACTOR ;
398+
399+ let chunks_unrolled = current. len ( ) / UNROLLED_STRIDE_BYTES ;
400+ let ( simd_current_unrolled, mut remainder_current_simd_tail) =
401+ current. split_at_mut ( chunks_unrolled * UNROLLED_STRIDE_BYTES ) ;
402+
403+ for unrolled_chunk in simd_current_unrolled. chunks_exact_mut ( UNROLLED_STRIDE_BYTES ) {
404+ let ( chunk1_slice, chunk2_slice) = unrolled_chunk. split_at_mut ( STRIDE_BYTES ) ;
405+
406+ // Process chunk 1
407+ let mut x_vec1: SimdVector = SimdVector :: from_slice ( chunk1_slice) ;
408+ let carry_in_vec1 = prev_pixel_val. resize :: < STRIDE_BYTES > ( 0u8 ) ;
409+ x_vec1 = x_vec1 + carry_in_vec1;
410+ x_vec1 = x_vec1 + x_vec1. shift_elements_right :: < BPP > ( 0u8 ) ;
411+ x_vec1 = x_vec1 + x_vec1. shift_elements_right :: < { 2 * BPP } > ( 0u8 ) ;
412+ x_vec1 = x_vec1 + x_vec1. shift_elements_right :: < { 4 * BPP } > ( 0u8 ) ;
413+ x_vec1 = x_vec1 + x_vec1. shift_elements_right :: < { 8 * BPP } > ( 0u8 ) ;
414+ let prev_pixel_val_for_chunk2 = x_vec1. extract :: < { STRIDE_BYTES - BPP } , BPP > ( ) ;
415+ x_vec1. copy_to_slice ( chunk1_slice) ;
416+
417+ // Process chunk 2
418+ let mut x_vec2: SimdVector = SimdVector :: from_slice ( chunk2_slice) ;
419+ let carry_in_vec2 = prev_pixel_val_for_chunk2. resize :: < STRIDE_BYTES > ( 0u8 ) ;
420+ x_vec2 = x_vec2 + carry_in_vec2;
421+ x_vec2 = x_vec2 + x_vec2. shift_elements_right :: < BPP > ( 0u8 ) ;
422+ x_vec2 = x_vec2 + x_vec2. shift_elements_right :: < { 2 * BPP } > ( 0u8 ) ;
423+ x_vec2 = x_vec2 + x_vec2. shift_elements_right :: < { 4 * BPP } > ( 0u8 ) ;
424+ x_vec2 = x_vec2 + x_vec2. shift_elements_right :: < { 8 * BPP } > ( 0u8 ) ;
425+ prev_pixel_val = x_vec2. extract :: < { STRIDE_BYTES - BPP } , BPP > ( ) ;
426+ x_vec2. copy_to_slice ( chunk2_slice) ;
427+ }
428+
429+ // Process any remaining single STRIDE_BYTES chunk
430+ if remainder_current_simd_tail. len ( ) >= STRIDE_BYTES {
431+ let ( chunk_single_slice, scalar_remainder_slice) =
432+ remainder_current_simd_tail. split_at_mut ( STRIDE_BYTES ) ;
433+ let mut x_out: SimdVector = SimdVector :: from_slice ( chunk_single_slice) ;
434+ let carry_in_vec = prev_pixel_val. resize :: < STRIDE_BYTES > ( 0u8 ) ;
435+ x_out = x_out + carry_in_vec;
436+ x_out = x_out + x_out. shift_elements_right :: < BPP > ( 0u8 ) ;
437+ x_out = x_out + x_out. shift_elements_right :: < { 2 * BPP } > ( 0u8 ) ;
438+ x_out = x_out + x_out. shift_elements_right :: < { 4 * BPP } > ( 0u8 ) ;
439+ x_out = x_out + x_out. shift_elements_right :: < { 8 * BPP } > ( 0u8 ) ;
440+ prev_pixel_val = x_out. extract :: < { STRIDE_BYTES - BPP } , BPP > ( ) ;
441+ x_out. copy_to_slice ( chunk_single_slice) ;
442+ remainder_current_simd_tail = scalar_remainder_slice;
443+ }
316444
317- /// Unfilters a row of pixels using the Sub filter. Implements via a prefix sum.
318- pub fn sub_unfilter_3bpp ( current : & mut [ u8 ] ) {
319- const BPP : usize = 3 ;
320- const STRIDE_BYTES : usize = 48 ; // 16 pixels * 3 bytes/pixel
321- type SimdVector = Simd < u8 , STRIDE_BYTES > ;
322-
323- let mut prev_simd_a: Simd < u8 , BPP > = Default :: default ( ) ; // Unfiltered value of the pixel to the left
324-
325- const UNROLL_FACTOR : usize = 2 ;
326- const UNROLLED_STRIDE_BYTES : usize = STRIDE_BYTES * UNROLL_FACTOR ;
327-
328- let chunks_unrolled = current. len ( ) / UNROLLED_STRIDE_BYTES ;
329- let ( simd_current_unrolled, mut remainder_current_simd_tail) =
330- current. split_at_mut ( chunks_unrolled * UNROLLED_STRIDE_BYTES ) ;
331-
332- for unrolled_chunk in simd_current_unrolled. chunks_exact_mut ( UNROLLED_STRIDE_BYTES ) {
333- let ( chunk1_slice, chunk2_slice) = unrolled_chunk. split_at_mut ( STRIDE_BYTES ) ;
334-
335- // Process chunk 1
336- let mut x_vec1: SimdVector = SimdVector :: from_slice ( chunk1_slice) ;
337- let carry_in_vec1 = prev_simd_a. resize :: < STRIDE_BYTES > ( 0u8 ) ;
338- x_vec1 = x_vec1 + carry_in_vec1;
339- x_vec1 = x_vec1 + x_vec1. shift_elements_right :: < BPP > ( 0u8 ) ;
340- x_vec1 = x_vec1 + x_vec1. shift_elements_right :: < { 2 * BPP } > ( 0u8 ) ;
341- x_vec1 = x_vec1 + x_vec1. shift_elements_right :: < { 4 * BPP } > ( 0u8 ) ;
342- x_vec1 = x_vec1 + x_vec1. shift_elements_right :: < { 8 * BPP } > ( 0u8 ) ;
343- let prev_simd_a_for_chunk2 = x_vec1. extract :: < { STRIDE_BYTES - BPP } , BPP > ( ) ;
344- x_vec1. copy_to_slice ( chunk1_slice) ;
345-
346- // Process chunk 2
347- let mut x_vec2: SimdVector = SimdVector :: from_slice ( chunk2_slice) ;
348- let carry_in_vec2 = prev_simd_a_for_chunk2. resize :: < STRIDE_BYTES > ( 0u8 ) ;
349- x_vec2 = x_vec2 + carry_in_vec2;
350- x_vec2 = x_vec2 + x_vec2. shift_elements_right :: < BPP > ( 0u8 ) ;
351- x_vec2 = x_vec2 + x_vec2. shift_elements_right :: < { 2 * BPP } > ( 0u8 ) ;
352- x_vec2 = x_vec2 + x_vec2. shift_elements_right :: < { 4 * BPP } > ( 0u8 ) ;
353- x_vec2 = x_vec2 + x_vec2. shift_elements_right :: < { 8 * BPP } > ( 0u8 ) ;
354- prev_simd_a = x_vec2. extract :: < { STRIDE_BYTES - BPP } , BPP > ( ) ;
355- x_vec2. copy_to_slice ( chunk2_slice) ;
356- }
357-
358- // Process any remaining single STRIDE_BYTES chunk
359- if remainder_current_simd_tail. len ( ) >= STRIDE_BYTES {
360- let ( chunk_single_slice, scalar_remainder_slice) =
361- remainder_current_simd_tail. split_at_mut ( STRIDE_BYTES ) ;
362- let mut x_vec: SimdVector = SimdVector :: from_slice ( chunk_single_slice) ;
363- let carry_in_vec = prev_simd_a. resize :: < STRIDE_BYTES > ( 0u8 ) ;
364- x_vec = x_vec + carry_in_vec;
365- x_vec = x_vec + x_vec. shift_elements_right :: < BPP > ( 0u8 ) ;
366- x_vec = x_vec + x_vec. shift_elements_right :: < { 2 * BPP } > ( 0u8 ) ;
367- x_vec = x_vec + x_vec. shift_elements_right :: < { 4 * BPP } > ( 0u8 ) ;
368- x_vec = x_vec + x_vec. shift_elements_right :: < { 8 * BPP } > ( 0u8 ) ;
369- prev_simd_a = x_vec. extract :: < { STRIDE_BYTES - BPP } , BPP > ( ) ;
370- x_vec. copy_to_slice ( chunk_single_slice) ;
371- remainder_current_simd_tail = scalar_remainder_slice;
372- }
373-
374- // Scalar remainder processing
375- let mut prev_scalar_a = prev_simd_a. to_array ( ) ;
376- for chunk in remainder_current_simd_tail. chunks_exact_mut ( BPP ) {
377- let new_chunk = [
378- chunk[ 0 ] . wrapping_add ( prev_scalar_a[ 0 ] ) ,
379- chunk[ 1 ] . wrapping_add ( prev_scalar_a[ 1 ] ) ,
380- chunk[ 2 ] . wrapping_add ( prev_scalar_a[ 2 ] ) ,
381- ] ;
382- * TryInto :: < & mut [ u8 ; BPP ] > :: try_into ( chunk) . unwrap ( ) = new_chunk;
383- prev_scalar_a = new_chunk;
384- }
445+ // Scalar remainder processing
446+ let mut prev_scalar = prev_pixel_val. to_array ( ) ;
447+ for chunk in remainder_current_simd_tail. chunks_exact_mut ( BPP ) {
448+ let new_chunk = [
449+ chunk[ 0 ] . wrapping_add ( prev_scalar[ 0 ] ) ,
450+ chunk[ 1 ] . wrapping_add ( prev_scalar[ 1 ] ) ,
451+ chunk[ 2 ] . wrapping_add ( prev_scalar[ 2 ] ) ,
452+ chunk[ 3 ] . wrapping_add ( prev_scalar[ 3 ] ) ,
453+ ] ;
454+ * core:: convert:: TryInto :: < & mut [ u8 ; BPP ] > :: try_into ( chunk) . unwrap ( ) = new_chunk;
455+ prev_scalar = new_chunk;
385456 }
386457}
0 commit comments