@@ -679,6 +679,19 @@ struct VECTOR_SUB
679679 // src1/src2.
680680 e.vpsubd (e.xmm1 , src1, src2);
681681
682+ if (e.IsFeatureEnabled (kX64EmitAVX512Ortho )) {
683+ // If the result is less or equal to the first operand then
684+ // we did not underflow
685+ Opmask not_underflow = e.k1 ;
686+ // _mm_cmple_epu32_mask
687+ e.vpcmpud (not_underflow, e.xmm1 , src1, 0x2 );
688+
689+ // Copy over values that did not underflow, write zero
690+ // everywhere else
691+ e.vmovdqa32 (dest | not_underflow | e.T_z , e.xmm1 );
692+ return ;
693+ }
694+
682695 // If result is greater than either of the inputs, we've
683696 // underflowed (only need to check one input)
684697 // if (res > src1) then underflowed
@@ -690,6 +703,21 @@ struct VECTOR_SUB
690703 } else {
691704 e.vpsubd (e.xmm1 , src1, src2);
692705
706+ if (e.IsFeatureEnabled (kX64EmitAVX512Ortho |
707+ kX64EmitAVX512DQ )) {
708+ e.vmovdqa32 (e.xmm3 , src1);
709+ e.vpternlogd (e.xmm3 , e.xmm1 , src2, 0b00011000 );
710+
711+ const Opmask saturate = e.k1 ;
712+ e.vpmovd2m (saturate, e.xmm3 );
713+
714+ e.vpsrad (e.xmm2 , e.xmm1 , 31 );
715+ e.vpxord (e.xmm2 , e.xmm2 , e.GetXmmConstPtr (XMMSignMaskI32));
716+
717+ e.vpblendmd (dest | saturate, e.xmm1 , e.xmm2 );
718+ return ;
719+ }
720+
693721 // We can only overflow if the signs of the operands are
694722 // opposite. If signs are opposite and result sign isn't the
695723 // same as src1's sign, we've overflowed. if ((s32b)((src1 ^
0 commit comments