Skip to content

Commit c873049

Browse files
committed
[x64] Add AVX512 optimization for OPCODE_VECTOR_SUB(saturated)
Passes the `vsubuws` and `vsubsws` unit-tests from #1348
1 parent 45050b2 commit c873049

File tree

1 file changed

+28
-0
lines changed

1 file changed

+28
-0
lines changed

src/xenia/cpu/backend/x64/x64_seq_vector.cc

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -679,6 +679,19 @@ struct VECTOR_SUB
679679
// src1/src2.
680680
e.vpsubd(e.xmm1, src1, src2);
681681

682+
if (e.IsFeatureEnabled(kX64EmitAVX512Ortho)) {
683+
// If the result is less or equal to the first operand then
684+
// we did not underflow
685+
Opmask not_underflow = e.k1;
686+
// _mm_cmple_epu32_mask
687+
e.vpcmpud(not_underflow, e.xmm1, src1, 0x2);
688+
689+
// Copy over values that did not underflow, write zero
690+
// everywhere else
691+
e.vmovdqa32(dest | not_underflow | e.T_z, e.xmm1);
692+
return;
693+
}
694+
682695
// If result is greater than either of the inputs, we've
683696
// underflowed (only need to check one input)
684697
// if (res > src1) then underflowed
@@ -690,6 +703,21 @@ struct VECTOR_SUB
690703
} else {
691704
e.vpsubd(e.xmm1, src1, src2);
692705

706+
if (e.IsFeatureEnabled(kX64EmitAVX512Ortho |
707+
kX64EmitAVX512DQ)) {
708+
e.vmovdqa32(e.xmm3, src1);
709+
e.vpternlogd(e.xmm3, e.xmm1, src2, 0b00011000);
710+
711+
const Opmask saturate = e.k1;
712+
e.vpmovd2m(saturate, e.xmm3);
713+
714+
e.vpsrad(e.xmm2, e.xmm1, 31);
715+
e.vpxord(e.xmm2, e.xmm2, e.GetXmmConstPtr(XMMSignMaskI32));
716+
717+
e.vpblendmd(dest | saturate, e.xmm1, e.xmm2);
718+
return;
719+
}
720+
693721
// We can only overflow if the signs of the operands are
694722
// opposite. If signs are opposite and result sign isn't the
695723
// same as src1's sign, we've overflowed. if ((s32b)((src1 ^

0 commit comments

Comments
 (0)