[DAGCombiner] fold bit-hack form of usubsat

(i8 X ^ 128) & (i8 X s>> 7) --> usubsat X, 128 I haven't found a generalization of this identity: https://alive2.llvm.org/ce/z/_sriEQ Note: I was actually looking at the first form of the pattern in that link, but that's part of a long chain of potential missed transforms in codegen and IR....that I hope ends here! The predicates for when this is profitable are a bit tricky. This version of the patch excludes multi-use but includes custom lowering (as opposed to legal only). On x86 for example, we have custom lowering for some vector types, and that uses umax and sub. So to enable that fold, we need add use checks to avoid regressions. Even with legal-only lowering, we could see code with extra reg move instructions for extra uses, so that constraint would have to be eased very carefully to avoid penalties. Differential Revision: https://reviews.llvm.org/D112085
2021-10-21 09:06:37 -04:00 · 2021-10-21 09:06:37 -04:00 · d2198771e9
parent fa111d3085
commit d2198771e9
3 changed files with 75 additions and 65 deletions
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@ -5627,6 +5627,35 @@ static SDValue combineShiftAnd1ToBitTest(SDNode *And, SelectionDAG &DAG) {
  return DAG.getZExtOrTrunc(Setcc, DL, VT);
 }

+/// For targets that support usubsat, match a bit-hack form of that operation
+/// that ends in 'and' and convert it.
+static SDValue foldAndToUsubsat(SDNode *N, SelectionDAG &DAG) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N1.getValueType();
+
+  // Canonicalize xor as operand 0.
+  if (N1.getOpcode() == ISD::XOR)
+    std::swap(N0, N1);
+
+  if (N0.getOpcode() != ISD::XOR || N1.getOpcode() != ISD::SRA ||
+      !N0.hasOneUse() || !N1.hasOneUse() ||
+      N0.getOperand(0) != N1.getOperand(0))
+    return SDValue();
+
+  unsigned BitWidth = VT.getScalarSizeInBits();
+  ConstantSDNode *XorC = isConstOrConstSplat(N0.getOperand(1), true);
+  ConstantSDNode *SraC = isConstOrConstSplat(N1.getOperand(1), true);
+  if (!XorC || !XorC->getAPIntValue().isSignMask() ||
+      !SraC || SraC->getAPIntValue() != BitWidth - 1)
+    return SDValue();
+
+  // (i8 X ^ 128) & (i8 X s>> 7) --> usubsat X, 128
+  SDLoc DL(N);
+  SDValue SignMask = DAG.getConstant(XorC->getAPIntValue(), DL, VT);
+  return DAG.getNode(ISD::USUBSAT, DL, VT, N0.getOperand(0), SignMask);
+}
+
 SDValue DAGCombiner::visitAND(SDNode *N) {
  SDValue N0 = N->getOperand(0);
  SDValue N1 = N->getOperand(1);
@ -5989,6 +6018,10 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
  if (IsAndZeroExtMask(N0, N1))
    return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, N0.getOperand(0));

+  if (hasOperation(ISD::USUBSAT, VT))
+    if (SDValue V = foldAndToUsubsat(N, DAG))
+      return V;
+
  return SDValue();
 }

--- a/llvm/test/CodeGen/AMDGPU/usubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/usubsat.ll
@ -86,26 +86,22 @@ define i16 @usubsat_as_bithack_i16(i16 %x) {
 ; GFX8-LABEL: usubsat_as_bithack_i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_ashrrev_i16_e32 v1, 15, v0
-; GFX8-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX8-NEXT:    s_movk_i32 s4, 0x8000
+; GFX8-NEXT:    v_sub_u16_e64 v0, v0, s4 clamp
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: usubsat_as_bithack_i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_ashrrev_i16_e32 v1, 15, v0
-; GFX9-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX9-NEXT:    s_movk_i32 s4, 0x8000
+; GFX9-NEXT:    v_sub_u16_e64 v0, v0, s4 clamp
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: usubsat_as_bithack_i16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_ashrrev_i16 v1, 15, v0
-; GFX10-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX10-NEXT:    v_sub_nc_u16 v0, v0, 0x8000 clamp
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
  %signsplat = ashr i16 %x, 15
  %flipsign = xor i16 %x, 32768
--- a/llvm/test/CodeGen/X86/psubus.ll
+++ b/llvm/test/CodeGen/X86/psubus.ll
@ -34,37 +34,21 @@ vector.ph:
 define <8 x i16> @ashr_xor_and(<8 x i16> %x) nounwind {
 ; SSE-LABEL: ashr_xor_and:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    psraw $15, %xmm1
-; SSE-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE-NEXT:    pand %xmm1, %xmm0
+; SSE-NEXT:    psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: ashr_xor_and:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpsraw $15, %xmm0, %xmm1
-; AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: ashr_xor_and:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpsraw $15, %xmm0, %xmm1
-; AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT:    vpand %xmm0, %xmm1, %xmm0
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: ashr_xor_and:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpsraw $15, %xmm0, %xmm1
-; AVX512-NEXT:    vpternlogq $72, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
-; AVX512-NEXT:    retq
+; AVX-LABEL: ashr_xor_and:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    retq
  %signsplat = ashr <8 x i16> %x, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
  %flipsign = xor <8 x i16> %x, <i16 undef, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768>
  %res = and <8 x i16> %signsplat, %flipsign
  ret <8 x i16> %res
 }

+; negative test - extra uses may lead to extra instructions when custom-lowered
+
 define <16 x i8> @ashr_xor_and_commute_uses(<16 x i8> %x, <16 x i8>* %p1, <16 x i8>* %p2) nounwind {
 ; SSE-LABEL: ashr_xor_and_commute_uses:
 ; SSE:       # %bb.0:
@ -94,27 +78,33 @@ define <16 x i8> @ashr_xor_and_commute_uses(<16 x i8> %x, <16 x i8>* %p1, <16 x
 }

 define <4 x i32> @ashr_xor_and_custom(<4 x i32> %x) nounwind {
-; SSE-LABEL: ashr_xor_and_custom:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    psrad $31, %xmm1
-; SSE-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE-NEXT:    pand %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE2OR3-LABEL: ashr_xor_and_custom:
+; SSE2OR3:       # %bb.0:
+; SSE2OR3-NEXT:    movdqa %xmm0, %xmm1
+; SSE2OR3-NEXT:    psrad $31, %xmm1
+; SSE2OR3-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2OR3-NEXT:    pand %xmm1, %xmm0
+; SSE2OR3-NEXT:    retq
+;
+; SSE41-LABEL: ashr_xor_and_custom:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
+; SSE41-NEXT:    pmaxud %xmm1, %xmm0
+; SSE41-NEXT:    psubd %xmm1, %xmm0
+; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: ashr_xor_and_custom:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm1
-; AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
+; AVX1-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ashr_xor_and_custom:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm1
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
+; AVX2-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: ashr_xor_and_custom:
@ -349,37 +339,28 @@ vector.ph:
 define <16 x i16> @ashr_xor_and_v16i16(<16 x i16> %x) nounwind {
 ; SSE-LABEL: ashr_xor_and_v16i16:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa %xmm1, %xmm2
-; SSE-NEXT:    psraw $15, %xmm2
-; SSE-NEXT:    movdqa %xmm0, %xmm3
-; SSE-NEXT:    psraw $15, %xmm3
-; SSE-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE-NEXT:    pand %xmm2, %xmm1
-; SSE-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE-NEXT:    pand %xmm3, %xmm0
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE-NEXT:    psubusw %xmm2, %xmm0
+; SSE-NEXT:    psubusw %xmm2, %xmm1
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: ashr_xor_and_v16i16:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpsraw $15, %xmm0, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpsraw $15, %xmm2, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT:    vandps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; AVX1-NEXT:    vpsubusw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubusw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: ashr_xor_and_v16i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpsraw $15, %ymm0, %ymm1
-; AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT:    vpand %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: ashr_xor_and_v16i16:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpsraw $15, %ymm0, %ymm1
-; AVX512-NEXT:    vpternlogq $72, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
+; AVX512-NEXT:    vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX512-NEXT:    retq
  %signsplat = ashr <16 x i16> %x, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
  %flipsign = xor <16 x i16> %x, <i16 undef, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768>