[TargetLowering] try harder to determine undef elements of vector binops

This might be the start of tracking all vector element constants generally if we take it to its logical conclusion, but let's stop here and make sure this is correct/beneficial so far. The affected tests require a convoluted path before they get simplified currently because we don't call SimplifyDemandedVectorElts() from binops directly and don't modify the binop operands directly in SimplifyDemandedVectorElts(). That's why the tests all have a trailing shuffle to induce a chain reaction of transforms. So something like this is happening: 1. Improve the knowledge of undefs in the binop via a SimplifyDemandedVectorElts() call that originates from a shuffle. 2. Transfer that undef knowledge back to the shuffle mask user as more undef lanes. 3. Combine the modified shuffle by calling SimplifyDemandedVectorElts() again. 4. Translate the improved shuffle mask as undemanded lanes of build vector constants causing those to become full undef constants. 5. Simplify the binop now that it has a full undef operand. As we can see from the unchanged 'and' and 'or' tests, tracking undefs alone isn't a full solution. We would need to track zero and all-ones constants to improve those opcodes. We'd probably need to track NaN for FP ops too (assuming we don't have fast-math-flags set). Differential Revision: https://reviews.llvm.org/D57066 llvm-svn: 352880
2019-02-01 15:35:12 +00:00 · 2019-02-01 15:35:12 +00:00 · 0279b5b0b8
parent 1a529f58f9
commit 0279b5b0b8
2 changed files with 61 additions and 30 deletions
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@ -1433,6 +1433,53 @@ bool TargetLowering::SimplifyDemandedVectorElts(SDValue Op,
  return Simplified;
 }

+/// Given a vector binary operation and known undefined elements for each input
+/// operand, compute whether each element of the output is undefined.
+static APInt getKnownUndefForVectorBinop(SDValue BO, SelectionDAG &DAG,
+                                         const APInt &UndefOp0,
+                                         const APInt &UndefOp1) {
+  EVT VT = BO.getValueType();
+  assert(ISD::isBinaryOp(BO.getNode()) && VT.isVector() && "Vector binop only");
+
+  EVT EltVT = VT.getVectorElementType();
+  unsigned NumElts = VT.getVectorNumElements();
+  assert(UndefOp0.getBitWidth() == NumElts &&
+         UndefOp1.getBitWidth() == NumElts && "Bad type for undef analysis");
+
+  auto getUndefOrConstantElt = [&](SDValue V, unsigned Index,
+                                   const APInt &UndefVals) {
+    if (UndefVals[Index])
+      return DAG.getUNDEF(EltVT);
+
+    if (auto *BV = dyn_cast<BuildVectorSDNode>(V)) {
+      // Try hard to make sure that the getNode() call is not creating temporary
+      // nodes. Ignore opaque integers because they do not constant fold.
+      SDValue Elt = BV->getOperand(Index);
+      auto *C = dyn_cast<ConstantSDNode>(Elt);
+      if (isa<ConstantFPSDNode>(Elt) || Elt.isUndef() || (C && !C->isOpaque()))
+        return Elt;
+    }
+
+    return SDValue();
+  };
+
+  APInt KnownUndef = APInt::getNullValue(NumElts);
+  for (unsigned i = 0; i != NumElts; ++i) {
+    // If both inputs for this element are either constant or undef and match
+    // the element type, compute the constant/undef result for this element of
+    // the vector.
+    // TODO: Ideally we would use FoldConstantArithmetic() here, but that does
+    // not handle FP constants. The code within getNode() should be refactored
+    // to avoid the danger of creating a bogus temporary node here.
+    SDValue C0 = getUndefOrConstantElt(BO.getOperand(0), i, UndefOp0);
+    SDValue C1 = getUndefOrConstantElt(BO.getOperand(1), i, UndefOp1);
+    if (C0 && C1 && C0.getValueType() == EltVT && C1.getValueType() == EltVT)
+      if (DAG.getNode(BO.getOpcode(), SDLoc(BO), EltVT, C0, C1).isUndef())
+        KnownUndef.setBit(i);
+  }
+  return KnownUndef;
+}
+
 bool TargetLowering::SimplifyDemandedVectorElts(
    SDValue Op, const APInt &DemandedEltMask, APInt &KnownUndef,
    APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth,
@ -1805,6 +1852,9 @@ bool TargetLowering::SimplifyDemandedVectorElts(
    }
    break;
  }
+
+  // TODO: There are more binop opcodes that could be handled here - MUL, MIN,
+  // MAX, saturated math, etc.
  case ISD::OR:
  case ISD::XOR:
  case ISD::ADD:
@ -1814,15 +1864,17 @@ bool TargetLowering::SimplifyDemandedVectorElts(
  case ISD::FMUL:
  case ISD::FDIV:
  case ISD::FREM: {
-    APInt SrcUndef, SrcZero;
-    if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, SrcUndef,
-                                   SrcZero, TLO, Depth + 1))
+    APInt UndefRHS, ZeroRHS;
+    if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, UndefRHS,
+                                   ZeroRHS, TLO, Depth + 1))
      return true;
-    if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, KnownUndef,
-                                   KnownZero, TLO, Depth + 1))
+    APInt UndefLHS, ZeroLHS;
+    if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, UndefLHS,
+                                   ZeroLHS, TLO, Depth + 1))
      return true;
-    KnownZero &= SrcZero;
-    KnownUndef &= SrcUndef;
+
+    KnownZero = ZeroLHS & ZeroRHS;
+    KnownUndef = getKnownUndefForVectorBinop(Op, TLO.DAG, UndefLHS, UndefRHS);
    break;
  }
  case ISD::AND: {
@ -1836,6 +1888,8 @@ bool TargetLowering::SimplifyDemandedVectorElts(

    // If either side has a zero element, then the result element is zero, even
    // if the other is an UNDEF.
+    // TODO: Extend getKnownUndefForVectorBinop to also deal with known zeros
+    // and then handle 'and' nodes with the rest of the binop opcodes.
    KnownZero |= SrcZero;
    KnownUndef &= SrcUndef;
    KnownUndef &= ~KnownZero;
--- a/llvm/test/CodeGen/X86/vector-partial-undef.ll
+++ b/llvm/test/CodeGen/X86/vector-partial-undef.ll
@ -51,10 +51,6 @@ define <8 x i32> @add_undef_elts(<4 x i32> %x) {
 ;
 ; AVX-LABEL: add_undef_elts:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX-NEXT:    vpaddd {{.*}}(%rip), %ymm0, %ymm0
-; AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,0,5,4,3,2,1,7]
-; AVX-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 ; AVX-NEXT:    retq
  %extend = shufflevector <4 x i32> %x, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
  %bogus_bo = add <8 x i32> %extend, <i32 undef, i32 undef, i32 undef, i32 undef, i32 42, i32 43, i32 44, i32 12>
@ -71,11 +67,6 @@ define <8 x i32> @sub_undef_elts(<4 x i32> %x) {
 ;
 ; AVX-LABEL: sub_undef_elts:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,u,u,42,43,44,12>
-; AVX-NEXT:    vpsubd %ymm0, %ymm1, %ymm0
-; AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,0,5,4,3,2,6,7]
-; AVX-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 ; AVX-NEXT:    retq
  %extend = shufflevector <4 x i32> %x, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
  %bogus_bo = sub <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 42, i32 43, i32 44, i32 12>, %extend
@ -130,24 +121,10 @@ define <4 x i64> @or_undef_elts(<2 x i64> %x) {
 define <8 x i32> @xor_undef_elts(<4 x i32> %x) {
 ; SSE-LABEL: xor_undef_elts:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
-; SSE-NEXT:    pxor {{.*}}(%rip), %xmm2
-; SSE-NEXT:    pxor {{.*}}(%rip), %xmm1
-; SSE-NEXT:    movdqa %xmm1, %xmm0
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[2,0]
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[1,0]
-; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,0],xmm1[0,0]
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,2],xmm2[2,0]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: xor_undef_elts:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,3,0,2]
-; AVX-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3]
-; AVX-NEXT:    vxorps {{.*}}(%rip), %ymm0, %ymm0
-; AVX-NEXT:    vmovaps {{.*#+}} ymm1 = [6,1,5,4,3,2,0,7]
-; AVX-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX-NEXT:    retq
  %extend = shufflevector <4 x i32> %x, <4 x i32> undef, <8 x i32> <i32 undef, i32 undef, i32 1, i32 3, i32 0, i32 2, i32 undef, i32 undef>
  %bogus_bo = xor <8 x i32> %extend, <i32 42, i32 43, i32 undef, i32 undef, i32 undef, i32 undef, i32 44, i32 12>