[X86] combineOr - don't demand operand elements if the other operand element is 'allones'

If either operand has an element with allbits set, then we don't need the equivalent element from the other operand, as allbits are guaranteed to be set.
2021-12-14 14:52:38 +00:00 · 2021-12-14 14:52:38 +00:00 · 4f2e183229
parent a9d811405f
commit 4f2e183229
3 changed files with 38 additions and 20 deletions
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@ -46647,6 +46647,7 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
  SDValue N1 = N->getOperand(1);
  EVT VT = N->getValueType(0);
  SDLoc dl(N);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  // If this is SSE1 only convert to FOR to avoid scalarization.
  if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
@ -46663,7 +46664,6 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
    SmallVector<APInt, 2> SrcPartials;
    if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&
        SrcOps.size() == 1) {
-      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
      unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
      EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
      SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
@ -46724,11 +46724,36 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
    }
  }

-  // Attempt to recursively combine an OR of shuffles.
  if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
+    // Attempt to recursively combine an OR of shuffles.
    SDValue Op(N, 0);
    if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
      return Res;
+
+    // If either operand is a constant mask, then only the elements that aren't
+    // allones are actually demanded by the other operand.
+    auto SimplifyUndemandedElts = [&](SDValue Op, SDValue OtherOp) {
+      APInt UndefElts;
+      SmallVector<APInt> EltBits;
+      int NumElts = VT.getVectorNumElements();
+      int EltSizeInBits = VT.getScalarSizeInBits();
+      if (!getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits))
+        return false;
+
+      APInt DemandedElts = APInt::getZero(NumElts);
+      for (int I = 0; I != NumElts; ++I)
+        if (!EltBits[I].isAllOnes())
+          DemandedElts.setBit(I);
+
+      APInt KnownUndef, KnownZero;
+      return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, KnownUndef,
+                                            KnownZero, DCI);
+    };
+    if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) {
+      if (N->getOpcode() != ISD::DELETED_NODE)
+        DCI.AddToWorklist(N);
+      return SDValue(N, 0);
+    }
  }

  // We should fold "masked merge" patterns when `andn` is not available.
--- a/llvm/test/CodeGen/X86/setcc-lowering.ll
+++ b/llvm/test/CodeGen/X86/setcc-lowering.ll
@ -9,12 +9,11 @@
 define <8 x i16> @pr25080(<8 x i32> %a) {
 ; AVX-LABEL: pr25080:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
--- a/llvm/test/CodeGen/X86/vshift-6.ll
+++ b/llvm/test/CodeGen/X86/vshift-6.ll
@ -30,12 +30,9 @@ define <16 x i8> @do_not_crash(i8*, i32*, i64*, i32, i64, i8) {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movb %al, (%ecx)
-; X86-NEXT:    movd %eax, %xmm0
-; X86-NEXT:    psllq $56, %xmm0
-; X86-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
-; X86-NEXT:    movdqa %xmm2, %xmm1
-; X86-NEXT:    pandn %xmm0, %xmm1
-; X86-NEXT:    por %xmm2, %xmm1
+; X86-NEXT:    movd %eax, %xmm1
+; X86-NEXT:    psllq $56, %xmm1
+; X86-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
 ; X86-NEXT:    pcmpeqd %xmm3, %xmm3
 ; X86-NEXT:    psllw $5, %xmm1
 ; X86-NEXT:    pxor %xmm2, %xmm2
@ -65,12 +62,9 @@ define <16 x i8> @do_not_crash(i8*, i32*, i64*, i32, i64, i8) {
 ; X64-LABEL: do_not_crash:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movb %r9b, (%rdi)
-; X64-NEXT:    movd %r9d, %xmm0
-; X64-NEXT:    psllq $56, %xmm0
-; X64-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
-; X64-NEXT:    movdqa %xmm2, %xmm1
-; X64-NEXT:    pandn %xmm0, %xmm1
-; X64-NEXT:    por %xmm2, %xmm1
+; X64-NEXT:    movd %r9d, %xmm1
+; X64-NEXT:    psllq $56, %xmm1
+; X64-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; X64-NEXT:    pcmpeqd %xmm2, %xmm2
 ; X64-NEXT:    psllw $5, %xmm1
 ; X64-NEXT:    pxor %xmm3, %xmm3