[DAGCombiner] don't group bswap with casts in logic hoisting fold

This was probably organized as it was because bswap is a unary op. But that's where the similarity to the other opcodes ends. We should not limit this transform to scalars, and we should not try it if either input has other uses. This is another step towards trying to clean this whole function up to prevent it from causing infinite loops and memory explosions. Earlier commits in this series: rL348501 rL348508 rL348518 llvm-svn: 348534
2018-12-06 22:10:44 +00:00 · 2018-12-06 22:10:44 +00:00 · 70af85b0ac
parent b0b61955a1
commit 70af85b0ac
2 changed files with 26 additions and 31 deletions
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@ -3718,13 +3718,13 @@ SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) {
  // FIXME: We should check number of uses of the operands to not increase
  //        the instruction count for all transforms.

+  // Handle size-changing casts.
  EVT Op0VT = N0.getOperand(0).getValueType();
  switch (HandOpcode) {
    case ISD::ANY_EXTEND:
    case ISD::TRUNCATE:
    case ISD::ZERO_EXTEND:
    case ISD::SIGN_EXTEND:
-    case ISD::BSWAP:
      // If both operands have other uses, this transform would create extra
      // instructions without eliminating anything.
      if (!N0.hasOneUse() && !N1.hasOneUse())
@ -3732,7 +3732,7 @@ SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) {
      // We need matching integer source types.
      // Do not hoist logic op inside of a vector extend, since it may combine
      // into a vsetcc.
-      // TODO: Should the vector check apply to truncate and bswap though?
+      // TODO: Should the vector check apply to truncate though?
      if (VT.isVector() || Op0VT != N1.getOperand(0).getValueType())
        return SDValue();
      // Don't create an illegal op during or after legalization.
@ -3757,10 +3757,8 @@ SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) {
      return DAG.getNode(HandOpcode, SDLoc(N), VT, Logic);
  }

-  // For each of OP in SHL/SRL/SRA/AND...
-  //   fold (and (OP x, z), (OP y, z)) -> (OP (and x, y), z)
-  //   fold (or  (OP x, z), (OP y, z)) -> (OP (or  x, y), z)
-  //   fold (xor (OP x, z), (OP y, z)) -> (OP (xor x, y), z)
+  // For binops SHL/SRL/SRA/AND:
+  //   logic_op (OP x, z), (OP y, z) --> OP (logic_op x, y), z
  if ((HandOpcode == ISD::SHL || HandOpcode == ISD::SRL ||
       HandOpcode == ISD::SRA || HandOpcode == ISD::AND) &&
      N0.getOperand(1) == N1.getOperand(1)) {
@ -3773,6 +3771,17 @@ SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) {
    return DAG.getNode(HandOpcode, SDLoc(N), VT, Logic, N0.getOperand(1));
  }

+  // Unary ops: logic_op (bswap x), (bswap y) --> bswap (logic_op x, y)
+  if (HandOpcode == ISD::BSWAP) {
+    // If either operand has other uses, this transform is not an improvement.
+    if (!N0.hasOneUse() || !N1.hasOneUse())
+      return SDValue();
+    SDValue Logic = DAG.getNode(LogicOpcode, SDLoc(N0), Op0VT,
+                                N0.getOperand(0), N1.getOperand(0));
+    AddToWorklist(Logic.getNode());
+    return DAG.getNode(HandOpcode, SDLoc(N), VT, Logic);
+  }
+
  // Simplify xor/and/or (bitcast(A), bitcast(B)) -> bitcast(op (A,B))
  // Only perform this optimization up until type legalization, before
  // LegalizeVectorOprs. LegalizeVectorOprs promotes vector operations by
--- a/llvm/test/CodeGen/X86/bswap-vector.ll
+++ b/llvm/test/CodeGen/X86/bswap-vector.ll
@ -76,48 +76,34 @@ define <4 x i32> @test2(<4 x i32> %v) {
 define <4 x i32> @or_bswap(<4 x i32> %x, <4 x i32> %y, <4 x i32>* %p1, <4 x i32>* %p2) {
 ; CHECK-NOSSSE3-LABEL: or_bswap:
 ; CHECK-NOSSSE3:       # %bb.0:
-; CHECK-NOSSSE3-NEXT:    pxor %xmm2, %xmm2
-; CHECK-NOSSSE3-NEXT:    movdqa %xmm0, %xmm3
-; CHECK-NOSSSE3-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
-; CHECK-NOSSSE3-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
-; CHECK-NOSSSE3-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
-; CHECK-NOSSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; CHECK-NOSSSE3-NEXT:    por %xmm1, %xmm0
+; CHECK-NOSSSE3-NEXT:    pxor %xmm1, %xmm1
+; CHECK-NOSSSE3-NEXT:    movdqa %xmm0, %xmm2
+; CHECK-NOSSSE3-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; CHECK-NOSSSE3-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
+; CHECK-NOSSSE3-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
+; CHECK-NOSSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; CHECK-NOSSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
-; CHECK-NOSSSE3-NEXT:    pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,7,6,5,4]
-; CHECK-NOSSSE3-NEXT:    packuswb %xmm3, %xmm4
-; CHECK-NOSSSE3-NEXT:    movdqa %xmm1, %xmm0
-; CHECK-NOSSSE3-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
-; CHECK-NOSSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
-; CHECK-NOSSSE3-NEXT:    pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,7,6,5,4]
-; CHECK-NOSSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; CHECK-NOSSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[3,2,1,0,4,5,6,7]
 ; CHECK-NOSSSE3-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
-; CHECK-NOSSSE3-NEXT:    packuswb %xmm3, %xmm0
-; CHECK-NOSSSE3-NEXT:    por %xmm4, %xmm0
+; CHECK-NOSSSE3-NEXT:    packuswb %xmm2, %xmm0
 ; CHECK-NOSSSE3-NEXT:    retq
 ;
 ; CHECK-SSSE3-LABEL: or_bswap:
 ; CHECK-SSSE3:       # %bb.0:
-; CHECK-SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
-; CHECK-SSSE3-NEXT:    pshufb %xmm2, %xmm0
-; CHECK-SSSE3-NEXT:    pshufb %xmm2, %xmm1
 ; CHECK-SSSE3-NEXT:    por %xmm1, %xmm0
+; CHECK-SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
 ; CHECK-SSSE3-NEXT:    retq
 ;
 ; CHECK-AVX-LABEL: or_bswap:
 ; CHECK-AVX:       # %bb.0:
-; CHECK-AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
-; CHECK-AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; CHECK-AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; CHECK-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
 ; CHECK-AVX-NEXT:    retq
 ;
 ; CHECK-WIDE-AVX-LABEL: or_bswap:
 ; CHECK-WIDE-AVX:       # %bb.0:
-; CHECK-WIDE-AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
-; CHECK-WIDE-AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; CHECK-WIDE-AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; CHECK-WIDE-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-WIDE-AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
 ; CHECK-WIDE-AVX-NEXT:    retq
  %xt = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %x)
  %yt = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %y)