[X86] Teach combineTruncatedArithmetic to push truncate through subtracts where only one of the inputs is free to truncate.

Fix combineSubToSubus to handle the new DAG to avoid a regression. There are still regressions in test14/test15/test16. Where it looks like were trying to set up cases we could match to umin+trunc+subus but the handling was never finished. The regression here isn't unique to sub. Its a lost opportunity for taking an AND with two truncated inputs and producing a larger AND with a single truncate. The same thing could happen with any other node we handle in combineTruncatedArithmetic since we are moving the truncate up the DAG. Differential Revision: https://reviews.llvm.org/D80483
2020-05-25 11:34:09 -07:00 · 2020-05-25 11:34:09 -07:00 · 51a276c759
parent 37ef15143a
commit 51a276c759
2 changed files with 470 additions and 461 deletions
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@ -43588,7 +43588,8 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
  case ISD::AND:
  case ISD::XOR:
  case ISD::OR:
-  case ISD::ADD: {
+  case ISD::ADD:
+  case ISD::SUB: {
    SDValue Op0 = Src.getOperand(0);
    SDValue Op1 = Src.getOperand(1);
    if (TLI.isOperationLegal(SrcOpcode, VT) &&
@ -43596,16 +43597,6 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
      return TruncateArithmetic(Op0, Op1);
    break;
  }
-  case ISD::SUB: {
-    // TODO: ISD::SUB We are conservative and require both sides to be freely
-    // truncatable to avoid interfering with combineSubToSubus.
-    SDValue Op0 = Src.getOperand(0);
-    SDValue Op1 = Src.getOperand(1);
-    if (TLI.isOperationLegal(SrcOpcode, VT) &&
-        (Op0 == Op1 || (IsFreeTruncation(Op0) && IsFreeTruncation(Op1))))
-      return TruncateArithmetic(Op0, Op1);
-    break;
-  }
  }

  return SDValue();
@ -46698,6 +46689,38 @@ static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
      SubusRHS = MinLHS;
    else
      return SDValue();
+  } else if (Op1.getOpcode() == ISD::TRUNCATE &&
+             Op1.getOperand(0).getOpcode() == ISD::UMIN &&
+             (EltVT == MVT::i8 || EltVT == MVT::i16)) {
+    // Special case where the UMIN has been truncated. Try to push the truncate
+    // further up. This is similar to the i32/i64 special processing.
+    SubusLHS = Op0;
+    SDValue MinLHS = Op1.getOperand(0).getOperand(0);
+    SDValue MinRHS = Op1.getOperand(0).getOperand(1);
+    EVT TruncVT = Op1.getOperand(0).getValueType();
+    if (!(Subtarget.hasSSSE3() && (TruncVT == MVT::v8i32 ||
+                                   TruncVT == MVT::v8i64)) &&
+        !(Subtarget.useBWIRegs() && (TruncVT == MVT::v16i32)))
+      return SDValue();
+    SDValue OpToSaturate;
+    if (MinLHS.getOpcode() == ISD::ZERO_EXTEND &&
+        MinLHS.getOperand(0) == Op0)
+      OpToSaturate = MinRHS;
+    else if (MinRHS.getOpcode() == ISD::ZERO_EXTEND &&
+             MinRHS.getOperand(0) == Op0)
+      OpToSaturate = MinLHS;
+    else
+      return SDValue();
+
+    // Saturate the non-extended input and then truncate it.
+    SDLoc DL(N);
+    SDValue SaturationConst =
+        DAG.getConstant(APInt::getLowBitsSet(TruncVT.getScalarSizeInBits(),
+                                             VT.getScalarSizeInBits()),
+                        DL, TruncVT);
+    SDValue UMin = DAG.getNode(ISD::UMIN, DL, TruncVT, OpToSaturate,
+                               SaturationConst);
+    SubusRHS = DAG.getNode(ISD::TRUNCATE, DL, VT, UMin);
  } else
    return SDValue();

--- a/llvm/test/CodeGen/X86/psubus.ll
+++ b/llvm/test/CodeGen/X86/psubus.ll