[X86] Teach combineTruncatedArithmetic to push truncate through subtracts where only one of the inputs is free to truncate.

Fix combineSubToSubus to handle the new DAG to avoid a regression. There are still regressions in test14/test15/test16. Where it looks like were trying to set up cases we could match to umin+trunc+subus but the handling was never finished. The regression here isn't unique to sub. Its a lost opportunity for taking an AND with two truncated inputs and producing a larger AND with a single truncate. The same thing could happen with any other node we handle in combineTruncatedArithmetic since we are moving the truncate up the DAG. Differential Revision: https://reviews.llvm.org/D80483
2020-05-25 11:34:09 -07:00 · 2020-05-25 11:34:09 -07:00 · 51a276c759
parent 37ef15143a
commit 51a276c759
2 changed files with 470 additions and 461 deletions
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@ -43588,7 +43588,8 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
  case ISD::AND:
  case ISD::XOR:
  case ISD::OR:
-  case ISD::ADD: {
+  case ISD::ADD:
  case ISD::SUB: {
    SDValue Op0 = Src.getOperand(0);
    SDValue Op1 = Src.getOperand(1);
    if (TLI.isOperationLegal(SrcOpcode, VT) &&
@ -43596,16 +43597,6 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
      return TruncateArithmetic(Op0, Op1);
    break;
  }
  case ISD::SUB: {
    // TODO: ISD::SUB We are conservative and require both sides to be freely
    // truncatable to avoid interfering with combineSubToSubus.
    SDValue Op0 = Src.getOperand(0);
    SDValue Op1 = Src.getOperand(1);
    if (TLI.isOperationLegal(SrcOpcode, VT) &&
        (Op0 == Op1 || (IsFreeTruncation(Op0) && IsFreeTruncation(Op1))))
      return TruncateArithmetic(Op0, Op1);
    break;
  }
  }
  return SDValue();
@ -46698,6 +46689,38 @@ static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
      SubusRHS = MinLHS;
    else
      return SDValue();
  } else if (Op1.getOpcode() == ISD::TRUNCATE &&
             Op1.getOperand(0).getOpcode() == ISD::UMIN &&
             (EltVT == MVT::i8 || EltVT == MVT::i16)) {
    // Special case where the UMIN has been truncated. Try to push the truncate
    // further up. This is similar to the i32/i64 special processing.
    SubusLHS = Op0;
    SDValue MinLHS = Op1.getOperand(0).getOperand(0);
    SDValue MinRHS = Op1.getOperand(0).getOperand(1);
    EVT TruncVT = Op1.getOperand(0).getValueType();
    if (!(Subtarget.hasSSSE3() && (TruncVT == MVT::v8i32 ||
                                   TruncVT == MVT::v8i64)) &&
        !(Subtarget.useBWIRegs() && (TruncVT == MVT::v16i32)))
      return SDValue();
    SDValue OpToSaturate;
    if (MinLHS.getOpcode() == ISD::ZERO_EXTEND &&
        MinLHS.getOperand(0) == Op0)
      OpToSaturate = MinRHS;
    else if (MinRHS.getOpcode() == ISD::ZERO_EXTEND &&
             MinRHS.getOperand(0) == Op0)
      OpToSaturate = MinLHS;
    else
      return SDValue();
    // Saturate the non-extended input and then truncate it.
    SDLoc DL(N);
    SDValue SaturationConst =
        DAG.getConstant(APInt::getLowBitsSet(TruncVT.getScalarSizeInBits(),
                                             VT.getScalarSizeInBits()),
                        DL, TruncVT);
    SDValue UMin = DAG.getNode(ISD::UMIN, DL, TruncVT, OpToSaturate,
                               SaturationConst);
    SubusRHS = DAG.getNode(ISD::TRUNCATE, DL, VT, UMin);
  } else
    return SDValue();
--- a/llvm/test/CodeGen/X86/psubus.ll
+++ b/llvm/test/CodeGen/X86/psubus.ll