[X86] Teach combineTruncatedArithmetic to push truncate through subtracts where only one of the inputs is free to truncate.

Fix combineSubToSubus to handle the new DAG to avoid a regression.

There are still regressions in test14/test15/test16. Where it
looks like were trying to set up cases we could match to
umin+trunc+subus but the handling was never finished. The
regression here isn't unique to sub. Its a lost opportunity for
taking an AND with two truncated inputs and producing a larger
AND with a single truncate. The same thing could happen with
any other node we handle in combineTruncatedArithmetic since we
are moving the truncate up the DAG.

Differential Revision: https://reviews.llvm.org/D80483
This commit is contained in:
Craig Topper 2020-05-25 11:34:09 -07:00
parent 37ef15143a
commit 51a276c759
2 changed files with 470 additions and 461 deletions

View File

@ -43588,7 +43588,8 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
case ISD::AND:
case ISD::XOR:
case ISD::OR:
case ISD::ADD: {
case ISD::ADD:
case ISD::SUB: {
SDValue Op0 = Src.getOperand(0);
SDValue Op1 = Src.getOperand(1);
if (TLI.isOperationLegal(SrcOpcode, VT) &&
@ -43596,16 +43597,6 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
return TruncateArithmetic(Op0, Op1);
break;
}
case ISD::SUB: {
// TODO: ISD::SUB We are conservative and require both sides to be freely
// truncatable to avoid interfering with combineSubToSubus.
SDValue Op0 = Src.getOperand(0);
SDValue Op1 = Src.getOperand(1);
if (TLI.isOperationLegal(SrcOpcode, VT) &&
(Op0 == Op1 || (IsFreeTruncation(Op0) && IsFreeTruncation(Op1))))
return TruncateArithmetic(Op0, Op1);
break;
}
}
return SDValue();
@ -46698,6 +46689,38 @@ static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
SubusRHS = MinLHS;
else
return SDValue();
} else if (Op1.getOpcode() == ISD::TRUNCATE &&
Op1.getOperand(0).getOpcode() == ISD::UMIN &&
(EltVT == MVT::i8 || EltVT == MVT::i16)) {
// Special case where the UMIN has been truncated. Try to push the truncate
// further up. This is similar to the i32/i64 special processing.
SubusLHS = Op0;
SDValue MinLHS = Op1.getOperand(0).getOperand(0);
SDValue MinRHS = Op1.getOperand(0).getOperand(1);
EVT TruncVT = Op1.getOperand(0).getValueType();
if (!(Subtarget.hasSSSE3() && (TruncVT == MVT::v8i32 ||
TruncVT == MVT::v8i64)) &&
!(Subtarget.useBWIRegs() && (TruncVT == MVT::v16i32)))
return SDValue();
SDValue OpToSaturate;
if (MinLHS.getOpcode() == ISD::ZERO_EXTEND &&
MinLHS.getOperand(0) == Op0)
OpToSaturate = MinRHS;
else if (MinRHS.getOpcode() == ISD::ZERO_EXTEND &&
MinRHS.getOperand(0) == Op0)
OpToSaturate = MinLHS;
else
return SDValue();
// Saturate the non-extended input and then truncate it.
SDLoc DL(N);
SDValue SaturationConst =
DAG.getConstant(APInt::getLowBitsSet(TruncVT.getScalarSizeInBits(),
VT.getScalarSizeInBits()),
DL, TruncVT);
SDValue UMin = DAG.getNode(ISD::UMIN, DL, TruncVT, OpToSaturate,
SaturationConst);
SubusRHS = DAG.getNode(ISD::TRUNCATE, DL, VT, UMin);
} else
return SDValue();

File diff suppressed because it is too large Load Diff