[DAGCombine] Move the remaining X86 funnel shift patterns to DAGCombine

X86 matches several 'shift+xor' funnel shift patterns:

  fold (or (srl (srl x1, 1), (xor y, 31)), (shl x0, y))  -> (fshl x0, x1, y)
  fold (or (shl (shl x0, 1), (xor y, 31)), (srl x1, y))  -> (fshr x0, x1, y)
  fold (or (shl (add x0, x0), (xor y, 31)), (srl x1, y)) -> (fshr x0, x1, y)

These patterns are also what we end up with the proposed expansion changes in D77301.

This patch moves these to DAGCombine's generic MatchFunnelPosNeg.

All existing X86 test cases still pass, and we just have a small codegen change in pr32282.ll.

Reviewed By: @spatel

Differential Revision: https://reviews.llvm.org/D78935
This commit is contained in:
Simon Pilgrim 2020-04-30 12:53:05 +01:00
parent 6547a5ceb2
commit 96238486ed
3 changed files with 53 additions and 122 deletions

View File

@ -6339,6 +6339,9 @@ SDValue DAGCombiner::MatchFunnelPosNeg(SDValue N0, SDValue N1, SDValue Pos,
SDValue Neg, SDValue InnerPos,
SDValue InnerNeg, unsigned PosOpcode,
unsigned NegOpcode, const SDLoc &DL) {
EVT VT = N0.getValueType();
unsigned EltBits = VT.getScalarSizeInBits();
// fold (or (shl x0, (*ext y)),
// (srl x1, (*ext (sub 32, y)))) ->
// (fshl x0, x1, y) or (fshr x0, x1, (sub 32, y))
@ -6346,13 +6349,52 @@ SDValue DAGCombiner::MatchFunnelPosNeg(SDValue N0, SDValue N1, SDValue Pos,
// fold (or (shl x0, (*ext (sub 32, y))),
// (srl x1, (*ext y))) ->
// (fshr x0, x1, y) or (fshl x0, x1, (sub 32, y))
EVT VT = N0.getValueType();
if (matchRotateSub(InnerPos, InnerNeg, VT.getScalarSizeInBits(), DAG)) {
if (matchRotateSub(InnerPos, InnerNeg, EltBits, DAG)) {
bool HasPos = TLI.isOperationLegalOrCustom(PosOpcode, VT);
return DAG.getNode(HasPos ? PosOpcode : NegOpcode, DL, VT, N0, N1,
HasPos ? Pos : Neg);
}
// Matching the shift+xor cases, we can't easily use the xor'd shift amount
// so for now just use the PosOpcode case if its legal.
// TODO: When can we use the NegOpcode case?
if (PosOpcode == ISD::FSHL && isPowerOf2_32(EltBits)) {
auto IsBinOpImm = [](SDValue Op, unsigned BinOpc, unsigned Imm) {
if (Op.getOpcode() != BinOpc)
return false;
ConstantSDNode *Cst = isConstOrConstSplat(Op.getOperand(1));
return Cst && (Cst->getAPIntValue() == Imm);
};
// fold (or (shl x0, y), (srl (srl x1, 1), (xor y, 31)))
// -> (fshl x0, x1, y)
if (IsBinOpImm(N1, ISD::SRL, 1) &&
IsBinOpImm(InnerNeg, ISD::XOR, EltBits - 1) &&
InnerPos == InnerNeg.getOperand(0) &&
TLI.isOperationLegalOrCustom(ISD::FSHL, VT)) {
return DAG.getNode(ISD::FSHL, DL, VT, N0, N1.getOperand(0), Pos);
}
// fold (or (shl (shl x0, 1), (xor y, 31)), (srl x1, y))
// -> (fshr x0, x1, y)
if (IsBinOpImm(N0, ISD::SHL, 1) &&
IsBinOpImm(InnerPos, ISD::XOR, EltBits - 1) &&
InnerNeg == InnerPos.getOperand(0) &&
TLI.isOperationLegalOrCustom(ISD::FSHR, VT)) {
return DAG.getNode(ISD::FSHR, DL, VT, N0.getOperand(0), N1, Neg);
}
// fold (or (shl (add x0, x0), (xor y, 31)), (srl x1, y))
// -> (fshr x0, x1, y)
// TODO: Should add(x,x) -> shl(x,1) be a general DAG canonicalization?
if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N0.getOperand(1) &&
IsBinOpImm(InnerPos, ISD::XOR, EltBits - 1) &&
InnerNeg == InnerPos.getOperand(0) &&
TLI.isOperationLegalOrCustom(ISD::FSHR, VT)) {
return DAG.getNode(ISD::FSHR, DL, VT, N0.getOperand(0), N1, Neg);
}
}
return SDValue();
}

View File

@ -42045,114 +42045,6 @@ static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
return Ret;
}
static SDValue combineOrShiftToFunnelShift(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
assert(N->getOpcode() == ISD::OR && "Expected ISD::OR node");
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N->getValueType(0);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (!TLI.isOperationLegalOrCustom(ISD::FSHL, VT) ||
!TLI.isOperationLegalOrCustom(ISD::FSHR, VT))
return SDValue();
// fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
bool OptForSize = DAG.shouldOptForSize();
unsigned Bits = VT.getScalarSizeInBits();
// SHLD/SHRD instructions have lower register pressure, but on some
// platforms they have higher latency than the equivalent
// series of shifts/or that would otherwise be generated.
// Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
// have higher latencies and we are not optimizing for size.
if (!OptForSize && Subtarget.isSHLDSlow())
return SDValue();
if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
std::swap(N0, N1);
if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
return SDValue();
if (!N0.hasOneUse() || !N1.hasOneUse())
return SDValue();
EVT ShiftVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
SDValue ShAmt0 = N0.getOperand(1);
if (ShAmt0.getValueType() != ShiftVT)
return SDValue();
SDValue ShAmt1 = N1.getOperand(1);
if (ShAmt1.getValueType() != ShiftVT)
return SDValue();
// Peek through any modulo shift masks.
SDValue ShMsk0;
if (ShAmt0.getOpcode() == ISD::AND &&
isa<ConstantSDNode>(ShAmt0.getOperand(1)) &&
ShAmt0.getConstantOperandAPInt(1) == (Bits - 1)) {
ShMsk0 = ShAmt0;
ShAmt0 = ShAmt0.getOperand(0);
}
SDValue ShMsk1;
if (ShAmt1.getOpcode() == ISD::AND &&
isa<ConstantSDNode>(ShAmt1.getOperand(1)) &&
ShAmt1.getConstantOperandAPInt(1) == (Bits - 1)) {
ShMsk1 = ShAmt1;
ShAmt1 = ShAmt1.getOperand(0);
}
if (ShAmt0.getOpcode() == ISD::TRUNCATE)
ShAmt0 = ShAmt0.getOperand(0);
if (ShAmt1.getOpcode() == ISD::TRUNCATE)
ShAmt1 = ShAmt1.getOperand(0);
SDLoc DL(N);
unsigned Opc = ISD::FSHL;
SDValue Op0 = N0.getOperand(0);
SDValue Op1 = N1.getOperand(0);
if (ShAmt0.getOpcode() == ISD::SUB || ShAmt0.getOpcode() == ISD::XOR) {
Opc = ISD::FSHR;
std::swap(Op0, Op1);
std::swap(ShAmt0, ShAmt1);
std::swap(ShMsk0, ShMsk1);
}
auto GetFunnelShift = [&DAG, &DL, VT, Opc, &ShiftVT](SDValue Op0, SDValue Op1,
SDValue Amt) {
if (Opc == ISD::FSHR)
std::swap(Op0, Op1);
return DAG.getNode(Opc, DL, VT, Op0, Op1,
DAG.getNode(ISD::TRUNCATE, DL, ShiftVT, Amt));
};
// OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> FSHL( X, Y, C )
// OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> FSHR( Y, X, C )
if (ShAmt1.getOpcode() == ISD::XOR) {
SDValue Mask = ShAmt1.getOperand(1);
if (auto *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
unsigned InnerShift = (ISD::FSHL == Opc ? ISD::SRL : ISD::SHL);
SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
ShAmt1Op0 = ShAmt1Op0.getOperand(0);
if (MaskC->getSExtValue() == (Bits - 1) &&
(ShAmt1Op0 == ShAmt0 || ShAmt1Op0 == ShMsk0)) {
if (Op1.getOpcode() == InnerShift &&
isa<ConstantSDNode>(Op1.getOperand(1)) &&
Op1.getConstantOperandAPInt(1).isOneValue()) {
return GetFunnelShift(Op0, Op1.getOperand(0), ShAmt0);
}
// Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&
Op1.getOperand(0) == Op1.getOperand(1)) {
return GetFunnelShift(Op0, Op1.getOperand(0), ShAmt0);
}
}
}
}
return SDValue();
}
static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
@ -42208,9 +42100,6 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
return R;
if (SDValue R = combineOrShiftToFunnelShift(N, DAG, Subtarget))
return R;
// Attempt to recursively combine an OR of shuffles.
if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
SDValue Op(N, 0);

View File

@ -13,18 +13,18 @@ define void @foo(i64 %x) nounwind {
; X86-LABEL: foo:
; X86: # %bb.0:
; X86-NEXT: pushl %eax
; X86-NEXT: movl d+4, %eax
; X86-NEXT: movl d, %eax
; X86-NEXT: notl %eax
; X86-NEXT: movl d, %ecx
; X86-NEXT: movl d+4, %ecx
; X86-NEXT: notl %ecx
; X86-NEXT: andl $-566231040, %ecx # imm = 0xDE400000
; X86-NEXT: andl $701685459, %eax # imm = 0x29D2DED3
; X86-NEXT: shrdl $21, %eax, %ecx
; X86-NEXT: shrl $21, %eax
; X86-NEXT: addl $7, %ecx
; X86-NEXT: adcl $0, %eax
; X86-NEXT: pushl %eax
; X86-NEXT: andl $701685459, %ecx # imm = 0x29D2DED3
; X86-NEXT: andl $-566231040, %eax # imm = 0xDE400000
; X86-NEXT: shrdl $21, %ecx, %eax
; X86-NEXT: shrl $21, %ecx
; X86-NEXT: addl $7, %eax
; X86-NEXT: adcl $0, %ecx
; X86-NEXT: pushl %ecx
; X86-NEXT: pushl %eax
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll __divdi3