[X86] Restore X86ISelDAGToDAG::matchBEXTRFromAnd. Teach address matching to create a BEXTR pattern from a (shl (and X, mask >> C1) if C1 can be folded into addressing mode.

This is an alternative to D53080 since I think using a BEXTR for a shifted mask is definitely an improvement when the shl can be absorbed into addressing mode. The other cases I'm less sure about.

We already have several tricks for handling an and of a shift in address matching. This adds a new case for BEXTR.

I've moved the BEXTR matching code back to X86ISelDAGToDAG to allow it to match. I suppose alternatively we could directly emit a X86ISD::BEXTR node that isel could pattern match. But I'm trying to view BEXTR matching as an isel concern so DAG combine can see 'and' and 'shift' operations that are well understood. We did lose a couple cases from tbm_patterns.ll, but I think there are ways to recover that.

I've also put back the manual load folding code in matchBEXTRFromAnd that I removed a few months ago in r324939. This gives us some more freedom to make decisions based on the ability to fold a load. I haven't done anything with that yet.

Differential Revision: https://reviews.llvm.org/D53126

llvm-svn: 344270
This commit is contained in:
Craig Topper 2018-10-11 18:06:07 +00:00
parent e502f8b315
commit fb2ac8969e
5 changed files with 218 additions and 98 deletions

View File

@ -451,6 +451,7 @@ namespace {
}
bool foldLoadStoreIntoMemOperand(SDNode *Node);
bool matchBEXTRFromAndImm(SDNode *Node);
bool matchBEXTR(SDNode *Node);
bool shrinkAndImmediate(SDNode *N);
bool isMaskZeroExtended(SDNode *N) const;
@ -1340,6 +1341,64 @@ static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
return false;
}
// Transform "(X >> SHIFT) & (MASK << C1)" to
// "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be
// matched to a BEXTR later. Returns false if the simplification is performed.
static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N,
uint64_t Mask,
SDValue Shift, SDValue X,
X86ISelAddressMode &AM,
const X86Subtarget &Subtarget) {
if (Shift.getOpcode() != ISD::SRL ||
!isa<ConstantSDNode>(Shift.getOperand(1)) ||
!Shift.hasOneUse() || !N.hasOneUse())
return true;
// Only do this if BEXTR will be matched by matchBEXTRFromAndImm.
if (!Subtarget.hasTBM() &&
!(Subtarget.hasBMI() && Subtarget.hasFastBEXTR()))
return true;
// We need to ensure that mask is a continuous run of bits.
if (!isShiftedMask_64(Mask)) return true;
unsigned ShiftAmt = Shift.getConstantOperandVal(1);
// The amount of shift we're trying to fit into the addressing mode is taken
// from the trailing zeros of the mask.
unsigned AMShiftAmt = countTrailingZeros(Mask);
// There is nothing we can do here unless the mask is removing some bits.
// Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
if (AMShiftAmt <= 0 || AMShiftAmt > 3) return true;
MVT VT = N.getSimpleValueType();
SDLoc DL(N);
SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt);
SDValue NewMask = DAG.getConstant(Mask >> AMShiftAmt, DL, VT);
SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, NewSRL, NewMask);
SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewAnd, NewSHLAmt);
// Insert the new nodes into the topological ordering. We must do this in
// a valid topological ordering as nothing is going to go back and re-sort
// these nodes. We continually insert before 'N' in sequence as this is
// essentially a pre-flattened and pre-sorted sequence of nodes. There is no
// hierarchy left to express.
insertDAGNode(DAG, N, NewSRLAmt);
insertDAGNode(DAG, N, NewSRL);
insertDAGNode(DAG, N, NewMask);
insertDAGNode(DAG, N, NewAnd);
insertDAGNode(DAG, N, NewSHLAmt);
insertDAGNode(DAG, N, NewSHL);
DAG.ReplaceAllUsesWith(N, NewSHL);
AM.Scale = 1 << AMShiftAmt;
AM.IndexReg = NewAnd;
return false;
}
bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
unsigned Depth) {
SDLoc dl(N);
@ -1620,6 +1679,11 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
// a scale on the outside of the mask.
if (!foldMaskedShiftToScaledMask(*CurDAG, N, Mask, Shift, X, AM))
return false;
// Try to fold the mask and shift into BEXTR and scale.
if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask, Shift, X, AM, *Subtarget))
return false;
break;
}
}
@ -2646,6 +2710,92 @@ bool X86DAGToDAGISel::matchBEXTR(SDNode *Node) {
return true;
}
// See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
bool X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
MVT NVT = Node->getSimpleValueType(0);
SDLoc dl(Node);
SDValue N0 = Node->getOperand(0);
SDValue N1 = Node->getOperand(1);
// If we have TBM we can use an immediate for the control. If we have BMI
// we should only do this if the BEXTR instruction is implemented well.
// Otherwise moving the control into a register makes this more costly.
// TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
// hoisting the move immediate would make it worthwhile with a less optimal
// BEXTR?
if (!Subtarget->hasTBM() &&
!(Subtarget->hasBMI() && Subtarget->hasFastBEXTR()))
return false;
// Must have a shift right.
if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA)
return false;
// Shift can't have additional users.
if (!N0->hasOneUse())
return false;
// Only supported for 32 and 64 bits.
if (NVT != MVT::i32 && NVT != MVT::i64)
return false;
// Shift amount and RHS of and must be constant.
ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(N1);
ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
if (!MaskCst || !ShiftCst)
return false;
// And RHS must be a mask.
uint64_t Mask = MaskCst->getZExtValue();
if (!isMask_64(Mask))
return false;
uint64_t Shift = ShiftCst->getZExtValue();
uint64_t MaskSize = countPopulation(Mask);
// Don't interfere with something that can be handled by extracting AH.
// TODO: If we are able to fold a load, BEXTR might still be better than AH.
if (Shift == 8 && MaskSize == 8)
return false;
// Make sure we are only using bits that were in the original value, not
// shifted in.
if (Shift + MaskSize > NVT.getSizeInBits())
return false;
SDValue New = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT);
unsigned ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
unsigned MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
// BMI requires the immediate to placed in a register.
if (!Subtarget->hasTBM()) {
ROpc = NVT == MVT::i64 ? X86::BEXTR64rr : X86::BEXTR32rr;
MOpc = NVT == MVT::i64 ? X86::BEXTR64rm : X86::BEXTR32rm;
unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
New = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, New), 0);
}
MachineSDNode *NewNode;
SDValue Input = N0->getOperand(0);
SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, New, Input.getOperand(0) };
SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
// Update the chain.
ReplaceUses(Input.getValue(1), SDValue(NewNode, 1));
// Record the mem-refs
CurDAG->setNodeMemRefs(NewNode, {cast<LoadSDNode>(Input)->getMemOperand()});
} else {
NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, Input, New);
}
ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
CurDAG->RemoveDeadNode(Node);
return true;
}
// Emit a PCMISTR(I/M) instruction.
MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc,
bool MayFoldLoad, const SDLoc &dl,
@ -2953,6 +3103,8 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
break;
case ISD::AND:
if (matchBEXTRFromAndImm(Node))
return;
if (matchBEXTR(Node))
return;
if (AndImmShrink && shrinkAndImmediate(Node))

View File

@ -35278,69 +35278,6 @@ static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
return SDValue();
}
static bool hasBEXTR(const X86Subtarget &Subtarget, EVT VT) {
// If we have TBM we can use an immediate for the control. If we have BMI
// we should only do this if the BEXTR instruction is implemented well.
// Otherwise moving the control into a register makes this more costly.
// TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
// hoisting the move immediate would make it worthwhile with a less optimal
// BEXTR?
if (!Subtarget.hasTBM() && !(Subtarget.hasBMI() && Subtarget.hasFastBEXTR()))
return false;
return (VT == MVT::i32 || (VT == MVT::i64 && Subtarget.is64Bit()));
}
// See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
static SDValue combineAndIntoBEXTR(SDNode *Node, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
EVT NVT = Node->getValueType(0);
SDLoc dl(Node);
SDValue N0 = Node->getOperand(0);
SDValue N1 = Node->getOperand(1);
// Check if subtarget has BEXTR instruction for the node's type
if (!hasBEXTR(Subtarget, NVT))
return SDValue();
// Must have a shift right.
if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA)
return SDValue();
// Shift can't have additional users.
if (!N0->hasOneUse())
return SDValue();
// Shift amount and RHS of and must be constant.
ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(N1);
ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
if (!MaskCst || !ShiftCst)
return SDValue();
// And RHS must be a mask.
uint64_t Mask = MaskCst->getZExtValue();
if (!isMask_64(Mask))
return SDValue();
uint64_t Shift = ShiftCst->getZExtValue();
uint64_t MaskSize = countPopulation(Mask);
// Don't interfere with something that can be handled by extracting AH.
// TODO: If we are able to fold a load, BEXTR might still be better than AH.
if (Shift == 8 && MaskSize == 8)
return SDValue();
// Make sure we are only using bits that were in the original value, not
// shifted in.
if (Shift + MaskSize > NVT.getSizeInBits())
return SDValue();
// Create a BEXTR node.
SDValue C = DAG.getConstant(Shift | (MaskSize << 8), dl, NVT);
SDValue New = DAG.getNode(X86ISD::BEXTR, dl, NVT, N0->getOperand(0), C);
return New;
}
// Look for (and (ctpop X), 1) which is the IR form of __builtin_parity.
// Turn it into series of XORs and a setnp.
static SDValue combineParity(SDNode *N, SelectionDAG &DAG,
@ -35442,9 +35379,6 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
if (DCI.isBeforeLegalizeOps())
return SDValue();
if (SDValue R = combineAndIntoBEXTR(N, DAG, Subtarget))
return R;
if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
return R;

View File

@ -2135,17 +2135,3 @@ def : Pat<(cttz_zero_undef (loadi64 addr:$src)), (BSF64rm addr:$src)>;
let Predicates = [HasMOVBE] in {
def : Pat<(bswap GR16:$src), (ROL16ri GR16:$src, (i8 8))>;
}
// These patterns are selected by some custom code in X86ISelDAGToDAG.cpp that
// custom combines and+srl into BEXTR. We use these patterns to avoid a bunch
// of manual code for folding loads.
let Predicates = [HasBMI, NoTBM] in {
def : Pat<(X86bextr GR32:$src1, (i32 imm:$src2)),
(BEXTR32rr GR32:$src1, (MOV32ri imm:$src2))>;
def : Pat<(X86bextr (loadi32 addr:$src1), (i32 imm:$src2)),
(BEXTR32rm addr:$src1, (MOV32ri imm:$src2))>;
def : Pat<(X86bextr GR64:$src1, mov64imm32:$src2),
(BEXTR64rr GR64:$src1, (MOV32ri64 mov64imm32:$src2))>;
def : Pat<(X86bextr (loadi64 addr:$src1), mov64imm32:$src2),
(BEXTR64rm addr:$src1, (MOV32ri64 mov64imm32:$src2))>;
} // HasBMI, NoTBM

View File

@ -5568,23 +5568,69 @@ define i64 @bextr64_d5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
; https://bugs.llvm.org/show_bug.cgi?id=38938
define void @pr38938(i32* %a0, i64* %a1) {
; X86-LABEL: pr38938:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl (%ecx), %ecx
; X86-NEXT: shrl $19, %ecx
; X86-NEXT: andl $4092, %ecx # imm = 0xFFC
; X86-NEXT: incl (%eax,%ecx)
; X86-NEXT: retl
; X86-NOBMI-LABEL: pr38938:
; X86-NOBMI: # %bb.0:
; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NOBMI-NEXT: movl (%ecx), %ecx
; X86-NOBMI-NEXT: shrl $19, %ecx
; X86-NOBMI-NEXT: andl $4092, %ecx # imm = 0xFFC
; X86-NOBMI-NEXT: incl (%eax,%ecx)
; X86-NOBMI-NEXT: retl
;
; X64-LABEL: pr38938:
; X64: # %bb.0:
; X64-NEXT: movq (%rsi), %rax
; X64-NEXT: shrq $19, %rax
; X64-NEXT: andl $4092, %eax # imm = 0xFFC
; X64-NEXT: incl (%rdi,%rax)
; X64-NEXT: retq
; X86-BMI1NOTBM-LABEL: pr38938:
; X86-BMI1NOTBM: # %bb.0:
; X86-BMI1NOTBM-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-BMI1NOTBM-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-BMI1NOTBM-NEXT: movl $2581, %edx # imm = 0xA15
; X86-BMI1NOTBM-NEXT: bextrl %edx, (%ecx), %ecx
; X86-BMI1NOTBM-NEXT: incl (%eax,%ecx,4)
; X86-BMI1NOTBM-NEXT: retl
;
; X86-BMI1TBM-LABEL: pr38938:
; X86-BMI1TBM: # %bb.0:
; X86-BMI1TBM-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-BMI1TBM-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-BMI1TBM-NEXT: bextrl $2581, (%ecx), %ecx # imm = 0xA15
; X86-BMI1TBM-NEXT: incl (%eax,%ecx,4)
; X86-BMI1TBM-NEXT: retl
;
; X86-BMI1NOTBMBMI2-LABEL: pr38938:
; X86-BMI1NOTBMBMI2: # %bb.0:
; X86-BMI1NOTBMBMI2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-BMI1NOTBMBMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-BMI1NOTBMBMI2-NEXT: movl $2581, %edx # imm = 0xA15
; X86-BMI1NOTBMBMI2-NEXT: bextrl %edx, (%ecx), %ecx
; X86-BMI1NOTBMBMI2-NEXT: incl (%eax,%ecx,4)
; X86-BMI1NOTBMBMI2-NEXT: retl
;
; X64-NOBMI-LABEL: pr38938:
; X64-NOBMI: # %bb.0:
; X64-NOBMI-NEXT: movq (%rsi), %rax
; X64-NOBMI-NEXT: shrq $19, %rax
; X64-NOBMI-NEXT: andl $4092, %eax # imm = 0xFFC
; X64-NOBMI-NEXT: incl (%rdi,%rax)
; X64-NOBMI-NEXT: retq
;
; X64-BMI1NOTBM-LABEL: pr38938:
; X64-BMI1NOTBM: # %bb.0:
; X64-BMI1NOTBM-NEXT: movl $2581, %eax # imm = 0xA15
; X64-BMI1NOTBM-NEXT: bextrq %rax, (%rsi), %rax
; X64-BMI1NOTBM-NEXT: incl (%rdi,%rax,4)
; X64-BMI1NOTBM-NEXT: retq
;
; X64-BMI1TBM-LABEL: pr38938:
; X64-BMI1TBM: # %bb.0:
; X64-BMI1TBM-NEXT: bextrq $2581, (%rsi), %rax # imm = 0xA15
; X64-BMI1TBM-NEXT: incl (%rdi,%rax,4)
; X64-BMI1TBM-NEXT: retq
;
; X64-BMI1NOTBMBMI2-LABEL: pr38938:
; X64-BMI1NOTBMBMI2: # %bb.0:
; X64-BMI1NOTBMBMI2-NEXT: movl $2581, %eax # imm = 0xA15
; X64-BMI1NOTBMBMI2-NEXT: bextrq %rax, (%rsi), %rax
; X64-BMI1NOTBMBMI2-NEXT: incl (%rdi,%rax,4)
; X64-BMI1NOTBMBMI2-NEXT: retq
%tmp = load i64, i64* %a1, align 8
%tmp1 = lshr i64 %tmp, 21
%tmp2 = and i64 %tmp1, 1023

View File

@ -53,7 +53,8 @@ define i32 @test_x86_tbm_bextri_u32_z2(i32 %a, i32 %b, i32 %c) nounwind {
; CHECK-LABEL: test_x86_tbm_bextri_u32_z2:
; CHECK: # %bb.0:
; CHECK-NEXT: movl %esi, %eax
; CHECK-NEXT: bextrl $3076, %edi, %ecx # imm = 0xC04
; CHECK-NEXT: shrl $4, %edi
; CHECK-NEXT: testl $4095, %edi # imm = 0xFFF
; CHECK-NEXT: cmovnel %edx, %eax
; CHECK-NEXT: retq
%t0 = lshr i32 %a, 4
@ -113,7 +114,8 @@ define i64 @test_x86_tbm_bextri_u64_z2(i64 %a, i64 %b, i64 %c) nounwind {
; CHECK-LABEL: test_x86_tbm_bextri_u64_z2:
; CHECK: # %bb.0:
; CHECK-NEXT: movq %rsi, %rax
; CHECK-NEXT: bextrl $3076, %edi, %ecx # imm = 0xC04
; CHECK-NEXT: shrl $4, %edi
; CHECK-NEXT: testl $4095, %edi # imm = 0xFFF
; CHECK-NEXT: cmovneq %rdx, %rax
; CHECK-NEXT: retq
%t0 = lshr i64 %a, 4