X86: enable CSE between CMP and SUB

We perform the following:
1> Use SUB instead of CMP for i8,i16,i32 and i64 in ISel lowering.
2> Modify MachineCSE to correctly handle implicit defs.
3> Convert SUB back to CMP if possible at peephole.

Removed pattern matching of (a>b) ? (a-b):0 and like, since they are handled
by peephole now.

rdar://11873276

llvm-svn: 161462
This commit is contained in:
Manman Ren 2012-08-08 00:51:41 +00:00
parent 3b9a442841
commit 1be131ba27
6 changed files with 139 additions and 49 deletions

View File

@ -417,6 +417,7 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
bool Changed = false;
SmallVector<std::pair<unsigned, unsigned>, 8> CSEPairs;
SmallVector<unsigned, 2> ImplicitDefsToUpdate;
for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E; ) {
MachineInstr *MI = &*I;
++I;
@ -486,15 +487,24 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
// Check if it's profitable to perform this CSE.
bool DoCSE = true;
unsigned NumDefs = MI->getDesc().getNumDefs();
unsigned NumDefs = MI->getDesc().getNumDefs() +
MI->getDesc().getNumImplicitDefs();
for (unsigned i = 0, e = MI->getNumOperands(); NumDefs && i != e; ++i) {
MachineOperand &MO = MI->getOperand(i);
if (!MO.isReg() || !MO.isDef())
continue;
unsigned OldReg = MO.getReg();
unsigned NewReg = CSMI->getOperand(i).getReg();
if (OldReg == NewReg)
// Go through implicit defs of CSMI and MI, if a def is not dead at MI,
// we should make sure it is not dead at CSMI.
if (MO.isImplicit() && !MO.isDead() && CSMI->getOperand(i).isDead())
ImplicitDefsToUpdate.push_back(i);
if (OldReg == NewReg) {
--NumDefs;
continue;
}
assert(TargetRegisterInfo::isVirtualRegister(OldReg) &&
TargetRegisterInfo::isVirtualRegister(NewReg) &&
@ -526,6 +536,11 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
MRI->clearKillFlags(CSEPairs[i].second);
}
// Go through implicit defs of CSMI and MI, if a def is not dead at MI,
// we should make sure it is not dead at CSMI.
for (unsigned i = 0, e = ImplicitDefsToUpdate.size(); i != e; ++i)
CSMI->getOperand(ImplicitDefsToUpdate[i]).setIsDead(false);
if (CrossMBBPhysDef) {
// Add physical register defs now coming in from a predecessor to MBB
// livein list.
@ -549,6 +564,7 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
Exps.push_back(MI);
}
CSEPairs.clear();
ImplicitDefsToUpdate.clear();
}
return Changed;

View File

@ -2438,7 +2438,12 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
return NULL;
}
case X86ISD::CMP: {
case X86ISD::CMP:
case X86ISD::SUB: {
// Sometimes a SUB is used to perform comparison.
if (Opcode == X86ISD::SUB && Node->hasAnyUseOfValue(0))
// This node is not a CMP.
break;
SDValue N0 = Node->getOperand(0);
SDValue N1 = Node->getOperand(1);

View File

@ -8333,11 +8333,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
switch (Op.getNode()->getOpcode()) {
default: llvm_unreachable("unexpected operator!");
case ISD::SUB:
// If the only use of SUB is EFLAGS, use CMP instead.
if (Op.hasOneUse())
Opcode = X86ISD::CMP;
else
Opcode = X86ISD::SUB;
Opcode = X86ISD::SUB;
break;
case ISD::OR: Opcode = X86ISD::OR; break;
case ISD::XOR: Opcode = X86ISD::XOR; break;
@ -8391,6 +8387,14 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
return EmitTest(Op0, X86CC, DAG);
DebugLoc dl = Op0.getDebugLoc();
if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
// Use SUB instead of CMP to enable CSE between SUB and CMP.
SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
Op0, Op1);
return SDValue(Sub.getNode(), 1);
}
return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
}
@ -8765,46 +8769,6 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
Cond = NewCond;
}
// Handle the following cases related to max and min:
// (a > b) ? (a-b) : 0
// (a >= b) ? (a-b) : 0
// (b < a) ? (a-b) : 0
// (b <= a) ? (a-b) : 0
// Comparison is removed to use EFLAGS from SUB.
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op2))
if (Cond.getOpcode() == X86ISD::SETCC &&
Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
(Op1.getOpcode() == ISD::SUB || Op1.getOpcode() == X86ISD::SUB) &&
C->getAPIntValue() == 0) {
SDValue Cmp = Cond.getOperand(1);
unsigned CC = cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
if ((DAG.isEqualTo(Op1.getOperand(0), Cmp.getOperand(0)) &&
DAG.isEqualTo(Op1.getOperand(1), Cmp.getOperand(1)) &&
(CC == X86::COND_G || CC == X86::COND_GE ||
CC == X86::COND_A || CC == X86::COND_AE)) ||
(DAG.isEqualTo(Op1.getOperand(0), Cmp.getOperand(1)) &&
DAG.isEqualTo(Op1.getOperand(1), Cmp.getOperand(0)) &&
(CC == X86::COND_L || CC == X86::COND_LE ||
CC == X86::COND_B || CC == X86::COND_BE))) {
if (Op1.getOpcode() == ISD::SUB) {
SDVTList VTs = DAG.getVTList(Op1.getValueType(), MVT::i32);
SDValue New = DAG.getNode(X86ISD::SUB, DL, VTs,
Op1.getOperand(0), Op1.getOperand(1));
DAG.ReplaceAllUsesWith(Op1, New);
Op1 = New;
}
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
unsigned NewCC = (CC == X86::COND_G || CC == X86::COND_GE ||
CC == X86::COND_L ||
CC == X86::COND_LE) ? X86::COND_GE : X86::COND_AE;
SDValue Ops[] = { Op2, Op1, DAG.getConstant(NewCC, MVT::i8),
SDValue(Op1.getNode(), 1) };
return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops, array_lengthof(Ops));
}
}
// (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
// (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
// (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
@ -8945,7 +8909,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
// a < b ? 0 : -1 -> RES = setcc_carry
// a >= b ? -1 : 0 -> RES = setcc_carry
// a >= b ? 0 : -1 -> RES = ~setcc_carry
if (Cond.getOpcode() == X86ISD::CMP) {
if (Cond.getOpcode() == X86ISD::SUB) {
Cond = ConvertCmpIfNecessary(Cond, DAG);
unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();

View File

@ -1132,8 +1132,10 @@ defm XOR : ArithBinOp_RF<0x30, 0x32, 0x34, "xor", MRM6r, MRM6m,
X86xor_flag, xor, 1, 0>;
defm ADD : ArithBinOp_RF<0x00, 0x02, 0x04, "add", MRM0r, MRM0m,
X86add_flag, add, 1, 1>;
let isCompare = 1 in {
defm SUB : ArithBinOp_RF<0x28, 0x2A, 0x2C, "sub", MRM5r, MRM5m,
X86sub_flag, sub, 0, 0>;
}
// Arithmetic.
let Uses = [EFLAGS] in {

View File

@ -3031,6 +3031,37 @@ analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, unsigned &SrcReg2,
CmpMask = ~0;
CmpValue = MI->getOperand(1).getImm();
return true;
// A SUB can be used to perform comparison.
case X86::SUB64rm:
case X86::SUB32rm:
case X86::SUB16rm:
case X86::SUB8rm:
SrcReg = MI->getOperand(1).getReg();
SrcReg2 = 0;
CmpMask = ~0;
CmpValue = 0;
return true;
case X86::SUB64rr:
case X86::SUB32rr:
case X86::SUB16rr:
case X86::SUB8rr:
SrcReg = MI->getOperand(1).getReg();
SrcReg2 = MI->getOperand(2).getReg();
CmpMask = ~0;
CmpValue = 0;
return true;
case X86::SUB64ri32:
case X86::SUB64ri8:
case X86::SUB32ri:
case X86::SUB32ri8:
case X86::SUB16ri:
case X86::SUB16ri8:
case X86::SUB8ri:
SrcReg = MI->getOperand(1).getReg();
SrcReg2 = 0;
CmpMask = ~0;
CmpValue = MI->getOperand(2).getImm();
return true;
case X86::CMP64rr:
case X86::CMP32rr:
case X86::CMP16rr:
@ -3139,6 +3170,55 @@ bool X86InstrInfo::
optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
int CmpMask, int CmpValue,
const MachineRegisterInfo *MRI) const {
// Check whether we can replace SUB with CMP.
unsigned NewOpcode = 0;
switch (CmpInstr->getOpcode()) {
default: break;
case X86::SUB64ri32:
case X86::SUB64ri8:
case X86::SUB32ri:
case X86::SUB32ri8:
case X86::SUB16ri:
case X86::SUB16ri8:
case X86::SUB8ri:
case X86::SUB64rm:
case X86::SUB32rm:
case X86::SUB16rm:
case X86::SUB8rm:
case X86::SUB64rr:
case X86::SUB32rr:
case X86::SUB16rr:
case X86::SUB8rr: {
if (!MRI->use_nodbg_empty(CmpInstr->getOperand(0).getReg()))
return false;
// There is no use of the destination register, we can replace SUB with CMP.
switch (CmpInstr->getOpcode()) {
default: llvm_unreachable(0);
case X86::SUB64rm: NewOpcode = X86::CMP64rm; break;
case X86::SUB32rm: NewOpcode = X86::CMP32rm; break;
case X86::SUB16rm: NewOpcode = X86::CMP16rm; break;
case X86::SUB8rm: NewOpcode = X86::CMP8rm; break;
case X86::SUB64rr: NewOpcode = X86::CMP64rr; break;
case X86::SUB32rr: NewOpcode = X86::CMP32rr; break;
case X86::SUB16rr: NewOpcode = X86::CMP16rr; break;
case X86::SUB8rr: NewOpcode = X86::CMP8rr; break;
case X86::SUB64ri32: NewOpcode = X86::CMP64ri32; break;
case X86::SUB64ri8: NewOpcode = X86::CMP64ri8; break;
case X86::SUB32ri: NewOpcode = X86::CMP32ri; break;
case X86::SUB32ri8: NewOpcode = X86::CMP32ri8; break;
case X86::SUB16ri: NewOpcode = X86::CMP16ri; break;
case X86::SUB16ri8: NewOpcode = X86::CMP16ri8; break;
case X86::SUB8ri: NewOpcode = X86::CMP8ri; break;
}
CmpInstr->setDesc(get(NewOpcode));
CmpInstr->RemoveOperand(0);
// Fall through to optimize Cmp if Cmp is CMPrr or CMPri.
if (NewOpcode == X86::CMP64rm || NewOpcode == X86::CMP32rm ||
NewOpcode == X86::CMP16rm || NewOpcode == X86::CMP8rm)
return false;
}
}
// Get the unique definition of SrcReg.
MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
if (!MI) return false;

View File

@ -230,3 +230,26 @@ define i32 @q(i32 %j.4, i32 %w, i32 %el) {
%j.5 = select i1 %or.cond, i32 %tmp535, i32 %j.4
ret i32 %j.5
}
; rdar://11873276
define i8* @r(i8* %base, i32* nocapture %offset, i32 %size) nounwind {
entry:
; CHECK: r:
; CHECK: sub
; CHECK-NOT: cmp
; CHECK: j
; CHECK-NOT: sub
; CHECK: ret
%0 = load i32* %offset, align 8
%cmp = icmp slt i32 %0, %size
br i1 %cmp, label %return, label %if.end
if.end:
%sub = sub nsw i32 %0, %size
store i32 %sub, i32* %offset, align 8
%add.ptr = getelementptr inbounds i8* %base, i32 %sub
br label %return
return:
%retval.0 = phi i8* [ %add.ptr, %if.end ], [ null, %entry ]
ret i8* %retval.0
}