From f4807a19e8107f68504bcf5965c940caa353a161 Mon Sep 17 00:00:00 2001 From: Evan Cheng Date: Thu, 27 Oct 2011 21:21:05 +0000 Subject: [PATCH] Avoid partial CPSR dependency from loop backedges. rdar://10357570 llvm-svn: 143145 --- llvm/lib/Target/ARM/Thumb2SizeReduction.cpp | 67 +++++++++++++-------- llvm/test/CodeGen/ARM/avoid-cpsr-rmw.ll | 38 +++++++++++- 2 files changed, 79 insertions(+), 26 deletions(-) diff --git a/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp b/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp index 89a155c5a7f5..e5fc8b4fdd5c 100644 --- a/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp +++ b/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp @@ -146,7 +146,8 @@ namespace { /// ReduceOpcodeMap - Maps wide opcode to index of entry in ReduceTable. DenseMap ReduceOpcodeMap; - bool canAddPseudoFlagDep(MachineInstr *Def, MachineInstr *Use); + bool canAddPseudoFlagDep(MachineInstr *Def, MachineInstr *Use, + bool IsSelfLoop); bool VerifyPredAndCC(MachineInstr *MI, const ReduceEntry &Entry, bool is2Addr, ARMCC::CondCodes Pred, @@ -157,19 +158,21 @@ namespace { bool ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI, const ReduceEntry &Entry, bool LiveCPSR, - MachineInstr *CPSRDef); + MachineInstr *CPSRDef, bool IsSelfLoop); /// ReduceTo2Addr - Reduce a 32-bit instruction to a 16-bit two-address /// instruction. bool ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI, const ReduceEntry &Entry, - bool LiveCPSR, MachineInstr *CPSRDef); + bool LiveCPSR, MachineInstr *CPSRDef, + bool IsSelfLoop); /// ReduceToNarrow - Reduce a 32-bit instruction to a 16-bit /// non-two-address instruction. bool ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI, const ReduceEntry &Entry, - bool LiveCPSR, MachineInstr *CPSRDef); + bool LiveCPSR, MachineInstr *CPSRDef, + bool IsSelfLoop); /// ReduceMBB - Reduce width of instructions in the specified basic block. bool ReduceMBB(MachineBasicBlock &MBB); @@ -210,10 +213,17 @@ static bool HasImplicitCPSRDef(const MCInstrDesc &MCID) { /// In this case it would have been ok to narrow the mul.w to muls since there /// are indirect RAW dependency between the muls and the mul.w bool -Thumb2SizeReduce::canAddPseudoFlagDep(MachineInstr *Def, MachineInstr *Use) { - if (!Def || !STI->avoidCPSRPartialUpdate()) +Thumb2SizeReduce::canAddPseudoFlagDep(MachineInstr *Def, MachineInstr *Use, + bool FirstInSelfLoop) { + // FIXME: Disable check for -Oz (aka OptimizeForSizeHarder). + if (!STI->avoidCPSRPartialUpdate()) return false; + if (!Def) + // If this BB loops back to itself, conservatively avoid narrowing the + // first instruction that does partial flag update. + return FirstInSelfLoop; + SmallSet Defs; for (unsigned i = 0, e = Def->getNumOperands(); i != e; ++i) { const MachineOperand &MO = Def->getOperand(i); @@ -476,15 +486,16 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI, bool Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI, const ReduceEntry &Entry, - bool LiveCPSR, MachineInstr *CPSRDef) { + bool LiveCPSR, MachineInstr *CPSRDef, + bool IsSelfLoop) { unsigned Opc = MI->getOpcode(); if (Opc == ARM::t2ADDri) { // If the source register is SP, try to reduce to tADDrSPi, otherwise // it's a normal reduce. if (MI->getOperand(1).getReg() != ARM::SP) { - if (ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, CPSRDef)) + if (ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop)) return true; - return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef); + return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop); } // Try to reduce to tADDrSPi. unsigned Imm = MI->getOperand(2).getImm(); @@ -535,12 +546,12 @@ Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI, switch (Opc) { default: break; case ARM::t2ADDSri: { - if (ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, CPSRDef)) + if (ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop)) return true; // fallthrough } case ARM::t2ADDSrr: - return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef); + return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop); } } break; @@ -552,13 +563,13 @@ Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI, case ARM::t2UXTB: case ARM::t2UXTH: if (MI->getOperand(2).getImm() == 0) - return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef); + return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop); break; case ARM::t2MOVi16: // Can convert only 'pure' immediate operands, not immediates obtained as // globals' addresses. if (MI->getOperand(1).isImm()) - return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef); + return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop); break; case ARM::t2CMPrr: { // Try to reduce to the lo-reg only version first. Why there are two @@ -568,9 +579,9 @@ Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI, // source insn opcode. So for now, we hack a local entry record to use. static const ReduceEntry NarrowEntry = { ARM::t2CMPrr,ARM::tCMPr, 0, 0, 0, 1, 1,2, 0, 0,1 }; - if (ReduceToNarrow(MBB, MI, NarrowEntry, LiveCPSR, CPSRDef)) + if (ReduceToNarrow(MBB, MI, NarrowEntry, LiveCPSR, CPSRDef, IsSelfLoop)) return true; - return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef); + return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop); } } return false; @@ -579,7 +590,8 @@ Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI, bool Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI, const ReduceEntry &Entry, - bool LiveCPSR, MachineInstr *CPSRDef) { + bool LiveCPSR, MachineInstr *CPSRDef, + bool IsSelfLoop) { if (ReduceLimit2Addr != -1 && ((int)Num2Addrs >= ReduceLimit2Addr)) return false; @@ -637,7 +649,7 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI, // Avoid adding a false dependency on partial flag update by some 16-bit // instructions which has the 's' bit set. if (Entry.PartFlag && NewMCID.hasOptionalDef() && HasCC && - canAddPseudoFlagDep(CPSRDef, MI)) + canAddPseudoFlagDep(CPSRDef, MI, IsSelfLoop)) return false; // Add the 16-bit instruction. @@ -674,7 +686,8 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI, bool Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI, const ReduceEntry &Entry, - bool LiveCPSR, MachineInstr *CPSRDef) { + bool LiveCPSR, MachineInstr *CPSRDef, + bool IsSelfLoop) { if (ReduceLimit != -1 && ((int)NumNarrows >= ReduceLimit)) return false; @@ -727,7 +740,7 @@ Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI, // Avoid adding a false dependency on partial flag update by some 16-bit // instructions which has the 's' bit set. if (Entry.PartFlag && NewMCID.hasOptionalDef() && HasCC && - canAddPseudoFlagDep(CPSRDef, MI)) + canAddPseudoFlagDep(CPSRDef, MI, IsSelfLoop)) return false; // Add the 16-bit instruction. @@ -818,6 +831,9 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) { bool LiveCPSR = MBB.isLiveIn(ARM::CPSR); MachineInstr *CPSRDef = 0; + // If this BB loops back to itself, conservatively avoid narrowing the + // first instruction that does partial flag update. + bool IsSelfLoop = MBB.isSuccessor(&MBB); MachineBasicBlock::iterator MII = MBB.begin(), E = MBB.end(); MachineBasicBlock::iterator NextMII; for (; MII != E; MII = NextMII) { @@ -832,7 +848,7 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) { const ReduceEntry &Entry = ReduceTable[OPI->second]; // Ignore "special" cases for now. if (Entry.Special) { - if (ReduceSpecial(MBB, MI, Entry, LiveCPSR, CPSRDef)) { + if (ReduceSpecial(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop)) { Modified = true; MachineBasicBlock::iterator I = prior(NextMII); MI = &*I; @@ -842,7 +858,7 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) { // Try to transform to a 16-bit two-address instruction. if (Entry.NarrowOpc2 && - ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, CPSRDef)) { + ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop)) { Modified = true; MachineBasicBlock::iterator I = prior(NextMII); MI = &*I; @@ -851,7 +867,7 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) { // Try to transform to a 16-bit non-two-address instruction. if (Entry.NarrowOpc1 && - ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef)) { + ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop)) { Modified = true; MachineBasicBlock::iterator I = prior(NextMII); MI = &*I; @@ -861,12 +877,15 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) { ProcessNext: bool DefCPSR = false; LiveCPSR = UpdateCPSRDef(*MI, LiveCPSR, DefCPSR); - if (MI->getDesc().isCall()) + if (MI->getDesc().isCall()) { // Calls don't really set CPSR. CPSRDef = 0; - else if (DefCPSR) + IsSelfLoop = false; + } else if (DefCPSR) { // This is the last CPSR defining instruction. CPSRDef = MI; + IsSelfLoop = false; + } } return Modified; diff --git a/llvm/test/CodeGen/ARM/avoid-cpsr-rmw.ll b/llvm/test/CodeGen/ARM/avoid-cpsr-rmw.ll index 92aff7007f23..877ec1806261 100644 --- a/llvm/test/CodeGen/ARM/avoid-cpsr-rmw.ll +++ b/llvm/test/CodeGen/ARM/avoid-cpsr-rmw.ll @@ -3,9 +3,9 @@ ; dependency) when it isn't dependent on last CPSR defining instruction. ; rdar://8928208 -define i32 @t(i32 %a, i32 %b, i32 %c, i32 %d) nounwind readnone { +define i32 @t1(i32 %a, i32 %b, i32 %c, i32 %d) nounwind readnone { entry: -; CHECK: t: +; CHECK: t1: ; CHECK: muls [[REG:(r[0-9]+)]], r2, r3 ; CHECK-NEXT: mul [[REG2:(r[0-9]+)]], r0, r1 ; CHECK-NEXT: muls r0, [[REG2]], [[REG]] @@ -14,3 +14,37 @@ define i32 @t(i32 %a, i32 %b, i32 %c, i32 %d) nounwind readnone { %2 = mul nsw i32 %0, %1 ret i32 %2 } + +; Avoid partial CPSR dependency via loop backedge. +; rdar://10357570 +define void @t2(i32* nocapture %ptr1, i32* %ptr2, i32 %c) nounwind { +entry: +; CHECK: t2: + %tobool7 = icmp eq i32* %ptr2, null + br i1 %tobool7, label %while.end, label %while.body + +while.body: +; CHECK: while.body +; CHECK: mul r{{[0-9]+}} +; CHECK-NOT: muls + %ptr1.addr.09 = phi i32* [ %add.ptr, %while.body ], [ %ptr1, %entry ] + %ptr2.addr.08 = phi i32* [ %incdec.ptr, %while.body ], [ %ptr2, %entry ] + %0 = load i32* %ptr1.addr.09, align 4 + %arrayidx1 = getelementptr inbounds i32* %ptr1.addr.09, i32 1 + %1 = load i32* %arrayidx1, align 4 + %arrayidx3 = getelementptr inbounds i32* %ptr1.addr.09, i32 2 + %2 = load i32* %arrayidx3, align 4 + %arrayidx4 = getelementptr inbounds i32* %ptr1.addr.09, i32 3 + %3 = load i32* %arrayidx4, align 4 + %add.ptr = getelementptr inbounds i32* %ptr1.addr.09, i32 4 + %mul = mul i32 %1, %0 + %mul5 = mul i32 %mul, %2 + %mul6 = mul i32 %mul5, %3 + store i32 %mul6, i32* %ptr2.addr.08, align 4 + %incdec.ptr = getelementptr inbounds i32* %ptr2.addr.08, i32 -1 + %tobool = icmp eq i32* %incdec.ptr, null + br i1 %tobool, label %while.end, label %while.body + +while.end: + ret void +}