diff --git a/llvm/include/llvm/CodeGen/ScheduleDAG.h b/llvm/include/llvm/CodeGen/ScheduleDAG.h index 076268b99c20..a86ba831853c 100644 --- a/llvm/include/llvm/CodeGen/ScheduleDAG.h +++ b/llvm/include/llvm/CodeGen/ScheduleDAG.h @@ -247,6 +247,7 @@ namespace llvm { unsigned NumSuccs; // # of SDep::Data sucss. unsigned NumPredsLeft; // # of preds not scheduled. unsigned NumSuccsLeft; // # of succs not scheduled. + bool isCall : 1; // Is a function call. bool isTwoAddress : 1; // Is a two-address instruction. bool isCommutable : 1; // Is a commutable instruction. bool hasPhysRegDefs : 1; // Has physreg defs that are being used. @@ -273,7 +274,8 @@ namespace llvm { SUnit(SDNode *node, unsigned nodenum) : Node(node), Instr(0), OrigNode(0), NodeNum(nodenum), NodeQueueId(0), Latency(0), NumPreds(0), NumSuccs(0), NumPredsLeft(0), - NumSuccsLeft(0), isTwoAddress(false), isCommutable(false), + NumSuccsLeft(0), + isCall(false), isTwoAddress(false), isCommutable(false), hasPhysRegDefs(false), hasPhysRegClobbers(false), isPending(false), isAvailable(false), isScheduled(false), isScheduleHigh(false), isCloned(false), @@ -286,7 +288,8 @@ namespace llvm { SUnit(MachineInstr *instr, unsigned nodenum) : Node(0), Instr(instr), OrigNode(0), NodeNum(nodenum), NodeQueueId(0), Latency(0), NumPreds(0), NumSuccs(0), NumPredsLeft(0), - NumSuccsLeft(0), isTwoAddress(false), isCommutable(false), + NumSuccsLeft(0), + isCall(false), isTwoAddress(false), isCommutable(false), hasPhysRegDefs(false), hasPhysRegClobbers(false), isPending(false), isAvailable(false), isScheduled(false), isScheduleHigh(false), isCloned(false), @@ -298,7 +301,8 @@ namespace llvm { SUnit() : Node(0), Instr(0), OrigNode(0), NodeNum(~0u), NodeQueueId(0), Latency(0), NumPreds(0), NumSuccs(0), NumPredsLeft(0), - NumSuccsLeft(0), isTwoAddress(false), isCommutable(false), + NumSuccsLeft(0), + isCall(false), isTwoAddress(false), isCommutable(false), hasPhysRegDefs(false), hasPhysRegClobbers(false), isPending(false), isAvailable(false), isScheduled(false), isScheduleHigh(false), isCloned(false), diff --git a/llvm/include/llvm/Target/TargetInstrInfo.h b/llvm/include/llvm/Target/TargetInstrInfo.h index 0ecfc79c6b99..7e77dba761f0 100644 --- a/llvm/include/llvm/Target/TargetInstrInfo.h +++ b/llvm/include/llvm/Target/TargetInstrInfo.h @@ -304,12 +304,14 @@ public: return true; } - /// isProfitableToIfCvt - Return true if it's profitable to first "NumInstrs" + /// isProfitableToIfCvt - Return true if it's profitable to predicate + /// instructions with accumulated instruction latency of "NumCycles" /// of the specified basic block, where the probability of the instructions /// being executed is given by Probability, and Confidence is a measure /// of our confidence that it will be properly predicted. virtual - bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumInstrs, + bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCyles, + unsigned ExtraPredCycles, float Probability, float Confidence) const { return false; } @@ -321,19 +323,22 @@ public: /// by Probability, and Confidence is a measure of our confidence that it /// will be properly predicted. virtual bool - isProfitableToIfCvt(MachineBasicBlock &TMBB, unsigned NumTInstrs, - MachineBasicBlock &FMBB, unsigned NumFInstrs, + isProfitableToIfCvt(MachineBasicBlock &TMBB, + unsigned NumTCycles, unsigned ExtraTCycles, + MachineBasicBlock &FMBB, + unsigned NumFCycles, unsigned ExtraFCycles, float Probability, float Confidence) const { return false; } /// isProfitableToDupForIfCvt - Return true if it's profitable for - /// if-converter to duplicate a specific number of instructions in the - /// specified MBB to enable if-conversion, where the probability of the - /// instructions being executed is given by Probability, and Confidence is - /// a measure of our confidence that it will be properly predicted. + /// if-converter to duplicate instructions of specified accumulated + /// instruction latencies in the specified MBB to enable if-conversion. + /// The probability of the instructions being executed is given by + /// Probability, and Confidence is a measure of our confidence that it + /// will be properly predicted. virtual bool - isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumInstrs, + isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCyles, float Probability, float Confidence) const { return false; } @@ -608,24 +613,31 @@ public: /// getNumMicroOps - Return the number of u-operations the given machine /// instruction will be decoded to on the target cpu. - virtual unsigned getNumMicroOps(const MachineInstr *MI, - const InstrItineraryData *ItinData) const; + virtual unsigned getNumMicroOps(const InstrItineraryData *ItinData, + const MachineInstr *MI) const; /// getOperandLatency - Compute and return the use operand latency of a given - /// itinerary class and operand index if the value is produced by an - /// instruction of the specified itinerary class and def operand index. + /// pair of def and use. /// In most cases, the static scheduling itinerary was enough to determine the /// operand latency. But it may not be possible for instructions with variable /// number of defs / uses. - virtual - int getOperandLatency(const InstrItineraryData *ItinData, - const MachineInstr *DefMI, unsigned DefIdx, - const MachineInstr *UseMI, unsigned UseIdx) const; + virtual int getOperandLatency(const InstrItineraryData *ItinData, + const MachineInstr *DefMI, unsigned DefIdx, + const MachineInstr *UseMI, unsigned UseIdx) const; - virtual - int getOperandLatency(const InstrItineraryData *ItinData, - SDNode *DefNode, unsigned DefIdx, - SDNode *UseNode, unsigned UseIdx) const; + virtual int getOperandLatency(const InstrItineraryData *ItinData, + SDNode *DefNode, unsigned DefIdx, + SDNode *UseNode, unsigned UseIdx) const; + + /// getInstrLatency - Compute the instruction latency of a given instruction. + /// If the instruction has higher cost when predicated, it's returned via + /// PredCost. + virtual int getInstrLatency(const InstrItineraryData *ItinData, + const MachineInstr *MI, + unsigned *PredCost = 0) const; + + virtual int getInstrLatency(const InstrItineraryData *ItinData, + SDNode *Node) const; /// hasHighOperandLatency - Compute operand latency between a def of 'Reg' /// and an use in the current loop, return true if the target considered diff --git a/llvm/lib/CodeGen/IfConversion.cpp b/llvm/lib/CodeGen/IfConversion.cpp index e90985bf8994..c05aa40b6742 100644 --- a/llvm/lib/CodeGen/IfConversion.cpp +++ b/llvm/lib/CodeGen/IfConversion.cpp @@ -93,7 +93,8 @@ namespace { /// ClobbersPred - True if BB could modify predicates (e.g. has /// cmp, call, etc.) /// NonPredSize - Number of non-predicated instructions. - /// ExtraCost - Extra cost for microcoded instructions. + /// ExtraCost - Extra cost for multi-cycle instructions. + /// ExtraCost2 - Some instructions are slower when predicated /// BB - Corresponding MachineBasicBlock. /// TrueBB / FalseBB- See AnalyzeBranch(). /// BrCond - Conditions for end of block conditional branches. @@ -110,6 +111,7 @@ namespace { bool ClobbersPred : 1; unsigned NonPredSize; unsigned ExtraCost; + unsigned ExtraCost2; MachineBasicBlock *BB; MachineBasicBlock *TrueBB; MachineBasicBlock *FalseBB; @@ -119,7 +121,7 @@ namespace { IsAnalyzed(false), IsEnqueued(false), IsBrAnalyzable(false), HasFallThrough(false), IsUnpredicable(false), CannotBeCopied(false), ClobbersPred(false), NonPredSize(0), - ExtraCost(0), BB(0), TrueBB(0), FalseBB(0) {} + ExtraCost(0), ExtraCost2(0), BB(0), TrueBB(0), FalseBB(0) {} }; /// IfcvtToken - Record information about pending if-conversions to attempt: @@ -203,17 +205,20 @@ namespace { bool IgnoreBr = false); void MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges = true); - bool MeetIfcvtSizeLimit(MachineBasicBlock &BB, unsigned Size, + bool MeetIfcvtSizeLimit(MachineBasicBlock &BB, + unsigned Cycle, unsigned Extra, float Prediction, float Confidence) const { - return Size > 0 && TII->isProfitableToIfCvt(BB, Size, - Prediction, Confidence); + return Cycle > 0 && TII->isProfitableToIfCvt(BB, Cycle, Extra, + Prediction, Confidence); } - bool MeetIfcvtSizeLimit(MachineBasicBlock &TBB, unsigned TSize, - MachineBasicBlock &FBB, unsigned FSize, + bool MeetIfcvtSizeLimit(MachineBasicBlock &TBB, + unsigned TCycle, unsigned TExtra, + MachineBasicBlock &FBB, + unsigned FCycle, unsigned FExtra, float Prediction, float Confidence) const { - return TSize > 0 && FSize > 0 && - TII->isProfitableToIfCvt(TBB, TSize, FBB, FSize, + return TCycle > 0 && FCycle > 0 && + TII->isProfitableToIfCvt(TBB, TCycle, TExtra, FBB, FCycle, FExtra, Prediction, Confidence); } @@ -649,6 +654,7 @@ void IfConverter::ScanInstructions(BBInfo &BBI) { // Then scan all the instructions. BBI.NonPredSize = 0; BBI.ExtraCost = 0; + BBI.ExtraCost2 = 0; BBI.ClobbersPred = false; for (MachineBasicBlock::iterator I = BBI.BB->begin(), E = BBI.BB->end(); I != E; ++I) { @@ -665,9 +671,12 @@ void IfConverter::ScanInstructions(BBInfo &BBI) { if (!isCondBr) { if (!isPredicated) { BBI.NonPredSize++; - unsigned NumOps = TII->getNumMicroOps(&*I, InstrItins); - if (NumOps > 1) - BBI.ExtraCost += NumOps-1; + unsigned ExtraPredCost = 0; + unsigned NumCycles = TII->getInstrLatency(InstrItins, &*I, + &ExtraPredCost); + if (NumCycles > 1) + BBI.ExtraCost += NumCycles-1; + BBI.ExtraCost2 += ExtraPredCost; } else if (!AlreadyPredicated) { // FIXME: This instruction is already predicated before the // if-conversion pass. It's probably something like a conditional move. @@ -815,9 +824,9 @@ IfConverter::BBInfo &IfConverter::AnalyzeBlock(MachineBasicBlock *BB, if (CanRevCond && ValidDiamond(TrueBBI, FalseBBI, Dups, Dups2) && MeetIfcvtSizeLimit(*TrueBBI.BB, (TrueBBI.NonPredSize - (Dups + Dups2) + - TrueBBI.ExtraCost), + TrueBBI.ExtraCost), TrueBBI.ExtraCost2, *FalseBBI.BB, (FalseBBI.NonPredSize - (Dups + Dups2) + - FalseBBI.ExtraCost), + FalseBBI.ExtraCost),FalseBBI.ExtraCost2, Prediction, Confidence) && FeasibilityAnalysis(TrueBBI, BBI.BrCond) && FeasibilityAnalysis(FalseBBI, RevCond)) { @@ -836,7 +845,7 @@ IfConverter::BBInfo &IfConverter::AnalyzeBlock(MachineBasicBlock *BB, if (ValidTriangle(TrueBBI, FalseBBI, false, Dups, Prediction, Confidence) && MeetIfcvtSizeLimit(*TrueBBI.BB, TrueBBI.NonPredSize + TrueBBI.ExtraCost, - Prediction, Confidence) && + TrueBBI.ExtraCost2, Prediction, Confidence) && FeasibilityAnalysis(TrueBBI, BBI.BrCond, true)) { // Triangle: // EBB @@ -851,7 +860,7 @@ IfConverter::BBInfo &IfConverter::AnalyzeBlock(MachineBasicBlock *BB, if (ValidTriangle(TrueBBI, FalseBBI, true, Dups, Prediction, Confidence) && MeetIfcvtSizeLimit(*TrueBBI.BB, TrueBBI.NonPredSize + TrueBBI.ExtraCost, - Prediction, Confidence) && + TrueBBI.ExtraCost2, Prediction, Confidence) && FeasibilityAnalysis(TrueBBI, BBI.BrCond, true, true)) { Tokens.push_back(new IfcvtToken(BBI, ICTriangleRev, TNeedSub, Dups)); Enqueued = true; @@ -859,7 +868,7 @@ IfConverter::BBInfo &IfConverter::AnalyzeBlock(MachineBasicBlock *BB, if (ValidSimple(TrueBBI, Dups, Prediction, Confidence) && MeetIfcvtSizeLimit(*TrueBBI.BB, TrueBBI.NonPredSize + TrueBBI.ExtraCost, - Prediction, Confidence) && + TrueBBI.ExtraCost2, Prediction, Confidence) && FeasibilityAnalysis(TrueBBI, BBI.BrCond)) { // Simple (split, no rejoin): // EBB @@ -878,7 +887,7 @@ IfConverter::BBInfo &IfConverter::AnalyzeBlock(MachineBasicBlock *BB, 1.0-Prediction, Confidence) && MeetIfcvtSizeLimit(*FalseBBI.BB, FalseBBI.NonPredSize + FalseBBI.ExtraCost, - 1.0-Prediction, Confidence) && + FalseBBI.ExtraCost2, 1.0-Prediction, Confidence) && FeasibilityAnalysis(FalseBBI, RevCond, true)) { Tokens.push_back(new IfcvtToken(BBI, ICTriangleFalse, FNeedSub, Dups)); Enqueued = true; @@ -888,7 +897,7 @@ IfConverter::BBInfo &IfConverter::AnalyzeBlock(MachineBasicBlock *BB, 1.0-Prediction, Confidence) && MeetIfcvtSizeLimit(*FalseBBI.BB, FalseBBI.NonPredSize + FalseBBI.ExtraCost, - 1.0-Prediction, Confidence) && + FalseBBI.ExtraCost2, 1.0-Prediction, Confidence) && FeasibilityAnalysis(FalseBBI, RevCond, true, true)) { Tokens.push_back(new IfcvtToken(BBI, ICTriangleFRev, FNeedSub, Dups)); Enqueued = true; @@ -897,7 +906,7 @@ IfConverter::BBInfo &IfConverter::AnalyzeBlock(MachineBasicBlock *BB, if (ValidSimple(FalseBBI, Dups, 1.0-Prediction, Confidence) && MeetIfcvtSizeLimit(*FalseBBI.BB, FalseBBI.NonPredSize + FalseBBI.ExtraCost, - 1.0-Prediction, Confidence) && + FalseBBI.ExtraCost2, 1.0-Prediction, Confidence) && FeasibilityAnalysis(FalseBBI, RevCond)) { Tokens.push_back(new IfcvtToken(BBI, ICSimpleFalse, FNeedSub, Dups)); Enqueued = true; @@ -1427,9 +1436,11 @@ void IfConverter::CopyAndPredicateBlock(BBInfo &ToBBI, BBInfo &FromBBI, MachineInstr *MI = MF.CloneMachineInstr(I); ToBBI.BB->insert(ToBBI.BB->end(), MI); ToBBI.NonPredSize++; - unsigned NumOps = TII->getNumMicroOps(MI, InstrItins); - if (NumOps > 1) - ToBBI.ExtraCost += NumOps-1; + unsigned ExtraPredCost = 0; + unsigned NumCycles = TII->getInstrLatency(InstrItins, &*I, &ExtraPredCost); + if (NumCycles > 1) + ToBBI.ExtraCost += NumCycles-1; + ToBBI.ExtraCost2 += ExtraPredCost; if (!TII->isPredicated(I) && !MI->isDebugValue()) { if (!TII->PredicateInstruction(MI, Cond)) { @@ -1504,8 +1515,10 @@ void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges) { ToBBI.NonPredSize += FromBBI.NonPredSize; ToBBI.ExtraCost += FromBBI.ExtraCost; + ToBBI.ExtraCost2 += FromBBI.ExtraCost2; FromBBI.NonPredSize = 0; FromBBI.ExtraCost = 0; + FromBBI.ExtraCost2 = 0; ToBBI.ClobbersPred |= FromBBI.ClobbersPred; ToBBI.HasFallThrough = FromBBI.HasFallThrough; diff --git a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp index abd68caf1224..e86a78c69195 100644 --- a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp +++ b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp @@ -238,6 +238,8 @@ void ScheduleDAGInstrs::BuildSchedGraph(AliasAnalysis *AA) { "Cannot schedule terminators or labels!"); // Create the SUnit for this MI. SUnit *SU = NewSUnit(MI); + SU->isCall = TID.isCall(); + SU->isCommutable = TID.isCommutable(); // Assign the Latency field of SU using target-provided information. if (UnitLatencies) @@ -564,9 +566,9 @@ void ScheduleDAGInstrs::ComputeLatency(SUnit *SU) { // extra time. if (SU->getInstr()->getDesc().mayLoad()) SU->Latency += 2; - } else - SU->Latency = - InstrItins->getStageLatency(SU->getInstr()->getDesc().getSchedClass()); + } else { + SU->Latency = TII->getInstrLatency(InstrItins, SU->getInstr()); + } } void ScheduleDAGInstrs::ComputeOperandLatency(SUnit *Def, SUnit *Use, diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp index ea1aaa1e05c7..9978d00f20f0 100644 --- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp @@ -1589,6 +1589,10 @@ bool src_ls_rr_sort::operator()(const SUnit *left, const SUnit *right) const { } bool hybrid_ls_rr_sort::operator()(const SUnit *left, const SUnit *right) const{ + if (left->isCall || right->isCall) + // No way to compute latency of calls. + return BURRSort(left, right, SPQ); + bool LHigh = SPQ->HighRegPressure(left); bool RHigh = SPQ->HighRegPressure(right); // Avoid causing spills. If register pressure is high, schedule for @@ -1648,6 +1652,10 @@ bool hybrid_ls_rr_sort::operator()(const SUnit *left, const SUnit *right) const{ bool ilp_ls_rr_sort::operator()(const SUnit *left, const SUnit *right) const { + if (left->isCall || right->isCall) + // No way to compute latency of calls. + return BURRSort(left, right, SPQ); + bool LHigh = SPQ->HighRegPressure(left); bool RHigh = SPQ->HighRegPressure(right); // Avoid causing spills. If register pressure is high, schedule for diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp index 7d01bd31b960..429b1152b076 100644 --- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp @@ -72,6 +72,7 @@ SUnit *ScheduleDAGSDNodes::Clone(SUnit *Old) { SUnit *SU = NewSUnit(Old->getNode()); SU->OrigNode = Old->OrigNode; SU->Latency = Old->Latency; + SU->isCall = Old->isCall; SU->isTwoAddress = Old->isTwoAddress; SU->isCommutable = Old->isCommutable; SU->hasPhysRegDefs = Old->hasPhysRegDefs; @@ -300,6 +301,8 @@ void ScheduleDAGSDNodes::BuildSchedUnits() { N = N->getOperand(N->getNumOperands()-1).getNode(); assert(N->getNodeId() == -1 && "Node already inserted!"); N->setNodeId(NodeSUnit->NodeNum); + if (N->isMachineOpcode() && TII->get(N->getMachineOpcode()).isCall()) + NodeSUnit->isCall = true; } // Scan down to find any flagged succs. @@ -316,6 +319,8 @@ void ScheduleDAGSDNodes::BuildSchedUnits() { assert(N->getNodeId() == -1 && "Node already inserted!"); N->setNodeId(NodeSUnit->NodeNum); N = *UI; + if (N->isMachineOpcode() && TII->get(N->getMachineOpcode()).isCall()) + NodeSUnit->isCall = true; break; } if (!HasFlagUse) break; @@ -438,10 +443,8 @@ void ScheduleDAGSDNodes::ComputeLatency(SUnit *SU) { // all nodes flagged together into this SUnit. SU->Latency = 0; for (SDNode *N = SU->getNode(); N; N = N->getFlaggedNode()) - if (N->isMachineOpcode()) { - SU->Latency += InstrItins-> - getStageLatency(TII->get(N->getMachineOpcode()).getSchedClass()); - } + if (N->isMachineOpcode()) + SU->Latency += TII->getInstrLatency(InstrItins, N); } void ScheduleDAGSDNodes::ComputeOperandLatency(SDNode *Def, SDNode *Use, diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp index 325706500865..b023379e7ba8 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -40,10 +40,6 @@ static cl::opt EnableARM3Addr("enable-arm-3-addr-conv", cl::Hidden, cl::desc("Enable ARM 2-addr to 3-addr conv")); -static cl::opt -OldARMIfCvt("old-arm-ifcvt", cl::Hidden, - cl::desc("Use old-style ARM if-conversion heuristics")); - ARMBaseInstrInfo::ARMBaseInstrInfo(const ARMSubtarget& STI) : TargetInstrInfoImpl(ARMInsts, array_lengthof(ARMInsts)), Subtarget(STI) { @@ -1205,53 +1201,36 @@ bool ARMBaseInstrInfo::isSchedulingBoundary(const MachineInstr *MI, } bool ARMBaseInstrInfo::isProfitableToIfCvt(MachineBasicBlock &MBB, - unsigned NumInstrs, + unsigned NumCyles, + unsigned ExtraPredCycles, float Probability, float Confidence) const { - if (!NumInstrs) + if (!NumCyles) return false; - // Use old-style heuristics - if (OldARMIfCvt) { - if (Subtarget.getCPUString() == "generic") - // Generic (and overly aggressive) if-conversion limits for testing. - return NumInstrs <= 10; - if (Subtarget.hasV7Ops()) - return NumInstrs <= 3; - return NumInstrs <= 2; - } - // Attempt to estimate the relative costs of predication versus branching. - float UnpredCost = Probability * NumInstrs; + float UnpredCost = Probability * NumCyles; UnpredCost += 1.0; // The branch itself UnpredCost += (1.0 - Confidence) * Subtarget.getMispredictionPenalty(); - float PredCost = NumInstrs; - - return PredCost < UnpredCost; - + return (float)(NumCyles + ExtraPredCycles) < UnpredCost; } bool ARMBaseInstrInfo:: -isProfitableToIfCvt(MachineBasicBlock &TMBB, unsigned NumT, - MachineBasicBlock &FMBB, unsigned NumF, +isProfitableToIfCvt(MachineBasicBlock &TMBB, + unsigned TCycles, unsigned TExtra, + MachineBasicBlock &FMBB, + unsigned FCycles, unsigned FExtra, float Probability, float Confidence) const { - // Use old-style if-conversion heuristics - if (OldARMIfCvt) { - return NumT && NumF && NumT <= 2 && NumF <= 2; - } - - if (!NumT || !NumF) + if (!TCycles || !FCycles) return false; // Attempt to estimate the relative costs of predication versus branching. - float UnpredCost = Probability * NumT + (1.0 - Probability) * NumF; + float UnpredCost = Probability * TCycles + (1.0 - Probability) * FCycles; UnpredCost += 1.0; // The branch itself UnpredCost += (1.0 - Confidence) * Subtarget.getMispredictionPenalty(); - float PredCost = NumT + NumF; - - return PredCost < UnpredCost; + return (float)(TCycles + FCycles + TExtra + FExtra) < UnpredCost; } /// getInstrPredicate - If instruction is predicated, returns its predicate @@ -1591,8 +1570,8 @@ OptimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, int CmpMask, } unsigned -ARMBaseInstrInfo::getNumMicroOps(const MachineInstr *MI, - const InstrItineraryData *ItinData) const { +ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData, + const MachineInstr *MI) const { if (!ItinData || ItinData->isEmpty()) return 1; @@ -1649,9 +1628,14 @@ ARMBaseInstrInfo::getNumMicroOps(const MachineInstr *MI, case ARM::t2STM_UPD: { unsigned NumRegs = MI->getNumOperands() - Desc.getNumOperands() + 1; if (Subtarget.isCortexA8()) { - // 4 registers would be issued: 1, 2, 1. - // 5 registers would be issued: 1, 2, 2. - return 1 + (NumRegs / 2); + if (NumRegs < 4) + return 2; + // 4 registers would be issued: 2, 2. + // 5 registers would be issued: 2, 2, 1. + UOps = (NumRegs / 2); + if (NumRegs % 2) + ++UOps; + return UOps; } else if (Subtarget.isCortexA9()) { UOps = (NumRegs / 2); // If there are odd number of registers or if it's not 64-bit aligned, @@ -2025,6 +2009,46 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, return Latency; } +int ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, + const MachineInstr *MI, + unsigned *PredCost) const { + if (MI->isCopyLike() || MI->isInsertSubreg() || + MI->isRegSequence() || MI->isImplicitDef()) + return 1; + + if (!ItinData || ItinData->isEmpty()) + return 1; + + const TargetInstrDesc &TID = MI->getDesc(); + unsigned Class = TID.getSchedClass(); + unsigned UOps = ItinData->Itineraries[Class].NumMicroOps; + if (PredCost && TID.hasImplicitDefOfPhysReg(ARM::CPSR)) + // When predicated, CPSR is an additional source operand for CPSR updating + // instructions, this apparently increases their latencies. + *PredCost = 1; + if (UOps) + return ItinData->getStageLatency(Class); + return getNumMicroOps(ItinData, MI); +} + +int ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, + SDNode *Node) const { + if (!Node->isMachineOpcode()) + return 1; + + if (!ItinData || ItinData->isEmpty()) + return 1; + + unsigned Opcode = Node->getMachineOpcode(); + switch (Opcode) { + default: + return ItinData->getStageLatency(get(Opcode).getSchedClass()); + case ARM::VLDMQ: + case ARM::VSTMQ: + return 2; + } +} + bool ARMBaseInstrInfo:: hasHighOperandLatency(const InstrItineraryData *ItinData, const MachineRegisterInfo *MRI, diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h index e19bd9c566d9..c11f02ccb109 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h @@ -318,18 +318,20 @@ public: const MachineFunction &MF) const; virtual bool isProfitableToIfCvt(MachineBasicBlock &MBB, - unsigned NumInstrs, + unsigned NumCyles, unsigned ExtraPredCycles, float Prob, float Confidence) const; - virtual bool isProfitableToIfCvt(MachineBasicBlock &TMBB,unsigned NumT, - MachineBasicBlock &FMBB,unsigned NumF, + virtual bool isProfitableToIfCvt(MachineBasicBlock &TMBB, + unsigned NumT, unsigned ExtraT, + MachineBasicBlock &FMBB, + unsigned NumF, unsigned ExtraF, float Probability, float Confidence) const; virtual bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, - unsigned NumInstrs, + unsigned NumCyles, float Probability, float Confidence) const { - return NumInstrs == 1; + return NumCyles == 1; } /// AnalyzeCompare - For a comparison instruction, return the source register @@ -345,8 +347,8 @@ public: const MachineRegisterInfo *MRI, MachineBasicBlock::iterator &MII) const; - virtual unsigned getNumMicroOps(const MachineInstr *MI, - const InstrItineraryData *ItinData) const; + virtual unsigned getNumMicroOps(const InstrItineraryData *ItinData, + const MachineInstr *MI) const; virtual int getOperandLatency(const InstrItineraryData *ItinData, @@ -379,6 +381,12 @@ private: const TargetInstrDesc &UseTID, unsigned UseIdx, unsigned UseAlign) const; + int getInstrLatency(const InstrItineraryData *ItinData, + const MachineInstr *MI, unsigned *PredCost = 0) const; + + int getInstrLatency(const InstrItineraryData *ItinData, + SDNode *Node) const; + bool hasHighOperandLatency(const InstrItineraryData *ItinData, const MachineRegisterInfo *MRI, const MachineInstr *DefMI, unsigned DefIdx, diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp index 0a0f3146efdc..719b140ce9fa 100644 --- a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp +++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp @@ -42,33 +42,6 @@ unsigned Thumb2InstrInfo::getUnindexedOpcode(unsigned Opc) const { return 0; } -bool Thumb2InstrInfo::isProfitableToIfCvt(MachineBasicBlock &MBB, - unsigned NumInstrs, - float Prediction, - float Confidence) const { - if (!OldT2IfCvt) - return ARMBaseInstrInfo::isProfitableToIfCvt(MBB, NumInstrs, - Prediction, Confidence); - return NumInstrs && NumInstrs <= 3; -} - -bool Thumb2InstrInfo:: -isProfitableToIfCvt(MachineBasicBlock &TMBB, unsigned NumT, - MachineBasicBlock &FMBB, unsigned NumF, - float Prediction, float Confidence) const { - if (!OldT2IfCvt) - return ARMBaseInstrInfo::isProfitableToIfCvt(TMBB, NumT, - FMBB, NumF, - Prediction, Confidence); - - // FIXME: Catch optimization such as: - // r0 = movne - // r0 = moveq - return NumT && NumF && - NumT <= 3 && NumF <= 3; -} - - void Thumb2InstrInfo::ReplaceTailWithBranchTo(MachineBasicBlock::iterator Tail, MachineBasicBlock *NewDest) const { diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.h b/llvm/lib/Target/ARM/Thumb2InstrInfo.h index b348ad01911a..9ed7eea7e2db 100644 --- a/llvm/lib/Target/ARM/Thumb2InstrInfo.h +++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.h @@ -38,12 +38,6 @@ public: bool isLegalToSplitMBBAt(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const; - bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumInstrs, - float Prediction, float Confidence) const; - bool isProfitableToIfCvt(MachineBasicBlock &TMBB, unsigned NumTInstrs, - MachineBasicBlock &FMBB, unsigned NumFInstrs, - float Prediction, float Confidence) const; - void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL, unsigned DestReg, unsigned SrcReg, diff --git a/llvm/lib/Target/TargetInstrInfo.cpp b/llvm/lib/Target/TargetInstrInfo.cpp index 62818d0cae9d..eca97ab09669 100644 --- a/llvm/lib/Target/TargetInstrInfo.cpp +++ b/llvm/lib/Target/TargetInstrInfo.cpp @@ -50,8 +50,8 @@ TargetInstrInfo::~TargetInstrInfo() { } unsigned -TargetInstrInfo::getNumMicroOps(const MachineInstr *MI, - const InstrItineraryData *ItinData) const { +TargetInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData, + const MachineInstr *MI) const { if (!ItinData || ItinData->isEmpty()) return 1; @@ -94,6 +94,26 @@ TargetInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, return ItinData->getOperandLatency(DefClass, DefIdx, UseClass, UseIdx); } +int TargetInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, + const MachineInstr *MI, + unsigned *PredCost) const { + if (!ItinData || ItinData->isEmpty()) + return 1; + + return ItinData->getStageLatency(MI->getDesc().getSchedClass()); +} + +int TargetInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, + SDNode *N) const { + if (!ItinData || ItinData->isEmpty()) + return 1; + + if (!N->isMachineOpcode()) + return 1; + + return ItinData->getStageLatency(get(N->getMachineOpcode()).getSchedClass()); +} + bool TargetInstrInfo::hasLowDefLatency(const InstrItineraryData *ItinData, const MachineInstr *DefMI, unsigned DefIdx) const { diff --git a/llvm/test/CodeGen/ARM/2007-01-19-InfiniteLoop.ll b/llvm/test/CodeGen/ARM/2007-01-19-InfiniteLoop.ll index 7f299aa1ceb2..0198908f0649 100644 --- a/llvm/test/CodeGen/ARM/2007-01-19-InfiniteLoop.ll +++ b/llvm/test/CodeGen/ARM/2007-01-19-InfiniteLoop.ll @@ -8,8 +8,9 @@ define fastcc i32 @dct_luma_sp(i32 %block_x, i32 %block_y, i32* %coeff_cost) { entry: ; Make sure to use base-updating stores for saving callee-saved registers. +; CHECK: push ; CHECK-NOT: sub sp -; CHECK: vpush +; CHECK: push %predicted_block = alloca [4 x [4 x i32]], align 4 ; <[4 x [4 x i32]]*> [#uses=1] br label %cond_next489 diff --git a/llvm/test/CodeGen/ARM/ifcvt10.ll b/llvm/test/CodeGen/ARM/ifcvt10.ll index 707f0510b918..75428ac21655 100644 --- a/llvm/test/CodeGen/ARM/ifcvt10.ll +++ b/llvm/test/CodeGen/ARM/ifcvt10.ll @@ -4,27 +4,40 @@ ; micro-coded and would have long issue latency even if predicated on ; false predicate. -%0 = type { float, float, float, float } -%pln = type { %vec, float } -%vec = type { [4 x float] } - -define arm_aapcs_vfpcc float @aaa(%vec* nocapture %ustart, %vec* nocapture %udir, %vec* nocapture %vstart, %vec* nocapture %vdir, %vec* %upoint, %vec* %vpoint) { -; CHECK: aaa: -; CHECK: vldr.32 -; CHECK-NOT: vldrne -; CHECK-NOT: vpopne -; CHECK-NOT: popne -; CHECK: vpop -; CHECK: pop +define void @t(double %a, double %b, double %c, double %d, i32* nocapture %solutions, double* nocapture %x) nounwind { entry: - br i1 undef, label %bb81, label %bb48 +; CHECK: t: +; CHECK: vpop {d8} +; CHECK-NOT: vpopne +; CHECK: ldmia sp!, {r7, pc} +; CHECK: vpop {d8} +; CHECK: ldmia sp!, {r7, pc} + br i1 undef, label %if.else, label %if.then -bb48: ; preds = %entry - %0 = call arm_aapcs_vfpcc %0 @bbb(%pln* undef, %vec* %vstart, %vec* undef) nounwind ; <%0> [#uses=0] - ret float 0.000000e+00 +if.then: ; preds = %entry + %mul73 = fmul double undef, 0.000000e+00 + %sub76 = fsub double %mul73, undef + store double %sub76, double* undef, align 4 + %call88 = tail call double @cos(double 0.000000e+00) nounwind + %mul89 = fmul double undef, %call88 + %sub92 = fsub double %mul89, undef + store double %sub92, double* undef, align 4 + ret void -bb81: ; preds = %entry - ret float 0.000000e+00 +if.else: ; preds = %entry + %tmp101 = tail call double @llvm.pow.f64(double undef, double 0x3FD5555555555555) + %add112 = fadd double %tmp101, undef + %mul118 = fmul double %add112, undef + store double 0.000000e+00, double* %x, align 4 + ret void } -declare arm_aapcs_vfpcc %0 @bbb(%pln* nocapture, %vec* nocapture, %vec* nocapture) nounwind +declare double @acos(double) + +declare double @sqrt(double) readnone + +declare double @cos(double) readnone + +declare double @fabs(double) + +declare double @llvm.pow.f64(double, double) nounwind readonly diff --git a/llvm/test/CodeGen/ARM/ifcvt11.ll b/llvm/test/CodeGen/ARM/ifcvt11.ll new file mode 100644 index 000000000000..63f8557d555b --- /dev/null +++ b/llvm/test/CodeGen/ARM/ifcvt11.ll @@ -0,0 +1,59 @@ +; RUN: llc < %s -mtriple=arm-apple-darwin -mcpu=cortex-a8 | FileCheck %s +; rdar://8598427 +; Adjust if-converter heuristics to avoid predicating vmrs which can cause +; significant regression. + +%struct.xyz_t = type { double, double, double } + +define i32 @effie(i32 %tsets, %struct.xyz_t* nocapture %p, i32 %a, i32 %b, i32 %c) nounwind readonly noinline { +; CHECK: effie: +entry: + %0 = icmp sgt i32 %tsets, 0 + br i1 %0, label %bb.nph, label %bb6 + +bb.nph: ; preds = %entry + %1 = add nsw i32 %b, %a + %2 = add nsw i32 %1, %c + br label %bb + +bb: ; preds = %bb4, %bb.nph +; CHECK: vcmpe.f64 +; CHECK: vmrs apsr_nzcv, fpscr + %r.19 = phi i32 [ 0, %bb.nph ], [ %r.0, %bb4 ] + %n.08 = phi i32 [ 0, %bb.nph ], [ %10, %bb4 ] + %scevgep10 = getelementptr inbounds %struct.xyz_t* %p, i32 %n.08, i32 0 + %scevgep11 = getelementptr %struct.xyz_t* %p, i32 %n.08, i32 1 + %3 = load double* %scevgep10, align 4 + %4 = load double* %scevgep11, align 4 + %5 = fcmp uge double %3, %4 + br i1 %5, label %bb3, label %bb1 + +bb1: ; preds = %bb +; CHECK-NOT: it +; CHECK-NOT: vcmpemi +; CHECK-NOT: vmrsmi +; CHECK: vcmpe.f64 +; CHECK: vmrs apsr_nzcv, fpscr + %scevgep12 = getelementptr %struct.xyz_t* %p, i32 %n.08, i32 2 + %6 = load double* %scevgep12, align 4 + %7 = fcmp uge double %3, %6 + br i1 %7, label %bb3, label %bb2 + +bb2: ; preds = %bb1 + %8 = add nsw i32 %2, %r.19 + br label %bb4 + +bb3: ; preds = %bb1, %bb + %9 = add nsw i32 %r.19, 1 + br label %bb4 + +bb4: ; preds = %bb3, %bb2 + %r.0 = phi i32 [ %9, %bb3 ], [ %8, %bb2 ] + %10 = add nsw i32 %n.08, 1 + %exitcond = icmp eq i32 %10, %tsets + br i1 %exitcond, label %bb6, label %bb + +bb6: ; preds = %bb4, %entry + %r.1.lcssa = phi i32 [ 0, %entry ], [ %r.0, %bb4 ] + ret i32 %r.1.lcssa +} diff --git a/llvm/test/CodeGen/ARM/lsr-on-unrolled-loops.ll b/llvm/test/CodeGen/ARM/lsr-on-unrolled-loops.ll index 3bc8fee6d339..52e40b234ba8 100644 --- a/llvm/test/CodeGen/ARM/lsr-on-unrolled-loops.ll +++ b/llvm/test/CodeGen/ARM/lsr-on-unrolled-loops.ll @@ -4,14 +4,14 @@ ; constant offset addressing, so that each of the following stores ; uses the same register. -; CHECK: vstr.32 s{{.*}}, [r{{.*}}, #-128] -; CHECK: vstr.32 s{{.*}}, [r{{.*}}, #-96] -; CHECK: vstr.32 s{{.*}}, [r{{.*}}, #-64] -; CHECK: vstr.32 s{{.*}}, [r{{.*}}, #-32] -; CHECK: vstr.32 s{{.*}}, [r{{.*}}] -; CHECK: vstr.32 s{{.*}}, [r{{.*}}, #32] -; CHECK: vstr.32 s{{.*}}, [r{{.*}}, #64] -; CHECK: vstr.32 s{{.*}}, [r{{.*}}, #96] +; CHECK: vstr.32 s{{.*}}, [lr, #-128] +; CHECK: vstr.32 s{{.*}}, [lr, #-96] +; CHECK: vstr.32 s{{.*}}, [lr, #-64] +; CHECK: vstr.32 s{{.*}}, [lr, #-32] +; CHECK: vstr.32 s{{.*}}, [lr] +; CHECK: vstr.32 s{{.*}}, [lr, #32] +; CHECK: vstr.32 s{{.*}}, [lr, #64] +; CHECK: vstr.32 s{{.*}}, [lr, #96] target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32-n32" diff --git a/llvm/test/CodeGen/ARM/reg_sequence.ll b/llvm/test/CodeGen/ARM/reg_sequence.ll index 390955472ec0..24eb3a88fb04 100644 --- a/llvm/test/CodeGen/ARM/reg_sequence.ll +++ b/llvm/test/CodeGen/ARM/reg_sequence.ll @@ -271,7 +271,6 @@ define arm_aapcs_vfpcc i32 @t10() nounwind { entry: ; CHECK: t10: ; CHECK: vmov.i32 q9, #0x3F000000 -; CHECK: vmov d0, d17 ; CHECK: vmla.f32 q8, q8, d0[0] %0 = shufflevector <4 x float> zeroinitializer, <4 x float> undef, <4 x i32> zeroinitializer ; <<4 x float>> [#uses=1] %1 = insertelement <4 x float> %0, float undef, i32 1 ; <<4 x float>> [#uses=1] diff --git a/llvm/test/CodeGen/Thumb2/2010-06-14-NEONCoalescer.ll b/llvm/test/CodeGen/Thumb2/2010-06-14-NEONCoalescer.ll index 080341c8df41..9ed6a01255f8 100644 --- a/llvm/test/CodeGen/Thumb2/2010-06-14-NEONCoalescer.ll +++ b/llvm/test/CodeGen/Thumb2/2010-06-14-NEONCoalescer.ll @@ -23,8 +23,6 @@ entry: %4 = insertelement <2 x double> %2, double %V.0.ph, i32 1 ; <<2 x double>> [#uses=2] ; Constant pool load followed by add. ; Then clobber the loaded register, not the sum. -; CHECK: vldr.64 -; CHECK: vadd.f64 ; CHECK: vldr.64 [[LDR:d.*]], ; CHECK: LPC0_0: ; CHECK: vadd.f64 [[ADD:d.*]], [[LDR]], [[LDR]]