From 4a010fd1eac964a9d60fa87dfc1841dcb666335b Mon Sep 17 00:00:00 2001 From: Evan Cheng Date: Wed, 29 Sep 2010 22:42:35 +0000 Subject: [PATCH] Model Cortex-a9 load to SUB, RSB, ADD, ADC, SBC, RSC, CMN, MVN, or CMP pipeline forwarding path. llvm-svn: 115098 --- .../llvm/Target/TargetInstrItineraries.h | 55 ++++++++++++++- llvm/lib/CodeGen/ScheduleDAGInstrs.cpp | 33 ++++----- .../SelectionDAG/ScheduleDAGSDNodes.cpp | 34 ++++----- llvm/lib/Target/ARM/ARMInstrThumb2.td | 4 +- llvm/lib/Target/ARM/ARMSchedule.td | 1 + llvm/lib/Target/ARM/ARMScheduleA8.td | 1 + llvm/lib/Target/ARM/ARMScheduleA9.td | 69 +++++++++++++------ llvm/utils/TableGen/SubtargetEmitter.cpp | 37 +++++----- 8 files changed, 157 insertions(+), 77 deletions(-) diff --git a/llvm/include/llvm/Target/TargetInstrItineraries.h b/llvm/include/llvm/Target/TargetInstrItineraries.h index fd193f7318f1..380147c650b4 100644 --- a/llvm/include/llvm/Target/TargetInstrItineraries.h +++ b/llvm/include/llvm/Target/TargetInstrItineraries.h @@ -111,14 +111,16 @@ class InstrItineraryData { public: const InstrStage *Stages; ///< Array of stages selected const unsigned *OperandCycles; ///< Array of operand cycles selected + const unsigned *Forwardings; ///< Array of pipeline forwarding pathes const InstrItinerary *Itineraries; ///< Array of itineraries selected /// Ctors. /// - InstrItineraryData() : Stages(0), OperandCycles(0), Itineraries(0) {} + InstrItineraryData() : Stages(0), OperandCycles(0), Forwardings(0), + Itineraries(0) {} InstrItineraryData(const InstrStage *S, const unsigned *OS, - const InstrItinerary *I) - : Stages(S), OperandCycles(OS), Itineraries(I) {} + const unsigned *F, const InstrItinerary *I) + : Stages(S), OperandCycles(OS), Forwardings(F), Itineraries(I) {} /// isEmpty - Returns true if there are no itineraries. /// @@ -182,6 +184,53 @@ public: return (int)OperandCycles[FirstIdx + OperandIdx]; } + /// hasPipelineForwarding - Return true if there is a pipeline forwarding + /// between instructions of itinerary classes DefClass and UseClasses so that + /// value produced by an instruction of itinerary class DefClass, operand + /// index DefIdx can be bypassed when it's read by an instruction of + /// itinerary class UseClass, operand index UseIdx. + bool hasPipelineForwarding(unsigned DefClass, unsigned DefIdx, + unsigned UseClass, unsigned UseIdx) const { + unsigned FirstDefIdx = Itineraries[DefClass].FirstOperandCycle; + unsigned LastDefIdx = Itineraries[DefClass].LastOperandCycle; + if ((FirstDefIdx + DefIdx) >= LastDefIdx) + return false; + if (Forwardings[FirstDefIdx + DefIdx] == 0) + return false; + + unsigned FirstUseIdx = Itineraries[UseClass].FirstOperandCycle; + unsigned LastUseIdx = Itineraries[UseClass].LastOperandCycle; + if ((FirstUseIdx + UseIdx) >= LastUseIdx) + return false; + + return Forwardings[FirstDefIdx + DefIdx] == + Forwardings[FirstUseIdx + UseIdx]; + } + + /// getOperandLatency - Compute and return the use operand latency of a given + /// itinerary class and operand index if the value is produced by an + /// instruction of the specified itinerary class and def operand index. + int getOperandLatency(unsigned DefClass, unsigned DefIdx, + unsigned UseClass, unsigned UseIdx) const { + if (isEmpty()) + return -1; + + int DefCycle = getOperandCycle(DefClass, DefIdx); + if (DefCycle == -1) + return -1; + + int UseCycle = getOperandCycle(UseClass, UseIdx); + if (UseCycle == -1) + return -1; + + UseCycle = DefCycle - UseCycle + 1; + if (UseCycle > 0 && + hasPipelineForwarding(DefClass, DefIdx, UseClass, UseIdx)) + // FIXME: This assumes one cycle benefit for every pipeline forwarding. + --UseCycle; + return UseCycle; + } + /// isMicroCoded - Return true if the instructions in the given class decode /// to more than one micro-ops. bool isMicroCoded(unsigned ItinClassIndx) const { diff --git a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp index da0b0562e120..3d2565dd18c5 100644 --- a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp +++ b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp @@ -527,26 +527,23 @@ void ScheduleDAGInstrs::ComputeOperandLatency(SUnit *Def, SUnit *Use, MachineInstr *DefMI = Def->getInstr(); int DefIdx = DefMI->findRegisterDefOperandIdx(Reg); if (DefIdx != -1) { - int DefCycle = InstrItins->getOperandCycle(DefMI->getDesc().getSchedClass(), - DefIdx); - if (DefCycle >= 0) { - MachineInstr *UseMI = Use->getInstr(); - const unsigned UseClass = UseMI->getDesc().getSchedClass(); + unsigned DefClass = DefMI->getDesc().getSchedClass(); + MachineInstr *UseMI = Use->getInstr(); + unsigned UseClass = UseMI->getDesc().getSchedClass(); - // For all uses of the register, calculate the maxmimum latency - int Latency = -1; - for (unsigned i = 0, e = UseMI->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = UseMI->getOperand(i); - if (!MO.isReg() || !MO.isUse()) - continue; - unsigned MOReg = MO.getReg(); - if (MOReg != Reg) - continue; + // For all uses of the register, calculate the maxmimum latency + int Latency = -1; + for (unsigned i = 0, e = UseMI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = UseMI->getOperand(i); + if (!MO.isReg() || !MO.isUse()) + continue; + unsigned MOReg = MO.getReg(); + if (MOReg != Reg) + continue; - int UseCycle = InstrItins->getOperandCycle(UseClass, i); - if (UseCycle >= 0) - Latency = std::max(Latency, DefCycle - UseCycle + 1); - } + int UseCycle = InstrItins->getOperandLatency(DefClass, DefIdx, + UseClass, i); + Latency = std::max(Latency, UseCycle); // If we found a latency, then replace the existing dependence latency. if (Latency >= 0) diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp index fbf621d0bb4b..23ff9c5807cc 100644 --- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp @@ -457,24 +457,24 @@ void ScheduleDAGSDNodes::ComputeOperandLatency(SDNode *Def, SDNode *Use, return; unsigned DefIdx = Use->getOperand(OpIdx).getResNo(); - if (Def->isMachineOpcode()) { - const TargetInstrDesc &II = TII->get(Def->getMachineOpcode()); - if (DefIdx >= II.getNumDefs()) - return; - int DefCycle = InstrItins->getOperandCycle(II.getSchedClass(), DefIdx); - if (DefCycle < 0) - return; - int UseCycle = 1; - if (Use->isMachineOpcode()) { - const unsigned UseClass = TII->get(Use->getMachineOpcode()).getSchedClass(); - UseCycle = InstrItins->getOperandCycle(UseClass, OpIdx); - } - if (UseCycle >= 0) { - int Latency = DefCycle - UseCycle + 1; - if (Latency >= 0) - dep.setLatency(Latency); - } + if (!Def->isMachineOpcode()) + return; + + const TargetInstrDesc &II = TII->get(Def->getMachineOpcode()); + if (DefIdx >= II.getNumDefs()) + return; + + int Latency = 0; + if (!Use->isMachineOpcode()) { + Latency = InstrItins->getOperandCycle(II.getSchedClass(), DefIdx); + } else { + unsigned DefClass = II.getSchedClass(); + unsigned UseClass = TII->get(Use->getMachineOpcode()).getSchedClass(); + Latency = InstrItins->getOperandLatency(DefClass, DefIdx, UseClass, OpIdx); } + + if (Latency >= 0) + dep.setLatency(Latency); } void ScheduleDAGSDNodes::dumpNode(const SUnit *SU) const { diff --git a/llvm/lib/Target/ARM/ARMInstrThumb2.td b/llvm/lib/Target/ARM/ARMInstrThumb2.td index 5ca21aa91973..25eca70d38fa 100644 --- a/llvm/lib/Target/ARM/ARMInstrThumb2.td +++ b/llvm/lib/Target/ARM/ARMInstrThumb2.td @@ -285,7 +285,7 @@ multiclass T2I_rbin_irs opcod, string opc, PatFrag opnode> { let Inst{5-4} = 0b00; // type } // shifted register - def rs : T2sI<(outs rGPR:$dst), (ins rGPR:$rhs, t2_so_reg:$lhs), IIC_iALUsi, + def rs : T2sI<(outs rGPR:$dst), (ins rGPR:$rhs, t2_so_reg:$lhs), IIC_iALUsir, opc, "\t$dst, $rhs, $lhs", [(set rGPR:$dst, (opnode t2_so_reg:$lhs, rGPR:$rhs))]> { let Inst{31-27} = 0b11101; @@ -1698,7 +1698,7 @@ defm t2ORN : T2I_bin_irs<0b0011, "orn", // Prefer over of t2EORri ra, rb, -1 because mvn has 16-bit version let AddedComplexity = 1 in defm t2MVN : T2I_un_irs <0b0011, "mvn", - IIC_iMOVi, IIC_iMOVr, IIC_iMOVsi, + IIC_iMVNi, IIC_iMVNr, IIC_iMVNsi, UnOpFrag<(not node:$Src)>, 1, 1>; diff --git a/llvm/lib/Target/ARM/ARMSchedule.td b/llvm/lib/Target/ARM/ARMSchedule.td index 133a81b195d8..00d148b8eda1 100644 --- a/llvm/lib/Target/ARM/ARMSchedule.td +++ b/llvm/lib/Target/ARM/ARMSchedule.td @@ -14,6 +14,7 @@ def IIC_iALUx : InstrItinClass; def IIC_iALUi : InstrItinClass; def IIC_iALUr : InstrItinClass; def IIC_iALUsi : InstrItinClass; +def IIC_iALUsir : InstrItinClass; def IIC_iALUsr : InstrItinClass; def IIC_iBITi : InstrItinClass; def IIC_iBITr : InstrItinClass; diff --git a/llvm/lib/Target/ARM/ARMScheduleA8.td b/llvm/lib/Target/ARM/ARMScheduleA8.td index e6b2beae0380..8962ec93efa5 100644 --- a/llvm/lib/Target/ARM/ARMScheduleA8.td +++ b/llvm/lib/Target/ARM/ARMScheduleA8.td @@ -36,6 +36,7 @@ def CortexA8Itineraries : ProcessorItineraries< InstrItinData], [2, 2]>, InstrItinData], [2, 2, 2]>, InstrItinData], [2, 2, 1]>, + InstrItinData], [2, 1, 2]>, InstrItinData], [2, 2, 1, 1]>, // // Bitwise Instructions that produce a result diff --git a/llvm/lib/Target/ARM/ARMScheduleA9.td b/llvm/lib/Target/ARM/ARMScheduleA9.td index 14197c824da8..1f4b8d1ab079 100644 --- a/llvm/lib/Target/ARM/ARMScheduleA9.td +++ b/llvm/lib/Target/ARM/ARMScheduleA9.td @@ -23,10 +23,14 @@ def A9_NPipe : FuncUnit; // NEON ALU/MUL pipe def A9_DRegsVFP: FuncUnit; // FP register set, VFP side def A9_DRegsN : FuncUnit; // FP register set, NEON side +// Bypasses +def A9_LdBypass : Bypass; + // Dual issue pipeline represented by A9_Pipe0 | A9_Pipe1 // def CortexA9Itineraries : ProcessorItineraries< - [A9_NPipe, A9_DRegsN, A9_DRegsVFP, A9_LSPipe, A9_Pipe0, A9_Pipe1], [], [ + [A9_NPipe, A9_DRegsN, A9_DRegsVFP, A9_LSPipe, A9_Pipe0, A9_Pipe1], + [A9_LdBypass], [ // Two fully-pipelined integer ALU pipelines // @@ -39,19 +43,30 @@ def CortexA9Itineraries : ProcessorItineraries< InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2]>, // // MVN instructions - InstrItinData], [1]>, - InstrItinData], [1, 1]>, - InstrItinData], [1, 1]>, - InstrItinData], [2, 2, 1]>, + InstrItinData], + [1]>, + InstrItinData], + [1, 1], [NoBypass, A9_LdBypass]>, + InstrItinData], + [1, 1]>, + InstrItinData], + [2, 2, 1]>, // // No operand cycles InstrItinData]>, // // Binary Instructions that produce a result - InstrItinData], [2, 2]>, - InstrItinData], [2, 2, 2]>, - InstrItinData], [2, 2, 1]>, - InstrItinData], [2, 2, 1, 1]>, + InstrItinData], + [2, 2], [NoBypass, A9_LdBypass]>, + InstrItinData], + [2, 2, 2], [NoBypass, A9_LdBypass, A9_LdBypass]>, + InstrItinData], + [2, 2, 1], [NoBypass, A9_LdBypass, NoBypass]>, + InstrItinData], + [2, 1, 2], [NoBypass, NoBypass, A9_LdBypass]>, + InstrItinData], + [2, 2, 1, 1], + [NoBypass, A9_LdBypass, NoBypass, NoBypass]>, // // Bitwise Instructions that produce a result InstrItinData], [2, 2]>, @@ -69,10 +84,14 @@ def CortexA9Itineraries : ProcessorItineraries< InstrItinData],[3, 1, 1, 1]>, // // Compare instructions - InstrItinData], [2]>, - InstrItinData], [2, 2]>, - InstrItinData], [2, 1]>, - InstrItinData], [2, 1, 1]>, + InstrItinData], + [2], [A9_LdBypass]>, + InstrItinData], + [2, 2], [A9_LdBypass, A9_LdBypass]>, + InstrItinData], + [2, 1], [A9_LdBypass, NoBypass]>, + InstrItinData], + [2, 1, 1], [A9_LdBypass, NoBypass, NoBypass]>, // // Test instructions InstrItinData], [2]>, @@ -105,31 +124,38 @@ def CortexA9Itineraries : ProcessorItineraries< // // Immediate offset InstrItinData, - InstrStage<1, [A9_LSPipe]>], [3, 1]>, + InstrStage<1, [A9_LSPipe]>], + [3, 1], [A9_LdBypass]>, // // Register offset InstrItinData, - InstrStage<1, [A9_LSPipe]>], [3, 1, 1]>, + InstrStage<1, [A9_LSPipe]>], + [3, 1, 1], [A9_LdBypass]>, // // Scaled register offset InstrItinData, - InstrStage<2, [A9_LSPipe]>], [4, 1, 1]>, + InstrStage<2, [A9_LSPipe]>], + [4, 1, 1], [A9_LdBypass]>, // // Immediate offset with update InstrItinData, - InstrStage<2, [A9_LSPipe]>], [3, 2, 1]>, + InstrStage<2, [A9_LSPipe]>], + [3, 2, 1], [A9_LdBypass]>, // // Register offset with update InstrItinData, - InstrStage<2, [A9_LSPipe]>], [3, 2, 1, 1]>, + InstrStage<2, [A9_LSPipe]>], + [3, 2, 1, 1], [A9_LdBypass]>, // // Scaled register offset with update InstrItinData, - InstrStage<2, [A9_LSPipe]>], [4, 3, 1, 1]>, + InstrStage<2, [A9_LSPipe]>], + [4, 3, 1, 1], [A9_LdBypass]>, // // Load multiple InstrItinData, - InstrStage<1, [A9_LSPipe]>]>, + InstrStage<1, [A9_LSPipe]>], + [3], [A9_LdBypass]>, // // Load multiple plus branch @@ -141,7 +167,8 @@ def CortexA9Itineraries : ProcessorItineraries< // iLoadi + iALUr for t2LDRpci_pic. InstrItinData, InstrStage<1, [A9_LSPipe]>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [4, 1]>, + InstrStage<1, [A9_Pipe0, A9_Pipe1]>], + [2, 1]>, // Integer store pipeline /// diff --git a/llvm/utils/TableGen/SubtargetEmitter.cpp b/llvm/utils/TableGen/SubtargetEmitter.cpp index 03a813d939d0..754635b3fb78 100644 --- a/llvm/utils/TableGen/SubtargetEmitter.cpp +++ b/llvm/utils/TableGen/SubtargetEmitter.cpp @@ -269,14 +269,14 @@ void SubtargetEmitter::FormItineraryBypassString(const std::string &Name, const std::vector &BypassList = ItinData->getValueAsListOfDefs("Bypasses"); unsigned N = BypassList.size(); - for (unsigned i = 0; i < N;) { + unsigned i = 0; + for (; i < N;) { ItinString += Name + "Bypass::" + BypassList[i]->getName(); - if (++i < N) ItinString += ", "; + if (++i < NOperandCycles) ItinString += ", "; } - - for (; N < NOperandCycles;) { + for (; i < NOperandCycles;) { ItinString += " 0"; - if (++N < NOperandCycles) ItinString += ", "; + if (++i < NOperandCycles) ItinString += ", "; } } @@ -316,14 +316,17 @@ void SubtargetEmitter::EmitStageAndOperandCycleData(raw_ostream &OS, OS << "}\n"; std::vector BPs = Proc->getValueAsListOfDefs("BP"); - OS << "\n// Pipeline bypasses for itineraries \"" << Name << "\"\n" - << "namespace " << Name << "Bypass {\n"; + if (BPs.size()) { + OS << "\n// Pipeline forwarding pathes for itineraries \"" << Name + << "\"\n" << "namespace " << Name << "Bypass {\n"; - for (unsigned j = 0, BPN = BPs.size(); j < BPN; ++j) - OS << " const unsigned " << BPs[j]->getName() - << " = 1 << " << j << ";\n"; + OS << " const unsigned NoBypass = 0;\n"; + for (unsigned j = 0, BPN = BPs.size(); j < BPN; ++j) + OS << " const unsigned " << BPs[j]->getName() + << " = 1 << " << j << ";\n"; - OS << "}\n"; + OS << "}\n"; + } } // Begin stages table @@ -335,12 +338,12 @@ void SubtargetEmitter::EmitStageAndOperandCycleData(raw_ostream &OS, OperandCycleTable += " 0, // No itinerary\n"; // Begin pipeline bypass table - std::string BypassTable = "static const unsigned Bypasses[] = {\n"; + std::string BypassTable = "static const unsigned ForwardingPathes[] = {\n"; BypassTable += " 0, // No itinerary\n"; unsigned StageCount = 1, OperandCycleCount = 1; unsigned ItinStageEnum = 1, ItinOperandCycleEnum = 1; - std::map ItinStageMap, ItinOperandCycleMap; + std::map ItinStageMap, ItinOperandMap; for (unsigned i = 0, N = ProcItinList.size(); i < N; i++) { // Next record Record *Proc = ProcItinList[i]; @@ -395,13 +398,14 @@ void SubtargetEmitter::EmitStageAndOperandCycleData(raw_ostream &OS, // Check to see if operand cycle already exists and create if it doesn't unsigned FindOperandCycle = 0; if (NOperandCycles > 0) { - FindOperandCycle = ItinOperandCycleMap[ItinOperandCycleString]; + std::string ItinOperandString = ItinOperandCycleString+ItinBypassString; + FindOperandCycle = ItinOperandMap[ItinOperandString]; if (FindOperandCycle == 0) { // Emit as cycle, // index OperandCycleTable += ItinOperandCycleString + ", // " + itostr(ItinOperandCycleEnum) + "\n"; // Record Itin class number. - ItinOperandCycleMap[ItinOperandCycleString] = + ItinOperandMap[ItinOperandCycleString] = FindOperandCycle = OperandCycleCount; // Emit as bypass, // index @@ -622,7 +626,8 @@ void SubtargetEmitter::ParseFeaturesFunction(raw_ostream &OS) { OS << "\n" << " InstrItinerary *Itinerary = (InstrItinerary *)" << "Features.getInfo(ProcItinKV, ProcItinKVSize);\n" - << " InstrItins = InstrItineraryData(Stages, OperandCycles, Itinerary);\n"; + << " InstrItins = InstrItineraryData(Stages, OperandCycles, " + << "ForwardingPathes, Itinerary);\n"; } OS << " return Features.getCPU();\n"