[PowerPC] Enable MachinePipeliner for P9 with -ppc-enable-pipeliner

Implement necessary target hooks to enable MachinePipeliner for P9 only.
The pass is off by default, can be enabled with -ppc-enable-pipeliner for P9.

Differential Revision: https://reviews.llvm.org/D62164

llvm-svn: 363085
This commit is contained in:
Jinsong Ji 2019-06-11 17:40:39 +00:00
parent 10c0855542
commit ef2d6d99c0
12 changed files with 227 additions and 19 deletions

View File

@ -318,9 +318,9 @@ private:
MBBVectorTy &EpilogBBs);
void splitLifetimes(MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs,
SMSchedule &Schedule);
void addBranches(MBBVectorTy &PrologBBs, MachineBasicBlock *KernelBB,
MBBVectorTy &EpilogBBs, SMSchedule &Schedule,
ValueMapTy *VRMap);
void addBranches(MachineBasicBlock &PreheaderBB, MBBVectorTy &PrologBBs,
MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs,
SMSchedule &Schedule, ValueMapTy *VRMap);
bool computeDelta(MachineInstr &MI, unsigned &Delta);
void updateMemOperands(MachineInstr &NewMI, MachineInstr &OldMI,
unsigned Num);

View File

@ -670,8 +670,9 @@ public:
/// is finished. Return the value/register of the new loop count. We need
/// this function when peeling off one or more iterations of a loop. This
/// function assumes the nth iteration is peeled first.
virtual unsigned reduceLoopCount(MachineBasicBlock &MBB, MachineInstr *IndVar,
MachineInstr &Cmp,
virtual unsigned reduceLoopCount(MachineBasicBlock &MBB,
MachineBasicBlock &PreHeader,
MachineInstr *IndVar, MachineInstr &Cmp,
SmallVectorImpl<MachineOperand> &Cond,
SmallVectorImpl<MachineInstr *> &PrevInsts,
unsigned Iter, unsigned MaxIter) const {

View File

@ -193,6 +193,9 @@ public:
/// for preRA scheduling with the source level scheduler.
virtual bool enableMachineSchedDefaultSched() const { return true; }
/// True if the subtarget should run MachinePipeliner
virtual bool enableMachinePipeliner() const { return true; };
/// True if the subtarget should enable joining global copies.
///
/// By default this is enabled if the machine scheduler is enabled, but

View File

@ -187,6 +187,9 @@ bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) {
!EnableSWPOptSize.getPosition())
return false;
if (!mf.getSubtarget().enableMachinePipeliner())
return false;
// Cannot pipeline loops without instruction itineraries if we are using
// DFA for the pipeliner.
if (mf.getSubtarget().useDFAforSMS() &&
@ -2026,6 +2029,10 @@ void SwingSchedulerDAG::generatePipelinedLoop(SMSchedule &Schedule) {
InstrMapTy InstrMap;
SmallVector<MachineBasicBlock *, 4> PrologBBs;
MachineBasicBlock *PreheaderBB = MLI->getLoopFor(BB)->getLoopPreheader();
assert(PreheaderBB != nullptr &&
"Need to add code to handle loops w/o preheader");
// Generate the prolog instructions that set up the pipeline.
generateProlog(Schedule, MaxStageCount, KernelBB, VRMap, PrologBBs);
MF.insert(BB->getIterator(), KernelBB);
@ -2082,7 +2089,7 @@ void SwingSchedulerDAG::generatePipelinedLoop(SMSchedule &Schedule) {
removeDeadInstructions(KernelBB, EpilogBBs);
// Add branches between prolog and epilog blocks.
addBranches(PrologBBs, KernelBB, EpilogBBs, Schedule, VRMap);
addBranches(*PreheaderBB, PrologBBs, KernelBB, EpilogBBs, Schedule, VRMap);
// Remove the original loop since it's no longer referenced.
for (auto &I : *BB)
@ -2767,7 +2774,8 @@ static void removePhis(MachineBasicBlock *BB, MachineBasicBlock *Incoming) {
/// Create branches from each prolog basic block to the appropriate epilog
/// block. These edges are needed if the loop ends before reaching the
/// kernel.
void SwingSchedulerDAG::addBranches(MBBVectorTy &PrologBBs,
void SwingSchedulerDAG::addBranches(MachineBasicBlock &PreheaderBB,
MBBVectorTy &PrologBBs,
MachineBasicBlock *KernelBB,
MBBVectorTy &EpilogBBs,
SMSchedule &Schedule, ValueMapTy *VRMap) {
@ -2794,8 +2802,8 @@ void SwingSchedulerDAG::addBranches(MBBVectorTy &PrologBBs,
// Check if the LOOP0 has already been removed. If so, then there is no need
// to reduce the trip count.
if (LC != 0)
LC = TII->reduceLoopCount(*Prolog, IndVar, *Cmp, Cond, PrevInsts, j,
MaxIter);
LC = TII->reduceLoopCount(*Prolog, PreheaderBB, IndVar, *Cmp, Cond,
PrevInsts, j, MaxIter);
// Record the value of the first trip count, which is used to determine if
// branches and blocks can be removed for constant trip counts.

View File

@ -697,11 +697,11 @@ bool HexagonInstrInfo::analyzeLoop(MachineLoop &L,
/// Generate code to reduce the loop iteration by one and check if the loop is
/// finished. Return the value/register of the new loop count. this function
/// assumes the nth iteration is peeled first.
unsigned HexagonInstrInfo::reduceLoopCount(MachineBasicBlock &MBB,
MachineInstr *IndVar, MachineInstr &Cmp,
SmallVectorImpl<MachineOperand> &Cond,
SmallVectorImpl<MachineInstr *> &PrevInsts,
unsigned Iter, unsigned MaxIter) const {
unsigned HexagonInstrInfo::reduceLoopCount(
MachineBasicBlock &MBB, MachineBasicBlock &PreHeader, MachineInstr *IndVar,
MachineInstr &Cmp, SmallVectorImpl<MachineOperand> &Cond,
SmallVectorImpl<MachineInstr *> &PrevInsts, unsigned Iter,
unsigned MaxIter) const {
// We expect a hardware loop currently. This means that IndVar is set
// to null, and the compare is the ENDLOOP instruction.
assert((!IndVar) && isEndLoopN(Cmp.getOpcode())

View File

@ -139,7 +139,7 @@ public:
/// is finished. Return the value/register of the new loop count. We need
/// this function when peeling off one or more iterations of a loop. This
/// function assumes the nth iteration is peeled first.
unsigned reduceLoopCount(MachineBasicBlock &MBB,
unsigned reduceLoopCount(MachineBasicBlock &MBB, MachineBasicBlock &PreHeader,
MachineInstr *IndVar, MachineInstr &Cmp,
SmallVectorImpl<MachineOperand> &Cond,
SmallVectorImpl<MachineInstr *> &PrevInsts,

View File

@ -3922,3 +3922,77 @@ PPCInstrInfo::isSignOrZeroExtended(const MachineInstr &MI, bool SignExt,
}
return false;
}
bool PPCInstrInfo::isBDNZ(unsigned Opcode) const {
return (Opcode == (Subtarget.isPPC64() ? PPC::BDNZ8 : PPC::BDNZ));
}
bool PPCInstrInfo::analyzeLoop(MachineLoop &L, MachineInstr *&IndVarInst,
MachineInstr *&CmpInst) const {
MachineBasicBlock *LoopEnd = L.getBottomBlock();
MachineBasicBlock::iterator I = LoopEnd->getFirstTerminator();
// We really "analyze" only CTR loops right now.
if (I != LoopEnd->end() && isBDNZ(I->getOpcode())) {
IndVarInst = nullptr;
CmpInst = &*I;
return false;
}
return true;
}
MachineInstr *
PPCInstrInfo::findLoopInstr(MachineBasicBlock &PreHeader) const {
unsigned LOOPi = (Subtarget.isPPC64() ? PPC::MTCTR8loop : PPC::MTCTRloop);
// The loop set-up instruction should be in preheader
for (auto &I : PreHeader.instrs())
if (I.getOpcode() == LOOPi)
return &I;
return nullptr;
}
unsigned PPCInstrInfo::reduceLoopCount(
MachineBasicBlock &MBB, MachineBasicBlock &PreHeader, MachineInstr *IndVar,
MachineInstr &Cmp, SmallVectorImpl<MachineOperand> &Cond,
SmallVectorImpl<MachineInstr *> &PrevInsts, unsigned Iter,
unsigned MaxIter) const {
// We expect a hardware loop currently. This means that IndVar is set
// to null, and the compare is the ENDLOOP instruction.
assert((!IndVar) && isBDNZ(Cmp.getOpcode()) && "Expecting a CTR loop");
MachineFunction *MF = MBB.getParent();
DebugLoc DL = Cmp.getDebugLoc();
MachineInstr *Loop = findLoopInstr(PreHeader);
if (!Loop)
return 0;
unsigned LoopCountReg = Loop->getOperand(0).getReg();
MachineRegisterInfo &MRI = MF->getRegInfo();
MachineInstr *LoopCount = MRI.getUniqueVRegDef(LoopCountReg);
if (!LoopCount)
return 0;
// If the loop trip count is a compile-time value, then just change the
// value.
if (LoopCount->getOpcode() == PPC::LI8 || LoopCount->getOpcode() == PPC::LI) {
int64_t Offset = LoopCount->getOperand(1).getImm();
if (Offset <= 1) {
LoopCount->eraseFromParent();
Loop->eraseFromParent();
return 0;
}
LoopCount->getOperand(1).setImm(Offset - 1);
return Offset - 1;
}
// The loop trip count is a run-time value.
// We need to subtract one from the trip count,
// and insert branch later to check if we're done with the loop.
// Since BDZ/BDZ8 that we will insert will also decrease the ctr by 1,
// so we don't need to generate any thing here.
Cond.push_back(MachineOperand::CreateImm(0));
Cond.push_back(MachineOperand::CreateReg(
Subtarget.isPPC64() ? PPC::CTR8 : PPC::CTR, true));
return LoopCountReg;
}

View File

@ -457,6 +457,34 @@ public:
}
return Reg;
}
/// Check \p Opcode is BDNZ (Decrement CTR and branch if it is still nonzero).
bool isBDNZ(unsigned Opcode) const;
/// Find the hardware loop instruction used to set-up the specified loop.
/// On PPC, we have two instructions used to set-up the hardware loop
/// (MTCTRloop, MTCTR8loop) with corresponding endloop (BDNZ, BDNZ8)
/// instructions to indicate the end of a loop.
MachineInstr *findLoopInstr(MachineBasicBlock &PreHeader) const;
/// Analyze the loop code to find the loop induction variable and compare used
/// to compute the number of iterations. Currently, we analyze loop that are
/// controlled using hardware loops. In this case, the induction variable
/// instruction is null. For all other cases, this function returns true,
/// which means we're unable to analyze it. \p IndVarInst and \p CmpInst will
/// return new values when we can analyze the readonly loop \p L, otherwise,
/// nothing got changed
bool analyzeLoop(MachineLoop &L, MachineInstr *&IndVarInst,
MachineInstr *&CmpInst) const override;
/// Generate code to reduce the loop iteration by one and check if the loop
/// is finished. Return the value/register of the new loop count. We need
/// this function when peeling off one or more iterations of a loop. This
/// function assumes the last iteration is peeled first.
unsigned reduceLoopCount(MachineBasicBlock &MBB, MachineBasicBlock &PreHeader,
MachineInstr *IndVar, MachineInstr &Cmp,
SmallVectorImpl<MachineOperand> &Cond,
SmallVectorImpl<MachineInstr *> &PrevInsts,
unsigned Iter, unsigned MaxIter) const override;
};
}

View File

@ -39,6 +39,11 @@ static cl::opt<bool> QPXStackUnaligned("qpx-stack-unaligned",
cl::desc("Even when QPX is enabled the stack is not 32-byte aligned"),
cl::Hidden);
static cl::opt<bool>
EnableMachinePipeliner("ppc-enable-pipeliner",
cl::desc("Enable Machine Pipeliner for PPC"),
cl::init(false), cl::Hidden);
PPCSubtarget &PPCSubtarget::initializeSubtargetDependencies(StringRef CPU,
StringRef FS) {
initializeEnvironment();
@ -181,10 +186,14 @@ bool PPCSubtarget::hasLazyResolverStub(const GlobalValue *GV) const {
return false;
}
bool PPCSubtarget::enableMachineScheduler() const {
return true;
bool PPCSubtarget::enableMachineScheduler() const { return true; }
bool PPCSubtarget::enableMachinePipeliner() const {
return (DarwinDirective == PPC::DIR_PWR9) && EnableMachinePipeliner;
}
bool PPCSubtarget::useDFAforSMS() const { return false; }
// This overrides the PostRAScheduler bit in the SchedModel for each CPU.
bool PPCSubtarget::enablePostRAScheduler() const { return true; }

View File

@ -322,9 +322,13 @@ public:
/// but may expand the ISEL instruction later.
bool enableEarlyIfConversion() const override { return true; }
// Scheduling customization.
/// Scheduling customization.
bool enableMachineScheduler() const override;
// This overrides the PostRAScheduler bit in the SchedModel for each CPU.
/// Pipeliner customization.
bool enableMachinePipeliner() const override;
/// Machine Pipeliner customization
bool useDFAforSMS() const override;
/// This overrides the PostRAScheduler bit in the SchedModel for each CPU.
bool enablePostRAScheduler() const override;
AntiDepBreakMode getAntiDepBreakMode() const override;
void getCriticalPathRCs(RegClassVector &CriticalPathRCs) const override;

View File

@ -488,6 +488,9 @@ void PPCPassConfig::addPreRegAlloc() {
}
if (EnableExtraTOCRegDeps)
addPass(createPPCTOCRegDepsPass());
if (getOptLevel() != CodeGenOpt::None)
addPass(&MachinePipelinerID);
}
void PPCPassConfig::addPreSched2() {

View File

@ -0,0 +1,78 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -ppc-vsr-nums-as-vr -mtriple=powerpc64-unknown-linux-gnu \
; RUN: -verify-machineinstrs -ppc-asm-full-reg-names -mcpu=pwr9 --ppc-enable-pipeliner \
; RUN: | FileCheck %s
@x = dso_local local_unnamed_addr global <{ i32, i32, i32, i32, [1020 x i32] }> <{ i32 1, i32 2, i32 3, i32 4, [1020 x i32] zeroinitializer }>, align 4
@y = common dso_local global [1024 x i32] zeroinitializer, align 4
; Function Attrs: norecurse nounwind
define dso_local i32* @foo() local_unnamed_addr #0 {
; CHECK-LABEL: foo:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: addis r5, r2, x@toc@ha
; CHECK-NEXT: addi r5, r5, x@toc@l
; CHECK-NEXT: addis r6, r2, y@toc@ha
; CHECK-NEXT: li r7, 340
; CHECK-NEXT: addi r3, r6, y@toc@l
; CHECK-NEXT: lwz r6, y@toc@l(r6)
; CHECK-NEXT: mtctr r7
; CHECK-NEXT: addi r5, r5, -8
; CHECK-NEXT: lwzu r7, 12(r5)
; CHECK-NEXT: maddld r6, r7, r7, r6
; CHECK-NEXT: lwz r7, 4(r5)
; CHECK-NEXT: addi r4, r3, -8
; CHECK-NEXT: stwu r6, 12(r4)
; CHECK-NEXT: maddld r6, r7, r7, r6
; CHECK-NEXT: lwz r7, 8(r5)
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB0_1: # %for.body
; CHECK: maddld r7, r7, r7, r6
; CHECK-NEXT: lwzu r8, 12(r5)
; CHECK-NEXT: maddld r8, r8, r8, r7
; CHECK-NEXT: stw r6, 4(r4)
; CHECK-NEXT: lwz r6, 4(r5)
; CHECK-NEXT: maddld r6, r6, r6, r8
; CHECK-NEXT: stw r7, 8(r4)
; CHECK-NEXT: lwz r7, 8(r5)
; CHECK-NEXT: stwu r8, 12(r4)
; CHECK-NEXT: bdnz .LBB0_1
; CHECK-NEXT: # %bb.2:
; CHECK-NEXT: maddld r5, r7, r7, r6
; CHECK-NEXT: stw r6, 4(r4)
; CHECK-NEXT: stw r5, 8(r4)
; CHECK-NEXT: blr
entry:
%.pre = load i32, i32* getelementptr inbounds ([1024 x i32], [1024 x i32]* @y, i64 0, i64 0), align 4
br label %for.body
for.cond.cleanup: ; preds = %for.body
ret i32* getelementptr inbounds ([1024 x i32], [1024 x i32]* @y, i64 0, i64 0)
for.body: ; preds = %for.body, %entry
%0 = phi i32 [ %.pre, %entry ], [ %add.2, %for.body ]
%indvars.iv = phi i64 [ 1, %entry ], [ %indvars.iv.next.2, %for.body ]
%arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* bitcast (<{ i32, i32, i32, i32, [1020 x i32] }>* @x to [1024 x i32]*), i64 0, i64 %indvars.iv
%1 = load i32, i32* %arrayidx2, align 4
%mul = mul nsw i32 %1, %1
%add = add nsw i32 %mul, %0
%arrayidx6 = getelementptr inbounds [1024 x i32], [1024 x i32]* @y, i64 0, i64 %indvars.iv
store i32 %add, i32* %arrayidx6, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%arrayidx2.1 = getelementptr inbounds [1024 x i32], [1024 x i32]* bitcast (<{ i32, i32, i32, i32, [1020 x i32] }>* @x to [1024 x i32]*), i64 0, i64 %indvars.iv.next
%2 = load i32, i32* %arrayidx2.1, align 4
%mul.1 = mul nsw i32 %2, %2
%add.1 = add nsw i32 %mul.1, %add
%arrayidx6.1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @y, i64 0, i64 %indvars.iv.next
store i32 %add.1, i32* %arrayidx6.1, align 4
%indvars.iv.next.1 = add nuw nsw i64 %indvars.iv, 2
%arrayidx2.2 = getelementptr inbounds [1024 x i32], [1024 x i32]* bitcast (<{ i32, i32, i32, i32, [1020 x i32] }>* @x to [1024 x i32]*), i64 0, i64 %indvars.iv.next.1
%3 = load i32, i32* %arrayidx2.2, align 4
%mul.2 = mul nsw i32 %3, %3
%add.2 = add nsw i32 %mul.2, %add.1
%arrayidx6.2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @y, i64 0, i64 %indvars.iv.next.1
store i32 %add.2, i32* %arrayidx6.2, align 4
%indvars.iv.next.2 = add nuw nsw i64 %indvars.iv, 3
%exitcond.2 = icmp eq i64 %indvars.iv.next.2, 1024
br i1 %exitcond.2, label %for.cond.cleanup, label %for.body
}