forked from OSchip/llvm-project
[AMDGPU][NewPM] Port amdgpu-promote-alloca(-to-vector)
And add to AMDGPU opt pipeline. Don't pin an opt run to the legacy PM when -enable-new-pm=1 if these passes (or passes introduced in https://reviews.llvm.org/D93863) are in the list of passes. Reviewed By: arsenm Differential Revision: https://reviews.llvm.org/D93875
This commit is contained in:
parent
94427af60c
commit
0e9abcfc19
|
@ -208,6 +208,23 @@ FunctionPass *createAMDGPUPromoteAllocaToVector();
|
|||
void initializeAMDGPUPromoteAllocaToVectorPass(PassRegistry&);
|
||||
extern char &AMDGPUPromoteAllocaToVectorID;
|
||||
|
||||
struct AMDGPUPromoteAllocaPass : PassInfoMixin<AMDGPUPromoteAllocaPass> {
|
||||
AMDGPUPromoteAllocaPass(TargetMachine &TM) : TM(TM) {}
|
||||
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
|
||||
|
||||
private:
|
||||
TargetMachine &TM;
|
||||
};
|
||||
|
||||
struct AMDGPUPromoteAllocaToVectorPass
|
||||
: PassInfoMixin<AMDGPUPromoteAllocaToVectorPass> {
|
||||
AMDGPUPromoteAllocaToVectorPass(TargetMachine &TM) : TM(TM) {}
|
||||
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
|
||||
|
||||
private:
|
||||
TargetMachine &TM;
|
||||
};
|
||||
|
||||
Pass *createAMDGPUStructurizeCFGPass();
|
||||
FunctionPass *createAMDGPUISelDag(
|
||||
TargetMachine *TM = nullptr,
|
||||
|
|
|
@ -42,6 +42,7 @@
|
|||
#include "llvm/IR/LLVMContext.h"
|
||||
#include "llvm/IR/Metadata.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include "llvm/IR/PassManager.h"
|
||||
#include "llvm/IR/Type.h"
|
||||
#include "llvm/IR/User.h"
|
||||
#include "llvm/IR/Value.h"
|
||||
|
@ -83,8 +84,26 @@ static cl::opt<unsigned> PromoteAllocaToVectorLimit(
|
|||
|
||||
// FIXME: This can create globals so should be a module pass.
|
||||
class AMDGPUPromoteAlloca : public FunctionPass {
|
||||
public:
|
||||
static char ID;
|
||||
|
||||
AMDGPUPromoteAlloca() : FunctionPass(ID) {}
|
||||
|
||||
bool runOnFunction(Function &F) override;
|
||||
|
||||
StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
|
||||
|
||||
bool handleAlloca(AllocaInst &I, bool SufficientLDS);
|
||||
|
||||
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
||||
AU.setPreservesCFG();
|
||||
FunctionPass::getAnalysisUsage(AU);
|
||||
}
|
||||
};
|
||||
|
||||
class AMDGPUPromoteAllocaImpl {
|
||||
private:
|
||||
const TargetMachine *TM;
|
||||
const TargetMachine &TM;
|
||||
Module *Mod = nullptr;
|
||||
const DataLayout *DL = nullptr;
|
||||
|
||||
|
@ -116,28 +135,14 @@ private:
|
|||
/// Check whether we have enough local memory for promotion.
|
||||
bool hasSufficientLocalMem(const Function &F);
|
||||
|
||||
public:
|
||||
static char ID;
|
||||
|
||||
AMDGPUPromoteAlloca() : FunctionPass(ID) {}
|
||||
|
||||
bool doInitialization(Module &M) override;
|
||||
bool runOnFunction(Function &F) override;
|
||||
|
||||
StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
|
||||
|
||||
bool handleAlloca(AllocaInst &I, bool SufficientLDS);
|
||||
|
||||
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
||||
AU.setPreservesCFG();
|
||||
FunctionPass::getAnalysisUsage(AU);
|
||||
}
|
||||
public:
|
||||
AMDGPUPromoteAllocaImpl(TargetMachine &TM) : TM(TM) {}
|
||||
bool run(Function &F);
|
||||
};
|
||||
|
||||
class AMDGPUPromoteAllocaToVector : public FunctionPass {
|
||||
private:
|
||||
unsigned MaxVGPRs;
|
||||
|
||||
public:
|
||||
static char ID;
|
||||
|
||||
|
@ -149,8 +154,6 @@ public:
|
|||
return "AMDGPU Promote Alloca to vector";
|
||||
}
|
||||
|
||||
bool handleAlloca(AllocaInst &I);
|
||||
|
||||
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
||||
AU.setPreservesCFG();
|
||||
FunctionPass::getAnalysisUsage(AU);
|
||||
|
@ -171,32 +174,41 @@ INITIALIZE_PASS(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
|
|||
char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
|
||||
char &llvm::AMDGPUPromoteAllocaToVectorID = AMDGPUPromoteAllocaToVector::ID;
|
||||
|
||||
bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
|
||||
Mod = &M;
|
||||
DL = &Mod->getDataLayout();
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
|
||||
if (skipFunction(F))
|
||||
return false;
|
||||
|
||||
if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
|
||||
TM = &TPC->getTM<TargetMachine>();
|
||||
else
|
||||
return false;
|
||||
if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) {
|
||||
return AMDGPUPromoteAllocaImpl(TPC->getTM<TargetMachine>()).run(F);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
const Triple &TT = TM->getTargetTriple();
|
||||
PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F,
|
||||
FunctionAnalysisManager &AM) {
|
||||
bool Changed = AMDGPUPromoteAllocaImpl(TM).run(F);
|
||||
if (Changed) {
|
||||
PreservedAnalyses PA;
|
||||
PA.preserveSet<CFGAnalyses>();
|
||||
return PA;
|
||||
}
|
||||
return PreservedAnalyses::all();
|
||||
}
|
||||
|
||||
bool AMDGPUPromoteAllocaImpl::run(Function &F) {
|
||||
Mod = F.getParent();
|
||||
DL = &Mod->getDataLayout();
|
||||
|
||||
const Triple &TT = TM.getTargetTriple();
|
||||
IsAMDGCN = TT.getArch() == Triple::amdgcn;
|
||||
IsAMDHSA = TT.getOS() == Triple::AMDHSA;
|
||||
|
||||
const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
|
||||
const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
|
||||
if (!ST.isPromoteAllocaEnabled())
|
||||
return false;
|
||||
|
||||
if (IsAMDGCN) {
|
||||
const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
|
||||
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
|
||||
MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
|
||||
} else {
|
||||
MaxVGPRs = 128;
|
||||
|
@ -221,9 +233,9 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
|
|||
}
|
||||
|
||||
std::pair<Value *, Value *>
|
||||
AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
|
||||
AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) {
|
||||
const Function &F = *Builder.GetInsertBlock()->getParent();
|
||||
const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
|
||||
const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
|
||||
|
||||
if (!IsAMDHSA) {
|
||||
Function *LocalSizeYFn
|
||||
|
@ -308,9 +320,10 @@ AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
|
|||
return std::make_pair(Y, LoadZU);
|
||||
}
|
||||
|
||||
Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) {
|
||||
Value *AMDGPUPromoteAllocaImpl::getWorkitemID(IRBuilder<> &Builder,
|
||||
unsigned N) {
|
||||
const AMDGPUSubtarget &ST =
|
||||
AMDGPUSubtarget::get(*TM, *Builder.GetInsertBlock()->getParent());
|
||||
AMDGPUSubtarget::get(TM, *Builder.GetInsertBlock()->getParent());
|
||||
Intrinsic::ID IntrID = Intrinsic::not_intrinsic;
|
||||
|
||||
switch (N) {
|
||||
|
@ -592,11 +605,9 @@ static bool isCallPromotable(CallInst *CI) {
|
|||
}
|
||||
}
|
||||
|
||||
bool AMDGPUPromoteAlloca::binaryOpIsDerivedFromSameAlloca(Value *BaseAlloca,
|
||||
Value *Val,
|
||||
Instruction *Inst,
|
||||
int OpIdx0,
|
||||
int OpIdx1) const {
|
||||
bool AMDGPUPromoteAllocaImpl::binaryOpIsDerivedFromSameAlloca(
|
||||
Value *BaseAlloca, Value *Val, Instruction *Inst, int OpIdx0,
|
||||
int OpIdx1) const {
|
||||
// Figure out which operand is the one we might not be promoting.
|
||||
Value *OtherOp = Inst->getOperand(OpIdx0);
|
||||
if (Val == OtherOp)
|
||||
|
@ -624,10 +635,8 @@ bool AMDGPUPromoteAlloca::binaryOpIsDerivedFromSameAlloca(Value *BaseAlloca,
|
|||
return true;
|
||||
}
|
||||
|
||||
bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes(
|
||||
Value *BaseAlloca,
|
||||
Value *Val,
|
||||
std::vector<Value*> &WorkList) const {
|
||||
bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes(
|
||||
Value *BaseAlloca, Value *Val, std::vector<Value *> &WorkList) const {
|
||||
|
||||
for (User *User : Val->users()) {
|
||||
if (is_contained(WorkList, User))
|
||||
|
@ -727,10 +736,10 @@ bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes(
|
|||
return true;
|
||||
}
|
||||
|
||||
bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) {
|
||||
bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
|
||||
|
||||
FunctionType *FTy = F.getFunctionType();
|
||||
const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
|
||||
const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
|
||||
|
||||
// If the function has any arguments in the local address space, then it's
|
||||
// possible these arguments require the entire local memory space, so
|
||||
|
@ -863,7 +872,7 @@ bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) {
|
|||
}
|
||||
|
||||
// FIXME: Should try to pick the most likely to be profitable allocas first.
|
||||
bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
|
||||
bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) {
|
||||
// Array allocations are probably not worth handling, since an allocation of
|
||||
// the array type is the canonical form.
|
||||
if (!I.isStaticAlloca() || I.isArrayAllocation())
|
||||
|
@ -904,7 +913,7 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
|
|||
if (!SufficientLDS)
|
||||
return false;
|
||||
|
||||
const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, ContainingFunction);
|
||||
const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, ContainingFunction);
|
||||
unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
|
||||
|
||||
Align Alignment =
|
||||
|
@ -1083,22 +1092,29 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
|
|||
return true;
|
||||
}
|
||||
|
||||
bool AMDGPUPromoteAllocaToVector::runOnFunction(Function &F) {
|
||||
if (skipFunction(F) || DisablePromoteAllocaToVector)
|
||||
bool handlePromoteAllocaToVector(AllocaInst &I, unsigned MaxVGPRs) {
|
||||
// Array allocations are probably not worth handling, since an allocation of
|
||||
// the array type is the canonical form.
|
||||
if (!I.isStaticAlloca() || I.isArrayAllocation())
|
||||
return false;
|
||||
|
||||
const TargetMachine *TM;
|
||||
if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
|
||||
TM = &TPC->getTM<TargetMachine>();
|
||||
else
|
||||
LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');
|
||||
|
||||
Module *Mod = I.getParent()->getParent()->getParent();
|
||||
return tryPromoteAllocaToVector(&I, Mod->getDataLayout(), MaxVGPRs);
|
||||
}
|
||||
|
||||
bool promoteAllocasToVector(Function &F, TargetMachine &TM) {
|
||||
if (DisablePromoteAllocaToVector)
|
||||
return false;
|
||||
|
||||
const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
|
||||
const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
|
||||
if (!ST.isPromoteAllocaEnabled())
|
||||
return false;
|
||||
|
||||
if (TM->getTargetTriple().getArch() == Triple::amdgcn) {
|
||||
const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
|
||||
unsigned MaxVGPRs;
|
||||
if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
|
||||
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
|
||||
MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
|
||||
} else {
|
||||
MaxVGPRs = 128;
|
||||
|
@ -1114,23 +1130,31 @@ bool AMDGPUPromoteAllocaToVector::runOnFunction(Function &F) {
|
|||
}
|
||||
|
||||
for (AllocaInst *AI : Allocas) {
|
||||
if (handleAlloca(*AI))
|
||||
if (handlePromoteAllocaToVector(*AI, MaxVGPRs))
|
||||
Changed = true;
|
||||
}
|
||||
|
||||
return Changed;
|
||||
}
|
||||
|
||||
bool AMDGPUPromoteAllocaToVector::handleAlloca(AllocaInst &I) {
|
||||
// Array allocations are probably not worth handling, since an allocation of
|
||||
// the array type is the canonical form.
|
||||
if (!I.isStaticAlloca() || I.isArrayAllocation())
|
||||
bool AMDGPUPromoteAllocaToVector::runOnFunction(Function &F) {
|
||||
if (skipFunction(F))
|
||||
return false;
|
||||
if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) {
|
||||
return promoteAllocasToVector(F, TPC->getTM<TargetMachine>());
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');
|
||||
|
||||
Module *Mod = I.getParent()->getParent()->getParent();
|
||||
return tryPromoteAllocaToVector(&I, Mod->getDataLayout(), MaxVGPRs);
|
||||
PreservedAnalyses
|
||||
AMDGPUPromoteAllocaToVectorPass::run(Function &F, FunctionAnalysisManager &AM) {
|
||||
bool Changed = promoteAllocasToVector(F, TM);
|
||||
if (Changed) {
|
||||
PreservedAnalyses PA;
|
||||
PA.preserveSet<CFGAnalyses>();
|
||||
return PA;
|
||||
}
|
||||
return PreservedAnalyses::all();
|
||||
}
|
||||
|
||||
FunctionPass *llvm::createAMDGPUPromoteAlloca() {
|
||||
|
|
|
@ -29,6 +29,7 @@
|
|||
#include "SIMachineFunctionInfo.h"
|
||||
#include "SIMachineScheduler.h"
|
||||
#include "TargetInfo/AMDGPUTargetInfo.h"
|
||||
#include "llvm/Analysis/CGSCCPassManager.h"
|
||||
#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
|
||||
#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
|
||||
#include "llvm/CodeGen/GlobalISel/Legalizer.h"
|
||||
|
@ -488,8 +489,8 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
|
|||
void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB,
|
||||
bool DebugPassManager) {
|
||||
PB.registerPipelineParsingCallback(
|
||||
[](StringRef PassName, FunctionPassManager &PM,
|
||||
ArrayRef<PassBuilder::PipelineElement>) {
|
||||
[this](StringRef PassName, FunctionPassManager &PM,
|
||||
ArrayRef<PassBuilder::PipelineElement>) {
|
||||
if (PassName == "amdgpu-simplifylib") {
|
||||
PM.addPass(AMDGPUSimplifyLibCallsPass());
|
||||
return true;
|
||||
|
@ -498,6 +499,14 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB,
|
|||
PM.addPass(AMDGPUUseNativeCallsPass());
|
||||
return true;
|
||||
}
|
||||
if (PassName == "amdgpu-promote-alloca") {
|
||||
PM.addPass(AMDGPUPromoteAllocaPass(*this));
|
||||
return true;
|
||||
}
|
||||
if (PassName == "amdgpu-promote-alloca-to-vector") {
|
||||
PM.addPass(AMDGPUPromoteAllocaToVectorPass(*this));
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
});
|
||||
|
||||
|
@ -510,6 +519,18 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB,
|
|||
FPM.addPass(AMDGPUSimplifyLibCallsPass());
|
||||
PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
|
||||
});
|
||||
|
||||
PB.registerCGSCCOptimizerLateEPCallback(
|
||||
[this, DebugPassManager](CGSCCPassManager &PM,
|
||||
PassBuilder::OptimizationLevel Level) {
|
||||
if (Level != PassBuilder::OptimizationLevel::O0) {
|
||||
FunctionPassManager FPM(DebugPassManager);
|
||||
// Promote alloca to vector before SROA and loop unroll. If we manage
|
||||
// to eliminate allocas before unroll we may choose to unroll less.
|
||||
FPM.addPass(AMDGPUPromoteAllocaToVectorPass(*this));
|
||||
PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM)));
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
; RUN: opt -mtriple=amdgcn-- -O1 -S < %s | FileCheck %s --check-prefixes=FUNC,LOOP
|
||||
; RUN: opt -mtriple=amdgcn-- -passes='default<O1>' -S < %s | FileCheck %s --check-prefixes=FUNC,LOOP
|
||||
; RUN: opt -mtriple=amdgcn-- -O1 -S -disable-promote-alloca-to-vector < %s | FileCheck %s --check-prefixes=FUNC,FULL-UNROLL
|
||||
; RUN: opt -mtriple=amdgcn-- -passes='default<O1>' -S -disable-promote-alloca-to-vector < %s | FileCheck %s --check-prefixes=FUNC,FULL-UNROLL
|
||||
|
||||
target datalayout = "A5"
|
||||
|
||||
|
|
|
@ -4,6 +4,7 @@
|
|||
; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s
|
||||
; RUN: llc -march=r600 -mtriple=r600-- -mcpu=redwood < %s | FileCheck --check-prefix=EG -check-prefix=FUNC %s
|
||||
; RUN: opt -S -mtriple=amdgcn-- -amdgpu-promote-alloca -sroa -instcombine < %s | FileCheck -check-prefix=OPT %s
|
||||
; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-promote-alloca,sroa,instcombine < %s | FileCheck -check-prefix=OPT %s
|
||||
target datalayout = "A5"
|
||||
|
||||
; OPT-LABEL: @vector_read(
|
||||
|
|
|
@ -462,6 +462,13 @@ struct TimeTracerRAII {
|
|||
// TODO: use a codegen version of PassRegistry.def/PassBuilder::is*Pass() once
|
||||
// it exists.
|
||||
static bool shouldPinPassToLegacyPM(StringRef Pass) {
|
||||
std::vector<StringRef> PassNameExactToIgnore = {
|
||||
"amdgpu-simplifylib", "amdgpu-usenative", "amdgpu-promote-alloca",
|
||||
"amdgpu-promote-alloca-to-vector"};
|
||||
for (const auto &P : PassNameExactToIgnore)
|
||||
if (Pass == P)
|
||||
return false;
|
||||
|
||||
std::vector<StringRef> PassNamePrefix = {
|
||||
"x86-", "xcore-", "wasm-", "systemz-", "ppc-", "nvvm-", "nvptx-",
|
||||
"mips-", "lanai-", "hexagon-", "bpf-", "avr-", "thumb2-", "arm-",
|
||||
|
|
Loading…
Reference in New Issue