forked from OSchip/llvm-project
459 lines
17 KiB
C++
459 lines
17 KiB
C++
//===-- AMDGPUReplaceLDSUseWithPointer.cpp --------------------------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// This pass replaces all the uses of LDS within non-kernel functions by
|
|
// corresponding pointer counter-parts.
|
|
//
|
|
// The main motivation behind this pass is - to *avoid* subsequent LDS lowering
|
|
// pass from directly packing LDS (assume large LDS) into a struct type which
|
|
// would otherwise cause allocating huge memory for struct instance within every
|
|
// kernel.
|
|
//
|
|
// Brief sketch of the algorithm implemented in this pass is as below:
|
|
//
|
|
// 1. Collect all the LDS defined in the module which qualify for pointer
|
|
// replacement, say it is, LDSGlobals set.
|
|
//
|
|
// 2. Collect all the reachable callees for each kernel defined in the module,
|
|
// say it is, KernelToCallees map.
|
|
//
|
|
// 3. FOR (each global GV from LDSGlobals set) DO
|
|
// LDSUsedNonKernels = Collect all non-kernel functions which use GV.
|
|
// FOR (each kernel K in KernelToCallees map) DO
|
|
// ReachableCallees = KernelToCallees[K]
|
|
// ReachableAndLDSUsedCallees =
|
|
// SetIntersect(LDSUsedNonKernels, ReachableCallees)
|
|
// IF (ReachableAndLDSUsedCallees is not empty) THEN
|
|
// Pointer = Create a pointer to point-to GV if not created.
|
|
// Initialize Pointer to point-to GV within kernel K.
|
|
// ENDIF
|
|
// ENDFOR
|
|
// Replace all uses of GV within non kernel functions by Pointer.
|
|
// ENFOR
|
|
//
|
|
// LLVM IR example:
|
|
//
|
|
// Input IR:
|
|
//
|
|
// @lds = internal addrspace(3) global [4 x i32] undef, align 16
|
|
//
|
|
// define internal void @f0() {
|
|
// entry:
|
|
// %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds,
|
|
// i32 0, i32 0
|
|
// ret void
|
|
// }
|
|
//
|
|
// define protected amdgpu_kernel void @k0() {
|
|
// entry:
|
|
// call void @f0()
|
|
// ret void
|
|
// }
|
|
//
|
|
// Output IR:
|
|
//
|
|
// @lds = internal addrspace(3) global [4 x i32] undef, align 16
|
|
// @lds.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2
|
|
//
|
|
// define internal void @f0() {
|
|
// entry:
|
|
// %0 = load i16, i16 addrspace(3)* @lds.ptr, align 2
|
|
// %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0
|
|
// %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)*
|
|
// %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2,
|
|
// i32 0, i32 0
|
|
// ret void
|
|
// }
|
|
//
|
|
// define protected amdgpu_kernel void @k0() {
|
|
// entry:
|
|
// store i16 ptrtoint ([4 x i32] addrspace(3)* @lds to i16),
|
|
// i16 addrspace(3)* @lds.ptr, align 2
|
|
// call void @f0()
|
|
// ret void
|
|
// }
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "AMDGPU.h"
|
|
#include "GCNSubtarget.h"
|
|
#include "Utils/AMDGPUBaseInfo.h"
|
|
#include "Utils/AMDGPULDSUtils.h"
|
|
#include "llvm/ADT/DenseMap.h"
|
|
#include "llvm/ADT/STLExtras.h"
|
|
#include "llvm/ADT/SetOperations.h"
|
|
#include "llvm/CodeGen/TargetPassConfig.h"
|
|
#include "llvm/IR/Constants.h"
|
|
#include "llvm/IR/DerivedTypes.h"
|
|
#include "llvm/IR/IRBuilder.h"
|
|
#include "llvm/IR/InlineAsm.h"
|
|
#include "llvm/IR/Instructions.h"
|
|
#include "llvm/IR/IntrinsicsAMDGPU.h"
|
|
#include "llvm/IR/ReplaceConstant.h"
|
|
#include "llvm/InitializePasses.h"
|
|
#include "llvm/Pass.h"
|
|
#include "llvm/Support/Debug.h"
|
|
#include "llvm/Target/TargetMachine.h"
|
|
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
|
|
#include "llvm/Transforms/Utils/ModuleUtils.h"
|
|
#include <algorithm>
|
|
#include <vector>
|
|
|
|
#define DEBUG_TYPE "amdgpu-replace-lds-use-with-pointer"
|
|
|
|
using namespace llvm;
|
|
|
|
namespace {
|
|
|
|
class ReplaceLDSUseImpl {
|
|
Module &M;
|
|
LLVMContext &Ctx;
|
|
const DataLayout &DL;
|
|
Constant *LDSMemBaseAddr;
|
|
|
|
DenseMap<GlobalVariable *, GlobalVariable *> LDSToPointer;
|
|
DenseMap<GlobalVariable *, SmallPtrSet<Function *, 8>> LDSToNonKernels;
|
|
DenseMap<Function *, SmallPtrSet<Function *, 8>> KernelToCallees;
|
|
DenseMap<Function *, SmallPtrSet<GlobalVariable *, 8>> KernelToLDSPointers;
|
|
DenseMap<Function *, BasicBlock *> KernelToInitBB;
|
|
DenseMap<Function *, DenseMap<GlobalVariable *, Value *>>
|
|
FunctionToLDSToReplaceInst;
|
|
|
|
// Collect LDS which requires their uses to be replaced by pointer.
|
|
std::vector<GlobalVariable *> collectLDSRequiringPointerReplace() {
|
|
// Collect LDS which requires module lowering.
|
|
std::vector<GlobalVariable *> LDSGlobals = AMDGPU::findVariablesToLower(M);
|
|
|
|
// Remove LDS which don't qualify for replacement.
|
|
llvm::erase_if(LDSGlobals, [&](GlobalVariable *GV) {
|
|
return shouldIgnorePointerReplacement(GV);
|
|
});
|
|
|
|
return LDSGlobals;
|
|
}
|
|
|
|
// Returns true if uses of given LDS global within non-kernel functions should
|
|
// be keep as it is without pointer replacement.
|
|
bool shouldIgnorePointerReplacement(GlobalVariable *GV) {
|
|
// LDS whose size is very small and doesn't exceed pointer size is not worth
|
|
// replacing.
|
|
if (DL.getTypeAllocSize(GV->getValueType()) <= 2)
|
|
return true;
|
|
|
|
// LDS which is not used from non-kernel function scope or it is used from
|
|
// global scope does not qualify for replacement.
|
|
LDSToNonKernels[GV] = AMDGPU::collectNonKernelAccessorsOfLDS(GV);
|
|
return LDSToNonKernels[GV].empty();
|
|
|
|
// FIXME: When GV is used within all (or within most of the kernels), then
|
|
// it does not make sense to create a pointer for it.
|
|
}
|
|
|
|
// Insert new global LDS pointer which points to LDS.
|
|
GlobalVariable *createLDSPointer(GlobalVariable *GV) {
|
|
// LDS pointer which points to LDS is already created? Return it.
|
|
auto PointerEntry = LDSToPointer.insert(std::make_pair(GV, nullptr));
|
|
if (!PointerEntry.second)
|
|
return PointerEntry.first->second;
|
|
|
|
// We need to create new LDS pointer which points to LDS.
|
|
//
|
|
// Each CU owns at max 64K of LDS memory, so LDS address ranges from 0 to
|
|
// 2^16 - 1. Hence 16 bit pointer is enough to hold the LDS address.
|
|
auto *I16Ty = Type::getInt16Ty(Ctx);
|
|
GlobalVariable *LDSPointer = new GlobalVariable(
|
|
M, I16Ty, false, GlobalValue::InternalLinkage, UndefValue::get(I16Ty),
|
|
GV->getName() + Twine(".ptr"), nullptr, GlobalVariable::NotThreadLocal,
|
|
AMDGPUAS::LOCAL_ADDRESS);
|
|
|
|
LDSPointer->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
|
|
LDSPointer->setAlignment(AMDGPU::getAlign(DL, LDSPointer));
|
|
|
|
// Mark that an associated LDS pointer is created for LDS.
|
|
LDSToPointer[GV] = LDSPointer;
|
|
|
|
return LDSPointer;
|
|
}
|
|
|
|
// Split entry basic block in such a way that only lane 0 of each wave does
|
|
// the LDS pointer initialization, and return newly created basic block.
|
|
BasicBlock *activateLaneZero(Function *K) {
|
|
// If the entry basic block of kernel K is already split, then return
|
|
// newly created basic block.
|
|
auto BasicBlockEntry = KernelToInitBB.insert(std::make_pair(K, nullptr));
|
|
if (!BasicBlockEntry.second)
|
|
return BasicBlockEntry.first->second;
|
|
|
|
// Split entry basic block of kernel K.
|
|
auto *EI = &(*(K->getEntryBlock().getFirstInsertionPt()));
|
|
IRBuilder<> Builder(EI);
|
|
|
|
Value *Mbcnt =
|
|
Builder.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
|
|
{Builder.getInt32(-1), Builder.getInt32(0)});
|
|
Value *Cond = Builder.CreateICmpEQ(Mbcnt, Builder.getInt32(0));
|
|
Instruction *WB = cast<Instruction>(
|
|
Builder.CreateIntrinsic(Intrinsic::amdgcn_wave_barrier, {}, {}));
|
|
|
|
BasicBlock *NBB = SplitBlockAndInsertIfThen(Cond, WB, false)->getParent();
|
|
|
|
// Mark that the entry basic block of kernel K is split.
|
|
KernelToInitBB[K] = NBB;
|
|
|
|
return NBB;
|
|
}
|
|
|
|
// Within given kernel, initialize given LDS pointer to point to given LDS.
|
|
void initializeLDSPointer(Function *K, GlobalVariable *GV,
|
|
GlobalVariable *LDSPointer) {
|
|
// If LDS pointer is already initialized within K, then nothing to do.
|
|
auto PointerEntry = KernelToLDSPointers.insert(
|
|
std::make_pair(K, SmallPtrSet<GlobalVariable *, 8>()));
|
|
if (!PointerEntry.second)
|
|
if (PointerEntry.first->second.contains(LDSPointer))
|
|
return;
|
|
|
|
// Insert instructions at EI which initialize LDS pointer to point-to LDS
|
|
// within kernel K.
|
|
//
|
|
// That is, convert pointer type of GV to i16, and then store this converted
|
|
// i16 value within LDSPointer which is of type i16*.
|
|
auto *EI = &(*(activateLaneZero(K)->getFirstInsertionPt()));
|
|
IRBuilder<> Builder(EI);
|
|
Builder.CreateStore(Builder.CreatePtrToInt(GV, Type::getInt16Ty(Ctx)),
|
|
LDSPointer);
|
|
|
|
// Mark that LDS pointer is initialized within kernel K.
|
|
KernelToLDSPointers[K].insert(LDSPointer);
|
|
}
|
|
|
|
// We have created an LDS pointer for LDS, and initialized it to point-to LDS
|
|
// within all relevant kernels. Now replace all the uses of LDS within
|
|
// non-kernel functions by LDS pointer.
|
|
void replaceLDSUseByPointer(GlobalVariable *GV, GlobalVariable *LDSPointer) {
|
|
SmallVector<User *, 8> LDSUsers(GV->users());
|
|
for (auto *U : LDSUsers) {
|
|
// When `U` is a constant expression, it is possible that same constant
|
|
// expression exists within multiple instructions, and within multiple
|
|
// non-kernel functions. Collect all those non-kernel functions and all
|
|
// those instructions within which `U` exist.
|
|
auto FunctionToInsts =
|
|
AMDGPU::getFunctionToInstsMap(U, false /*=CollectKernelInsts*/);
|
|
|
|
for (auto FI = FunctionToInsts.begin(), FE = FunctionToInsts.end();
|
|
FI != FE; ++FI) {
|
|
Function *F = FI->first;
|
|
auto &Insts = FI->second;
|
|
for (auto *I : Insts) {
|
|
// If `U` is a constant expression, then we need to break the
|
|
// associated instruction into a set of separate instructions by
|
|
// converting constant expressions into instructions.
|
|
SmallPtrSet<Instruction *, 8> UserInsts;
|
|
|
|
if (U == I) {
|
|
// `U` is an instruction, conversion from constant expression to
|
|
// set of instructions is *not* required.
|
|
UserInsts.insert(I);
|
|
} else {
|
|
// `U` is a constant expression, convert it into corresponding set
|
|
// of instructions.
|
|
auto *CE = cast<ConstantExpr>(U);
|
|
convertConstantExprsToInstructions(I, CE, &UserInsts);
|
|
}
|
|
|
|
// Go through all the user instructions, if LDS exist within them as
|
|
// an operand, then replace it by replace instruction.
|
|
for (auto *II : UserInsts) {
|
|
auto *ReplaceInst = getReplacementInst(F, GV, LDSPointer);
|
|
II->replaceUsesOfWith(GV, ReplaceInst);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Create a set of replacement instructions which together replace LDS within
|
|
// non-kernel function F by accessing LDS indirectly using LDS pointer.
|
|
Value *getReplacementInst(Function *F, GlobalVariable *GV,
|
|
GlobalVariable *LDSPointer) {
|
|
// If the instruction which replaces LDS within F is already created, then
|
|
// return it.
|
|
auto LDSEntry = FunctionToLDSToReplaceInst.insert(
|
|
std::make_pair(F, DenseMap<GlobalVariable *, Value *>()));
|
|
if (!LDSEntry.second) {
|
|
auto ReplaceInstEntry =
|
|
LDSEntry.first->second.insert(std::make_pair(GV, nullptr));
|
|
if (!ReplaceInstEntry.second)
|
|
return ReplaceInstEntry.first->second;
|
|
}
|
|
|
|
// Get the instruction insertion point within the beginning of the entry
|
|
// block of current non-kernel function.
|
|
auto *EI = &(*(F->getEntryBlock().getFirstInsertionPt()));
|
|
IRBuilder<> Builder(EI);
|
|
|
|
// Insert required set of instructions which replace LDS within F.
|
|
auto *V = Builder.CreateBitCast(
|
|
Builder.CreateGEP(
|
|
Builder.getInt8Ty(), LDSMemBaseAddr,
|
|
Builder.CreateLoad(LDSPointer->getValueType(), LDSPointer)),
|
|
GV->getType());
|
|
|
|
// Mark that the replacement instruction which replace LDS within F is
|
|
// created.
|
|
FunctionToLDSToReplaceInst[F][GV] = V;
|
|
|
|
return V;
|
|
}
|
|
|
|
public:
|
|
ReplaceLDSUseImpl(Module &M)
|
|
: M(M), Ctx(M.getContext()), DL(M.getDataLayout()) {
|
|
LDSMemBaseAddr = Constant::getIntegerValue(
|
|
PointerType::get(Type::getInt8Ty(M.getContext()),
|
|
AMDGPUAS::LOCAL_ADDRESS),
|
|
APInt(32, 0));
|
|
}
|
|
|
|
// Entry-point function which interface ReplaceLDSUseImpl with outside of the
|
|
// class.
|
|
bool replaceLDSUse();
|
|
|
|
private:
|
|
// For a given LDS from collected LDS globals set, replace its non-kernel
|
|
// function scope uses by pointer.
|
|
bool replaceLDSUse(GlobalVariable *GV);
|
|
};
|
|
|
|
// For given LDS from collected LDS globals set, replace its non-kernel function
|
|
// scope uses by pointer.
|
|
bool ReplaceLDSUseImpl::replaceLDSUse(GlobalVariable *GV) {
|
|
// Holds all those non-kernel functions within which LDS is being accessed.
|
|
SmallPtrSet<Function *, 8> &LDSAccessors = LDSToNonKernels[GV];
|
|
|
|
// The LDS pointer which points to LDS and replaces all the uses of LDS.
|
|
GlobalVariable *LDSPointer = nullptr;
|
|
|
|
// Traverse through each kernel K, check and if required, initialize the
|
|
// LDS pointer to point to LDS within K.
|
|
for (auto KI = KernelToCallees.begin(), KE = KernelToCallees.end(); KI != KE;
|
|
++KI) {
|
|
Function *K = KI->first;
|
|
SmallPtrSet<Function *, 8> Callees = KI->second;
|
|
|
|
// Compute reachable and LDS used callees for kernel K.
|
|
set_intersect(Callees, LDSAccessors);
|
|
|
|
// None of the LDS accessing non-kernel functions are reachable from
|
|
// kernel K. Hence, no need to initialize LDS pointer within kernel K.
|
|
if (Callees.empty())
|
|
continue;
|
|
|
|
// We have found reachable and LDS used callees for kernel K, and we need to
|
|
// initialize LDS pointer within kernel K, and we need to replace LDS use
|
|
// within those callees by LDS pointer.
|
|
//
|
|
// But, first check if LDS pointer is already created, if not create one.
|
|
LDSPointer = createLDSPointer(GV);
|
|
|
|
// Initialize LDS pointer to point to LDS within kernel K.
|
|
initializeLDSPointer(K, GV, LDSPointer);
|
|
}
|
|
|
|
// We have not found reachable and LDS used callees for any of the kernels,
|
|
// and hence we have not created LDS pointer.
|
|
if (!LDSPointer)
|
|
return false;
|
|
|
|
// We have created an LDS pointer for LDS, and initialized it to point-to LDS
|
|
// within all relevant kernels. Now replace all the uses of LDS within
|
|
// non-kernel functions by LDS pointer.
|
|
replaceLDSUseByPointer(GV, LDSPointer);
|
|
|
|
return true;
|
|
}
|
|
|
|
// Entry-point function which interface ReplaceLDSUseImpl with outside of the
|
|
// class.
|
|
bool ReplaceLDSUseImpl::replaceLDSUse() {
|
|
// Collect LDS which requires their uses to be replaced by pointer.
|
|
std::vector<GlobalVariable *> LDSGlobals =
|
|
collectLDSRequiringPointerReplace();
|
|
|
|
// No LDS to pointer-replace. Nothing to do.
|
|
if (LDSGlobals.empty())
|
|
return false;
|
|
|
|
// Collect reachable callee set for each kernel defined in the module.
|
|
AMDGPU::collectReachableCallees(M, KernelToCallees);
|
|
|
|
if (KernelToCallees.empty()) {
|
|
// Either module does not have any kernel definitions, or none of the kernel
|
|
// has a call to non-kernel functions, or we could not resolve any of the
|
|
// call sites to proper non-kernel functions, because of the situations like
|
|
// inline asm calls. Nothing to replace.
|
|
return false;
|
|
}
|
|
|
|
// For every LDS from collected LDS globals set, replace its non-kernel
|
|
// function scope use by pointer.
|
|
bool Changed = false;
|
|
for (auto *GV : LDSGlobals)
|
|
Changed |= replaceLDSUse(GV);
|
|
|
|
return Changed;
|
|
}
|
|
|
|
class AMDGPUReplaceLDSUseWithPointer : public ModulePass {
|
|
public:
|
|
static char ID;
|
|
|
|
AMDGPUReplaceLDSUseWithPointer() : ModulePass(ID) {
|
|
initializeAMDGPUReplaceLDSUseWithPointerPass(
|
|
*PassRegistry::getPassRegistry());
|
|
}
|
|
|
|
bool runOnModule(Module &M) override;
|
|
|
|
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
|
AU.addRequired<TargetPassConfig>();
|
|
}
|
|
};
|
|
|
|
} // namespace
|
|
|
|
char AMDGPUReplaceLDSUseWithPointer::ID = 0;
|
|
char &llvm::AMDGPUReplaceLDSUseWithPointerID =
|
|
AMDGPUReplaceLDSUseWithPointer::ID;
|
|
|
|
INITIALIZE_PASS_BEGIN(
|
|
AMDGPUReplaceLDSUseWithPointer, DEBUG_TYPE,
|
|
"Replace within non-kernel function use of LDS with pointer",
|
|
false /*only look at the cfg*/, false /*analysis pass*/)
|
|
INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
|
|
INITIALIZE_PASS_END(
|
|
AMDGPUReplaceLDSUseWithPointer, DEBUG_TYPE,
|
|
"Replace within non-kernel function use of LDS with pointer",
|
|
false /*only look at the cfg*/, false /*analysis pass*/)
|
|
|
|
bool AMDGPUReplaceLDSUseWithPointer::runOnModule(Module &M) {
|
|
ReplaceLDSUseImpl LDSUseReplacer{M};
|
|
return LDSUseReplacer.replaceLDSUse();
|
|
}
|
|
|
|
ModulePass *llvm::createAMDGPUReplaceLDSUseWithPointerPass() {
|
|
return new AMDGPUReplaceLDSUseWithPointer();
|
|
}
|
|
|
|
PreservedAnalyses
|
|
AMDGPUReplaceLDSUseWithPointerPass::run(Module &M, ModuleAnalysisManager &AM) {
|
|
ReplaceLDSUseImpl LDSUseReplacer{M};
|
|
LDSUseReplacer.replaceLDSUse();
|
|
return PreservedAnalyses::all();
|
|
}
|