[AMDGPU] Handle functions in llvm's global ctors and dtors list

This patch introduces a new code object metadata field, ".kind"
which is used to add support for init and fini kernels.

HSAStreamer will use function attributes, "device-init" and
"device-fini" to distinguish between init and fini kernels from
the regular kernels and will emit metadata with ".kind" set to
"init" and "fini" respectively.

To reduce the number of init and fini kernels, the ctors and
dtors present in the llvm's global.ctors and global.dtors lists
are called from a single init and fini kernel respectively.

Reviewed by: yaxunl

Differential Revision: https://reviews.llvm.org/D105682
This commit is contained in:
Reshabh Sharma 2021-08-04 19:47:07 +05:30
parent 35c0848b57
commit d42e70b3d3
11 changed files with 230 additions and 0 deletions

View File

@ -3142,6 +3142,37 @@ same *vendor-name*.
a register allocator
created spill
location.
".kind" string The kind of the kernel
with the following
values:
"normal"
Regular kernels.
"init"
These kernels must be
invoked after loading
the containing code
object and must
complete before any
normal and fini
kernels in the same
code object are
invoked.
"fini"
These kernels must be
invoked before
unloading the
containing code object
and after all init and
normal kernels in the
same code object have
been invoked and
completed.
If omitted, "normal" is
assumed.
=================================== ============== ========= ================================
..

View File

@ -114,6 +114,10 @@ ModulePass *createAMDGPUFixFunctionBitcastsPass();
void initializeAMDGPUFixFunctionBitcastsPass(PassRegistry &);
extern char &AMDGPUFixFunctionBitcastsID;
ModulePass *createAMDGPUCtorDtorLoweringPass();
void initializeAMDGPUCtorDtorLoweringPass(PassRegistry &);
extern char &AMDGPUCtorDtorLoweringID;
FunctionPass *createAMDGPULowerKernelArgumentsPass();
void initializeAMDGPULowerKernelArgumentsPass(PassRegistry &);
extern char &AMDGPULowerKernelArgumentsID;

View File

@ -0,0 +1,91 @@
//===-- AMDGPUCtorDtorLowering.cpp - Fix function bitcasts -------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
/// \file
/// This pass creates a unified init and fini kernel with the required metadata
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Value.h"
#include "llvm/Pass.h"
#include "llvm/Transforms/Utils/ModuleUtils.h"
using namespace llvm;
#define DEBUG_TYPE "amdgpu-lower-ctor-dtor"
namespace {
class AMDGPUCtorDtorLowering final : public ModulePass {
bool runOnModule(Module &M) override;
public:
Function *createInitOrFiniKernelFunction(Module &M, bool IsCtor) {
StringRef InitOrFiniKernelName = "amdgcn.device.init";
if (!IsCtor)
InitOrFiniKernelName = "amdgcn.device.fini";
Function *InitOrFiniKernel = Function::createWithDefaultAttr(
FunctionType::get(Type::getVoidTy(M.getContext()), false),
GlobalValue::InternalLinkage, 0, InitOrFiniKernelName, &M);
BasicBlock *InitOrFiniKernelBB =
BasicBlock::Create(M.getContext(), "", InitOrFiniKernel);
ReturnInst::Create(M.getContext(), InitOrFiniKernelBB);
InitOrFiniKernel->setCallingConv(CallingConv::AMDGPU_KERNEL);
if (IsCtor)
InitOrFiniKernel->addFnAttr("device-init");
else
InitOrFiniKernel->addFnAttr("device-fini");
return InitOrFiniKernel;
}
void createInitOrFiniKernel(Module &M, GlobalVariable *GV, bool IsCtor) {
if (!GV)
return;
ConstantArray *GA = cast<ConstantArray>(GV->getInitializer());
if (GA->getNumOperands() == 0)
return;
Function *InitOrFiniKernel = createInitOrFiniKernelFunction(M, IsCtor);
IRBuilder<> IRB(InitOrFiniKernel->getEntryBlock().getTerminator());
for (Value *V : GA->operands()) {
auto *CS = cast<ConstantStruct>(V);
if (Function *F = dyn_cast<Function>(CS->getOperand(1))) {
FunctionCallee Ctor =
M.getOrInsertFunction(F->getName(), IRB.getVoidTy());
IRB.CreateCall(Ctor);
}
}
appendToUsed(M, {InitOrFiniKernel});
}
static char ID;
AMDGPUCtorDtorLowering() : ModulePass(ID) {}
};
} // End anonymous namespace
char AMDGPUCtorDtorLowering::ID = 0;
char &llvm::AMDGPUCtorDtorLoweringID = AMDGPUCtorDtorLowering::ID;
INITIALIZE_PASS(AMDGPUCtorDtorLowering, DEBUG_TYPE,
"Lower ctors and dtors for AMDGPU", false, false)
ModulePass *llvm::createAMDGPUCtorDtorLoweringPass() {
return new AMDGPUCtorDtorLowering();
}
bool AMDGPUCtorDtorLowering::runOnModule(Module &M) {
createInitOrFiniKernel(M, M.getGlobalVariable("llvm.global_ctors"),
/*IsCtor =*/true);
createInitOrFiniKernel(M, M.getGlobalVariable("llvm.global_dtors"),
/*IsCtor =*/false);
return false;
}

View File

@ -665,6 +665,10 @@ void MetadataStreamerV3::emitKernelAttrs(const Function &Func,
Func.getFnAttribute("runtime-handle").getValueAsString().str(),
/*Copy=*/true);
}
if(Func.hasFnAttribute("device-init"))
Kern[".kind"] = Kern.getDocument()->getNode("init");
else if(Func.hasFnAttribute("device-fini"))
Kern[".kind"] = Kern.getDocument()->getNode("fini");
}
void MetadataStreamerV3::emitKernelArgs(const Function &Func,

View File

@ -349,6 +349,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeSIOptimizeVGPRLiveRangePass(*PR);
initializeSILoadStoreOptimizerPass(*PR);
initializeAMDGPUFixFunctionBitcastsPass(*PR);
initializeAMDGPUCtorDtorLoweringPass(*PR);
initializeAMDGPUAlwaysInlinePass(*PR);
initializeAMDGPUAttributorPass(*PR);
initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
@ -1014,6 +1015,7 @@ void AMDGPUPassConfig::addIRPasses() {
disablePass(&PatchableFunctionID);
addPass(createAMDGPUPrintfRuntimeBinding());
addPass(createAMDGPUCtorDtorLoweringPass());
// This must occur before inlining, as the inliner will not look through
// bitcast calls.

View File

@ -53,6 +53,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUCodeGenPrepare.cpp
AMDGPUExportClustering.cpp
AMDGPUFixFunctionBitcasts.cpp
AMDGPUCtorDtorLowering.cpp
AMDGPUFrameLowering.cpp
AMDGPUHSAMetadataStreamer.cpp
AMDGPUInstCombineIntrinsic.cpp

View File

@ -0,0 +1,39 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=3 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 --amdhsa-code-object-version=3 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=3 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=3 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 --amdhsa-code-object-version=3 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=3 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s
@llvm.global_ctors = appending addrspace(1) global [2 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 1, void ()* @foo, i8* null }, { i32, void ()*, i8* } { i32 1, void ()* @foo.5, i8* null }]
define internal void @foo() {
ret void
}
define internal void @foo.5() {
ret void
}
; CHECK: ---
; CHECK: .kind: init
; CHECK: .name: amdgcn.device.init
@llvm.global_dtors = appending addrspace(1) global [2 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 1, void ()* @bar, i8* null }, { i32, void ()*, i8* } { i32 1, void ()* @bar.5, i8* null }]
define internal void @bar() {
ret void
}
define internal void @bar.5() {
ret void
}
; CHECK: .kind: fini
; CHECK: .name: amdgcn.device.fini
; PARSER: AMDGPU HSA Metadata Parser Test: PASS

View File

@ -31,6 +31,7 @@
; GCN-O0-NEXT: AMDGPU Printf lowering
; GCN-O0-NEXT: FunctionPass Manager
; GCN-O0-NEXT: Dominator Tree Construction
; GCN-O0-NEXT: Lower ctors and dtors for AMDGPU
; GCN-O0-NEXT: Fix function bitcasts for AMDGPU
; GCN-O0-NEXT: FunctionPass Manager
; GCN-O0-NEXT: Early propagate attributes from kernels to functions
@ -165,6 +166,7 @@
; GCN-O1-NEXT: AMDGPU Printf lowering
; GCN-O1-NEXT: FunctionPass Manager
; GCN-O1-NEXT: Dominator Tree Construction
; GCN-O1-NEXT: Lower ctors and dtors for AMDGPU
; GCN-O1-NEXT: Fix function bitcasts for AMDGPU
; GCN-O1-NEXT: FunctionPass Manager
; GCN-O1-NEXT: Early propagate attributes from kernels to functions
@ -415,6 +417,7 @@
; GCN-O1-OPTS-NEXT: AMDGPU Printf lowering
; GCN-O1-OPTS-NEXT: FunctionPass Manager
; GCN-O1-OPTS-NEXT: Dominator Tree Construction
; GCN-O1-OPTS-NEXT: Lower ctors and dtors for AMDGPU
; GCN-O1-OPTS-NEXT: Fix function bitcasts for AMDGPU
; GCN-O1-OPTS-NEXT: FunctionPass Manager
; GCN-O1-OPTS-NEXT: Early propagate attributes from kernels to functions
@ -698,6 +701,7 @@
; GCN-O2-NEXT: AMDGPU Printf lowering
; GCN-O2-NEXT: FunctionPass Manager
; GCN-O2-NEXT: Dominator Tree Construction
; GCN-O2-NEXT: Lower ctors and dtors for AMDGPU
; GCN-O2-NEXT: Fix function bitcasts for AMDGPU
; GCN-O2-NEXT: FunctionPass Manager
; GCN-O2-NEXT: Early propagate attributes from kernels to functions
@ -983,6 +987,7 @@
; GCN-O3-NEXT: AMDGPU Printf lowering
; GCN-O3-NEXT: FunctionPass Manager
; GCN-O3-NEXT: Dominator Tree Construction
; GCN-O3-NEXT: Lower ctors and dtors for AMDGPU
; GCN-O3-NEXT: Fix function bitcasts for AMDGPU
; GCN-O3-NEXT: FunctionPass Manager
; GCN-O3-NEXT: Early propagate attributes from kernels to functions

View File

@ -0,0 +1,21 @@
; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-ctor-dtor < %s | FileCheck %s
@llvm.global_ctors = appending addrspace(1) global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 1, void ()* @foo, i8* null }]
@llvm.global_dtors = appending addrspace(1) global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 1, void ()* @bar, i8* null }]
; CHECK-LABEL: amdgpu_kernel void @amdgcn.device.init() #0
; CHECK-NEXT: call void @foo
; CHECK-LABEL: amdgpu_kernel void @amdgcn.device.fini() #1
; CHECK-NEXT: call void @bar
define internal void @foo() {
ret void
}
define internal void @bar() {
ret void
}
; CHECK: attributes #0 = { "device-init" }
; CHECK: attributes #1 = { "device-fini" }

View File

@ -0,0 +1,31 @@
; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-ctor-dtor < %s | FileCheck %s
@llvm.global_ctors = appending addrspace(1) global [2 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 1, void ()* @foo, i8* null }, { i32, void ()*, i8* } { i32 1, void ()* @foo.5, i8* null }]
@llvm.global_dtors = appending addrspace(1) global [2 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 1, void ()* @bar, i8* null }, { i32, void ()*, i8* } { i32 1, void ()* @bar.5, i8* null }]
; CHECK-LABEL: amdgpu_kernel void @amdgcn.device.init() #0
; CHECK-NEXT: call void @foo
; CHECK-NEXT: call void @foo.5
; CHECK-LABEL: amdgpu_kernel void @amdgcn.device.fini() #1
; CHECK-NEXT: call void @bar
; CHECK-NEXT: call void @bar.5
define internal void @foo() {
ret void
}
define internal void @bar() {
ret void
}
define internal void @foo.5() {
ret void
}
define internal void @bar.5() {
ret void
}
; CHECK: attributes #0 = { "device-init" }
; CHECK: attributes #1 = { "device-fini" }

View File

@ -133,6 +133,7 @@ static_library("LLVMAMDGPUCodeGen") {
"AMDGPUCodeGenPrepare.cpp",
"AMDGPUExportClustering.cpp",
"AMDGPUFixFunctionBitcasts.cpp",
"AMDGPUCtorDtorLowering.cpp",
"AMDGPUFrameLowering.cpp",
"AMDGPUGlobalISelUtils.cpp",
"AMDGPUHSAMetadataStreamer.cpp",