forked from OSchip/llvm-project
[AMDGPU] Handle functions in llvm's global ctors and dtors list
This patch introduces a new code object metadata field, ".kind" which is used to add support for init and fini kernels. HSAStreamer will use function attributes, "device-init" and "device-fini" to distinguish between init and fini kernels from the regular kernels and will emit metadata with ".kind" set to "init" and "fini" respectively. To reduce the number of init and fini kernels, the ctors and dtors present in the llvm's global.ctors and global.dtors lists are called from a single init and fini kernel respectively. Reviewed by: yaxunl Differential Revision: https://reviews.llvm.org/D105682
This commit is contained in:
parent
35c0848b57
commit
d42e70b3d3
|
@ -3142,6 +3142,37 @@ same *vendor-name*.
|
|||
a register allocator
|
||||
created spill
|
||||
location.
|
||||
".kind" string The kind of the kernel
|
||||
with the following
|
||||
values:
|
||||
|
||||
"normal"
|
||||
Regular kernels.
|
||||
|
||||
"init"
|
||||
These kernels must be
|
||||
invoked after loading
|
||||
the containing code
|
||||
object and must
|
||||
complete before any
|
||||
normal and fini
|
||||
kernels in the same
|
||||
code object are
|
||||
invoked.
|
||||
|
||||
"fini"
|
||||
These kernels must be
|
||||
invoked before
|
||||
unloading the
|
||||
containing code object
|
||||
and after all init and
|
||||
normal kernels in the
|
||||
same code object have
|
||||
been invoked and
|
||||
completed.
|
||||
|
||||
If omitted, "normal" is
|
||||
assumed.
|
||||
=================================== ============== ========= ================================
|
||||
|
||||
..
|
||||
|
|
|
@ -114,6 +114,10 @@ ModulePass *createAMDGPUFixFunctionBitcastsPass();
|
|||
void initializeAMDGPUFixFunctionBitcastsPass(PassRegistry &);
|
||||
extern char &AMDGPUFixFunctionBitcastsID;
|
||||
|
||||
ModulePass *createAMDGPUCtorDtorLoweringPass();
|
||||
void initializeAMDGPUCtorDtorLoweringPass(PassRegistry &);
|
||||
extern char &AMDGPUCtorDtorLoweringID;
|
||||
|
||||
FunctionPass *createAMDGPULowerKernelArgumentsPass();
|
||||
void initializeAMDGPULowerKernelArgumentsPass(PassRegistry &);
|
||||
extern char &AMDGPULowerKernelArgumentsID;
|
||||
|
|
|
@ -0,0 +1,91 @@
|
|||
//===-- AMDGPUCtorDtorLowering.cpp - Fix function bitcasts -------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
///
|
||||
/// \file
|
||||
/// This pass creates a unified init and fini kernel with the required metadata
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "AMDGPU.h"
|
||||
#include "llvm/IR/Constants.h"
|
||||
#include "llvm/IR/Function.h"
|
||||
#include "llvm/IR/GlobalVariable.h"
|
||||
#include "llvm/IR/IRBuilder.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include "llvm/IR/Value.h"
|
||||
#include "llvm/Pass.h"
|
||||
#include "llvm/Transforms/Utils/ModuleUtils.h"
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
#define DEBUG_TYPE "amdgpu-lower-ctor-dtor"
|
||||
|
||||
namespace {
|
||||
class AMDGPUCtorDtorLowering final : public ModulePass {
|
||||
bool runOnModule(Module &M) override;
|
||||
|
||||
public:
|
||||
Function *createInitOrFiniKernelFunction(Module &M, bool IsCtor) {
|
||||
StringRef InitOrFiniKernelName = "amdgcn.device.init";
|
||||
if (!IsCtor)
|
||||
InitOrFiniKernelName = "amdgcn.device.fini";
|
||||
|
||||
Function *InitOrFiniKernel = Function::createWithDefaultAttr(
|
||||
FunctionType::get(Type::getVoidTy(M.getContext()), false),
|
||||
GlobalValue::InternalLinkage, 0, InitOrFiniKernelName, &M);
|
||||
BasicBlock *InitOrFiniKernelBB =
|
||||
BasicBlock::Create(M.getContext(), "", InitOrFiniKernel);
|
||||
ReturnInst::Create(M.getContext(), InitOrFiniKernelBB);
|
||||
|
||||
InitOrFiniKernel->setCallingConv(CallingConv::AMDGPU_KERNEL);
|
||||
if (IsCtor)
|
||||
InitOrFiniKernel->addFnAttr("device-init");
|
||||
else
|
||||
InitOrFiniKernel->addFnAttr("device-fini");
|
||||
return InitOrFiniKernel;
|
||||
}
|
||||
|
||||
void createInitOrFiniKernel(Module &M, GlobalVariable *GV, bool IsCtor) {
|
||||
if (!GV)
|
||||
return;
|
||||
ConstantArray *GA = cast<ConstantArray>(GV->getInitializer());
|
||||
if (GA->getNumOperands() == 0)
|
||||
return;
|
||||
Function *InitOrFiniKernel = createInitOrFiniKernelFunction(M, IsCtor);
|
||||
IRBuilder<> IRB(InitOrFiniKernel->getEntryBlock().getTerminator());
|
||||
for (Value *V : GA->operands()) {
|
||||
auto *CS = cast<ConstantStruct>(V);
|
||||
if (Function *F = dyn_cast<Function>(CS->getOperand(1))) {
|
||||
FunctionCallee Ctor =
|
||||
M.getOrInsertFunction(F->getName(), IRB.getVoidTy());
|
||||
IRB.CreateCall(Ctor);
|
||||
}
|
||||
}
|
||||
appendToUsed(M, {InitOrFiniKernel});
|
||||
}
|
||||
|
||||
static char ID;
|
||||
AMDGPUCtorDtorLowering() : ModulePass(ID) {}
|
||||
};
|
||||
} // End anonymous namespace
|
||||
|
||||
char AMDGPUCtorDtorLowering::ID = 0;
|
||||
char &llvm::AMDGPUCtorDtorLoweringID = AMDGPUCtorDtorLowering::ID;
|
||||
INITIALIZE_PASS(AMDGPUCtorDtorLowering, DEBUG_TYPE,
|
||||
"Lower ctors and dtors for AMDGPU", false, false)
|
||||
|
||||
ModulePass *llvm::createAMDGPUCtorDtorLoweringPass() {
|
||||
return new AMDGPUCtorDtorLowering();
|
||||
}
|
||||
|
||||
bool AMDGPUCtorDtorLowering::runOnModule(Module &M) {
|
||||
createInitOrFiniKernel(M, M.getGlobalVariable("llvm.global_ctors"),
|
||||
/*IsCtor =*/true);
|
||||
createInitOrFiniKernel(M, M.getGlobalVariable("llvm.global_dtors"),
|
||||
/*IsCtor =*/false);
|
||||
return false;
|
||||
}
|
|
@ -665,6 +665,10 @@ void MetadataStreamerV3::emitKernelAttrs(const Function &Func,
|
|||
Func.getFnAttribute("runtime-handle").getValueAsString().str(),
|
||||
/*Copy=*/true);
|
||||
}
|
||||
if(Func.hasFnAttribute("device-init"))
|
||||
Kern[".kind"] = Kern.getDocument()->getNode("init");
|
||||
else if(Func.hasFnAttribute("device-fini"))
|
||||
Kern[".kind"] = Kern.getDocument()->getNode("fini");
|
||||
}
|
||||
|
||||
void MetadataStreamerV3::emitKernelArgs(const Function &Func,
|
||||
|
|
|
@ -349,6 +349,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
|
|||
initializeSIOptimizeVGPRLiveRangePass(*PR);
|
||||
initializeSILoadStoreOptimizerPass(*PR);
|
||||
initializeAMDGPUFixFunctionBitcastsPass(*PR);
|
||||
initializeAMDGPUCtorDtorLoweringPass(*PR);
|
||||
initializeAMDGPUAlwaysInlinePass(*PR);
|
||||
initializeAMDGPUAttributorPass(*PR);
|
||||
initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
|
||||
|
@ -1014,6 +1015,7 @@ void AMDGPUPassConfig::addIRPasses() {
|
|||
disablePass(&PatchableFunctionID);
|
||||
|
||||
addPass(createAMDGPUPrintfRuntimeBinding());
|
||||
addPass(createAMDGPUCtorDtorLoweringPass());
|
||||
|
||||
// This must occur before inlining, as the inliner will not look through
|
||||
// bitcast calls.
|
||||
|
|
|
@ -53,6 +53,7 @@ add_llvm_target(AMDGPUCodeGen
|
|||
AMDGPUCodeGenPrepare.cpp
|
||||
AMDGPUExportClustering.cpp
|
||||
AMDGPUFixFunctionBitcasts.cpp
|
||||
AMDGPUCtorDtorLowering.cpp
|
||||
AMDGPUFrameLowering.cpp
|
||||
AMDGPUHSAMetadataStreamer.cpp
|
||||
AMDGPUInstCombineIntrinsic.cpp
|
||||
|
|
|
@ -0,0 +1,39 @@
|
|||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=3 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck %s
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 --amdhsa-code-object-version=3 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck %s
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=3 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck %s
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=3 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 --amdhsa-code-object-version=3 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=3 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s
|
||||
|
||||
@llvm.global_ctors = appending addrspace(1) global [2 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 1, void ()* @foo, i8* null }, { i32, void ()*, i8* } { i32 1, void ()* @foo.5, i8* null }]
|
||||
|
||||
define internal void @foo() {
|
||||
ret void
|
||||
|
||||
}
|
||||
|
||||
define internal void @foo.5() {
|
||||
ret void
|
||||
|
||||
}
|
||||
|
||||
; CHECK: ---
|
||||
; CHECK: .kind: init
|
||||
; CHECK: .name: amdgcn.device.init
|
||||
|
||||
@llvm.global_dtors = appending addrspace(1) global [2 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 1, void ()* @bar, i8* null }, { i32, void ()*, i8* } { i32 1, void ()* @bar.5, i8* null }]
|
||||
|
||||
define internal void @bar() {
|
||||
ret void
|
||||
|
||||
}
|
||||
|
||||
define internal void @bar.5() {
|
||||
ret void
|
||||
|
||||
}
|
||||
|
||||
; CHECK: .kind: fini
|
||||
; CHECK: .name: amdgcn.device.fini
|
||||
|
||||
; PARSER: AMDGPU HSA Metadata Parser Test: PASS
|
|
@ -31,6 +31,7 @@
|
|||
; GCN-O0-NEXT: AMDGPU Printf lowering
|
||||
; GCN-O0-NEXT: FunctionPass Manager
|
||||
; GCN-O0-NEXT: Dominator Tree Construction
|
||||
; GCN-O0-NEXT: Lower ctors and dtors for AMDGPU
|
||||
; GCN-O0-NEXT: Fix function bitcasts for AMDGPU
|
||||
; GCN-O0-NEXT: FunctionPass Manager
|
||||
; GCN-O0-NEXT: Early propagate attributes from kernels to functions
|
||||
|
@ -165,6 +166,7 @@
|
|||
; GCN-O1-NEXT: AMDGPU Printf lowering
|
||||
; GCN-O1-NEXT: FunctionPass Manager
|
||||
; GCN-O1-NEXT: Dominator Tree Construction
|
||||
; GCN-O1-NEXT: Lower ctors and dtors for AMDGPU
|
||||
; GCN-O1-NEXT: Fix function bitcasts for AMDGPU
|
||||
; GCN-O1-NEXT: FunctionPass Manager
|
||||
; GCN-O1-NEXT: Early propagate attributes from kernels to functions
|
||||
|
@ -415,6 +417,7 @@
|
|||
; GCN-O1-OPTS-NEXT: AMDGPU Printf lowering
|
||||
; GCN-O1-OPTS-NEXT: FunctionPass Manager
|
||||
; GCN-O1-OPTS-NEXT: Dominator Tree Construction
|
||||
; GCN-O1-OPTS-NEXT: Lower ctors and dtors for AMDGPU
|
||||
; GCN-O1-OPTS-NEXT: Fix function bitcasts for AMDGPU
|
||||
; GCN-O1-OPTS-NEXT: FunctionPass Manager
|
||||
; GCN-O1-OPTS-NEXT: Early propagate attributes from kernels to functions
|
||||
|
@ -698,6 +701,7 @@
|
|||
; GCN-O2-NEXT: AMDGPU Printf lowering
|
||||
; GCN-O2-NEXT: FunctionPass Manager
|
||||
; GCN-O2-NEXT: Dominator Tree Construction
|
||||
; GCN-O2-NEXT: Lower ctors and dtors for AMDGPU
|
||||
; GCN-O2-NEXT: Fix function bitcasts for AMDGPU
|
||||
; GCN-O2-NEXT: FunctionPass Manager
|
||||
; GCN-O2-NEXT: Early propagate attributes from kernels to functions
|
||||
|
@ -983,6 +987,7 @@
|
|||
; GCN-O3-NEXT: AMDGPU Printf lowering
|
||||
; GCN-O3-NEXT: FunctionPass Manager
|
||||
; GCN-O3-NEXT: Dominator Tree Construction
|
||||
; GCN-O3-NEXT: Lower ctors and dtors for AMDGPU
|
||||
; GCN-O3-NEXT: Fix function bitcasts for AMDGPU
|
||||
; GCN-O3-NEXT: FunctionPass Manager
|
||||
; GCN-O3-NEXT: Early propagate attributes from kernels to functions
|
||||
|
|
|
@ -0,0 +1,21 @@
|
|||
; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-ctor-dtor < %s | FileCheck %s
|
||||
|
||||
@llvm.global_ctors = appending addrspace(1) global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 1, void ()* @foo, i8* null }]
|
||||
@llvm.global_dtors = appending addrspace(1) global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 1, void ()* @bar, i8* null }]
|
||||
|
||||
; CHECK-LABEL: amdgpu_kernel void @amdgcn.device.init() #0
|
||||
; CHECK-NEXT: call void @foo
|
||||
|
||||
; CHECK-LABEL: amdgpu_kernel void @amdgcn.device.fini() #1
|
||||
; CHECK-NEXT: call void @bar
|
||||
|
||||
define internal void @foo() {
|
||||
ret void
|
||||
}
|
||||
|
||||
define internal void @bar() {
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: attributes #0 = { "device-init" }
|
||||
; CHECK: attributes #1 = { "device-fini" }
|
|
@ -0,0 +1,31 @@
|
|||
; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-ctor-dtor < %s | FileCheck %s
|
||||
|
||||
@llvm.global_ctors = appending addrspace(1) global [2 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 1, void ()* @foo, i8* null }, { i32, void ()*, i8* } { i32 1, void ()* @foo.5, i8* null }]
|
||||
@llvm.global_dtors = appending addrspace(1) global [2 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 1, void ()* @bar, i8* null }, { i32, void ()*, i8* } { i32 1, void ()* @bar.5, i8* null }]
|
||||
|
||||
; CHECK-LABEL: amdgpu_kernel void @amdgcn.device.init() #0
|
||||
; CHECK-NEXT: call void @foo
|
||||
; CHECK-NEXT: call void @foo.5
|
||||
|
||||
; CHECK-LABEL: amdgpu_kernel void @amdgcn.device.fini() #1
|
||||
; CHECK-NEXT: call void @bar
|
||||
; CHECK-NEXT: call void @bar.5
|
||||
|
||||
define internal void @foo() {
|
||||
ret void
|
||||
}
|
||||
|
||||
define internal void @bar() {
|
||||
ret void
|
||||
}
|
||||
|
||||
define internal void @foo.5() {
|
||||
ret void
|
||||
}
|
||||
|
||||
define internal void @bar.5() {
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: attributes #0 = { "device-init" }
|
||||
; CHECK: attributes #1 = { "device-fini" }
|
|
@ -133,6 +133,7 @@ static_library("LLVMAMDGPUCodeGen") {
|
|||
"AMDGPUCodeGenPrepare.cpp",
|
||||
"AMDGPUExportClustering.cpp",
|
||||
"AMDGPUFixFunctionBitcasts.cpp",
|
||||
"AMDGPUCtorDtorLowering.cpp",
|
||||
"AMDGPUFrameLowering.cpp",
|
||||
"AMDGPUGlobalISelUtils.cpp",
|
||||
"AMDGPUHSAMetadataStreamer.cpp",
|
||||
|
|
Loading…
Reference in New Issue