diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp index f68c7cf5f799..102a88074dfd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp @@ -37,6 +37,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/Mangler.h" #include "llvm/IR/Module.h" #include "llvm/IR/User.h" #include "llvm/Pass.h" @@ -94,18 +95,25 @@ bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) { bool Changed = false; for (auto &F : M.functions()) { if (F.hasFnAttribute("enqueued-block")) { + if (!F.hasName()) { + SmallString<64> Name; + Mangler::getNameWithPrefix(Name, "__amdgpu_enqueued_kernel", + M.getDataLayout()); + F.setName(Name); + } + auto RuntimeHandle = (F.getName() + ".runtime_handle").str(); + auto *GV = new GlobalVariable( + M, Type::getInt8Ty(C)->getPointerTo(AMDGPUAS::GLOBAL_ADDRESS), + /*IsConstant=*/true, GlobalValue::ExternalLinkage, + /*Initializer=*/nullptr, RuntimeHandle, /*InsertBefore=*/nullptr, + GlobalValue::NotThreadLocal, AMDGPUAS::GLOBAL_ADDRESS, + /*IsExternallyInitialized=*/true); + DEBUG(dbgs() << "runtime handle created: " << *GV << '\n'); + for (auto U : F.users()) { if (!isa(&*U)) continue; auto *BitCast = cast(&*U); - auto RuntimeHandle = (F.getName() + "_runtime_handle").str(); - auto *GV = new GlobalVariable( - M, Type::getInt8Ty(C)->getPointerTo(AMDGPUAS::GLOBAL_ADDRESS), - /*IsConstant=*/true, GlobalValue::ExternalLinkage, - /*Initializer=*/nullptr, RuntimeHandle, /*InsertBefore=*/nullptr, - GlobalValue::NotThreadLocal, AMDGPUAS::GLOBAL_ADDRESS, - /*IsExternallyInitialized=*/true); - DEBUG(dbgs() << "runtime handle created: " << *GV << '\n'); auto *NewPtr = ConstantExpr::getPointerCast(GV, BitCast->getType()); BitCast->replaceAllUsesWith(NewPtr); F.addFnAttr("runtime-handle", RuntimeHandle); diff --git a/llvm/test/CodeGen/AMDGPU/enqueue-kernel.ll b/llvm/test/CodeGen/AMDGPU/enqueue-kernel.ll index a09c31784fe8..67864ed90da4 100644 --- a/llvm/test/CodeGen/AMDGPU/enqueue-kernel.ll +++ b/llvm/test/CodeGen/AMDGPU/enqueue-kernel.ll @@ -1,25 +1,41 @@ ; RUN: opt -data-layout=A5 -amdgpu-lower-enqueued-block -S < %s | FileCheck %s -; CHECK: @__test_block_invoke_kernel_runtime_handle = external addrspace(1) externally_initialized constant i8 addrspace(1)* -; CHECK: @__test_block_invoke_2_kernel_runtime_handle = external addrspace(1) externally_initialized constant i8 addrspace(1)* +; CHECK: @__test_block_invoke_kernel.runtime_handle = external addrspace(1) externally_initialized constant i8 addrspace(1)* +; CHECK: @__test_block_invoke_2_kernel.runtime_handle = external addrspace(1) externally_initialized constant i8 addrspace(1)* +; CHECK: @__amdgpu_enqueued_kernel.runtime_handle = external addrspace(1) externally_initialized constant i8 addrspace(1)* +; CHECK: @__amdgpu_enqueued_kernel.1.runtime_handle = external addrspace(1) externally_initialized constant i8 addrspace(1)* %struct.ndrange_t = type { i32 } %opencl.queue_t = type opaque -; CHECK: define amdgpu_kernel void @non_caller(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr !kernel_arg_addr_space +; CHECK-LABEL: define amdgpu_kernel void @non_caller +; CHECK-NOT: #{{[0-9]+}} define amdgpu_kernel void @non_caller(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 { ret void } -; CHECK: define amdgpu_kernel void @caller_indirect(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr #[[AT_CALLER:[0-9]+]] +; CHECK-LABEL: define amdgpu_kernel void @caller_indirect +; CHECK-SAME: #[[AT_CALLER:[0-9]+]] define amdgpu_kernel void @caller_indirect(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 { call void @caller(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) ret void } -; CHECK: define amdgpu_kernel void @caller(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr #[[AT_CALLER]] +; CHECK-LABEL: define amdgpu_kernel void @caller +; CHECK-SAME: #[[AT_CALLER]] +; CHECK-NOT: @__test_block_invoke_kernel +; CHECK-NOT: @__test_block_invoke_2_kernel +; CHECK-NOT: @__amdgpu_enqueued_kernel +; CHECK-NOT: @__amdgpu_enqueued_kernel.1 +; CHECK-NOT: @0 +; CHECK-NOT: @1 +; CHECK: call i32 @__enqueue_kernel_basic({{.*}}@__test_block_invoke_kernel.runtime_handle +; CHECK: call i32 @__enqueue_kernel_basic({{.*}}@__test_block_invoke_kernel.runtime_handle +; CHECK: call i32 @__enqueue_kernel_basic({{.*}}@__amdgpu_enqueued_kernel.runtime_handle +; CHECK: call i32 @__enqueue_kernel_basic({{.*}}@__amdgpu_enqueued_kernel.1.runtime_handle +; CHECK: call i32 @__enqueue_kernel_basic({{.*}}@__test_block_invoke_2_kernel.runtime_handle define amdgpu_kernel void @caller(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 { entry: @@ -41,6 +57,10 @@ entry: i8* bitcast (void (<{ i32, i32, i8 addrspace(1)*, i8 }>)* @__test_block_invoke_kernel to i8*), i8* nonnull %tmp4) #2 %tmp10 = call i32 @__enqueue_kernel_basic(%opencl.queue_t addrspace(1)* undef, i32 0, %struct.ndrange_t addrspace(5)* byval nonnull %tmp, i8* bitcast (void (<{ i32, i32, i8 addrspace(1)*, i8 }>)* @__test_block_invoke_kernel to i8*), i8* nonnull %tmp4) #2 + %tmp11 = call i32 @__enqueue_kernel_basic(%opencl.queue_t addrspace(1)* undef, i32 0, %struct.ndrange_t addrspace(5)* byval nonnull %tmp, + i8* bitcast (void (<{ i32, i32, i8 addrspace(1)*, i8 }>)* @0 to i8*), i8* nonnull %tmp4) #2 + %tmp12 = call i32 @__enqueue_kernel_basic(%opencl.queue_t addrspace(1)* undef, i32 0, %struct.ndrange_t addrspace(5)* byval nonnull %tmp, + i8* bitcast (void (<{ i32, i32, i8 addrspace(1)*, i8 }>)* @1 to i8*), i8* nonnull %tmp4) #2 %block.size4 = getelementptr inbounds <{ i32, i32, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>, <{ i32, i32, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }> addrspace(5)* %block2, i32 0, i32 0 store i32 41, i32 addrspace(5)* %block.size4, align 8 %block.align5 = getelementptr inbounds <{ i32, i32, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>, <{ i32, i32, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }> addrspace(5)* %block2, i32 0, i32 1 @@ -60,7 +80,8 @@ entry: ret void } -; CHECK: define dso_local amdgpu_kernel void @__test_block_invoke_kernel({{.*}}) #[[AT1:[0-9]+]] +; CHECK-LABEL: define dso_local amdgpu_kernel void @__test_block_invoke_kernel +; CHECK-SAME: #[[AT1:[0-9]+]] define internal amdgpu_kernel void @__test_block_invoke_kernel(<{ i32, i32, i8 addrspace(1)*, i8 }> %arg) #0 !kernel_arg_addr_space !14 !kernel_arg_access_qual !15 !kernel_arg_type !16 !kernel_arg_base_type !16 !kernel_arg_type_qual !17 { entry: @@ -72,7 +93,8 @@ entry: declare i32 @__enqueue_kernel_basic(%opencl.queue_t addrspace(1)*, i32, %struct.ndrange_t addrspace(5)*, i8*, i8*) local_unnamed_addr -; CHECK: define dso_local amdgpu_kernel void @__test_block_invoke_2_kernel({{.*}}) #[[AT2:[0-9]+]] +; CHECK-LABEL: define dso_local amdgpu_kernel void @__test_block_invoke_2_kernel +; CHECK-SAME: #[[AT2:[0-9]+]] define internal amdgpu_kernel void @__test_block_invoke_2_kernel(<{ i32, i32, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }> %arg) #0 !kernel_arg_addr_space !14 !kernel_arg_access_qual !15 !kernel_arg_type !16 !kernel_arg_base_type !16 !kernel_arg_type_qual !17 { @@ -86,9 +108,25 @@ entry: ret void } +; CHECK-LABEL: define dso_local amdgpu_kernel void @__amdgpu_enqueued_kernel +; CHECK-SAME: #[[AT3:[0-9]+]] +define internal amdgpu_kernel void @0(<{ i32, i32, i8 addrspace(1)*, i8 }> %arg) #0 + !kernel_arg_addr_space !14 !kernel_arg_access_qual !15 !kernel_arg_type !16 !kernel_arg_base_type !16 !kernel_arg_type_qual !17 { + ret void +} + +; CHECK-LABEL: define dso_local amdgpu_kernel void @__amdgpu_enqueued_kernel.1 +; CHECK-SAME: #[[AT4:[0-9]+]] +define internal amdgpu_kernel void @1(<{ i32, i32, i8 addrspace(1)*, i8 }> %arg) #0 + !kernel_arg_addr_space !14 !kernel_arg_access_qual !15 !kernel_arg_type !16 !kernel_arg_base_type !16 !kernel_arg_type_qual !17 { + ret void +} + ; CHECK: attributes #[[AT_CALLER]] = { "calls-enqueue-kernel" } -; CHECK: attributes #[[AT1]] = {{.*}}"runtime-handle"="__test_block_invoke_kernel_runtime_handle" -; CHECK: attributes #[[AT2]] = {{.*}}"runtime-handle"="__test_block_invoke_2_kernel_runtime_handle" +; CHECK: attributes #[[AT1]] = {{.*}}"runtime-handle"="__test_block_invoke_kernel.runtime_handle" +; CHECK: attributes #[[AT2]] = {{.*}}"runtime-handle"="__test_block_invoke_2_kernel.runtime_handle" +; CHECK: attributes #[[AT3]] = {{.*}}"runtime-handle"="__amdgpu_enqueued_kernel.runtime_handle" +; CHECK: attributes #[[AT4]] = {{.*}}"runtime-handle"="__amdgpu_enqueued_kernel.1.runtime_handle" attributes #0 = { "enqueued-block" }