AMDGPU: Optimize out implicit kernarg argument allocation if unused

We already annotate whether llvm.amdgcn.implicitarg.ptr is known to be
unused. Start using it to avoid allocating the implicit arguments if
unneeded.
This commit is contained in:
Matt Arsenault 2021-10-25 15:30:55 -04:00
parent ee691970a9
commit ae0ba7dedd
16 changed files with 70 additions and 66 deletions

View File

@ -280,11 +280,12 @@ void MetadataStreamerV2::emitKernelAttrs(const Function &Func) {
}
}
void MetadataStreamerV2::emitKernelArgs(const Function &Func) {
void MetadataStreamerV2::emitKernelArgs(const Function &Func,
const GCNSubtarget &ST) {
for (auto &Arg : Func.args())
emitKernelArg(Arg);
emitHiddenKernelArgs(Func);
emitHiddenKernelArgs(Func, ST);
}
void MetadataStreamerV2::emitKernelArg(const Argument &Arg) {
@ -381,10 +382,9 @@ void MetadataStreamerV2::emitKernelArg(const DataLayout &DL, Type *Ty,
}
}
void MetadataStreamerV2::emitHiddenKernelArgs(const Function &Func) {
int HiddenArgNumBytes =
getIntegerAttribute(Func, "amdgpu-implicitarg-num-bytes", 0);
void MetadataStreamerV2::emitHiddenKernelArgs(const Function &Func,
const GCNSubtarget &ST) {
unsigned HiddenArgNumBytes = ST.getImplicitArgNumBytes(Func);
if (!HiddenArgNumBytes)
return;
@ -465,11 +465,12 @@ void MetadataStreamerV2::emitKernel(const MachineFunction &MF,
HSAMetadata.mKernels.push_back(Kernel::Metadata());
auto &Kernel = HSAMetadata.mKernels.back();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
Kernel.mName = std::string(Func.getName());
Kernel.mSymbolName = (Twine(Func.getName()) + Twine("@kd")).str();
emitKernelLanguage(Func);
emitKernelAttrs(Func);
emitKernelArgs(Func);
emitKernelArgs(Func, ST);
HSAMetadata.mKernels.back().mCodeProps = CodeProps;
HSAMetadata.mKernels.back().mDebugProps = DebugProps;
}
@ -673,13 +674,14 @@ void MetadataStreamerV3::emitKernelAttrs(const Function &Func,
}
void MetadataStreamerV3::emitKernelArgs(const Function &Func,
const GCNSubtarget &ST,
msgpack::MapDocNode Kern) {
unsigned Offset = 0;
auto Args = HSAMetadataDoc->getArrayNode();
for (auto &Arg : Func.args())
emitKernelArg(Arg, Offset, Args);
emitHiddenKernelArgs(Func, Offset, Args);
emitHiddenKernelArgs(Func, ST, Offset, Args);
Kern[".args"] = Args;
}
@ -791,11 +793,10 @@ void MetadataStreamerV3::emitKernelArg(
}
void MetadataStreamerV3::emitHiddenKernelArgs(const Function &Func,
const GCNSubtarget &ST,
unsigned &Offset,
msgpack::ArrayDocNode Args) {
int HiddenArgNumBytes =
getIntegerAttribute(Func, "amdgpu-implicitarg-num-bytes", 0);
unsigned HiddenArgNumBytes = ST.getImplicitArgNumBytes(Func);
if (!HiddenArgNumBytes)
return;
@ -912,6 +913,7 @@ void MetadataStreamerV3::emitKernel(const MachineFunction &MF,
const SIProgramInfo &ProgramInfo) {
auto &Func = MF.getFunction();
auto Kern = getHSAKernelProps(MF, ProgramInfo);
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
assert(Func.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
Func.getCallingConv() == CallingConv::SPIR_KERNEL);
@ -925,7 +927,7 @@ void MetadataStreamerV3::emitKernel(const MachineFunction &MF,
(Twine(Func.getName()) + Twine(".kd")).str(), /*Copy=*/true);
emitKernelLanguage(Func, Kern);
emitKernelAttrs(Func, Kern);
emitKernelArgs(Func, Kern);
emitKernelArgs(Func, ST, Kern);
}
Kernels.push_back(Kern);

View File

@ -30,6 +30,7 @@ class MDNode;
class Module;
struct SIProgramInfo;
class Type;
class GCNSubtarget;
namespace AMDGPU {
@ -86,7 +87,8 @@ protected:
void emitKernelAttrs(const Function &Func, msgpack::MapDocNode Kern);
void emitKernelArgs(const Function &Func, msgpack::MapDocNode Kern);
void emitKernelArgs(const Function &Func, const GCNSubtarget &ST,
msgpack::MapDocNode Kern);
void emitKernelArg(const Argument &Arg, unsigned &Offset,
msgpack::ArrayDocNode Args);
@ -98,8 +100,8 @@ protected:
StringRef BaseTypeName = "", StringRef AccQual = "",
StringRef TypeQual = "");
void emitHiddenKernelArgs(const Function &Func, unsigned &Offset,
msgpack::ArrayDocNode Args);
void emitHiddenKernelArgs(const Function &Func, const GCNSubtarget &ST,
unsigned &Offset, msgpack::ArrayDocNode Args);
msgpack::DocNode &getRootMetadata(StringRef Key) {
return HSAMetadataDoc->getRoot().getMap(/*Convert=*/true)[Key];
@ -173,7 +175,7 @@ private:
void emitKernelAttrs(const Function &Func);
void emitKernelArgs(const Function &Func);
void emitKernelArgs(const Function &Func, const GCNSubtarget &ST);
void emitKernelArg(const Argument &Arg);
@ -183,7 +185,7 @@ private:
StringRef BaseTypeName = "", StringRef AccQual = "",
StringRef TypeQual = "");
void emitHiddenKernelArgs(const Function &Func);
void emitHiddenKernelArgs(const Function &Func, const GCNSubtarget &ST);
const Metadata &getHSAMetadata() const {
return HSAMetadata;

View File

@ -648,6 +648,11 @@ bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
}
unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
// We don't allocate the segment if we know the implicit arguments weren't
// used, even if the ABI implies we need them.
if (F.hasFnAttribute("amdgpu-no-implicitarg-ptr"))
return 0;
if (isMesaKernel(F))
return 16;
return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0);

View File

@ -2730,7 +2730,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(double addrspace(1)* %out, i32
; GPRIDX-NEXT: workitem_private_segment_byte_size = 0
; GPRIDX-NEXT: workgroup_group_segment_byte_size = 0
; GPRIDX-NEXT: gds_segment_byte_size = 0
; GPRIDX-NEXT: kernarg_segment_byte_size = 28
; GPRIDX-NEXT: kernarg_segment_byte_size = 12
; GPRIDX-NEXT: workgroup_fbarrier_count = 0
; GPRIDX-NEXT: wavefront_sgpr_count = 9
; GPRIDX-NEXT: workitem_vgpr_count = 3
@ -2821,7 +2821,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(double addrspace(1)* %out, i32
; MOVREL-NEXT: workitem_private_segment_byte_size = 0
; MOVREL-NEXT: workgroup_group_segment_byte_size = 0
; MOVREL-NEXT: gds_segment_byte_size = 0
; MOVREL-NEXT: kernarg_segment_byte_size = 28
; MOVREL-NEXT: kernarg_segment_byte_size = 12
; MOVREL-NEXT: workgroup_fbarrier_count = 0
; MOVREL-NEXT: wavefront_sgpr_count = 9
; MOVREL-NEXT: workitem_vgpr_count = 4
@ -2913,7 +2913,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(double addrspace(1)* %out, i32
; GFX10-NEXT: workitem_private_segment_byte_size = 0
; GFX10-NEXT: workgroup_group_segment_byte_size = 0
; GFX10-NEXT: gds_segment_byte_size = 0
; GFX10-NEXT: kernarg_segment_byte_size = 28
; GFX10-NEXT: kernarg_segment_byte_size = 12
; GFX10-NEXT: workgroup_fbarrier_count = 0
; GFX10-NEXT: wavefront_sgpr_count = 9
; GFX10-NEXT: workitem_vgpr_count = 3
@ -3559,7 +3559,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(float addrspace(1)* %out, i32
; GPRIDX-NEXT: workitem_private_segment_byte_size = 0
; GPRIDX-NEXT: workgroup_group_segment_byte_size = 0
; GPRIDX-NEXT: gds_segment_byte_size = 0
; GPRIDX-NEXT: kernarg_segment_byte_size = 28
; GPRIDX-NEXT: kernarg_segment_byte_size = 12
; GPRIDX-NEXT: workgroup_fbarrier_count = 0
; GPRIDX-NEXT: wavefront_sgpr_count = 6
; GPRIDX-NEXT: workitem_vgpr_count = 2
@ -3643,7 +3643,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(float addrspace(1)* %out, i32
; MOVREL-NEXT: workitem_private_segment_byte_size = 0
; MOVREL-NEXT: workgroup_group_segment_byte_size = 0
; MOVREL-NEXT: gds_segment_byte_size = 0
; MOVREL-NEXT: kernarg_segment_byte_size = 28
; MOVREL-NEXT: kernarg_segment_byte_size = 12
; MOVREL-NEXT: workgroup_fbarrier_count = 0
; MOVREL-NEXT: wavefront_sgpr_count = 6
; MOVREL-NEXT: workitem_vgpr_count = 3
@ -3728,7 +3728,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(float addrspace(1)* %out, i32
; GFX10-NEXT: workitem_private_segment_byte_size = 0
; GFX10-NEXT: workgroup_group_segment_byte_size = 0
; GFX10-NEXT: gds_segment_byte_size = 0
; GFX10-NEXT: kernarg_segment_byte_size = 28
; GFX10-NEXT: kernarg_segment_byte_size = 12
; GFX10-NEXT: workgroup_fbarrier_count = 0
; GFX10-NEXT: wavefront_sgpr_count = 6
; GFX10-NEXT: workitem_vgpr_count = 2
@ -3819,7 +3819,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(double addrspace(1)* %out, i3
; GPRIDX-NEXT: workitem_private_segment_byte_size = 0
; GPRIDX-NEXT: workgroup_group_segment_byte_size = 0
; GPRIDX-NEXT: gds_segment_byte_size = 0
; GPRIDX-NEXT: kernarg_segment_byte_size = 28
; GPRIDX-NEXT: kernarg_segment_byte_size = 12
; GPRIDX-NEXT: workgroup_fbarrier_count = 0
; GPRIDX-NEXT: wavefront_sgpr_count = 7
; GPRIDX-NEXT: workitem_vgpr_count = 3
@ -3906,7 +3906,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(double addrspace(1)* %out, i3
; MOVREL-NEXT: workitem_private_segment_byte_size = 0
; MOVREL-NEXT: workgroup_group_segment_byte_size = 0
; MOVREL-NEXT: gds_segment_byte_size = 0
; MOVREL-NEXT: kernarg_segment_byte_size = 28
; MOVREL-NEXT: kernarg_segment_byte_size = 12
; MOVREL-NEXT: workgroup_fbarrier_count = 0
; MOVREL-NEXT: wavefront_sgpr_count = 7
; MOVREL-NEXT: workitem_vgpr_count = 4
@ -3994,7 +3994,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(double addrspace(1)* %out, i3
; GFX10-NEXT: workitem_private_segment_byte_size = 0
; GFX10-NEXT: workgroup_group_segment_byte_size = 0
; GFX10-NEXT: gds_segment_byte_size = 0
; GFX10-NEXT: kernarg_segment_byte_size = 28
; GFX10-NEXT: kernarg_segment_byte_size = 12
; GFX10-NEXT: workgroup_fbarrier_count = 0
; GFX10-NEXT: wavefront_sgpr_count = 7
; GFX10-NEXT: workitem_vgpr_count = 3

View File

@ -74,14 +74,9 @@ define amdgpu_kernel void @opencl_test_implicit_alignment(i32 addrspace(1)* %out
ret void
}
; Mesa implies 16-bytes are always allocated, hsa requires the
; attribute for the additional space.
; ALL-LABEL: {{^}}test_no_kernargs:
; HSA: enable_sgpr_kernarg_segment_ptr = 0
; HSA: kernarg_segment_byte_size = 0
; OS-MESA3D: enable_sgpr_kernarg_segment_ptr = 1
; OS-MESA3D: kernarg_segment_byte_size = 16
; CO-V2: enable_sgpr_kernarg_segment_ptr = 0
; CO-V2: kernarg_segment_byte_size = 0
; CO-V2: kernarg_segment_alignment = 4
; HSA: s_mov_b64 [[OFFSET_NULL:s\[[0-9]+:[0-9]+\]]], 40{{$}}
@ -97,7 +92,7 @@ define amdgpu_kernel void @test_no_kernargs() #1 {
; ALL-LABEL: {{^}}opencl_test_implicit_alignment_no_explicit_kernargs:
; HSA: kernarg_segment_byte_size = 48
; OS-MESA3d: kernarg_segment_byte_size = 16
; OS-MESA3D: kernarg_segment_byte_size = 16
; CO-V2: kernarg_segment_alignment = 4
define amdgpu_kernel void @opencl_test_implicit_alignment_no_explicit_kernargs() #2 {
%implicitarg.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()

View File

@ -76,8 +76,8 @@ define amdgpu_kernel void @test_enqueue_kernel_caller(i8 %a) #1
; CHECK-NEXT: - 0
; CHECK-NOT: amdhsa.printf:
attributes #0 = { "amdgpu-implicitarg-num-bytes"="48" }
attributes #1 = { "calls-enqueue-kernel" "amdgpu-implicitarg-num-bytes"="48" }
attributes #0 = { optnone noinline "amdgpu-implicitarg-num-bytes"="48" }
attributes #1 = { optnone noinline "calls-enqueue-kernel" "amdgpu-implicitarg-num-bytes"="48" }
!1 = !{i32 0}
!2 = !{!"none"}

View File

@ -72,8 +72,8 @@ define amdgpu_kernel void @test_enqueue_kernel_caller(i8 %a) #1
ret void
}
attributes #0 = { "amdgpu-implicitarg-num-bytes"="48" }
attributes #1 = { "calls-enqueue-kernel" "amdgpu-implicitarg-num-bytes"="48" }
attributes #0 = { optnone noinline "amdgpu-implicitarg-num-bytes"="48" }
attributes #1 = { optnone noinline "calls-enqueue-kernel" "amdgpu-implicitarg-num-bytes"="48" }
!1 = !{i32 0}
!2 = !{!"none"}

View File

@ -1894,9 +1894,9 @@ define amdgpu_kernel void @unknown_addrspace_kernarg(i32 addrspace(12345)* %ptr)
; CHECK-NEXT: - 1
; CHECK-NEXT: - 0
attributes #0 = { "amdgpu-implicitarg-num-bytes"="56" }
attributes #1 = { "amdgpu-implicitarg-num-bytes"="56" "runtime-handle"="__test_block_invoke_kernel_runtime_handle" }
attributes #2 = { "amdgpu-implicitarg-num-bytes"="56" "calls-enqueue-kernel" }
attributes #0 = { optnone noinline "amdgpu-implicitarg-num-bytes"="56" }
attributes #1 = { optnone noinline "amdgpu-implicitarg-num-bytes"="56" "runtime-handle"="__test_block_invoke_kernel_runtime_handle" }
attributes #2 = { optnone noinline "amdgpu-implicitarg-num-bytes"="56" "calls-enqueue-kernel" }
!llvm.printf.fmts = !{!100, !101}

View File

@ -1866,9 +1866,9 @@ define amdgpu_kernel void @unknown_addrspace_kernarg(i32 addrspace(12345)* %ptr)
ret void
}
attributes #0 = { "amdgpu-implicitarg-num-bytes"="56" }
attributes #1 = { "amdgpu-implicitarg-num-bytes"="56" "runtime-handle"="__test_block_invoke_kernel_runtime_handle" }
attributes #2 = { "amdgpu-implicitarg-num-bytes"="56" "calls-enqueue-kernel" }
attributes #0 = { optnone noinline "amdgpu-implicitarg-num-bytes"="56" }
attributes #1 = { optnone noinline "amdgpu-implicitarg-num-bytes"="56" "runtime-handle"="__test_block_invoke_kernel_runtime_handle" }
attributes #2 = { optnone noinline "amdgpu-implicitarg-num-bytes"="56" "calls-enqueue-kernel" }
!llvm.printf.fmts = !{!100, !101}

View File

@ -296,9 +296,11 @@ entry:
; CHECK-NEXT: - 1
; CHECK-NEXT: - 0
attributes #0 = { "amdgpu-implicitarg-num-bytes"="8" }
attributes #1 = { "amdgpu-implicitarg-num-bytes"="16" }
attributes #2 = { "amdgpu-implicitarg-num-bytes"="24" }
attributes #3 = { "amdgpu-implicitarg-num-bytes"="32" }
attributes #4 = { "amdgpu-implicitarg-num-bytes"="48" }
attributes #5 = { "amdgpu-implicitarg-num-bytes"="56" }
; We don't have a use of llvm.amdgcn.implicitarg.ptr, so optnone to
; avoid optimizing out the implicit argument allocation.
attributes #0 = { optnone noinline "amdgpu-implicitarg-num-bytes"="8" }
attributes #1 = { optnone noinline "amdgpu-implicitarg-num-bytes"="16" }
attributes #2 = { optnone noinline "amdgpu-implicitarg-num-bytes"="24" }
attributes #3 = { optnone noinline "amdgpu-implicitarg-num-bytes"="32" }
attributes #4 = { optnone noinline "amdgpu-implicitarg-num-bytes"="48" }
attributes #5 = { optnone noinline "amdgpu-implicitarg-num-bytes"="56" }

View File

@ -300,9 +300,11 @@ entry:
ret void
}
attributes #0 = { "amdgpu-implicitarg-num-bytes"="8" }
attributes #1 = { "amdgpu-implicitarg-num-bytes"="16" }
attributes #2 = { "amdgpu-implicitarg-num-bytes"="24" }
attributes #3 = { "amdgpu-implicitarg-num-bytes"="32" }
attributes #4 = { "amdgpu-implicitarg-num-bytes"="48" }
attributes #5 = { "amdgpu-implicitarg-num-bytes"="56" }
; We don't have a use of llvm.amdgcn.implicitarg.ptr, so optnone to
; avoid optimizing out the implicit argument allocation.
attributes #0 = { optnone noinline "amdgpu-implicitarg-num-bytes"="8" }
attributes #1 = { optnone noinline "amdgpu-implicitarg-num-bytes"="16" }
attributes #2 = { optnone noinline "amdgpu-implicitarg-num-bytes"="24" }
attributes #3 = { optnone noinline "amdgpu-implicitarg-num-bytes"="32" }
attributes #4 = { optnone noinline "amdgpu-implicitarg-num-bytes"="48" }
attributes #5 = { optnone noinline "amdgpu-implicitarg-num-bytes"="56" }

View File

@ -38,7 +38,7 @@ define amdgpu_kernel void @test_kernel(i8 %a) #0
; CHECK-NEXT: - 1
; CHECK-NEXT: - 0
attributes #0 = { "amdgpu-implicitarg-num-bytes"="48" }
attributes #0 = { optnone noinline "amdgpu-implicitarg-num-bytes"="48" }
!1 = !{i32 0}
!2 = !{!"none"}

View File

@ -35,7 +35,7 @@ define amdgpu_kernel void @test_kernel(i8 %a) #0
ret void
}
attributes #0 = { "amdgpu-implicitarg-num-bytes"="48" }
attributes #0 = { optnone noinline "amdgpu-implicitarg-num-bytes"="48" }
!1 = !{i32 0}
!2 = !{!"none"}

View File

@ -39,7 +39,7 @@ define amdgpu_kernel void @test_kernel(i8 %a) #0
; CHECK-NEXT: - 1
; CHECK-NEXT: - 0
attributes #0 = { "amdgpu-implicitarg-num-bytes"="48" }
attributes #0 = { optnone noinline "amdgpu-implicitarg-num-bytes"="48" }
!1 = !{i32 0}
!2 = !{!"none"}

View File

@ -40,7 +40,7 @@ define amdgpu_kernel void @test_kernel(i8 %a) #0
ret void
}
attributes #0 = { "amdgpu-implicitarg-num-bytes"="48" }
attributes #0 = { optnone noinline "amdgpu-implicitarg-num-bytes"="48" }
!1 = !{i32 0}
!2 = !{!"none"}

View File

@ -75,14 +75,10 @@ define amdgpu_kernel void @opencl_test_implicit_alignment(i32 addrspace(1)* %out
ret void
}
; Mesa implies 16-bytes are always allocated, hsa requires the
; attribute for the additional space.
; ALL-LABEL: {{^}}test_no_kernargs:
; HSA: enable_sgpr_kernarg_segment_ptr = 0
; HSA: kernarg_segment_byte_size = 0
; CO-V2: enable_sgpr_kernarg_segment_ptr = 0
; CO-V2: kernarg_segment_byte_size = 0
; OS-MESA3D: enable_sgpr_kernarg_segment_ptr = 1
; OS-MESA3D: kernarg_segment_byte_size = 16
; CO-V2: kernarg_segment_alignment = 4
; HSA: s_mov_b64 [[NULL:s\[[0-9]+:[0-9]+\]]], 0{{$}}