forked from OSchip/llvm-project
AMDGPU: Optimize out implicit kernarg argument allocation if unused
We already annotate whether llvm.amdgcn.implicitarg.ptr is known to be unused. Start using it to avoid allocating the implicit arguments if unneeded.
This commit is contained in:
parent
ee691970a9
commit
ae0ba7dedd
|
@ -280,11 +280,12 @@ void MetadataStreamerV2::emitKernelAttrs(const Function &Func) {
|
|||
}
|
||||
}
|
||||
|
||||
void MetadataStreamerV2::emitKernelArgs(const Function &Func) {
|
||||
void MetadataStreamerV2::emitKernelArgs(const Function &Func,
|
||||
const GCNSubtarget &ST) {
|
||||
for (auto &Arg : Func.args())
|
||||
emitKernelArg(Arg);
|
||||
|
||||
emitHiddenKernelArgs(Func);
|
||||
emitHiddenKernelArgs(Func, ST);
|
||||
}
|
||||
|
||||
void MetadataStreamerV2::emitKernelArg(const Argument &Arg) {
|
||||
|
@ -381,10 +382,9 @@ void MetadataStreamerV2::emitKernelArg(const DataLayout &DL, Type *Ty,
|
|||
}
|
||||
}
|
||||
|
||||
void MetadataStreamerV2::emitHiddenKernelArgs(const Function &Func) {
|
||||
int HiddenArgNumBytes =
|
||||
getIntegerAttribute(Func, "amdgpu-implicitarg-num-bytes", 0);
|
||||
|
||||
void MetadataStreamerV2::emitHiddenKernelArgs(const Function &Func,
|
||||
const GCNSubtarget &ST) {
|
||||
unsigned HiddenArgNumBytes = ST.getImplicitArgNumBytes(Func);
|
||||
if (!HiddenArgNumBytes)
|
||||
return;
|
||||
|
||||
|
@ -465,11 +465,12 @@ void MetadataStreamerV2::emitKernel(const MachineFunction &MF,
|
|||
HSAMetadata.mKernels.push_back(Kernel::Metadata());
|
||||
auto &Kernel = HSAMetadata.mKernels.back();
|
||||
|
||||
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
|
||||
Kernel.mName = std::string(Func.getName());
|
||||
Kernel.mSymbolName = (Twine(Func.getName()) + Twine("@kd")).str();
|
||||
emitKernelLanguage(Func);
|
||||
emitKernelAttrs(Func);
|
||||
emitKernelArgs(Func);
|
||||
emitKernelArgs(Func, ST);
|
||||
HSAMetadata.mKernels.back().mCodeProps = CodeProps;
|
||||
HSAMetadata.mKernels.back().mDebugProps = DebugProps;
|
||||
}
|
||||
|
@ -673,13 +674,14 @@ void MetadataStreamerV3::emitKernelAttrs(const Function &Func,
|
|||
}
|
||||
|
||||
void MetadataStreamerV3::emitKernelArgs(const Function &Func,
|
||||
const GCNSubtarget &ST,
|
||||
msgpack::MapDocNode Kern) {
|
||||
unsigned Offset = 0;
|
||||
auto Args = HSAMetadataDoc->getArrayNode();
|
||||
for (auto &Arg : Func.args())
|
||||
emitKernelArg(Arg, Offset, Args);
|
||||
|
||||
emitHiddenKernelArgs(Func, Offset, Args);
|
||||
emitHiddenKernelArgs(Func, ST, Offset, Args);
|
||||
|
||||
Kern[".args"] = Args;
|
||||
}
|
||||
|
@ -791,11 +793,10 @@ void MetadataStreamerV3::emitKernelArg(
|
|||
}
|
||||
|
||||
void MetadataStreamerV3::emitHiddenKernelArgs(const Function &Func,
|
||||
const GCNSubtarget &ST,
|
||||
unsigned &Offset,
|
||||
msgpack::ArrayDocNode Args) {
|
||||
int HiddenArgNumBytes =
|
||||
getIntegerAttribute(Func, "amdgpu-implicitarg-num-bytes", 0);
|
||||
|
||||
unsigned HiddenArgNumBytes = ST.getImplicitArgNumBytes(Func);
|
||||
if (!HiddenArgNumBytes)
|
||||
return;
|
||||
|
||||
|
@ -912,6 +913,7 @@ void MetadataStreamerV3::emitKernel(const MachineFunction &MF,
|
|||
const SIProgramInfo &ProgramInfo) {
|
||||
auto &Func = MF.getFunction();
|
||||
auto Kern = getHSAKernelProps(MF, ProgramInfo);
|
||||
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
|
||||
|
||||
assert(Func.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
|
||||
Func.getCallingConv() == CallingConv::SPIR_KERNEL);
|
||||
|
@ -925,7 +927,7 @@ void MetadataStreamerV3::emitKernel(const MachineFunction &MF,
|
|||
(Twine(Func.getName()) + Twine(".kd")).str(), /*Copy=*/true);
|
||||
emitKernelLanguage(Func, Kern);
|
||||
emitKernelAttrs(Func, Kern);
|
||||
emitKernelArgs(Func, Kern);
|
||||
emitKernelArgs(Func, ST, Kern);
|
||||
}
|
||||
|
||||
Kernels.push_back(Kern);
|
||||
|
|
|
@ -30,6 +30,7 @@ class MDNode;
|
|||
class Module;
|
||||
struct SIProgramInfo;
|
||||
class Type;
|
||||
class GCNSubtarget;
|
||||
|
||||
namespace AMDGPU {
|
||||
|
||||
|
@ -86,7 +87,8 @@ protected:
|
|||
|
||||
void emitKernelAttrs(const Function &Func, msgpack::MapDocNode Kern);
|
||||
|
||||
void emitKernelArgs(const Function &Func, msgpack::MapDocNode Kern);
|
||||
void emitKernelArgs(const Function &Func, const GCNSubtarget &ST,
|
||||
msgpack::MapDocNode Kern);
|
||||
|
||||
void emitKernelArg(const Argument &Arg, unsigned &Offset,
|
||||
msgpack::ArrayDocNode Args);
|
||||
|
@ -98,8 +100,8 @@ protected:
|
|||
StringRef BaseTypeName = "", StringRef AccQual = "",
|
||||
StringRef TypeQual = "");
|
||||
|
||||
void emitHiddenKernelArgs(const Function &Func, unsigned &Offset,
|
||||
msgpack::ArrayDocNode Args);
|
||||
void emitHiddenKernelArgs(const Function &Func, const GCNSubtarget &ST,
|
||||
unsigned &Offset, msgpack::ArrayDocNode Args);
|
||||
|
||||
msgpack::DocNode &getRootMetadata(StringRef Key) {
|
||||
return HSAMetadataDoc->getRoot().getMap(/*Convert=*/true)[Key];
|
||||
|
@ -173,7 +175,7 @@ private:
|
|||
|
||||
void emitKernelAttrs(const Function &Func);
|
||||
|
||||
void emitKernelArgs(const Function &Func);
|
||||
void emitKernelArgs(const Function &Func, const GCNSubtarget &ST);
|
||||
|
||||
void emitKernelArg(const Argument &Arg);
|
||||
|
||||
|
@ -183,7 +185,7 @@ private:
|
|||
StringRef BaseTypeName = "", StringRef AccQual = "",
|
||||
StringRef TypeQual = "");
|
||||
|
||||
void emitHiddenKernelArgs(const Function &Func);
|
||||
void emitHiddenKernelArgs(const Function &Func, const GCNSubtarget &ST);
|
||||
|
||||
const Metadata &getHSAMetadata() const {
|
||||
return HSAMetadata;
|
||||
|
|
|
@ -648,6 +648,11 @@ bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
|
|||
}
|
||||
|
||||
unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
|
||||
// We don't allocate the segment if we know the implicit arguments weren't
|
||||
// used, even if the ABI implies we need them.
|
||||
if (F.hasFnAttribute("amdgpu-no-implicitarg-ptr"))
|
||||
return 0;
|
||||
|
||||
if (isMesaKernel(F))
|
||||
return 16;
|
||||
return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0);
|
||||
|
|
|
@ -2730,7 +2730,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(double addrspace(1)* %out, i32
|
|||
; GPRIDX-NEXT: workitem_private_segment_byte_size = 0
|
||||
; GPRIDX-NEXT: workgroup_group_segment_byte_size = 0
|
||||
; GPRIDX-NEXT: gds_segment_byte_size = 0
|
||||
; GPRIDX-NEXT: kernarg_segment_byte_size = 28
|
||||
; GPRIDX-NEXT: kernarg_segment_byte_size = 12
|
||||
; GPRIDX-NEXT: workgroup_fbarrier_count = 0
|
||||
; GPRIDX-NEXT: wavefront_sgpr_count = 9
|
||||
; GPRIDX-NEXT: workitem_vgpr_count = 3
|
||||
|
@ -2821,7 +2821,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(double addrspace(1)* %out, i32
|
|||
; MOVREL-NEXT: workitem_private_segment_byte_size = 0
|
||||
; MOVREL-NEXT: workgroup_group_segment_byte_size = 0
|
||||
; MOVREL-NEXT: gds_segment_byte_size = 0
|
||||
; MOVREL-NEXT: kernarg_segment_byte_size = 28
|
||||
; MOVREL-NEXT: kernarg_segment_byte_size = 12
|
||||
; MOVREL-NEXT: workgroup_fbarrier_count = 0
|
||||
; MOVREL-NEXT: wavefront_sgpr_count = 9
|
||||
; MOVREL-NEXT: workitem_vgpr_count = 4
|
||||
|
@ -2913,7 +2913,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(double addrspace(1)* %out, i32
|
|||
; GFX10-NEXT: workitem_private_segment_byte_size = 0
|
||||
; GFX10-NEXT: workgroup_group_segment_byte_size = 0
|
||||
; GFX10-NEXT: gds_segment_byte_size = 0
|
||||
; GFX10-NEXT: kernarg_segment_byte_size = 28
|
||||
; GFX10-NEXT: kernarg_segment_byte_size = 12
|
||||
; GFX10-NEXT: workgroup_fbarrier_count = 0
|
||||
; GFX10-NEXT: wavefront_sgpr_count = 9
|
||||
; GFX10-NEXT: workitem_vgpr_count = 3
|
||||
|
@ -3559,7 +3559,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(float addrspace(1)* %out, i32
|
|||
; GPRIDX-NEXT: workitem_private_segment_byte_size = 0
|
||||
; GPRIDX-NEXT: workgroup_group_segment_byte_size = 0
|
||||
; GPRIDX-NEXT: gds_segment_byte_size = 0
|
||||
; GPRIDX-NEXT: kernarg_segment_byte_size = 28
|
||||
; GPRIDX-NEXT: kernarg_segment_byte_size = 12
|
||||
; GPRIDX-NEXT: workgroup_fbarrier_count = 0
|
||||
; GPRIDX-NEXT: wavefront_sgpr_count = 6
|
||||
; GPRIDX-NEXT: workitem_vgpr_count = 2
|
||||
|
@ -3643,7 +3643,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(float addrspace(1)* %out, i32
|
|||
; MOVREL-NEXT: workitem_private_segment_byte_size = 0
|
||||
; MOVREL-NEXT: workgroup_group_segment_byte_size = 0
|
||||
; MOVREL-NEXT: gds_segment_byte_size = 0
|
||||
; MOVREL-NEXT: kernarg_segment_byte_size = 28
|
||||
; MOVREL-NEXT: kernarg_segment_byte_size = 12
|
||||
; MOVREL-NEXT: workgroup_fbarrier_count = 0
|
||||
; MOVREL-NEXT: wavefront_sgpr_count = 6
|
||||
; MOVREL-NEXT: workitem_vgpr_count = 3
|
||||
|
@ -3728,7 +3728,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(float addrspace(1)* %out, i32
|
|||
; GFX10-NEXT: workitem_private_segment_byte_size = 0
|
||||
; GFX10-NEXT: workgroup_group_segment_byte_size = 0
|
||||
; GFX10-NEXT: gds_segment_byte_size = 0
|
||||
; GFX10-NEXT: kernarg_segment_byte_size = 28
|
||||
; GFX10-NEXT: kernarg_segment_byte_size = 12
|
||||
; GFX10-NEXT: workgroup_fbarrier_count = 0
|
||||
; GFX10-NEXT: wavefront_sgpr_count = 6
|
||||
; GFX10-NEXT: workitem_vgpr_count = 2
|
||||
|
@ -3819,7 +3819,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(double addrspace(1)* %out, i3
|
|||
; GPRIDX-NEXT: workitem_private_segment_byte_size = 0
|
||||
; GPRIDX-NEXT: workgroup_group_segment_byte_size = 0
|
||||
; GPRIDX-NEXT: gds_segment_byte_size = 0
|
||||
; GPRIDX-NEXT: kernarg_segment_byte_size = 28
|
||||
; GPRIDX-NEXT: kernarg_segment_byte_size = 12
|
||||
; GPRIDX-NEXT: workgroup_fbarrier_count = 0
|
||||
; GPRIDX-NEXT: wavefront_sgpr_count = 7
|
||||
; GPRIDX-NEXT: workitem_vgpr_count = 3
|
||||
|
@ -3906,7 +3906,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(double addrspace(1)* %out, i3
|
|||
; MOVREL-NEXT: workitem_private_segment_byte_size = 0
|
||||
; MOVREL-NEXT: workgroup_group_segment_byte_size = 0
|
||||
; MOVREL-NEXT: gds_segment_byte_size = 0
|
||||
; MOVREL-NEXT: kernarg_segment_byte_size = 28
|
||||
; MOVREL-NEXT: kernarg_segment_byte_size = 12
|
||||
; MOVREL-NEXT: workgroup_fbarrier_count = 0
|
||||
; MOVREL-NEXT: wavefront_sgpr_count = 7
|
||||
; MOVREL-NEXT: workitem_vgpr_count = 4
|
||||
|
@ -3994,7 +3994,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(double addrspace(1)* %out, i3
|
|||
; GFX10-NEXT: workitem_private_segment_byte_size = 0
|
||||
; GFX10-NEXT: workgroup_group_segment_byte_size = 0
|
||||
; GFX10-NEXT: gds_segment_byte_size = 0
|
||||
; GFX10-NEXT: kernarg_segment_byte_size = 28
|
||||
; GFX10-NEXT: kernarg_segment_byte_size = 12
|
||||
; GFX10-NEXT: workgroup_fbarrier_count = 0
|
||||
; GFX10-NEXT: wavefront_sgpr_count = 7
|
||||
; GFX10-NEXT: workitem_vgpr_count = 3
|
||||
|
|
|
@ -74,14 +74,9 @@ define amdgpu_kernel void @opencl_test_implicit_alignment(i32 addrspace(1)* %out
|
|||
ret void
|
||||
}
|
||||
|
||||
; Mesa implies 16-bytes are always allocated, hsa requires the
|
||||
; attribute for the additional space.
|
||||
; ALL-LABEL: {{^}}test_no_kernargs:
|
||||
; HSA: enable_sgpr_kernarg_segment_ptr = 0
|
||||
; HSA: kernarg_segment_byte_size = 0
|
||||
|
||||
; OS-MESA3D: enable_sgpr_kernarg_segment_ptr = 1
|
||||
; OS-MESA3D: kernarg_segment_byte_size = 16
|
||||
; CO-V2: enable_sgpr_kernarg_segment_ptr = 0
|
||||
; CO-V2: kernarg_segment_byte_size = 0
|
||||
; CO-V2: kernarg_segment_alignment = 4
|
||||
|
||||
; HSA: s_mov_b64 [[OFFSET_NULL:s\[[0-9]+:[0-9]+\]]], 40{{$}}
|
||||
|
@ -97,7 +92,7 @@ define amdgpu_kernel void @test_no_kernargs() #1 {
|
|||
|
||||
; ALL-LABEL: {{^}}opencl_test_implicit_alignment_no_explicit_kernargs:
|
||||
; HSA: kernarg_segment_byte_size = 48
|
||||
; OS-MESA3d: kernarg_segment_byte_size = 16
|
||||
; OS-MESA3D: kernarg_segment_byte_size = 16
|
||||
; CO-V2: kernarg_segment_alignment = 4
|
||||
define amdgpu_kernel void @opencl_test_implicit_alignment_no_explicit_kernargs() #2 {
|
||||
%implicitarg.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
|
||||
|
|
|
@ -76,8 +76,8 @@ define amdgpu_kernel void @test_enqueue_kernel_caller(i8 %a) #1
|
|||
; CHECK-NEXT: - 0
|
||||
; CHECK-NOT: amdhsa.printf:
|
||||
|
||||
attributes #0 = { "amdgpu-implicitarg-num-bytes"="48" }
|
||||
attributes #1 = { "calls-enqueue-kernel" "amdgpu-implicitarg-num-bytes"="48" }
|
||||
attributes #0 = { optnone noinline "amdgpu-implicitarg-num-bytes"="48" }
|
||||
attributes #1 = { optnone noinline "calls-enqueue-kernel" "amdgpu-implicitarg-num-bytes"="48" }
|
||||
|
||||
!1 = !{i32 0}
|
||||
!2 = !{!"none"}
|
||||
|
|
|
@ -72,8 +72,8 @@ define amdgpu_kernel void @test_enqueue_kernel_caller(i8 %a) #1
|
|||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-implicitarg-num-bytes"="48" }
|
||||
attributes #1 = { "calls-enqueue-kernel" "amdgpu-implicitarg-num-bytes"="48" }
|
||||
attributes #0 = { optnone noinline "amdgpu-implicitarg-num-bytes"="48" }
|
||||
attributes #1 = { optnone noinline "calls-enqueue-kernel" "amdgpu-implicitarg-num-bytes"="48" }
|
||||
|
||||
!1 = !{i32 0}
|
||||
!2 = !{!"none"}
|
||||
|
|
|
@ -1894,9 +1894,9 @@ define amdgpu_kernel void @unknown_addrspace_kernarg(i32 addrspace(12345)* %ptr)
|
|||
; CHECK-NEXT: - 1
|
||||
; CHECK-NEXT: - 0
|
||||
|
||||
attributes #0 = { "amdgpu-implicitarg-num-bytes"="56" }
|
||||
attributes #1 = { "amdgpu-implicitarg-num-bytes"="56" "runtime-handle"="__test_block_invoke_kernel_runtime_handle" }
|
||||
attributes #2 = { "amdgpu-implicitarg-num-bytes"="56" "calls-enqueue-kernel" }
|
||||
attributes #0 = { optnone noinline "amdgpu-implicitarg-num-bytes"="56" }
|
||||
attributes #1 = { optnone noinline "amdgpu-implicitarg-num-bytes"="56" "runtime-handle"="__test_block_invoke_kernel_runtime_handle" }
|
||||
attributes #2 = { optnone noinline "amdgpu-implicitarg-num-bytes"="56" "calls-enqueue-kernel" }
|
||||
|
||||
!llvm.printf.fmts = !{!100, !101}
|
||||
|
||||
|
|
|
@ -1866,9 +1866,9 @@ define amdgpu_kernel void @unknown_addrspace_kernarg(i32 addrspace(12345)* %ptr)
|
|||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-implicitarg-num-bytes"="56" }
|
||||
attributes #1 = { "amdgpu-implicitarg-num-bytes"="56" "runtime-handle"="__test_block_invoke_kernel_runtime_handle" }
|
||||
attributes #2 = { "amdgpu-implicitarg-num-bytes"="56" "calls-enqueue-kernel" }
|
||||
attributes #0 = { optnone noinline "amdgpu-implicitarg-num-bytes"="56" }
|
||||
attributes #1 = { optnone noinline "amdgpu-implicitarg-num-bytes"="56" "runtime-handle"="__test_block_invoke_kernel_runtime_handle" }
|
||||
attributes #2 = { optnone noinline "amdgpu-implicitarg-num-bytes"="56" "calls-enqueue-kernel" }
|
||||
|
||||
!llvm.printf.fmts = !{!100, !101}
|
||||
|
||||
|
|
|
@ -296,9 +296,11 @@ entry:
|
|||
; CHECK-NEXT: - 1
|
||||
; CHECK-NEXT: - 0
|
||||
|
||||
attributes #0 = { "amdgpu-implicitarg-num-bytes"="8" }
|
||||
attributes #1 = { "amdgpu-implicitarg-num-bytes"="16" }
|
||||
attributes #2 = { "amdgpu-implicitarg-num-bytes"="24" }
|
||||
attributes #3 = { "amdgpu-implicitarg-num-bytes"="32" }
|
||||
attributes #4 = { "amdgpu-implicitarg-num-bytes"="48" }
|
||||
attributes #5 = { "amdgpu-implicitarg-num-bytes"="56" }
|
||||
; We don't have a use of llvm.amdgcn.implicitarg.ptr, so optnone to
|
||||
; avoid optimizing out the implicit argument allocation.
|
||||
attributes #0 = { optnone noinline "amdgpu-implicitarg-num-bytes"="8" }
|
||||
attributes #1 = { optnone noinline "amdgpu-implicitarg-num-bytes"="16" }
|
||||
attributes #2 = { optnone noinline "amdgpu-implicitarg-num-bytes"="24" }
|
||||
attributes #3 = { optnone noinline "amdgpu-implicitarg-num-bytes"="32" }
|
||||
attributes #4 = { optnone noinline "amdgpu-implicitarg-num-bytes"="48" }
|
||||
attributes #5 = { optnone noinline "amdgpu-implicitarg-num-bytes"="56" }
|
||||
|
|
|
@ -300,9 +300,11 @@ entry:
|
|||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-implicitarg-num-bytes"="8" }
|
||||
attributes #1 = { "amdgpu-implicitarg-num-bytes"="16" }
|
||||
attributes #2 = { "amdgpu-implicitarg-num-bytes"="24" }
|
||||
attributes #3 = { "amdgpu-implicitarg-num-bytes"="32" }
|
||||
attributes #4 = { "amdgpu-implicitarg-num-bytes"="48" }
|
||||
attributes #5 = { "amdgpu-implicitarg-num-bytes"="56" }
|
||||
; We don't have a use of llvm.amdgcn.implicitarg.ptr, so optnone to
|
||||
; avoid optimizing out the implicit argument allocation.
|
||||
attributes #0 = { optnone noinline "amdgpu-implicitarg-num-bytes"="8" }
|
||||
attributes #1 = { optnone noinline "amdgpu-implicitarg-num-bytes"="16" }
|
||||
attributes #2 = { optnone noinline "amdgpu-implicitarg-num-bytes"="24" }
|
||||
attributes #3 = { optnone noinline "amdgpu-implicitarg-num-bytes"="32" }
|
||||
attributes #4 = { optnone noinline "amdgpu-implicitarg-num-bytes"="48" }
|
||||
attributes #5 = { optnone noinline "amdgpu-implicitarg-num-bytes"="56" }
|
||||
|
|
|
@ -38,7 +38,7 @@ define amdgpu_kernel void @test_kernel(i8 %a) #0
|
|||
; CHECK-NEXT: - 1
|
||||
; CHECK-NEXT: - 0
|
||||
|
||||
attributes #0 = { "amdgpu-implicitarg-num-bytes"="48" }
|
||||
attributes #0 = { optnone noinline "amdgpu-implicitarg-num-bytes"="48" }
|
||||
|
||||
!1 = !{i32 0}
|
||||
!2 = !{!"none"}
|
||||
|
|
|
@ -35,7 +35,7 @@ define amdgpu_kernel void @test_kernel(i8 %a) #0
|
|||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-implicitarg-num-bytes"="48" }
|
||||
attributes #0 = { optnone noinline "amdgpu-implicitarg-num-bytes"="48" }
|
||||
|
||||
!1 = !{i32 0}
|
||||
!2 = !{!"none"}
|
||||
|
|
|
@ -39,7 +39,7 @@ define amdgpu_kernel void @test_kernel(i8 %a) #0
|
|||
; CHECK-NEXT: - 1
|
||||
; CHECK-NEXT: - 0
|
||||
|
||||
attributes #0 = { "amdgpu-implicitarg-num-bytes"="48" }
|
||||
attributes #0 = { optnone noinline "amdgpu-implicitarg-num-bytes"="48" }
|
||||
|
||||
!1 = !{i32 0}
|
||||
!2 = !{!"none"}
|
||||
|
|
|
@ -40,7 +40,7 @@ define amdgpu_kernel void @test_kernel(i8 %a) #0
|
|||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-implicitarg-num-bytes"="48" }
|
||||
attributes #0 = { optnone noinline "amdgpu-implicitarg-num-bytes"="48" }
|
||||
|
||||
!1 = !{i32 0}
|
||||
!2 = !{!"none"}
|
||||
|
|
|
@ -75,14 +75,10 @@ define amdgpu_kernel void @opencl_test_implicit_alignment(i32 addrspace(1)* %out
|
|||
ret void
|
||||
}
|
||||
|
||||
; Mesa implies 16-bytes are always allocated, hsa requires the
|
||||
; attribute for the additional space.
|
||||
; ALL-LABEL: {{^}}test_no_kernargs:
|
||||
; HSA: enable_sgpr_kernarg_segment_ptr = 0
|
||||
; HSA: kernarg_segment_byte_size = 0
|
||||
; CO-V2: enable_sgpr_kernarg_segment_ptr = 0
|
||||
; CO-V2: kernarg_segment_byte_size = 0
|
||||
|
||||
; OS-MESA3D: enable_sgpr_kernarg_segment_ptr = 1
|
||||
; OS-MESA3D: kernarg_segment_byte_size = 16
|
||||
; CO-V2: kernarg_segment_alignment = 4
|
||||
|
||||
; HSA: s_mov_b64 [[NULL:s\[[0-9]+:[0-9]+\]]], 0{{$}}
|
||||
|
|
Loading…
Reference in New Issue