AMDGPU/GlobalISel: Handle more input argument intrinsics

llvm-svn: 364836
This commit is contained in:
Matt Arsenault 2019-07-01 18:50:50 +00:00
parent 9e8e8c60fa
commit bae3636f96
9 changed files with 155 additions and 52 deletions

View File

@ -198,6 +198,58 @@ static void allocateSpecialEntryInputVGPRs(CCState &CCInfo,
}
}
// Allocate special inputs passed in user SGPRs.
static void allocateHSAUserSGPRs(CCState &CCInfo,
MachineIRBuilder &MIRBuilder,
MachineFunction &MF,
const SIRegisterInfo &TRI,
SIMachineFunctionInfo &Info) {
// FIXME: How should these inputs interact with inreg / custom SGPR inputs?
if (Info.hasPrivateSegmentBuffer()) {
unsigned PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
CCInfo.AllocateReg(PrivateSegmentBufferReg);
}
if (Info.hasDispatchPtr()) {
unsigned DispatchPtrReg = Info.addDispatchPtr(TRI);
MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(DispatchPtrReg);
}
if (Info.hasQueuePtr()) {
unsigned QueuePtrReg = Info.addQueuePtr(TRI);
MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(QueuePtrReg);
}
if (Info.hasKernargSegmentPtr()) {
MachineRegisterInfo &MRI = MF.getRegInfo();
Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
const LLT P4 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
Register VReg = MRI.createGenericVirtualRegister(P4);
MRI.addLiveIn(InputPtrReg, VReg);
MIRBuilder.getMBB().addLiveIn(InputPtrReg);
MIRBuilder.buildCopy(VReg, InputPtrReg);
CCInfo.AllocateReg(InputPtrReg);
}
if (Info.hasDispatchID()) {
unsigned DispatchIDReg = Info.addDispatchID(TRI);
MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(DispatchIDReg);
}
if (Info.hasFlatScratchInit()) {
unsigned FlatScratchInitReg = Info.addFlatScratchInit(TRI);
MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(FlatScratchInitReg);
}
// TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
// these from the dispatch pointer.
}
static void allocateSystemSGPRs(CCState &CCInfo,
MachineFunction &MF,
SIMachineFunctionInfo &Info,
@ -272,51 +324,12 @@ bool AMDGPUCallLowering::lowerFormalArguments(
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
// FIXME: How should these inputs interact with inreg / custom SGPR inputs?
if (Info->hasPrivateSegmentBuffer()) {
Register PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI);
MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SReg_128RegClass);
CCInfo.AllocateReg(PrivateSegmentBufferReg);
}
if (Info->hasDispatchPtr()) {
Register DispatchPtrReg = Info->addDispatchPtr(*TRI);
// FIXME: Need to add reg as live-in
CCInfo.AllocateReg(DispatchPtrReg);
}
if (Info->hasQueuePtr()) {
Register QueuePtrReg = Info->addQueuePtr(*TRI);
// FIXME: Need to add reg as live-in
CCInfo.AllocateReg(QueuePtrReg);
}
if (Info->hasKernargSegmentPtr()) {
Register InputPtrReg = Info->addKernargSegmentPtr(*TRI);
const LLT P2 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
Register VReg = MRI.createGenericVirtualRegister(P2);
MRI.addLiveIn(InputPtrReg, VReg);
MIRBuilder.getMBB().addLiveIn(InputPtrReg);
MIRBuilder.buildCopy(VReg, InputPtrReg);
CCInfo.AllocateReg(InputPtrReg);
}
if (Info->hasDispatchID()) {
unsigned DispatchIDReg = Info->addDispatchID(*TRI);
// FIXME: Need to add reg as live-in
CCInfo.AllocateReg(DispatchIDReg);
}
if (Info->hasFlatScratchInit()) {
unsigned FlatScratchInitReg = Info->addFlatScratchInit(*TRI);
// FIXME: Need to add reg as live-in
CCInfo.AllocateReg(FlatScratchInitReg);
}
// The infrastructure for normal calling convention lowering is essentially
// useless for kernels. We want to avoid any kind of legalization or argument
// splitting.
if (F.getCallingConv() == CallingConv::AMDGPU_KERNEL) {
allocateHSAUserSGPRs(CCInfo, MIRBuilder, MF, *TRI, *Info);
unsigned i = 0;
const unsigned KernArgBaseAlign = 16;
const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset(F);
@ -352,6 +365,12 @@ bool AMDGPUCallLowering::lowerFormalArguments(
return true;
}
if (Info->hasImplicitBufferPtr()) {
unsigned ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI);
MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(ImplicitBufferPtrReg);
}
unsigned NumArgs = F.arg_size();
Function::const_arg_iterator CurOrigArg = F.arg_begin();
const AMDGPUTargetLowering &TLI = *getTLI<AMDGPUTargetLowering>();

View File

@ -1237,6 +1237,18 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
case Intrinsic::amdgcn_workgroup_id_z:
return legalizePreloadedArgIntrin(MI, MRI, B,
AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
case Intrinsic::amdgcn_dispatch_ptr:
return legalizePreloadedArgIntrin(MI, MRI, B,
AMDGPUFunctionArgInfo::DISPATCH_PTR);
case Intrinsic::amdgcn_queue_ptr:
return legalizePreloadedArgIntrin(MI, MRI, B,
AMDGPUFunctionArgInfo::QUEUE_PTR);
case Intrinsic::amdgcn_implicit_buffer_ptr:
return legalizePreloadedArgIntrin(
MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
case Intrinsic::amdgcn_dispatch_id:
return legalizePreloadedArgIntrin(MI, MRI, B,
AMDGPUFunctionArgInfo::DISPATCH_ID);
default:
return true;
}

View File

@ -1,7 +1,7 @@
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -stop-after=irtranslator -global-isel %s -o - | FileCheck %s
; Check that we correctly skip over disabled inputs
; CHECK: [[S0:%[0-9]+]]:_(s32) = COPY $sgpr0
; CHECK: [[S0:%[0-9]+]]:_(s32) = COPY $sgpr2
; CHECK: [[V0:%[0-9]+]]:_(s32) = COPY $vgpr0
; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.exp), %{{[0-9]+}}(s32), %{{[0-9]+}}(s32), [[S0]](s32), [[S0]](s32), [[S0]](s32), [[V0]](s32)
define amdgpu_ps void @ps0(float inreg %arg0, float %psinput0, float %psinput1) #1 {

View File

@ -2,7 +2,7 @@
; CHECK-LABEL: name: test_f32_inreg
; CHECK: [[S0:%[0-9]+]]:_(s32) = COPY $sgpr0
; CHECK: [[S0:%[0-9]+]]:_(s32) = COPY $sgpr2
; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.exp), %{{[0-9]+}}(s32), %{{[0-9]+}}(s32), [[S0]]
define amdgpu_vs void @test_f32_inreg(float inreg %arg0) {
call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %arg0, float undef, float undef, float undef, i1 false, i1 false) #0
@ -18,7 +18,7 @@ define amdgpu_vs void @test_f32(float %arg0) {
}
; CHECK-LABEL: name: test_ptr2_byval
; CHECK: [[S01:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1
; CHECK: [[S01:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3
; CHECK: G_LOAD [[S01]]
define amdgpu_vs void @test_ptr2_byval(i32 addrspace(4)* byval %arg0) {
%tmp0 = load volatile i32, i32 addrspace(4)* %arg0
@ -26,7 +26,7 @@ define amdgpu_vs void @test_ptr2_byval(i32 addrspace(4)* byval %arg0) {
}
; CHECK-LABEL: name: test_ptr2_inreg
; CHECK: [[S01:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1
; CHECK: [[S01:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3
; CHECK: G_LOAD [[S01]]
define amdgpu_vs void @test_ptr2_inreg(i32 addrspace(4)* inreg %arg0) {
%tmp0 = load volatile i32, i32 addrspace(4)* %arg0
@ -34,8 +34,8 @@ define amdgpu_vs void @test_ptr2_inreg(i32 addrspace(4)* inreg %arg0) {
}
; CHECK-LABEL: name: test_sgpr_alignment0
; CHECK: [[S0:%[0-9]+]]:_(s32) = COPY $sgpr0
; CHECK: [[S23:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3
; CHECK: [[S0:%[0-9]+]]:_(s32) = COPY $sgpr2
; CHECK: [[S23:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; CHECK: G_LOAD [[S23]]
; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.exp), %{{[0-9]+}}(s32), %{{[0-9]+}}(s32), [[S0]]
define amdgpu_vs void @test_sgpr_alignment0(float inreg %arg0, i32 addrspace(4)* inreg %arg1) {
@ -45,8 +45,8 @@ define amdgpu_vs void @test_sgpr_alignment0(float inreg %arg0, i32 addrspace(4)*
}
; CHECK-LABEL: name: test_order
; CHECK: [[S0:%[0-9]+]]:_(s32) = COPY $sgpr0
; CHECK: [[S1:%[0-9]+]]:_(s32) = COPY $sgpr1
; CHECK: [[S0:%[0-9]+]]:_(s32) = COPY $sgpr2
; CHECK: [[S1:%[0-9]+]]:_(s32) = COPY $sgpr3
; CHECK: [[V0:%[0-9]+]]:_(s32) = COPY $vgpr0
; CHECK: [[V1:%[0-9]+]]:_(s32) = COPY $vgpr1
; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.exp), %{{[0-9]+}}(s32), %{{[0-9]+}}(s32), [[V0]](s32), [[S0]](s32), [[V1]](s32), [[S1]](s32)
@ -56,8 +56,8 @@ define amdgpu_vs void @test_order(float inreg %arg0, float inreg %arg1, float %a
}
; CHECK-LABEL: name: ret_struct
; CHECK: [[S0:%[0-9]+]]:_(s32) = COPY $sgpr0
; CHECK: [[S1:%[0-9]+]]:_(s32) = COPY $sgpr1
; CHECK: [[S0:%[0-9]+]]:_(s32) = COPY $sgpr2
; CHECK: [[S1:%[0-9]+]]:_(s32) = COPY $sgpr3
; CHECK: $sgpr0 = COPY [[S0]]
; CHECK: $sgpr1 = COPY [[S1]]
; CHECK: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1

View File

@ -0,0 +1,19 @@
; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
declare i64 @llvm.amdgcn.dispatch.id() #1
; GCN-LABEL: {{^}}dispatch_id:
; GCN: .amd_kernel_code_t
; GCN: enable_sgpr_dispatch_id = 1
; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s6
; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s7
; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
define amdgpu_kernel void @dispatch_id(i64 addrspace(1)* %out) #0 {
%tmp0 = call i64 @llvm.amdgcn.dispatch.id()
store i64 %tmp0, i64 addrspace(1)* %out
ret void
}
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }

View File

@ -0,0 +1,18 @@
; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; FIXME: Error on non-HSA target
; GCN-LABEL: {{^}}test:
; GCN: enable_sgpr_dispatch_ptr = 1
; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
define amdgpu_kernel void @test(i32 addrspace(1)* %out) {
%dispatch_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0
%header_ptr = bitcast i8 addrspace(4)* %dispatch_ptr to i32 addrspace(4)*
%value = load i32, i32 addrspace(4)* %header_ptr
store i32 %value, i32 addrspace(1)* %out
ret void
}
declare noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0
attributes #0 = { readnone }

View File

@ -0,0 +1,17 @@
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; FIXME: Dropped parts from original test
; GCN-LABEL: {{^}}test_ps:
; GCN: s_load_dword s{{[0-9]+}}, s[0:1], 0x0
define amdgpu_ps i32 @test_ps() #1 {
%implicit_buffer_ptr = call i8 addrspace(4)* @llvm.amdgcn.implicit.buffer.ptr()
%buffer_ptr = bitcast i8 addrspace(4)* %implicit_buffer_ptr to i32 addrspace(4)*
%value = load volatile i32, i32 addrspace(4)* %buffer_ptr
ret i32 %value
}
declare i8 addrspace(4)* @llvm.amdgcn.implicit.buffer.ptr() #0
attributes #0 = { nounwind readnone speculatable }
attributes #1 = { nounwind }

View File

@ -0,0 +1,18 @@
; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; FIXME: Error on non-hsa target
; GCN-LABEL: {{^}}test:
; GCN: enable_sgpr_queue_ptr = 1
; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
define amdgpu_kernel void @test(i32 addrspace(1)* %out) {
%queue_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0
%header_ptr = bitcast i8 addrspace(4)* %queue_ptr to i32 addrspace(4)*
%value = load i32, i32 addrspace(4)* %header_ptr
store i32 %value, i32 addrspace(1)* %out
ret void
}
declare noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0
attributes #0 = { nounwind readnone }

View File

@ -1,4 +1,4 @@
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; FIXME: Requires stack object to not assert
; GCN-LABEL: {{^}}test_ps: