AMDGPU/GlobalISel: Handle more input argument intrinsics

llvm-svn: 364836
2019-07-01 18:50:50 +00:00 · 2019-07-01 18:50:50 +00:00 · bae3636f96
parent 9e8e8c60fa
commit bae3636f96
9 changed files with 155 additions and 52 deletions
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@ -198,6 +198,58 @@ static void allocateSpecialEntryInputVGPRs(CCState &CCInfo,
  }
 }

+// Allocate special inputs passed in user SGPRs.
+static void allocateHSAUserSGPRs(CCState &CCInfo,
+                                 MachineIRBuilder &MIRBuilder,
+                                 MachineFunction &MF,
+                                 const SIRegisterInfo &TRI,
+                                 SIMachineFunctionInfo &Info) {
+  // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
+  if (Info.hasPrivateSegmentBuffer()) {
+    unsigned PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
+    MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
+    CCInfo.AllocateReg(PrivateSegmentBufferReg);
+  }
+
+  if (Info.hasDispatchPtr()) {
+    unsigned DispatchPtrReg = Info.addDispatchPtr(TRI);
+    MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
+    CCInfo.AllocateReg(DispatchPtrReg);
+  }
+
+  if (Info.hasQueuePtr()) {
+    unsigned QueuePtrReg = Info.addQueuePtr(TRI);
+    MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
+    CCInfo.AllocateReg(QueuePtrReg);
+  }
+
+  if (Info.hasKernargSegmentPtr()) {
+    MachineRegisterInfo &MRI = MF.getRegInfo();
+    Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
+    const LLT P4 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
+    Register VReg = MRI.createGenericVirtualRegister(P4);
+    MRI.addLiveIn(InputPtrReg, VReg);
+    MIRBuilder.getMBB().addLiveIn(InputPtrReg);
+    MIRBuilder.buildCopy(VReg, InputPtrReg);
+    CCInfo.AllocateReg(InputPtrReg);
+  }
+
+  if (Info.hasDispatchID()) {
+    unsigned DispatchIDReg = Info.addDispatchID(TRI);
+    MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
+    CCInfo.AllocateReg(DispatchIDReg);
+  }
+
+  if (Info.hasFlatScratchInit()) {
+    unsigned FlatScratchInitReg = Info.addFlatScratchInit(TRI);
+    MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
+    CCInfo.AllocateReg(FlatScratchInitReg);
+  }
+
+  // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
+  // these from the dispatch pointer.
+}
+
 static void allocateSystemSGPRs(CCState &CCInfo,
                                MachineFunction &MF,
                                SIMachineFunctionInfo &Info,
@ -272,51 +324,12 @@ bool AMDGPUCallLowering::lowerFormalArguments(
  SmallVector<CCValAssign, 16> ArgLocs;
  CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());

-  // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
-  if (Info->hasPrivateSegmentBuffer()) {
-    Register PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI);
-    MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SReg_128RegClass);
-    CCInfo.AllocateReg(PrivateSegmentBufferReg);
-  }
-
-  if (Info->hasDispatchPtr()) {
-    Register DispatchPtrReg = Info->addDispatchPtr(*TRI);
-    // FIXME: Need to add reg as live-in
-    CCInfo.AllocateReg(DispatchPtrReg);
-  }
-
-  if (Info->hasQueuePtr()) {
-    Register QueuePtrReg = Info->addQueuePtr(*TRI);
-    // FIXME: Need to add reg as live-in
-    CCInfo.AllocateReg(QueuePtrReg);
-  }
-
-  if (Info->hasKernargSegmentPtr()) {
-    Register InputPtrReg = Info->addKernargSegmentPtr(*TRI);
-    const LLT P2 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
-    Register VReg = MRI.createGenericVirtualRegister(P2);
-    MRI.addLiveIn(InputPtrReg, VReg);
-    MIRBuilder.getMBB().addLiveIn(InputPtrReg);
-    MIRBuilder.buildCopy(VReg, InputPtrReg);
-    CCInfo.AllocateReg(InputPtrReg);
-  }
-
-  if (Info->hasDispatchID()) {
-    unsigned DispatchIDReg = Info->addDispatchID(*TRI);
-    // FIXME: Need to add reg as live-in
-    CCInfo.AllocateReg(DispatchIDReg);
-  }
-
-  if (Info->hasFlatScratchInit()) {
-    unsigned FlatScratchInitReg = Info->addFlatScratchInit(*TRI);
-    // FIXME: Need to add reg as live-in
-    CCInfo.AllocateReg(FlatScratchInitReg);
-  }
-
  // The infrastructure for normal calling convention lowering is essentially
  // useless for kernels. We want to avoid any kind of legalization or argument
  // splitting.
  if (F.getCallingConv() == CallingConv::AMDGPU_KERNEL) {
+    allocateHSAUserSGPRs(CCInfo, MIRBuilder, MF, *TRI, *Info);
+
    unsigned i = 0;
    const unsigned KernArgBaseAlign = 16;
    const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset(F);
@ -352,6 +365,12 @@ bool AMDGPUCallLowering::lowerFormalArguments(
    return true;
  }

+  if (Info->hasImplicitBufferPtr()) {
+    unsigned ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI);
+    MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
+    CCInfo.AllocateReg(ImplicitBufferPtrReg);
+  }
+
  unsigned NumArgs = F.arg_size();
  Function::const_arg_iterator CurOrigArg = F.arg_begin();
  const AMDGPUTargetLowering &TLI = *getTLI<AMDGPUTargetLowering>();
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@ -1237,6 +1237,18 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
  case Intrinsic::amdgcn_workgroup_id_z:
    return legalizePreloadedArgIntrin(MI, MRI, B,
                                      AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
+  case Intrinsic::amdgcn_dispatch_ptr:
+    return legalizePreloadedArgIntrin(MI, MRI, B,
+                                      AMDGPUFunctionArgInfo::DISPATCH_PTR);
+  case Intrinsic::amdgcn_queue_ptr:
+    return legalizePreloadedArgIntrin(MI, MRI, B,
+                                      AMDGPUFunctionArgInfo::QUEUE_PTR);
+  case Intrinsic::amdgcn_implicit_buffer_ptr:
+    return legalizePreloadedArgIntrin(
+      MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
+  case Intrinsic::amdgcn_dispatch_id:
+    return legalizePreloadedArgIntrin(MI, MRI, B,
+                                      AMDGPUFunctionArgInfo::DISPATCH_ID);
  default:
    return true;
  }
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_ps.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_ps.ll
@ -1,7 +1,7 @@
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -stop-after=irtranslator -global-isel %s -o - | FileCheck %s

 ; Check that we correctly skip over disabled inputs
-; CHECK: [[S0:%[0-9]+]]:_(s32) = COPY $sgpr0
+; CHECK: [[S0:%[0-9]+]]:_(s32) = COPY $sgpr2
 ; CHECK: [[V0:%[0-9]+]]:_(s32) = COPY $vgpr0
 ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.exp), %{{[0-9]+}}(s32), %{{[0-9]+}}(s32), [[S0]](s32), [[S0]](s32), [[S0]](s32), [[V0]](s32)
 define amdgpu_ps void @ps0(float inreg %arg0, float %psinput0, float %psinput1) #1 {
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_vs.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_vs.ll
@ -2,7 +2,7 @@


 ; CHECK-LABEL: name: test_f32_inreg
-; CHECK: [[S0:%[0-9]+]]:_(s32) = COPY $sgpr0
+; CHECK: [[S0:%[0-9]+]]:_(s32) = COPY $sgpr2
 ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.exp), %{{[0-9]+}}(s32), %{{[0-9]+}}(s32), [[S0]]
 define amdgpu_vs void @test_f32_inreg(float inreg %arg0) {
  call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %arg0, float undef, float undef, float undef, i1 false, i1 false) #0
@ -18,7 +18,7 @@ define amdgpu_vs void @test_f32(float %arg0) {
 }

 ; CHECK-LABEL: name: test_ptr2_byval
-; CHECK: [[S01:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1
+; CHECK: [[S01:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3
 ; CHECK: G_LOAD [[S01]]
 define amdgpu_vs void @test_ptr2_byval(i32 addrspace(4)* byval %arg0) {
   %tmp0 = load volatile i32, i32 addrspace(4)* %arg0
@ -26,7 +26,7 @@ define amdgpu_vs void @test_ptr2_byval(i32 addrspace(4)* byval %arg0) {
 }

 ; CHECK-LABEL: name: test_ptr2_inreg
-; CHECK: [[S01:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1
+; CHECK: [[S01:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3
 ; CHECK: G_LOAD [[S01]]
 define amdgpu_vs void @test_ptr2_inreg(i32 addrspace(4)* inreg %arg0) {
  %tmp0 = load volatile i32, i32 addrspace(4)* %arg0
@ -34,8 +34,8 @@ define amdgpu_vs void @test_ptr2_inreg(i32 addrspace(4)* inreg %arg0) {
 }

 ; CHECK-LABEL: name: test_sgpr_alignment0
-; CHECK: [[S0:%[0-9]+]]:_(s32) = COPY $sgpr0
-; CHECK: [[S23:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3
+; CHECK: [[S0:%[0-9]+]]:_(s32) = COPY $sgpr2
+; CHECK: [[S23:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
 ; CHECK: G_LOAD [[S23]]
 ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.exp), %{{[0-9]+}}(s32), %{{[0-9]+}}(s32), [[S0]]
 define amdgpu_vs void @test_sgpr_alignment0(float inreg %arg0, i32 addrspace(4)* inreg %arg1) {
@ -45,8 +45,8 @@ define amdgpu_vs void @test_sgpr_alignment0(float inreg %arg0, i32 addrspace(4)*
 }

 ; CHECK-LABEL: name: test_order
-; CHECK: [[S0:%[0-9]+]]:_(s32) = COPY $sgpr0
-; CHECK: [[S1:%[0-9]+]]:_(s32) = COPY $sgpr1
+; CHECK: [[S0:%[0-9]+]]:_(s32) = COPY $sgpr2
+; CHECK: [[S1:%[0-9]+]]:_(s32) = COPY $sgpr3
 ; CHECK: [[V0:%[0-9]+]]:_(s32) = COPY $vgpr0
 ; CHECK: [[V1:%[0-9]+]]:_(s32) = COPY $vgpr1
 ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.exp), %{{[0-9]+}}(s32), %{{[0-9]+}}(s32), [[V0]](s32), [[S0]](s32), [[V1]](s32), [[S1]](s32)
@ -56,8 +56,8 @@ define amdgpu_vs void @test_order(float inreg %arg0, float inreg %arg1, float %a
 }

 ; CHECK-LABEL: name: ret_struct
-; CHECK: [[S0:%[0-9]+]]:_(s32) = COPY $sgpr0
-; CHECK: [[S1:%[0-9]+]]:_(s32) = COPY $sgpr1
+; CHECK: [[S0:%[0-9]+]]:_(s32) = COPY $sgpr2
+; CHECK: [[S1:%[0-9]+]]:_(s32) = COPY $sgpr3
 ; CHECK: $sgpr0 = COPY [[S0]]
 ; CHECK: $sgpr1 = COPY [[S1]]
 ; CHECK: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.dispatch.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.dispatch.id.ll
@ -0,0 +1,19 @@
+; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+declare i64 @llvm.amdgcn.dispatch.id() #1
+
+; GCN-LABEL: {{^}}dispatch_id:
+; GCN: .amd_kernel_code_t
+; GCN: enable_sgpr_dispatch_id = 1
+
+; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s6
+; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s7
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+define amdgpu_kernel void @dispatch_id(i64 addrspace(1)* %out) #0 {
+  %tmp0 = call i64 @llvm.amdgcn.dispatch.id()
+  store i64 %tmp0, i64 addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.dispatch.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.dispatch.ptr.ll
@ -0,0 +1,18 @@
+; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; FIXME: Error on non-HSA target
+
+; GCN-LABEL: {{^}}test:
+; GCN: enable_sgpr_dispatch_ptr = 1
+; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
+define amdgpu_kernel void @test(i32 addrspace(1)* %out) {
+  %dispatch_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0
+  %header_ptr = bitcast i8 addrspace(4)* %dispatch_ptr to i32 addrspace(4)*
+  %value = load i32, i32 addrspace(4)* %header_ptr
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+}
+
+declare noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0
+
+attributes #0 = { readnone }
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.implicit.buffer.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.implicit.buffer.ptr.ll
@ -0,0 +1,17 @@
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; FIXME: Dropped parts from original test
+
+; GCN-LABEL: {{^}}test_ps:
+; GCN: s_load_dword s{{[0-9]+}}, s[0:1], 0x0
+define amdgpu_ps i32 @test_ps() #1 {
+  %implicit_buffer_ptr = call i8 addrspace(4)* @llvm.amdgcn.implicit.buffer.ptr()
+  %buffer_ptr = bitcast i8 addrspace(4)* %implicit_buffer_ptr to i32 addrspace(4)*
+  %value = load volatile i32, i32 addrspace(4)* %buffer_ptr
+  ret i32 %value
+}
+
+declare i8 addrspace(4)* @llvm.amdgcn.implicit.buffer.ptr() #0
+
+attributes #0 = { nounwind readnone speculatable }
+attributes #1 = { nounwind }
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.queue.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.queue.ptr.ll
@ -0,0 +1,18 @@
+; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; FIXME: Error on non-hsa target
+
+; GCN-LABEL: {{^}}test:
+; GCN: enable_sgpr_queue_ptr = 1
+; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
+define amdgpu_kernel void @test(i32 addrspace(1)* %out) {
+  %queue_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0
+  %header_ptr = bitcast i8 addrspace(4)* %queue_ptr to i32 addrspace(4)*
+  %value = load i32, i32 addrspace(4)* %header_ptr
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+}
+
+declare noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0
+
+attributes #0 = { nounwind readnone }
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll
@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s

 ; FIXME: Requires stack object to not assert
 ; GCN-LABEL: {{^}}test_ps: