[AMDGPU] Add GFX11 llvm.amdgcn.permlane64 intrinsic

Compared to permlane16, permlane64 has no BC input because it has no boundary conditions, no fi input because the instruction acts as if FI were always enabled, and no OLD input because it always writes to every active lane. Also use the new intrinsic in the atomic optimizer pass. Differential Revision: https://reviews.llvm.org/D127662
2022-06-13 16:35:44 +01:00 · 2022-06-13 16:35:44 +01:00 · bfcfd53b92
parent be232979bc
commit bfcfd53b92
8 changed files with 115 additions and 19 deletions
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@ -1886,6 +1886,15 @@ def int_amdgcn_image_bvh_intersect_ray :
             LLVMMatchType<1>, llvm_v4i32_ty],
            [IntrReadMem, IntrWillReturn]>;

+//===----------------------------------------------------------------------===//
+// GFX11 Intrinsics
+//===----------------------------------------------------------------------===//
+
+// llvm.amdgcn.permlane64 <src0>
+def int_amdgcn_permlane64 :
+  Intrinsic<[llvm_i32_ty], [llvm_i32_ty],
+            [IntrNoMem, IntrConvergent, IntrWillReturn]>;
+
 //===----------------------------------------------------------------------===//
 // Deep learning intrinsics.
 //===----------------------------------------------------------------------===//
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@ -311,6 +311,12 @@ Value *AMDGPUAtomicOptimizer::buildReduction(IRBuilder<> &B,
  if (ST->isWave32())
    return V;

+  if (ST->hasPermLane64()) {
+    // Reduce across the upper and lower 32 lanes.
+    return buildNonAtomicBinOp(
+        B, Op, V, B.CreateIntrinsic(Intrinsic::amdgcn_permlane64, {}, V));
+  }
+
  // Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and
  // combine them with a scalar operation.
  Function *ReadLane =
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@ -919,6 +919,12 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {

    return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType()));
  }
+  case Intrinsic::amdgcn_permlane64:
+    // A constant value is trivially uniform.
+    if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {
+      return IC.replaceInstUsesWith(II, C);
+    }
+    break;
  case Intrinsic::amdgcn_readfirstlane:
  case Intrinsic::amdgcn_readlane: {
    // A constant value is trivially uniform.
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@ -4267,6 +4267,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
    case Intrinsic::amdgcn_wqm:
    case Intrinsic::amdgcn_softwqm:
    case Intrinsic::amdgcn_set_inactive:
+    case Intrinsic::amdgcn_permlane64:
      return getDefaultMappingAllVGPR(MI);
    case Intrinsic::amdgcn_kernarg_segment_ptr:
    case Intrinsic::amdgcn_s_getpc:
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@ -518,7 +518,8 @@ def V_ACCVGPR_MOV_B32 : VOP1_Pseudo<"v_accvgpr_mov_b32", VOPProfileAccMov, [], 1
 let SubtargetPredicate = isGFX11Plus in {
  // Restrict src0 to be VGPR
  def V_PERMLANE64_B32 : VOP1_Pseudo<"v_permlane64_b32", VOP_MOVRELS,
-                                      [],
+                                      getVOP1Pat64<int_amdgcn_permlane64,
+                                                   VOP_MOVRELS>.ret,
                                      /*VOP1Only=*/ 1>;
  defm V_NOT_B16        : VOP1Inst<"v_not_b16", VOP_I16_I16>;
  defm V_CVT_I32_I16    : VOP1Inst<"v_cvt_i32_i16", VOP_I32_I16>;
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@ -969,23 +969,22 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164-NEXT:    v_add_nc_u32_e32 v1, v1, v2
+; GFX1164-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-NEXT:    v_readlane_b32 s2, v1, 0
-; GFX1164-NEXT:    v_readlane_b32 s3, v1, 32
+; GFX1164-NEXT:    v_add_nc_u32_e32 v1, v1, v2
 ; GFX1164-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-NEXT:    s_add_i32 s0, s2, s3
-; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
-; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v0
+; GFX1164-NEXT:    v_mov_b32_e32 v0, v1
+; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v3
 ; GFX1164-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX1164-NEXT:  ; %bb.1:
-; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1164-NEXT:    v_mov_b32_e32 v3, s0
+; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1164-NEXT:    ds_add_u32 v0, v3
+; GFX1164-NEXT:    ds_add_u32 v3, v0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    buffer_gl0_inv
 ; GFX1164-NEXT:  .LBB3_2:
@ -2578,23 +2577,22 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164-NEXT:    v_add_nc_u32_e32 v1, v1, v2
+; GFX1164-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-NEXT:    v_readlane_b32 s2, v1, 0
-; GFX1164-NEXT:    v_readlane_b32 s3, v1, 32
+; GFX1164-NEXT:    v_add_nc_u32_e32 v1, v1, v2
 ; GFX1164-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-NEXT:    s_add_i32 s0, s2, s3
-; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
-; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v0
+; GFX1164-NEXT:    v_mov_b32_e32 v0, v1
+; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v3
 ; GFX1164-NEXT:    s_cbranch_execz .LBB10_2
 ; GFX1164-NEXT:  ; %bb.1:
-; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1164-NEXT:    v_mov_b32_e32 v3, s0
+; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1164-NEXT:    ds_sub_u32 v0, v3
+; GFX1164-NEXT:    ds_sub_u32 v3, v0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    buffer_gl0_inv
 ; GFX1164-NEXT:  .LBB10_2:
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll
@ -0,0 +1,62 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
+
+declare i32 @llvm.amdgcn.permlane64(i32)
+declare i32 @llvm.amdgcn.workitem.id.x()
+
+define amdgpu_kernel void @test_s(i32 addrspace(1)* %out, i32 %src0) {
+; GFX11-LABEL: test_s:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-NEXT:    v_permlane64_b32 v0, v0
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    s_endpgm
+  %v = call i32 @llvm.amdgcn.permlane64(i32 %src0)
+  store i32 %v, i32 addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_kernel void @test_i(i32 addrspace(1)* %out) {
+; GFX11-LABEL: test_i:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0x63
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    v_permlane64_b32 v0, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    s_endpgm
+  %v = call i32 @llvm.amdgcn.permlane64(i32 99)
+  store i32 %v, i32 addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_kernel void @test_v(i32 addrspace(1)* %out, i32 %src0) #1 {
+; GFX11-SDAG-LABEL: test_v:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-SDAG-NEXT:    v_permlane64_b32 v0, v0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: test_v:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT:    v_permlane64_b32 v0, v0
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %v = call i32 @llvm.amdgcn.permlane64(i32 %tidx)
+  store i32 %v, i32 addrspace(1)* %out
+  ret void
+}
--- a/llvm/test/Transforms/InstCombine/AMDGPU/permlane64.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/permlane64.ll
@ -0,0 +1,13 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=amdgcn-amd-amdhsa -instcombine -S | FileCheck %s
+
+; Optimize the intrinsic away if the argument is uniform.
+define i32 @test_constant() {
+; CHECK-LABEL: @test_constant(
+; CHECK-NEXT:    ret i32 99
+;
+  %call = call i32 @llvm.amdgcn.permlane64(i32 99)
+  ret i32 %call
+}
+
+declare i32 @llvm.amdgcn.permlane64(i32)