[AMDGPU] Make ds fp atomics overloadable

Differential Revision: https://reviews.llvm.org/D87947
2020-09-18 13:20:00 -07:00 · 2020-09-18 13:20:00 -07:00 · 59691dc874
parent 3726ac41e9
commit 59691dc874
5 changed files with 49 additions and 24 deletions
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@ -14746,6 +14746,32 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
    Function *F = CGM.getIntrinsic(Intrin, { Src0->getType() });
    return Builder.CreateCall(F, { Src0, Builder.getFalse() });
  }
+  case AMDGPU::BI__builtin_amdgcn_ds_faddf:
+  case AMDGPU::BI__builtin_amdgcn_ds_fminf:
+  case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: {
+    Intrinsic::ID Intrin;
+    switch (BuiltinID) {
+    case AMDGPU::BI__builtin_amdgcn_ds_faddf:
+      Intrin = Intrinsic::amdgcn_ds_fadd;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_ds_fminf:
+      Intrin = Intrinsic::amdgcn_ds_fmin;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
+      Intrin = Intrinsic::amdgcn_ds_fmax;
+      break;
+    }
+    llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
+    llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
+    llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
+    llvm::Value *Src3 = EmitScalarExpr(E->getArg(3));
+    llvm::Value *Src4 = EmitScalarExpr(E->getArg(4));
+    llvm::Function *F = CGM.getIntrinsic(Intrin, { Src1->getType() });
+    llvm::FunctionType *FTy = F->getFunctionType();
+    llvm::Type *PTy = FTy->getParamType(0);
+    Src0 = Builder.CreatePointerBitCastOrAddrSpaceCast(Src0, PTy);
+    return Builder.CreateCall(F, { Src0, Src1, Src2, Src3, Src4 });
+  }
  case AMDGPU::BI__builtin_amdgcn_read_exec: {
    CallInst *CI = cast<CallInst>(
      EmitSpecialRegisterBuiltin(*this, E, Int64Ty, Int64Ty, NormalRead, "exec"));
--- a/clang/test/CodeGenCUDA/builtins-amdgcn.cu
+++ b/clang/test/CodeGenCUDA/builtins-amdgcn.cu
@ -10,7 +10,7 @@ __global__ void use_dispatch_ptr(int* out) {
 }

 // CHECK-LABEL: @_Z12test_ds_fmaxf(
-// CHECK: call contract float @llvm.amdgcn.ds.fmax(float addrspace(3)* @_ZZ12test_ds_fmaxfE6shared, float %{{[^,]*}}, i32 0, i32 0, i1 false)
+// CHECK: call contract float @llvm.amdgcn.ds.fmax.f32(float addrspace(3)* @_ZZ12test_ds_fmaxfE6shared, float %{{[^,]*}}, i32 0, i32 0, i1 false)
 __global__
 void test_ds_fmax(float src) {
  __shared__ float shared;
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
@ -114,19 +114,19 @@ void test_update_dpp(global int* out, int arg1, int arg2)
 }

 // CHECK-LABEL: @test_ds_fadd
-// CHECK: call float @llvm.amdgcn.ds.fadd(float addrspace(3)* %out, float %src, i32 0, i32 0, i1 false)
+// CHECK: call float @llvm.amdgcn.ds.fadd.f32(float addrspace(3)* %out, float %src, i32 0, i32 0, i1 false)
 void test_ds_faddf(local float *out, float src) {
  *out = __builtin_amdgcn_ds_faddf(out, src, 0, 0, false);
 }

 // CHECK-LABEL: @test_ds_fmin
-// CHECK: call float @llvm.amdgcn.ds.fmin(float addrspace(3)* %out, float %src, i32 0, i32 0, i1 false)
+// CHECK: call float @llvm.amdgcn.ds.fmin.f32(float addrspace(3)* %out, float %src, i32 0, i32 0, i1 false)
 void test_ds_fminf(local float *out, float src) {
  *out = __builtin_amdgcn_ds_fminf(out, src, 0, 0, false);
 }

 // CHECK-LABEL: @test_ds_fmax
-// CHECK: call float @llvm.amdgcn.ds.fmax(float addrspace(3)* %out, float %src, i32 0, i32 0, i1 false)
+// CHECK: call float @llvm.amdgcn.ds.fmax.f32(float addrspace(3)* %out, float %src, i32 0, i32 0, i1 false)
 void test_ds_fmaxf(local float *out, float src) {
  *out = __builtin_amdgcn_ds_fmaxf(out, src, 0, 0, false);
 }
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@ -397,11 +397,10 @@ class AMDGPUAtomicIncIntrin : Intrinsic<[llvm_anyint_ty],
 def int_amdgcn_atomic_inc : AMDGPUAtomicIncIntrin;
 def int_amdgcn_atomic_dec : AMDGPUAtomicIncIntrin;

-class AMDGPULDSF32Intrin<string clang_builtin> :
-  GCCBuiltin<clang_builtin>,
-  Intrinsic<[llvm_float_ty],
-    [LLVMQualPointerType<llvm_float_ty, 3>,
-    llvm_float_ty,
+class AMDGPULDSIntrin :
+  Intrinsic<[llvm_any_ty],
+    [LLVMQualPointerType<LLVMMatchType<0>, 3>,
+    LLVMMatchType<0>,
    llvm_i32_ty, // ordering
    llvm_i32_ty, // scope
    llvm_i1_ty], // isVolatile
@ -446,9 +445,9 @@ def int_amdgcn_ds_ordered_swap : AMDGPUDSOrderedIntrinsic;
 def int_amdgcn_ds_append : AMDGPUDSAppendConsumedIntrinsic;
 def int_amdgcn_ds_consume : AMDGPUDSAppendConsumedIntrinsic;

-def int_amdgcn_ds_fadd : AMDGPULDSF32Intrin<"__builtin_amdgcn_ds_faddf">;
-def int_amdgcn_ds_fmin : AMDGPULDSF32Intrin<"__builtin_amdgcn_ds_fminf">;
-def int_amdgcn_ds_fmax : AMDGPULDSF32Intrin<"__builtin_amdgcn_ds_fmaxf">;
+def int_amdgcn_ds_fadd : AMDGPULDSIntrin;
+def int_amdgcn_ds_fmin : AMDGPULDSIntrin;
+def int_amdgcn_ds_fmax : AMDGPULDSIntrin;

 } // TargetPrefix = "amdgcn"

--- a/llvm/test/CodeGen/AMDGPU/lds_atomic_f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds_atomic_f32.ll
@ -1,9 +1,9 @@
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s

-declare float @llvm.amdgcn.ds.fadd(float addrspace(3)* nocapture, float, i32, i32, i1)
-declare float @llvm.amdgcn.ds.fmin(float addrspace(3)* nocapture, float, i32, i32, i1)
-declare float @llvm.amdgcn.ds.fmax(float addrspace(3)* nocapture, float, i32, i32, i1)
+declare float @llvm.amdgcn.ds.fadd.f32(float addrspace(3)* nocapture, float, i32, i32, i1)
+declare float @llvm.amdgcn.ds.fmin.f32(float addrspace(3)* nocapture, float, i32, i32, i1)
+declare float @llvm.amdgcn.ds.fmax.f32(float addrspace(3)* nocapture, float, i32, i32, i1)

 ; GCN-LABEL: {{^}}lds_ds_fadd:
 ; VI-DAG: s_mov_b32 m0
@ -19,9 +19,9 @@ define amdgpu_kernel void @lds_ds_fadd(float addrspace(1)* %out, float addrspace
  %shl1 = shl i32 %idx.add, 4
  %ptr0 = inttoptr i32 %shl0 to float addrspace(3)*
  %ptr1 = inttoptr i32 %shl1 to float addrspace(3)*
-  %a1 = call float @llvm.amdgcn.ds.fadd(float addrspace(3)* %ptr0, float 4.2e+1, i32 0, i32 0, i1 false)
-  %a2 = call float @llvm.amdgcn.ds.fadd(float addrspace(3)* %ptr1, float 4.2e+1, i32 0, i32 0, i1 false)
-  %a3 = call float @llvm.amdgcn.ds.fadd(float addrspace(3)* %ptrf, float %a1, i32 0, i32 0, i1 false)
+  %a1 = call float @llvm.amdgcn.ds.fadd.f32(float addrspace(3)* %ptr0, float 4.2e+1, i32 0, i32 0, i1 false)
+  %a2 = call float @llvm.amdgcn.ds.fadd.f32(float addrspace(3)* %ptr1, float 4.2e+1, i32 0, i32 0, i1 false)
+  %a3 = call float @llvm.amdgcn.ds.fadd.f32(float addrspace(3)* %ptrf, float %a1, i32 0, i32 0, i1 false)
  store float %a3, float addrspace(1)* %out
  ret void
 }
@ -40,9 +40,9 @@ define amdgpu_kernel void @lds_ds_fmin(float addrspace(1)* %out, float addrspace
  %shl1 = shl i32 %idx.add, 4
  %ptr0 = inttoptr i32 %shl0 to float addrspace(3)*
  %ptr1 = inttoptr i32 %shl1 to float addrspace(3)*
-  %a1 = call float @llvm.amdgcn.ds.fmin(float addrspace(3)* %ptr0, float 4.2e+1, i32 0, i32 0, i1 false)
-  %a2 = call float @llvm.amdgcn.ds.fmin(float addrspace(3)* %ptr1, float 4.2e+1, i32 0, i32 0, i1 false)
-  %a3 = call float @llvm.amdgcn.ds.fmin(float addrspace(3)* %ptrf, float %a1, i32 0, i32 0, i1 false)
+  %a1 = call float @llvm.amdgcn.ds.fmin.f32(float addrspace(3)* %ptr0, float 4.2e+1, i32 0, i32 0, i1 false)
+  %a2 = call float @llvm.amdgcn.ds.fmin.f32(float addrspace(3)* %ptr1, float 4.2e+1, i32 0, i32 0, i1 false)
+  %a3 = call float @llvm.amdgcn.ds.fmin.f32(float addrspace(3)* %ptrf, float %a1, i32 0, i32 0, i1 false)
  store float %a3, float addrspace(1)* %out
  ret void
 }
@ -61,9 +61,9 @@ define amdgpu_kernel void @lds_ds_fmax(float addrspace(1)* %out, float addrspace
  %shl1 = shl i32 %idx.add, 4
  %ptr0 = inttoptr i32 %shl0 to float addrspace(3)*
  %ptr1 = inttoptr i32 %shl1 to float addrspace(3)*
-  %a1 = call float @llvm.amdgcn.ds.fmax(float addrspace(3)* %ptr0, float 4.2e+1, i32 0, i32 0, i1 false)
-  %a2 = call float @llvm.amdgcn.ds.fmax(float addrspace(3)* %ptr1, float 4.2e+1, i32 0, i32 0, i1 false)
-  %a3 = call float @llvm.amdgcn.ds.fmax(float addrspace(3)* %ptrf, float %a1, i32 0, i32 0, i1 false)
+  %a1 = call float @llvm.amdgcn.ds.fmax.f32(float addrspace(3)* %ptr0, float 4.2e+1, i32 0, i32 0, i1 false)
+  %a2 = call float @llvm.amdgcn.ds.fmax.f32(float addrspace(3)* %ptr1, float 4.2e+1, i32 0, i32 0, i1 false)
+  %a3 = call float @llvm.amdgcn.ds.fmax.f32(float addrspace(3)* %ptrf, float %a1, i32 0, i32 0, i1 false)
  store float %a3, float addrspace(1)* %out
  ret void
 }