forked from OSchip/llvm-project
[AMDGPU] Make ds fp atomics overloadable
Differential Revision: https://reviews.llvm.org/D87947
This commit is contained in:
parent
3726ac41e9
commit
59691dc874
|
@ -14746,6 +14746,32 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
|
|||
Function *F = CGM.getIntrinsic(Intrin, { Src0->getType() });
|
||||
return Builder.CreateCall(F, { Src0, Builder.getFalse() });
|
||||
}
|
||||
case AMDGPU::BI__builtin_amdgcn_ds_faddf:
|
||||
case AMDGPU::BI__builtin_amdgcn_ds_fminf:
|
||||
case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: {
|
||||
Intrinsic::ID Intrin;
|
||||
switch (BuiltinID) {
|
||||
case AMDGPU::BI__builtin_amdgcn_ds_faddf:
|
||||
Intrin = Intrinsic::amdgcn_ds_fadd;
|
||||
break;
|
||||
case AMDGPU::BI__builtin_amdgcn_ds_fminf:
|
||||
Intrin = Intrinsic::amdgcn_ds_fmin;
|
||||
break;
|
||||
case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
|
||||
Intrin = Intrinsic::amdgcn_ds_fmax;
|
||||
break;
|
||||
}
|
||||
llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
|
||||
llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
|
||||
llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
|
||||
llvm::Value *Src3 = EmitScalarExpr(E->getArg(3));
|
||||
llvm::Value *Src4 = EmitScalarExpr(E->getArg(4));
|
||||
llvm::Function *F = CGM.getIntrinsic(Intrin, { Src1->getType() });
|
||||
llvm::FunctionType *FTy = F->getFunctionType();
|
||||
llvm::Type *PTy = FTy->getParamType(0);
|
||||
Src0 = Builder.CreatePointerBitCastOrAddrSpaceCast(Src0, PTy);
|
||||
return Builder.CreateCall(F, { Src0, Src1, Src2, Src3, Src4 });
|
||||
}
|
||||
case AMDGPU::BI__builtin_amdgcn_read_exec: {
|
||||
CallInst *CI = cast<CallInst>(
|
||||
EmitSpecialRegisterBuiltin(*this, E, Int64Ty, Int64Ty, NormalRead, "exec"));
|
||||
|
|
|
@ -10,7 +10,7 @@ __global__ void use_dispatch_ptr(int* out) {
|
|||
}
|
||||
|
||||
// CHECK-LABEL: @_Z12test_ds_fmaxf(
|
||||
// CHECK: call contract float @llvm.amdgcn.ds.fmax(float addrspace(3)* @_ZZ12test_ds_fmaxfE6shared, float %{{[^,]*}}, i32 0, i32 0, i1 false)
|
||||
// CHECK: call contract float @llvm.amdgcn.ds.fmax.f32(float addrspace(3)* @_ZZ12test_ds_fmaxfE6shared, float %{{[^,]*}}, i32 0, i32 0, i1 false)
|
||||
__global__
|
||||
void test_ds_fmax(float src) {
|
||||
__shared__ float shared;
|
||||
|
|
|
@ -114,19 +114,19 @@ void test_update_dpp(global int* out, int arg1, int arg2)
|
|||
}
|
||||
|
||||
// CHECK-LABEL: @test_ds_fadd
|
||||
// CHECK: call float @llvm.amdgcn.ds.fadd(float addrspace(3)* %out, float %src, i32 0, i32 0, i1 false)
|
||||
// CHECK: call float @llvm.amdgcn.ds.fadd.f32(float addrspace(3)* %out, float %src, i32 0, i32 0, i1 false)
|
||||
void test_ds_faddf(local float *out, float src) {
|
||||
*out = __builtin_amdgcn_ds_faddf(out, src, 0, 0, false);
|
||||
}
|
||||
|
||||
// CHECK-LABEL: @test_ds_fmin
|
||||
// CHECK: call float @llvm.amdgcn.ds.fmin(float addrspace(3)* %out, float %src, i32 0, i32 0, i1 false)
|
||||
// CHECK: call float @llvm.amdgcn.ds.fmin.f32(float addrspace(3)* %out, float %src, i32 0, i32 0, i1 false)
|
||||
void test_ds_fminf(local float *out, float src) {
|
||||
*out = __builtin_amdgcn_ds_fminf(out, src, 0, 0, false);
|
||||
}
|
||||
|
||||
// CHECK-LABEL: @test_ds_fmax
|
||||
// CHECK: call float @llvm.amdgcn.ds.fmax(float addrspace(3)* %out, float %src, i32 0, i32 0, i1 false)
|
||||
// CHECK: call float @llvm.amdgcn.ds.fmax.f32(float addrspace(3)* %out, float %src, i32 0, i32 0, i1 false)
|
||||
void test_ds_fmaxf(local float *out, float src) {
|
||||
*out = __builtin_amdgcn_ds_fmaxf(out, src, 0, 0, false);
|
||||
}
|
||||
|
|
|
@ -397,11 +397,10 @@ class AMDGPUAtomicIncIntrin : Intrinsic<[llvm_anyint_ty],
|
|||
def int_amdgcn_atomic_inc : AMDGPUAtomicIncIntrin;
|
||||
def int_amdgcn_atomic_dec : AMDGPUAtomicIncIntrin;
|
||||
|
||||
class AMDGPULDSF32Intrin<string clang_builtin> :
|
||||
GCCBuiltin<clang_builtin>,
|
||||
Intrinsic<[llvm_float_ty],
|
||||
[LLVMQualPointerType<llvm_float_ty, 3>,
|
||||
llvm_float_ty,
|
||||
class AMDGPULDSIntrin :
|
||||
Intrinsic<[llvm_any_ty],
|
||||
[LLVMQualPointerType<LLVMMatchType<0>, 3>,
|
||||
LLVMMatchType<0>,
|
||||
llvm_i32_ty, // ordering
|
||||
llvm_i32_ty, // scope
|
||||
llvm_i1_ty], // isVolatile
|
||||
|
@ -446,9 +445,9 @@ def int_amdgcn_ds_ordered_swap : AMDGPUDSOrderedIntrinsic;
|
|||
def int_amdgcn_ds_append : AMDGPUDSAppendConsumedIntrinsic;
|
||||
def int_amdgcn_ds_consume : AMDGPUDSAppendConsumedIntrinsic;
|
||||
|
||||
def int_amdgcn_ds_fadd : AMDGPULDSF32Intrin<"__builtin_amdgcn_ds_faddf">;
|
||||
def int_amdgcn_ds_fmin : AMDGPULDSF32Intrin<"__builtin_amdgcn_ds_fminf">;
|
||||
def int_amdgcn_ds_fmax : AMDGPULDSF32Intrin<"__builtin_amdgcn_ds_fmaxf">;
|
||||
def int_amdgcn_ds_fadd : AMDGPULDSIntrin;
|
||||
def int_amdgcn_ds_fmin : AMDGPULDSIntrin;
|
||||
def int_amdgcn_ds_fmax : AMDGPULDSIntrin;
|
||||
|
||||
} // TargetPrefix = "amdgcn"
|
||||
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
|
||||
|
||||
declare float @llvm.amdgcn.ds.fadd(float addrspace(3)* nocapture, float, i32, i32, i1)
|
||||
declare float @llvm.amdgcn.ds.fmin(float addrspace(3)* nocapture, float, i32, i32, i1)
|
||||
declare float @llvm.amdgcn.ds.fmax(float addrspace(3)* nocapture, float, i32, i32, i1)
|
||||
declare float @llvm.amdgcn.ds.fadd.f32(float addrspace(3)* nocapture, float, i32, i32, i1)
|
||||
declare float @llvm.amdgcn.ds.fmin.f32(float addrspace(3)* nocapture, float, i32, i32, i1)
|
||||
declare float @llvm.amdgcn.ds.fmax.f32(float addrspace(3)* nocapture, float, i32, i32, i1)
|
||||
|
||||
; GCN-LABEL: {{^}}lds_ds_fadd:
|
||||
; VI-DAG: s_mov_b32 m0
|
||||
|
@ -19,9 +19,9 @@ define amdgpu_kernel void @lds_ds_fadd(float addrspace(1)* %out, float addrspace
|
|||
%shl1 = shl i32 %idx.add, 4
|
||||
%ptr0 = inttoptr i32 %shl0 to float addrspace(3)*
|
||||
%ptr1 = inttoptr i32 %shl1 to float addrspace(3)*
|
||||
%a1 = call float @llvm.amdgcn.ds.fadd(float addrspace(3)* %ptr0, float 4.2e+1, i32 0, i32 0, i1 false)
|
||||
%a2 = call float @llvm.amdgcn.ds.fadd(float addrspace(3)* %ptr1, float 4.2e+1, i32 0, i32 0, i1 false)
|
||||
%a3 = call float @llvm.amdgcn.ds.fadd(float addrspace(3)* %ptrf, float %a1, i32 0, i32 0, i1 false)
|
||||
%a1 = call float @llvm.amdgcn.ds.fadd.f32(float addrspace(3)* %ptr0, float 4.2e+1, i32 0, i32 0, i1 false)
|
||||
%a2 = call float @llvm.amdgcn.ds.fadd.f32(float addrspace(3)* %ptr1, float 4.2e+1, i32 0, i32 0, i1 false)
|
||||
%a3 = call float @llvm.amdgcn.ds.fadd.f32(float addrspace(3)* %ptrf, float %a1, i32 0, i32 0, i1 false)
|
||||
store float %a3, float addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
@ -40,9 +40,9 @@ define amdgpu_kernel void @lds_ds_fmin(float addrspace(1)* %out, float addrspace
|
|||
%shl1 = shl i32 %idx.add, 4
|
||||
%ptr0 = inttoptr i32 %shl0 to float addrspace(3)*
|
||||
%ptr1 = inttoptr i32 %shl1 to float addrspace(3)*
|
||||
%a1 = call float @llvm.amdgcn.ds.fmin(float addrspace(3)* %ptr0, float 4.2e+1, i32 0, i32 0, i1 false)
|
||||
%a2 = call float @llvm.amdgcn.ds.fmin(float addrspace(3)* %ptr1, float 4.2e+1, i32 0, i32 0, i1 false)
|
||||
%a3 = call float @llvm.amdgcn.ds.fmin(float addrspace(3)* %ptrf, float %a1, i32 0, i32 0, i1 false)
|
||||
%a1 = call float @llvm.amdgcn.ds.fmin.f32(float addrspace(3)* %ptr0, float 4.2e+1, i32 0, i32 0, i1 false)
|
||||
%a2 = call float @llvm.amdgcn.ds.fmin.f32(float addrspace(3)* %ptr1, float 4.2e+1, i32 0, i32 0, i1 false)
|
||||
%a3 = call float @llvm.amdgcn.ds.fmin.f32(float addrspace(3)* %ptrf, float %a1, i32 0, i32 0, i1 false)
|
||||
store float %a3, float addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
@ -61,9 +61,9 @@ define amdgpu_kernel void @lds_ds_fmax(float addrspace(1)* %out, float addrspace
|
|||
%shl1 = shl i32 %idx.add, 4
|
||||
%ptr0 = inttoptr i32 %shl0 to float addrspace(3)*
|
||||
%ptr1 = inttoptr i32 %shl1 to float addrspace(3)*
|
||||
%a1 = call float @llvm.amdgcn.ds.fmax(float addrspace(3)* %ptr0, float 4.2e+1, i32 0, i32 0, i1 false)
|
||||
%a2 = call float @llvm.amdgcn.ds.fmax(float addrspace(3)* %ptr1, float 4.2e+1, i32 0, i32 0, i1 false)
|
||||
%a3 = call float @llvm.amdgcn.ds.fmax(float addrspace(3)* %ptrf, float %a1, i32 0, i32 0, i1 false)
|
||||
%a1 = call float @llvm.amdgcn.ds.fmax.f32(float addrspace(3)* %ptr0, float 4.2e+1, i32 0, i32 0, i1 false)
|
||||
%a2 = call float @llvm.amdgcn.ds.fmax.f32(float addrspace(3)* %ptr1, float 4.2e+1, i32 0, i32 0, i1 false)
|
||||
%a3 = call float @llvm.amdgcn.ds.fmax.f32(float addrspace(3)* %ptrf, float %a1, i32 0, i32 0, i1 false)
|
||||
store float %a3, float addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue