forked from OSchip/llvm-project
[AMDGPU] add LDS f32 intrinsics
added llvm.amdgcn.atomic.{add|min|max}.f32 intrinsics to allow generate ds_{add|min|max}[_rtn]_f32 instructions needed for OpenCL float atomics in LDS Reviewed by: arsenm Differential Revision: https://reviews.llvm.org/D37985 llvm-svn: 322656
This commit is contained in:
parent
6b65f7c380
commit
d5fca554e2
|
@ -295,6 +295,21 @@ class AMDGPUAtomicIncIntrin : Intrinsic<[llvm_anyint_ty],
|
||||||
def int_amdgcn_atomic_inc : AMDGPUAtomicIncIntrin;
|
def int_amdgcn_atomic_inc : AMDGPUAtomicIncIntrin;
|
||||||
def int_amdgcn_atomic_dec : AMDGPUAtomicIncIntrin;
|
def int_amdgcn_atomic_dec : AMDGPUAtomicIncIntrin;
|
||||||
|
|
||||||
|
class AMDGPUAtomicF32Intrin<string clang_builtin> :
|
||||||
|
GCCBuiltin<clang_builtin>,
|
||||||
|
Intrinsic<[llvm_float_ty],
|
||||||
|
[LLVMAnyPointerType<llvm_float_ty>,
|
||||||
|
llvm_float_ty,
|
||||||
|
llvm_i32_ty, // ordering
|
||||||
|
llvm_i32_ty, // scope
|
||||||
|
llvm_i1_ty], // isVolatile
|
||||||
|
[IntrArgMemOnly, NoCapture<0>]
|
||||||
|
>;
|
||||||
|
|
||||||
|
def int_amdgcn_atomic_fadd : AMDGPUAtomicF32Intrin<"__builtin_amdgcn_ds_fadd">;
|
||||||
|
def int_amdgcn_atomic_fmin : AMDGPUAtomicF32Intrin<"__builtin_amdgcn_ds_fmin">;
|
||||||
|
def int_amdgcn_atomic_fmax : AMDGPUAtomicF32Intrin<"__builtin_amdgcn_ds_fmax">;
|
||||||
|
|
||||||
class AMDGPUImageLoad<bit NoMem = 0> : Intrinsic <
|
class AMDGPUImageLoad<bit NoMem = 0> : Intrinsic <
|
||||||
[llvm_anyfloat_ty], // vdata(VGPR)
|
[llvm_anyfloat_ty], // vdata(VGPR)
|
||||||
[llvm_anyint_ty, // vaddr(VGPR)
|
[llvm_anyint_ty, // vaddr(VGPR)
|
||||||
|
|
|
@ -450,7 +450,10 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (isa<AtomicSDNode>(N) ||
|
if (isa<AtomicSDNode>(N) ||
|
||||||
(Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC))
|
(Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC ||
|
||||||
|
Opc == AMDGPUISD::ATOMIC_LOAD_FADD ||
|
||||||
|
Opc == AMDGPUISD::ATOMIC_LOAD_FMIN ||
|
||||||
|
Opc == AMDGPUISD::ATOMIC_LOAD_FMAX))
|
||||||
N = glueCopyToM0(N);
|
N = glueCopyToM0(N);
|
||||||
|
|
||||||
switch (Opc) {
|
switch (Opc) {
|
||||||
|
|
|
@ -3982,6 +3982,9 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
|
||||||
NODE_NAME_CASE(ATOMIC_CMP_SWAP)
|
NODE_NAME_CASE(ATOMIC_CMP_SWAP)
|
||||||
NODE_NAME_CASE(ATOMIC_INC)
|
NODE_NAME_CASE(ATOMIC_INC)
|
||||||
NODE_NAME_CASE(ATOMIC_DEC)
|
NODE_NAME_CASE(ATOMIC_DEC)
|
||||||
|
NODE_NAME_CASE(ATOMIC_LOAD_FADD)
|
||||||
|
NODE_NAME_CASE(ATOMIC_LOAD_FMIN)
|
||||||
|
NODE_NAME_CASE(ATOMIC_LOAD_FMAX)
|
||||||
NODE_NAME_CASE(BUFFER_LOAD)
|
NODE_NAME_CASE(BUFFER_LOAD)
|
||||||
NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
|
NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
|
||||||
NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)
|
NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)
|
||||||
|
|
|
@ -457,6 +457,9 @@ enum NodeType : unsigned {
|
||||||
ATOMIC_CMP_SWAP,
|
ATOMIC_CMP_SWAP,
|
||||||
ATOMIC_INC,
|
ATOMIC_INC,
|
||||||
ATOMIC_DEC,
|
ATOMIC_DEC,
|
||||||
|
ATOMIC_LOAD_FADD,
|
||||||
|
ATOMIC_LOAD_FMIN,
|
||||||
|
ATOMIC_LOAD_FMAX,
|
||||||
BUFFER_LOAD,
|
BUFFER_LOAD,
|
||||||
BUFFER_LOAD_FORMAT,
|
BUFFER_LOAD_FORMAT,
|
||||||
BUFFER_LOAD_FORMAT_D16,
|
BUFFER_LOAD_FORMAT_D16,
|
||||||
|
|
|
@ -475,6 +475,9 @@ static bool isIntrinsicSourceOfDivergence(const IntrinsicInst *I) {
|
||||||
case Intrinsic::r600_read_tidig_z:
|
case Intrinsic::r600_read_tidig_z:
|
||||||
case Intrinsic::amdgcn_atomic_inc:
|
case Intrinsic::amdgcn_atomic_inc:
|
||||||
case Intrinsic::amdgcn_atomic_dec:
|
case Intrinsic::amdgcn_atomic_dec:
|
||||||
|
case Intrinsic::amdgcn_atomic_fadd:
|
||||||
|
case Intrinsic::amdgcn_atomic_fmin:
|
||||||
|
case Intrinsic::amdgcn_atomic_fmax:
|
||||||
case Intrinsic::amdgcn_image_atomic_swap:
|
case Intrinsic::amdgcn_image_atomic_swap:
|
||||||
case Intrinsic::amdgcn_image_atomic_add:
|
case Intrinsic::amdgcn_image_atomic_add:
|
||||||
case Intrinsic::amdgcn_image_atomic_sub:
|
case Intrinsic::amdgcn_image_atomic_sub:
|
||||||
|
|
|
@ -440,7 +440,7 @@ defm DS_XOR_RTN_B32 : DS_1A1D_RET_mc<"ds_xor_rtn_b32", VGPR_32, "ds_xor_b32">;
|
||||||
defm DS_MSKOR_RTN_B32 : DS_1A2D_RET_mc<"ds_mskor_rtn_b32", VGPR_32, "ds_mskor_b32">;
|
defm DS_MSKOR_RTN_B32 : DS_1A2D_RET_mc<"ds_mskor_rtn_b32", VGPR_32, "ds_mskor_b32">;
|
||||||
defm DS_CMPST_RTN_B32 : DS_1A2D_RET_mc<"ds_cmpst_rtn_b32", VGPR_32, "ds_cmpst_b32">;
|
defm DS_CMPST_RTN_B32 : DS_1A2D_RET_mc<"ds_cmpst_rtn_b32", VGPR_32, "ds_cmpst_b32">;
|
||||||
defm DS_CMPST_RTN_F32 : DS_1A2D_RET_mc<"ds_cmpst_rtn_f32", VGPR_32, "ds_cmpst_f32">;
|
defm DS_CMPST_RTN_F32 : DS_1A2D_RET_mc<"ds_cmpst_rtn_f32", VGPR_32, "ds_cmpst_f32">;
|
||||||
defm DS_MIN_RTN_F32 : DS_1A1D_RET_mc <"ds_min_rtn_f32", VGPR_32, "ds_min_f32">;
|
defm DS_MIN_RTN_F32 : DS_1A1D_RET_mc<"ds_min_rtn_f32", VGPR_32, "ds_min_f32">;
|
||||||
defm DS_MAX_RTN_F32 : DS_1A1D_RET_mc<"ds_max_rtn_f32", VGPR_32, "ds_max_f32">;
|
defm DS_MAX_RTN_F32 : DS_1A1D_RET_mc<"ds_max_rtn_f32", VGPR_32, "ds_max_f32">;
|
||||||
|
|
||||||
defm DS_WRXCHG_RTN_B32 : DS_1A1D_RET_mc<"ds_wrxchg_rtn_b32">;
|
defm DS_WRXCHG_RTN_B32 : DS_1A1D_RET_mc<"ds_wrxchg_rtn_b32">;
|
||||||
|
@ -769,6 +769,9 @@ defm : DSAtomicRetPat_mc<DS_MAX_RTN_I32, i32, "atomic_load_max_local">;
|
||||||
defm : DSAtomicRetPat_mc<DS_MIN_RTN_U32, i32, "atomic_load_umin_local">;
|
defm : DSAtomicRetPat_mc<DS_MIN_RTN_U32, i32, "atomic_load_umin_local">;
|
||||||
defm : DSAtomicRetPat_mc<DS_MAX_RTN_U32, i32, "atomic_load_umax_local">;
|
defm : DSAtomicRetPat_mc<DS_MAX_RTN_U32, i32, "atomic_load_umax_local">;
|
||||||
defm : DSAtomicCmpXChg_mc<DS_CMPST_RTN_B32, i32, "atomic_cmp_swap_local">;
|
defm : DSAtomicCmpXChg_mc<DS_CMPST_RTN_B32, i32, "atomic_cmp_swap_local">;
|
||||||
|
defm : DSAtomicRetPat_mc<DS_MIN_RTN_F32, f32, "atomic_load_fmin_local">;
|
||||||
|
defm : DSAtomicRetPat_mc<DS_MAX_RTN_F32, f32, "atomic_load_fmax_local">;
|
||||||
|
defm : DSAtomicRetPat_mc<DS_ADD_RTN_F32, f32, "atomic_load_fadd_local">;
|
||||||
|
|
||||||
// 64-bit atomics.
|
// 64-bit atomics.
|
||||||
defm : DSAtomicRetPat_mc<DS_WRXCHG_RTN_B64, i64, "atomic_swap_local">;
|
defm : DSAtomicRetPat_mc<DS_WRXCHG_RTN_B64, i64, "atomic_swap_local">;
|
||||||
|
|
|
@ -565,7 +565,10 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
|
||||||
unsigned IntrID) const {
|
unsigned IntrID) const {
|
||||||
switch (IntrID) {
|
switch (IntrID) {
|
||||||
case Intrinsic::amdgcn_atomic_inc:
|
case Intrinsic::amdgcn_atomic_inc:
|
||||||
case Intrinsic::amdgcn_atomic_dec: {
|
case Intrinsic::amdgcn_atomic_dec:
|
||||||
|
case Intrinsic::amdgcn_atomic_fadd:
|
||||||
|
case Intrinsic::amdgcn_atomic_fmin:
|
||||||
|
case Intrinsic::amdgcn_atomic_fmax: {
|
||||||
Info.opc = ISD::INTRINSIC_W_CHAIN;
|
Info.opc = ISD::INTRINSIC_W_CHAIN;
|
||||||
Info.memVT = MVT::getVT(CI.getType());
|
Info.memVT = MVT::getVT(CI.getType());
|
||||||
Info.ptrVal = CI.getOperand(0);
|
Info.ptrVal = CI.getOperand(0);
|
||||||
|
@ -803,7 +806,10 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
|
||||||
Type *&AccessTy) const {
|
Type *&AccessTy) const {
|
||||||
switch (II->getIntrinsicID()) {
|
switch (II->getIntrinsicID()) {
|
||||||
case Intrinsic::amdgcn_atomic_inc:
|
case Intrinsic::amdgcn_atomic_inc:
|
||||||
case Intrinsic::amdgcn_atomic_dec: {
|
case Intrinsic::amdgcn_atomic_dec:
|
||||||
|
case Intrinsic::amdgcn_atomic_fadd:
|
||||||
|
case Intrinsic::amdgcn_atomic_fmin:
|
||||||
|
case Intrinsic::amdgcn_atomic_fmax: {
|
||||||
Value *Ptr = II->getArgOperand(0);
|
Value *Ptr = II->getArgOperand(0);
|
||||||
AccessTy = II->getType();
|
AccessTy = II->getType();
|
||||||
Ops.push_back(Ptr);
|
Ops.push_back(Ptr);
|
||||||
|
@ -4548,10 +4554,31 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
|
||||||
|
|
||||||
switch (IntrID) {
|
switch (IntrID) {
|
||||||
case Intrinsic::amdgcn_atomic_inc:
|
case Intrinsic::amdgcn_atomic_inc:
|
||||||
case Intrinsic::amdgcn_atomic_dec: {
|
case Intrinsic::amdgcn_atomic_dec:
|
||||||
|
case Intrinsic::amdgcn_atomic_fadd:
|
||||||
|
case Intrinsic::amdgcn_atomic_fmin:
|
||||||
|
case Intrinsic::amdgcn_atomic_fmax: {
|
||||||
MemSDNode *M = cast<MemSDNode>(Op);
|
MemSDNode *M = cast<MemSDNode>(Op);
|
||||||
unsigned Opc = (IntrID == Intrinsic::amdgcn_atomic_inc) ?
|
unsigned Opc;
|
||||||
AMDGPUISD::ATOMIC_INC : AMDGPUISD::ATOMIC_DEC;
|
switch (IntrID) {
|
||||||
|
case Intrinsic::amdgcn_atomic_inc:
|
||||||
|
Opc = AMDGPUISD::ATOMIC_INC;
|
||||||
|
break;
|
||||||
|
case Intrinsic::amdgcn_atomic_dec:
|
||||||
|
Opc = AMDGPUISD::ATOMIC_DEC;
|
||||||
|
break;
|
||||||
|
case Intrinsic::amdgcn_atomic_fadd:
|
||||||
|
Opc = AMDGPUISD::ATOMIC_LOAD_FADD;
|
||||||
|
break;
|
||||||
|
case Intrinsic::amdgcn_atomic_fmin:
|
||||||
|
Opc = AMDGPUISD::ATOMIC_LOAD_FMIN;
|
||||||
|
break;
|
||||||
|
case Intrinsic::amdgcn_atomic_fmax:
|
||||||
|
Opc = AMDGPUISD::ATOMIC_LOAD_FMAX;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
llvm_unreachable("Unknown intrinsic!");
|
||||||
|
}
|
||||||
SDValue Ops[] = {
|
SDValue Ops[] = {
|
||||||
M->getOperand(0), // Chain
|
M->getOperand(0), // Chain
|
||||||
M->getOperand(2), // Ptr
|
M->getOperand(2), // Ptr
|
||||||
|
@ -6817,7 +6844,10 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
|
||||||
case ISD::ATOMIC_LOAD_UMIN:
|
case ISD::ATOMIC_LOAD_UMIN:
|
||||||
case ISD::ATOMIC_LOAD_UMAX:
|
case ISD::ATOMIC_LOAD_UMAX:
|
||||||
case AMDGPUISD::ATOMIC_INC:
|
case AMDGPUISD::ATOMIC_INC:
|
||||||
case AMDGPUISD::ATOMIC_DEC: // TODO: Target mem intrinsics.
|
case AMDGPUISD::ATOMIC_DEC:
|
||||||
|
case AMDGPUISD::ATOMIC_LOAD_FADD:
|
||||||
|
case AMDGPUISD::ATOMIC_LOAD_FMIN:
|
||||||
|
case AMDGPUISD::ATOMIC_LOAD_FMAX: // TODO: Target mem intrinsics.
|
||||||
if (DCI.isBeforeLegalize())
|
if (DCI.isBeforeLegalize())
|
||||||
break;
|
break;
|
||||||
return performMemSDNodeCombine(cast<MemSDNode>(N), DCI);
|
return performMemSDNodeCombine(cast<MemSDNode>(N), DCI);
|
||||||
|
|
|
@ -46,6 +46,22 @@ def SIatomic_dec : SDNode<"AMDGPUISD::ATOMIC_DEC", SDTAtomic2,
|
||||||
[SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
|
[SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
|
||||||
>;
|
>;
|
||||||
|
|
||||||
|
def SDTAtomic2_f32 : SDTypeProfile<1, 2, [
|
||||||
|
SDTCisSameAs<0,2>, SDTCisFP<0>, SDTCisPtrTy<1>
|
||||||
|
]>;
|
||||||
|
|
||||||
|
def SIatomic_fadd : SDNode<"AMDGPUISD::ATOMIC_LOAD_FADD", SDTAtomic2_f32,
|
||||||
|
[SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
|
||||||
|
>;
|
||||||
|
|
||||||
|
def SIatomic_fmin : SDNode<"AMDGPUISD::ATOMIC_LOAD_FMIN", SDTAtomic2_f32,
|
||||||
|
[SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
|
||||||
|
>;
|
||||||
|
|
||||||
|
def SIatomic_fmax : SDNode<"AMDGPUISD::ATOMIC_LOAD_FMAX", SDTAtomic2_f32,
|
||||||
|
[SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
|
||||||
|
>;
|
||||||
|
|
||||||
def SDTbuffer_load : SDTypeProfile<1, 9,
|
def SDTbuffer_load : SDTypeProfile<1, 9,
|
||||||
[ // vdata
|
[ // vdata
|
||||||
SDTCisVT<1, v4i32>, // rsrc
|
SDTCisVT<1, v4i32>, // rsrc
|
||||||
|
@ -207,6 +223,9 @@ defm atomic_dec_global : global_binary_atomic_op<SIatomic_dec>;
|
||||||
|
|
||||||
def atomic_inc_local : local_binary_atomic_op<SIatomic_inc>;
|
def atomic_inc_local : local_binary_atomic_op<SIatomic_inc>;
|
||||||
def atomic_dec_local : local_binary_atomic_op<SIatomic_dec>;
|
def atomic_dec_local : local_binary_atomic_op<SIatomic_dec>;
|
||||||
|
def atomic_load_fadd_local : local_binary_atomic_op<SIatomic_fadd>;
|
||||||
|
def atomic_load_fmin_local : local_binary_atomic_op<SIatomic_fmin>;
|
||||||
|
def atomic_load_fmax_local : local_binary_atomic_op<SIatomic_fmax>;
|
||||||
|
|
||||||
//===----------------------------------------------------------------------===//
|
//===----------------------------------------------------------------------===//
|
||||||
// SDNodes PatFrags for loads/stores with a glue input.
|
// SDNodes PatFrags for loads/stores with a glue input.
|
||||||
|
@ -341,10 +360,11 @@ def lshl_rev : PatFrag <
|
||||||
(shl $src0, $src1)
|
(shl $src0, $src1)
|
||||||
>;
|
>;
|
||||||
|
|
||||||
multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0> {
|
multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0,
|
||||||
|
SDTypeProfile tc = SDTAtomic2> {
|
||||||
|
|
||||||
def _glue : SDNode <
|
def _glue : SDNode <
|
||||||
!if(is_amdgpu, "AMDGPUISD", "ISD")#"::ATOMIC_"#op_name, SDTAtomic2,
|
!if(is_amdgpu, "AMDGPUISD", "ISD")#"::ATOMIC_"#op_name, tc,
|
||||||
[SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
|
[SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
|
||||||
>;
|
>;
|
||||||
|
|
||||||
|
@ -363,6 +383,9 @@ defm atomic_load_xor : SIAtomicM0Glue2 <"LOAD_XOR">;
|
||||||
defm atomic_load_umin : SIAtomicM0Glue2 <"LOAD_UMIN">;
|
defm atomic_load_umin : SIAtomicM0Glue2 <"LOAD_UMIN">;
|
||||||
defm atomic_load_umax : SIAtomicM0Glue2 <"LOAD_UMAX">;
|
defm atomic_load_umax : SIAtomicM0Glue2 <"LOAD_UMAX">;
|
||||||
defm atomic_swap : SIAtomicM0Glue2 <"SWAP">;
|
defm atomic_swap : SIAtomicM0Glue2 <"SWAP">;
|
||||||
|
defm atomic_load_fadd : SIAtomicM0Glue2 <"LOAD_FADD", 1, SDTAtomic2_f32>;
|
||||||
|
defm atomic_load_fmin : SIAtomicM0Glue2 <"LOAD_FMIN", 1, SDTAtomic2_f32>;
|
||||||
|
defm atomic_load_fmax : SIAtomicM0Glue2 <"LOAD_FMAX", 1, SDTAtomic2_f32>;
|
||||||
|
|
||||||
def atomic_cmp_swap_glue : SDNode <"ISD::ATOMIC_CMP_SWAP", SDTAtomic3,
|
def atomic_cmp_swap_glue : SDNode <"ISD::ATOMIC_CMP_SWAP", SDTAtomic3,
|
||||||
[SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
|
[SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
|
||||||
|
|
|
@ -260,7 +260,10 @@ bool InferAddressSpaces::rewriteIntrinsicOperands(IntrinsicInst *II,
|
||||||
|
|
||||||
switch (II->getIntrinsicID()) {
|
switch (II->getIntrinsicID()) {
|
||||||
case Intrinsic::amdgcn_atomic_inc:
|
case Intrinsic::amdgcn_atomic_inc:
|
||||||
case Intrinsic::amdgcn_atomic_dec:{
|
case Intrinsic::amdgcn_atomic_dec:
|
||||||
|
case Intrinsic::amdgcn_atomic_fadd:
|
||||||
|
case Intrinsic::amdgcn_atomic_fmin:
|
||||||
|
case Intrinsic::amdgcn_atomic_fmax: {
|
||||||
const ConstantInt *IsVolatile = dyn_cast<ConstantInt>(II->getArgOperand(4));
|
const ConstantInt *IsVolatile = dyn_cast<ConstantInt>(II->getArgOperand(4));
|
||||||
if (!IsVolatile || !IsVolatile->isZero())
|
if (!IsVolatile || !IsVolatile->isZero())
|
||||||
return false;
|
return false;
|
||||||
|
@ -289,6 +292,9 @@ void InferAddressSpaces::collectRewritableIntrinsicOperands(
|
||||||
case Intrinsic::objectsize:
|
case Intrinsic::objectsize:
|
||||||
case Intrinsic::amdgcn_atomic_inc:
|
case Intrinsic::amdgcn_atomic_inc:
|
||||||
case Intrinsic::amdgcn_atomic_dec:
|
case Intrinsic::amdgcn_atomic_dec:
|
||||||
|
case Intrinsic::amdgcn_atomic_fadd:
|
||||||
|
case Intrinsic::amdgcn_atomic_fmin:
|
||||||
|
case Intrinsic::amdgcn_atomic_fmax:
|
||||||
appendsFlatAddressExpressionToPostorderStack(II->getArgOperand(0),
|
appendsFlatAddressExpressionToPostorderStack(II->getArgOperand(0),
|
||||||
PostorderStack, Visited);
|
PostorderStack, Visited);
|
||||||
break;
|
break;
|
||||||
|
|
|
@ -0,0 +1,69 @@
|
||||||
|
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
|
||||||
|
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
|
||||||
|
|
||||||
|
declare float @llvm.amdgcn.atomic.fadd.f32(float addrspace(3)* nocapture, float, i32, i32, i1)
|
||||||
|
declare float @llvm.amdgcn.atomic.fmin.f32(float addrspace(3)* nocapture, float, i32, i32, i1)
|
||||||
|
declare float @llvm.amdgcn.atomic.fmax.f32(float addrspace(3)* nocapture, float, i32, i32, i1)
|
||||||
|
|
||||||
|
; GCN-LABEL: {{^}}lds_atomic_fadd_f32:
|
||||||
|
; VI-DAG: s_mov_b32 m0
|
||||||
|
; GFX9-NOT: m0
|
||||||
|
; GCN-DAG: v_mov_b32_e32 [[V0:v[0-9]+]], 0x42280000
|
||||||
|
; GCN: ds_add_rtn_f32 [[V2:v[0-9]+]], [[V1:v[0-9]+]], [[V0]] offset:32
|
||||||
|
; GCN: ds_add_f32 [[V3:v[0-9]+]], [[V0]] offset:64
|
||||||
|
; GCN: s_waitcnt lgkmcnt(1)
|
||||||
|
; GCN: ds_add_rtn_f32 {{v[0-9]+}}, {{v[0-9]+}}, [[V2]]
|
||||||
|
define amdgpu_kernel void @lds_atomic_fadd_f32(float addrspace(1)* %out, float addrspace(3)* %ptrf, i32 %idx) {
|
||||||
|
%idx.add = add nuw i32 %idx, 4
|
||||||
|
%shl0 = shl i32 %idx.add, 3
|
||||||
|
%shl1 = shl i32 %idx.add, 4
|
||||||
|
%ptr0 = inttoptr i32 %shl0 to float addrspace(3)*
|
||||||
|
%ptr1 = inttoptr i32 %shl1 to float addrspace(3)*
|
||||||
|
%a1 = call float @llvm.amdgcn.atomic.fadd.f32(float addrspace(3)* %ptr0, float 4.2e+1, i32 0, i32 0, i1 false)
|
||||||
|
%a2 = call float @llvm.amdgcn.atomic.fadd.f32(float addrspace(3)* %ptr1, float 4.2e+1, i32 0, i32 0, i1 false)
|
||||||
|
%a3 = call float @llvm.amdgcn.atomic.fadd.f32(float addrspace(3)* %ptrf, float %a1, i32 0, i32 0, i1 false)
|
||||||
|
store float %a3, float addrspace(1)* %out
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
; GCN-LABEL: {{^}}lds_atomic_fmin_f32:
|
||||||
|
; VI-DAG: s_mov_b32 m0
|
||||||
|
; GFX9-NOT: m0
|
||||||
|
; GCN-DAG: v_mov_b32_e32 [[V0:v[0-9]+]], 0x42280000
|
||||||
|
; GCN: ds_min_rtn_f32 [[V2:v[0-9]+]], [[V1:v[0-9]+]], [[V0]] offset:32
|
||||||
|
; GCN: ds_min_f32 [[V3:v[0-9]+]], [[V0]] offset:64
|
||||||
|
; GCN: s_waitcnt lgkmcnt(1)
|
||||||
|
; GCN: ds_min_rtn_f32 {{v[0-9]+}}, {{v[0-9]+}}, [[V2]]
|
||||||
|
define amdgpu_kernel void @lds_atomic_fmin_f32(float addrspace(1)* %out, float addrspace(3)* %ptrf, i32 %idx) {
|
||||||
|
%idx.add = add nuw i32 %idx, 4
|
||||||
|
%shl0 = shl i32 %idx.add, 3
|
||||||
|
%shl1 = shl i32 %idx.add, 4
|
||||||
|
%ptr0 = inttoptr i32 %shl0 to float addrspace(3)*
|
||||||
|
%ptr1 = inttoptr i32 %shl1 to float addrspace(3)*
|
||||||
|
%a1 = call float @llvm.amdgcn.atomic.fmin.f32(float addrspace(3)* %ptr0, float 4.2e+1, i32 0, i32 0, i1 false)
|
||||||
|
%a2 = call float @llvm.amdgcn.atomic.fmin.f32(float addrspace(3)* %ptr1, float 4.2e+1, i32 0, i32 0, i1 false)
|
||||||
|
%a3 = call float @llvm.amdgcn.atomic.fmin.f32(float addrspace(3)* %ptrf, float %a1, i32 0, i32 0, i1 false)
|
||||||
|
store float %a3, float addrspace(1)* %out
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
; GCN-LABEL: {{^}}lds_atomic_fmax_f32:
|
||||||
|
; VI-DAG: s_mov_b32 m0
|
||||||
|
; GFX9-NOT: m0
|
||||||
|
; GCN-DAG: v_mov_b32_e32 [[V0:v[0-9]+]], 0x42280000
|
||||||
|
; GCN: ds_max_rtn_f32 [[V2:v[0-9]+]], [[V1:v[0-9]+]], [[V0]] offset:32
|
||||||
|
; GCN: ds_max_f32 [[V3:v[0-9]+]], [[V0]] offset:64
|
||||||
|
; GCN: s_waitcnt lgkmcnt(1)
|
||||||
|
; GCN: ds_max_rtn_f32 {{v[0-9]+}}, {{v[0-9]+}}, [[V2]]
|
||||||
|
define amdgpu_kernel void @lds_atomic_fmax_f32(float addrspace(1)* %out, float addrspace(3)* %ptrf, i32 %idx) {
|
||||||
|
%idx.add = add nuw i32 %idx, 4
|
||||||
|
%shl0 = shl i32 %idx.add, 3
|
||||||
|
%shl1 = shl i32 %idx.add, 4
|
||||||
|
%ptr0 = inttoptr i32 %shl0 to float addrspace(3)*
|
||||||
|
%ptr1 = inttoptr i32 %shl1 to float addrspace(3)*
|
||||||
|
%a1 = call float @llvm.amdgcn.atomic.fmax.f32(float addrspace(3)* %ptr0, float 4.2e+1, i32 0, i32 0, i1 false)
|
||||||
|
%a2 = call float @llvm.amdgcn.atomic.fmax.f32(float addrspace(3)* %ptr1, float 4.2e+1, i32 0, i32 0, i1 false)
|
||||||
|
%a3 = call float @llvm.amdgcn.atomic.fmax.f32(float addrspace(3)* %ptrf, float %a1, i32 0, i32 0, i1 false)
|
||||||
|
store float %a3, float addrspace(1)* %out
|
||||||
|
ret void
|
||||||
|
}
|
Loading…
Reference in New Issue