forked from OSchip/llvm-project
AMDGPU/GlobalISel: Fix selection of gfx90a FP atomics
The struct/raw forms for the buffer atomics now work as expected. However, we're incorrectly handling the legacy form (which we probably shouldn't handle at all). We also are not diagnosing the use of the return value on gfx908. These will be addressed separately.
This commit is contained in:
parent
89c447e4e6
commit
8ff3c9e0be
|
@ -1296,6 +1296,18 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
|
|||
if (ST.hasAtomicFaddInsts())
|
||||
Atomic.legalFor({{S32, GlobalPtr}});
|
||||
|
||||
if (ST.hasGFX90AInsts()) {
|
||||
// These are legal with some caveats, and should have undergone expansion in
|
||||
// the IR in most situations
|
||||
// TODO: Move atomic expansion into legalizer
|
||||
// TODO: Also supports <2 x f16>
|
||||
Atomic.legalFor({
|
||||
{S32, GlobalPtr},
|
||||
{S64, GlobalPtr},
|
||||
{S64, FlatPtr}
|
||||
});
|
||||
}
|
||||
|
||||
// BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
|
||||
// demarshalling
|
||||
getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
|
||||
|
|
|
@ -915,7 +915,7 @@ class FlatSignedAtomicPatNoRtn <FLAT_Pseudo inst, SDPatternOperator node, ValueT
|
|||
class FlatSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt,
|
||||
ValueType data_vt = vt> : GCNPat <
|
||||
(vt (node (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)),
|
||||
(inst $vaddr, $data, $offset)
|
||||
(inst VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)
|
||||
>;
|
||||
|
||||
class ScratchLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -global-isel < %s -march=amdgcn -mcpu=gfx90a -verify-machineinstrs | FileCheck %s -check-prefix=GFX90A
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX90A
|
||||
|
||||
declare double @llvm.amdgcn.buffer.atomic.fadd.f64(double, <4 x i32>, i32, i32, i1)
|
||||
declare double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double, <4 x i32>, i32, i32, i32, i32 immarg)
|
||||
|
@ -408,6 +408,102 @@ main_body:
|
|||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(double addrspace(1)* %ptr) #1 {
|
||||
; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat:
|
||||
; GFX90A: ; %bb.0: ; %main_body
|
||||
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX90A-NEXT: s_mov_b64 s[2:3], 0
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
|
||||
; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0
|
||||
; GFX90A-NEXT: buffer_wbl2
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_invl2
|
||||
; GFX90A-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
||||
; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB24_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_endpgm
|
||||
main_body:
|
||||
%ret = atomicrmw fadd double addrspace(1)* %ptr, double 4.0 seq_cst
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(double addrspace(1)* %ptr) #1 {
|
||||
; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_agent:
|
||||
; GFX90A: ; %bb.0: ; %main_body
|
||||
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v2, v[0:1], s[0:1] glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NEXT: s_endpgm
|
||||
main_body:
|
||||
%ret = atomicrmw fadd double addrspace(1)* %ptr, double 4.0 syncscope("agent") seq_cst
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(double addrspace(1)* %ptr) #1 {
|
||||
; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_system:
|
||||
; GFX90A: ; %bb.0: ; %main_body
|
||||
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX90A-NEXT: s_mov_b64 s[2:3], 0
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
|
||||
; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0
|
||||
; GFX90A-NEXT: buffer_wbl2
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_invl2
|
||||
; GFX90A-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
||||
; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB26_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_endpgm
|
||||
main_body:
|
||||
%ret = atomicrmw fadd double addrspace(1)* %ptr, double 4.0 syncscope("one-as") seq_cst
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(double addrspace(1)* %ptr) #0 {
|
||||
; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_flush:
|
||||
; GFX90A: ; %bb.0: ; %main_body
|
||||
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v2, v[0:1], s[0:1] glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NEXT: s_endpgm
|
||||
main_body:
|
||||
%ret = atomicrmw fadd double addrspace(1)* %ptr, double 4.0 syncscope("agent") seq_cst
|
||||
ret void
|
||||
}
|
||||
|
||||
define double @global_atomic_fadd_f64_rtn(double addrspace(1)* %ptr, double %data) {
|
||||
; GFX90A-LABEL: global_atomic_fadd_f64_rtn:
|
||||
; GFX90A: ; %bb.0: ; %main_body
|
||||
|
@ -420,6 +516,84 @@ main_body:
|
|||
ret double %ret
|
||||
}
|
||||
|
||||
define double @global_atomic_fadd_f64_rtn_pat(double addrspace(1)* %ptr, double %data) #1 {
|
||||
; GFX90A-LABEL: global_atomic_fadd_f64_rtn_pat:
|
||||
; GFX90A: ; %bb.0: ; %main_body
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
|
||||
; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0
|
||||
; GFX90A-NEXT: buffer_wbl2
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_invl2
|
||||
; GFX90A-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB29_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v1, v3
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
main_body:
|
||||
%ret = atomicrmw fadd double addrspace(1)* %ptr, double 4.0 seq_cst
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
define double @global_atomic_fadd_f64_rtn_pat_agent(double addrspace(1)* %ptr, double %data) #1 {
|
||||
; GFX90A-LABEL: global_atomic_fadd_f64_rtn_pat_agent:
|
||||
; GFX90A: ; %bb.0: ; %main_body
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
main_body:
|
||||
%ret = atomicrmw fadd double addrspace(1)* %ptr, double 4.0 syncscope("agent") seq_cst
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
define double @global_atomic_fadd_f64_rtn_pat_system(double addrspace(1)* %ptr, double %data) #1 {
|
||||
; GFX90A-LABEL: global_atomic_fadd_f64_rtn_pat_system:
|
||||
; GFX90A: ; %bb.0: ; %main_body
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
|
||||
; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0
|
||||
; GFX90A-NEXT: buffer_wbl2
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_invl2
|
||||
; GFX90A-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB31_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v1, v3
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
main_body:
|
||||
%ret = atomicrmw fadd double addrspace(1)* %ptr, double 4.0 syncscope("one-as") seq_cst
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
define double @global_atomic_fmax_f64_rtn(double addrspace(1)* %ptr, double %data) {
|
||||
; GFX90A-LABEL: global_atomic_fmax_f64_rtn:
|
||||
; GFX90A: ; %bb.0: ; %main_body
|
||||
|
@ -444,6 +618,195 @@ main_body:
|
|||
ret double %ret
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(double addrspace(1)* %ptr) {
|
||||
; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe:
|
||||
; GFX90A: ; %bb.0: ; %main_body
|
||||
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX90A-NEXT: s_mov_b64 s[2:3], 0
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
|
||||
; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
||||
; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB34_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_endpgm
|
||||
main_body:
|
||||
%ret = atomicrmw fadd double addrspace(1)* %ptr, double 4.0 syncscope("agent") seq_cst
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(double* %ptr) #1 {
|
||||
; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat:
|
||||
; GFX90A: ; %bb.0: ; %main_body
|
||||
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX90A-NEXT: s_mov_b64 s[2:3], 0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
||||
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
||||
; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1]
|
||||
; GFX90A-NEXT: buffer_wbl2
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: buffer_invl2
|
||||
; GFX90A-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
||||
; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB35_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_endpgm
|
||||
main_body:
|
||||
%ret = atomicrmw fadd double* %ptr, double 4.0 seq_cst
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(double* %ptr) #1 {
|
||||
; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent:
|
||||
; GFX90A: ; %bb.0: ; %main_body
|
||||
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NEXT: s_endpgm
|
||||
main_body:
|
||||
%ret = atomicrmw fadd double* %ptr, double 4.0 syncscope("agent") seq_cst
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(double* %ptr) #1 {
|
||||
; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_system:
|
||||
; GFX90A: ; %bb.0: ; %main_body
|
||||
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX90A-NEXT: s_mov_b64 s[2:3], 0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
||||
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
||||
; GFX90A-NEXT: .LBB37_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1]
|
||||
; GFX90A-NEXT: buffer_wbl2
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_invl2
|
||||
; GFX90A-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
||||
; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB37_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_endpgm
|
||||
main_body:
|
||||
%ret = atomicrmw fadd double* %ptr, double 4.0 syncscope("one-as") seq_cst
|
||||
ret void
|
||||
}
|
||||
|
||||
define double @flat_atomic_fadd_f64_rtn_pat(double* %ptr) #1 {
|
||||
; GFX90A-LABEL: flat_atomic_fadd_f64_rtn_pat:
|
||||
; GFX90A: ; %bb.0: ; %main_body
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: .LBB38_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
|
||||
; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0
|
||||
; GFX90A-NEXT: buffer_wbl2
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: buffer_invl2
|
||||
; GFX90A-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB38_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v1, v3
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
main_body:
|
||||
%ret = atomicrmw fadd double* %ptr, double 4.0 seq_cst
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
define double @flat_atomic_fadd_f64_rtn_pat_agent(double* %ptr) #1 {
|
||||
; GFX90A-LABEL: flat_atomic_fadd_f64_rtn_pat_agent:
|
||||
; GFX90A: ; %bb.0: ; %main_body
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
main_body:
|
||||
%ret = atomicrmw fadd double* %ptr, double 4.0 syncscope("agent") seq_cst
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
define double @flat_atomic_fadd_f64_rtn_pat_system(double* %ptr) #1 {
|
||||
; GFX90A-LABEL: flat_atomic_fadd_f64_rtn_pat_system:
|
||||
; GFX90A: ; %bb.0: ; %main_body
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX90A-NEXT: .LBB40_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
|
||||
; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0
|
||||
; GFX90A-NEXT: buffer_wbl2
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_invl2
|
||||
; GFX90A-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB40_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v1, v3
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
main_body:
|
||||
%ret = atomicrmw fadd double* %ptr, double 4.0 syncscope("one-as") seq_cst
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @flat_atomic_fadd_f64_noret(double* %ptr, double %data) {
|
||||
; GFX90A-LABEL: flat_atomic_fadd_f64_noret:
|
||||
; GFX90A: ; %bb.0: ; %main_body
|
||||
|
@ -470,6 +833,35 @@ main_body:
|
|||
ret double %ret
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(double* %ptr) {
|
||||
; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe:
|
||||
; GFX90A: ; %bb.0: ; %main_body
|
||||
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX90A-NEXT: s_mov_b64 s[2:3], 0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
||||
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
||||
; GFX90A-NEXT: .LBB43_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1]
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
||||
; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB43_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_endpgm
|
||||
main_body:
|
||||
%ret = atomicrmw fadd double* %ptr, double 4.0 syncscope("agent") seq_cst
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @flat_atomic_fmin_f64_noret(double* %ptr, double %data) {
|
||||
; GFX90A-LABEL: flat_atomic_fmin_f64_noret:
|
||||
; GFX90A: ; %bb.0: ; %main_body
|
||||
|
@ -551,7 +943,7 @@ main_body:
|
|||
ret double %ret
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(double addrspace(3)* %ptr) {
|
||||
define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(double addrspace(3)* %ptr) #1 {
|
||||
; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat:
|
||||
; GFX90A: ; %bb.0: ; %main_body
|
||||
; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24
|
||||
|
@ -568,7 +960,52 @@ main_body:
|
|||
ret void
|
||||
}
|
||||
|
||||
define double @local_atomic_fadd_f64_rtn_pat(double addrspace(3)* %ptr, double %data) {
|
||||
define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(double addrspace(3)* %ptr) #0 {
|
||||
; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat_flush:
|
||||
; GFX90A: ; %bb.0: ; %main_body
|
||||
; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v2, v[0:1]
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: s_endpgm
|
||||
main_body:
|
||||
%ret = atomicrmw fadd double addrspace(3)* %ptr, double 4.0 seq_cst
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(double addrspace(3)* %ptr) #4 {
|
||||
; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe:
|
||||
; GFX90A: ; %bb.0: ; %main_body
|
||||
; GFX90A-NEXT: s_load_dword s2, s[0:1], 0x24
|
||||
; GFX90A-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX90A-NEXT: ds_read_b64 v[0:1], v0
|
||||
; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], 4.0
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v4, s2
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: ds_cmpst_rtn_b64 v[2:3], v4, v[0:1], v[2:3]
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[0:1]
|
||||
; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[0,1]
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
||||
; GFX90A-NEXT: s_cbranch_execnz .LBB52_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_endpgm
|
||||
main_body:
|
||||
%ret = atomicrmw fadd double addrspace(3)* %ptr, double 4.0 seq_cst
|
||||
ret void
|
||||
}
|
||||
|
||||
define double @local_atomic_fadd_f64_rtn_pat(double addrspace(3)* %ptr, double %data) #1 {
|
||||
; GFX90A-LABEL: local_atomic_fadd_f64_rtn_pat:
|
||||
; GFX90A: ; %bb.0: ; %main_body
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
|
@ -582,3 +1019,37 @@ main_body:
|
|||
%ret = atomicrmw fadd double addrspace(3)* %ptr, double 4.0 seq_cst
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
define double @local_atomic_fadd_f64_rtn_ieee_unsafe(double addrspace(3)* %ptr, double %data) #2 {
|
||||
; GFX90A-LABEL: local_atomic_fadd_f64_rtn_ieee_unsafe:
|
||||
; GFX90A: ; %bb.0: ; %main_body
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v4, v1
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v2
|
||||
; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v0, v[4:5]
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
main_body:
|
||||
%ret = call double @llvm.amdgcn.ds.fadd.f64(double addrspace(3)* %ptr, double %data, i32 0, i32 0, i1 0)
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
define double @local_atomic_fadd_f64_rtn_ieee_safe(double addrspace(3)* %ptr, double %data) #3 {
|
||||
; GFX90A-LABEL: local_atomic_fadd_f64_rtn_ieee_safe:
|
||||
; GFX90A: ; %bb.0: ; %main_body
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v4, v1
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v5, v2
|
||||
; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v0, v[4:5]
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
main_body:
|
||||
%ret = call double @llvm.amdgcn.ds.fadd.f64(double addrspace(3)* %ptr, double %data, i32 0, i32 0, i1 0)
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
attributes #0 = { "denormal-fp-math"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
|
||||
attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" }
|
||||
attributes #2 = { "denormal-fp-math"="ieee,ieee" "amdgpu-unsafe-fp-atomics"="true" }
|
||||
attributes #3 = { "denormal-fp-math"="ieee,ieee" }
|
||||
attributes #4 = { "denormal-fp-math"="preserve-sign,preserve-sign" }
|
||||
|
|
Loading…
Reference in New Issue