forked from OSchip/llvm-project
[AMDGPU] Allow -amdgpu-unsafe-fp-atomics to ignore denorm mode
Fixes: SWDEV-274276 Differential Revision: https://reviews.llvm.org/D100072
This commit is contained in:
parent
d508561798
commit
189310a140
|
@ -2997,6 +2997,10 @@ Enable threadgroup split execution mode (AMDGPU only)
|
|||
|
||||
Specify XNACK mode (AMDGPU only)
|
||||
|
||||
.. option:: -munsafe-fp-atomics, -mno-unsafe-fp-atomics
|
||||
|
||||
Enable generation of unsafe floating point atomic instructions. May generate more efficient code, but may not respect rounding and denormal modes, and may give incorrect results for certain memory destinations. (AMDGPU only)
|
||||
|
||||
ARM
|
||||
---
|
||||
.. option:: -faapcs-bitfield-load
|
||||
|
|
|
@ -12055,9 +12055,14 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
|
|||
|
||||
if ((AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) &&
|
||||
Subtarget->hasAtomicFaddInsts()) {
|
||||
if (!fpModeMatchesGlobalFPAtomicMode(RMW) ||
|
||||
RMW->getFunction()->getFnAttribute("amdgpu-unsafe-fp-atomics")
|
||||
.getValueAsString() != "true")
|
||||
// The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe
|
||||
// floating point atomic instructions. May generate more efficient code,
|
||||
// but may not respect rounding and denormal modes, and may give incorrect
|
||||
// results for certain memory destinations.
|
||||
if (!fpModeMatchesGlobalFPAtomicMode(RMW) &&
|
||||
RMW->getFunction()
|
||||
->getFnAttribute("amdgpu-unsafe-fp-atomics")
|
||||
.getValueAsString() != "true")
|
||||
return AtomicExpansionKind::CmpXChg;
|
||||
|
||||
if (Subtarget->hasGFX90AInsts()) {
|
||||
|
|
|
@ -490,25 +490,13 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(double addrspa
|
|||
; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_flush:
|
||||
; GFX90A: ; %bb.0: ; %main_body
|
||||
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX90A-NEXT: s_mov_b64 s[2:3], 0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
|
||||
; GFX90A-NEXT: BB27_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
|
||||
; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
||||
; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
||||
; GFX90A-NEXT: s_cbranch_execnz BB27_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_endpgm
|
||||
main_body:
|
||||
%ret = atomicrmw fadd double addrspace(1)* %ptr, double 4.0 syncscope("agent") seq_cst
|
||||
|
|
|
@ -171,26 +171,12 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_ieee(float addrspace(1)* %
|
|||
; GFX90A-LABEL: global_atomic_fadd_ret_f32_ieee:
|
||||
; GFX90A: ; %bb.0:
|
||||
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX90A-NEXT: s_mov_b64 s[2:3], 0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX90A-NEXT: BB1_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX90A-NEXT: v_add_f32_e32 v0, 4.0, v1
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v1, 4.0
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
|
||||
; GFX90A-NEXT: global_atomic_add_f32 v0, v0, v1, s[0:1] glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
||||
; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
||||
; GFX90A-NEXT: s_cbranch_execnz BB1_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3]
|
||||
; GFX90A-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX90A-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -330,49 +316,23 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_ieee(float addrspace(1)*
|
|||
; GFX908-LABEL: global_atomic_fadd_noret_f32_ieee:
|
||||
; GFX908: ; %bb.0:
|
||||
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX908-NEXT: s_mov_b64 s[2:3], 0
|
||||
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX908-NEXT: s_load_dword s4, s[0:1], 0x0
|
||||
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX908-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX908-NEXT: BB3_1: ; %atomicrmw.start
|
||||
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX908-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX908-NEXT: v_add_f32_e32 v0, 4.0, v1
|
||||
; GFX908-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX908-NEXT: v_mov_b32_e32 v1, 4.0
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX908-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
|
||||
; GFX908-NEXT: global_atomic_add_f32 v0, v1, s[0:1]
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX908-NEXT: buffer_wbinvl1_vol
|
||||
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
||||
; GFX908-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
||||
; GFX908-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX908-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
||||
; GFX908-NEXT: s_cbranch_execnz BB3_1
|
||||
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX908-NEXT: s_endpgm
|
||||
;
|
||||
; GFX90A-LABEL: global_atomic_fadd_noret_f32_ieee:
|
||||
; GFX90A: ; %bb.0:
|
||||
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX90A-NEXT: s_mov_b64 s[2:3], 0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX90A-NEXT: BB3_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX90A-NEXT: v_add_f32_e32 v0, 4.0, v1
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v1, 4.0
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
|
||||
; GFX90A-NEXT: global_atomic_add_f32 v0, v1, s[0:1]
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
||||
; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
||||
; GFX90A-NEXT: s_cbranch_execnz BB3_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-LABEL: global_atomic_fadd_noret_f32_ieee:
|
||||
|
|
Loading…
Reference in New Issue