forked from OSchip/llvm-project
[AMDGPU] MachineLICM cannot hoist VALU
MachineLoop::isLoopInvariant() returns false for all VALU because of the exec use. Check TII::isIgnorableUse() to allow hoisting. That unfortunately results in higher register consumption since MachineLICM does not adequately estimate pressure. Therefor I think it shall only be enabled after D107677 even though it does not depend on it. Differential Revision: https://reviews.llvm.org/D107859
This commit is contained in:
parent
6185835656
commit
c80d8a8cea
|
@ -18,6 +18,7 @@
|
|||
#include "llvm/CodeGen/MachineDominators.h"
|
||||
#include "llvm/CodeGen/MachineRegisterInfo.h"
|
||||
#include "llvm/CodeGen/Passes.h"
|
||||
#include "llvm/CodeGen/TargetInstrInfo.h"
|
||||
#include "llvm/CodeGen/TargetSubtargetInfo.h"
|
||||
#include "llvm/Config/llvm-config.h"
|
||||
#include "llvm/InitializePasses.h"
|
||||
|
@ -154,7 +155,9 @@ MachineLoopInfo::findLoopPreheader(MachineLoop *L, bool SpeculativePreheader,
|
|||
bool MachineLoop::isLoopInvariant(MachineInstr &I) const {
|
||||
MachineFunction *MF = I.getParent()->getParent();
|
||||
MachineRegisterInfo *MRI = &MF->getRegInfo();
|
||||
const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
|
||||
const TargetSubtargetInfo &ST = MF->getSubtarget();
|
||||
const TargetRegisterInfo *TRI = ST.getRegisterInfo();
|
||||
const TargetInstrInfo *TII = ST.getInstrInfo();
|
||||
|
||||
// The instruction is loop invariant if all of its operands are.
|
||||
for (const MachineOperand &MO : I.operands()) {
|
||||
|
@ -174,7 +177,8 @@ bool MachineLoop::isLoopInvariant(MachineInstr &I) const {
|
|||
// However, if the physreg is known to always be caller saved/restored
|
||||
// then this use is safe to hoist.
|
||||
if (!MRI->isConstantPhysReg(Reg) &&
|
||||
!(TRI->isCallerPreservedPhysReg(Reg.asMCReg(), *I.getMF())))
|
||||
!(TRI->isCallerPreservedPhysReg(Reg.asMCReg(), *I.getMF())) &&
|
||||
!TII->isIgnorableUse(MO))
|
||||
return false;
|
||||
// Otherwise it's safe to move.
|
||||
continue;
|
||||
|
|
|
@ -120,25 +120,25 @@ define protected amdgpu_kernel void @nand(i32 addrspace(1)* %p, %S addrspace(1)*
|
|||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
||||
; CHECK-NEXT: s_mov_b64 s[4:5], 0
|
||||
; CHECK-NEXT: v_mov_b32_e32 v0, 0
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CHECK-NEXT: v_mov_b32_e32 v0, s6
|
||||
; CHECK-NEXT: v_mov_b32_e32 v1, s6
|
||||
; CHECK-NEXT: BB5_1: ; %atomicrmw.start
|
||||
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: v_mov_b32_e32 v1, v0
|
||||
; CHECK-NEXT: v_not_b32_e32 v0, v1
|
||||
; CHECK-NEXT: v_mov_b32_e32 v2, 0
|
||||
; CHECK-NEXT: v_or_b32_e32 v0, -2, v0
|
||||
; CHECK-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
|
||||
; CHECK-NEXT: v_mov_b32_e32 v3, v1
|
||||
; CHECK-NEXT: v_not_b32_e32 v1, v3
|
||||
; CHECK-NEXT: v_or_b32_e32 v2, -2, v1
|
||||
; CHECK-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] glc
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
||||
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
|
||||
; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; CHECK-NEXT: s_cbranch_execnz BB5_1
|
||||
; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
|
||||
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, 12, s[2:3]
|
||||
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
|
||||
; CHECK-NEXT: global_store_dword v[0:1], v2, off
|
||||
; CHECK-NEXT: s_endpgm
|
||||
|
@ -330,24 +330,24 @@ define protected amdgpu_kernel void @fadd(float addrspace(1)* %p, %S addrspace(1
|
|||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
||||
; CHECK-NEXT: s_mov_b64 s[4:5], 0
|
||||
; CHECK-NEXT: v_mov_b32_e32 v0, 0
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CHECK-NEXT: v_mov_b32_e32 v0, s6
|
||||
; CHECK-NEXT: v_mov_b32_e32 v1, s6
|
||||
; CHECK-NEXT: BB14_1: ; %atomicrmw.start
|
||||
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: v_mov_b32_e32 v1, v0
|
||||
; CHECK-NEXT: v_mov_b32_e32 v2, 0
|
||||
; CHECK-NEXT: v_add_f32_e32 v0, 1.0, v1
|
||||
; CHECK-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
|
||||
; CHECK-NEXT: v_mov_b32_e32 v3, v1
|
||||
; CHECK-NEXT: v_add_f32_e32 v2, 1.0, v3
|
||||
; CHECK-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] glc
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
||||
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
|
||||
; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; CHECK-NEXT: s_cbranch_execnz BB14_1
|
||||
; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0
|
||||
; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v1
|
||||
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
|
||||
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
|
||||
; CHECK-NEXT: global_store_dword v[0:1], v2, off
|
||||
|
@ -365,24 +365,24 @@ define protected amdgpu_kernel void @fsub(float addrspace(1)* %p, %S addrspace(1
|
|||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
||||
; CHECK-NEXT: s_mov_b64 s[4:5], 0
|
||||
; CHECK-NEXT: v_mov_b32_e32 v0, 0
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CHECK-NEXT: v_mov_b32_e32 v0, s6
|
||||
; CHECK-NEXT: v_mov_b32_e32 v1, s6
|
||||
; CHECK-NEXT: BB15_1: ; %atomicrmw.start
|
||||
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: v_mov_b32_e32 v1, v0
|
||||
; CHECK-NEXT: v_mov_b32_e32 v2, 0
|
||||
; CHECK-NEXT: v_add_f32_e32 v0, -1.0, v1
|
||||
; CHECK-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
|
||||
; CHECK-NEXT: v_mov_b32_e32 v3, v1
|
||||
; CHECK-NEXT: v_add_f32_e32 v2, -1.0, v3
|
||||
; CHECK-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] glc
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
||||
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
|
||||
; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; CHECK-NEXT: s_cbranch_execnz BB15_1
|
||||
; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0
|
||||
; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v1
|
||||
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
|
||||
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
|
||||
; CHECK-NEXT: global_store_dword v[0:1], v2, off
|
||||
|
|
|
@ -416,13 +416,13 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(double addrspace(1)*
|
|||
; GFX90A: ; %bb.0: ; %main_body
|
||||
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX90A-NEXT: s_mov_b64 s[2:3], 0
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
|
||||
; GFX90A-NEXT: BB24_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0
|
||||
; GFX90A-NEXT: buffer_wbl2
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
|
@ -464,13 +464,13 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(double addrsp
|
|||
; GFX90A: ; %bb.0: ; %main_body
|
||||
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX90A-NEXT: s_mov_b64 s[2:3], 0
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
|
||||
; GFX90A-NEXT: BB26_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0
|
||||
; GFX90A-NEXT: buffer_wbl2
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
|
@ -626,13 +626,13 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(double ad
|
|||
; GFX90A: ; %bb.0: ; %main_body
|
||||
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX90A-NEXT: s_mov_b64 s[2:3], 0
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
|
||||
; GFX90A-NEXT: BB34_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
|
||||
|
|
|
@ -9,108 +9,108 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32(float addrspace(1)* %ptr)
|
|||
; GFX900: ; %bb.0:
|
||||
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX900-NEXT: s_mov_b64 s[2:3], 0
|
||||
; GFX900-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0
|
||||
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX900-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX900-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX900-NEXT: BB0_1: ; %atomicrmw.start
|
||||
; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX900-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX900-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX900-NEXT: v_add_f32_e32 v0, 4.0, v1
|
||||
; GFX900-NEXT: v_mov_b32_e32 v2, v1
|
||||
; GFX900-NEXT: v_add_f32_e32 v1, 4.0, v2
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX900-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
|
||||
; GFX900-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-NEXT: buffer_wbinvl1_vol
|
||||
; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
||||
; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
|
||||
; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
||||
; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
||||
; GFX900-NEXT: s_cbranch_execnz BB0_1
|
||||
; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX900-NEXT: s_or_b64 exec, exec, s[2:3]
|
||||
; GFX900-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX900-NEXT: global_store_dword v[0:1], v1, off
|
||||
; GFX900-NEXT: s_endpgm
|
||||
;
|
||||
; GFX908-LABEL: global_atomic_fadd_ret_f32:
|
||||
; GFX908: ; %bb.0:
|
||||
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX908-NEXT: s_mov_b64 s[2:3], 0
|
||||
; GFX908-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX908-NEXT: s_load_dword s4, s[0:1], 0x0
|
||||
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX908-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX908-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX908-NEXT: BB0_1: ; %atomicrmw.start
|
||||
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX908-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX908-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX908-NEXT: v_add_f32_e32 v0, 4.0, v1
|
||||
; GFX908-NEXT: v_mov_b32_e32 v2, v1
|
||||
; GFX908-NEXT: v_add_f32_e32 v1, 4.0, v2
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX908-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
|
||||
; GFX908-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX908-NEXT: buffer_wbinvl1_vol
|
||||
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
||||
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
|
||||
; GFX908-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
||||
; GFX908-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
||||
; GFX908-NEXT: s_cbranch_execnz BB0_1
|
||||
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX908-NEXT: s_or_b64 exec, exec, s[2:3]
|
||||
; GFX908-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX908-NEXT: global_store_dword v[0:1], v1, off
|
||||
; GFX908-NEXT: s_endpgm
|
||||
;
|
||||
; GFX90A-LABEL: global_atomic_fadd_ret_f32:
|
||||
; GFX90A: ; %bb.0:
|
||||
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX90A-NEXT: s_mov_b64 s[2:3], 0
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX90A-NEXT: BB0_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX90A-NEXT: v_add_f32_e32 v0, 4.0, v1
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v3, v1
|
||||
; GFX90A-NEXT: v_add_f32_e32 v2, 4.0, v3
|
||||
; GFX90A-NEXT: buffer_wbl2
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_invl2
|
||||
; GFX90A-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
|
||||
; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
||||
; GFX90A-NEXT: s_cbranch_execnz BB0_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3]
|
||||
; GFX90A-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX90A-NEXT: global_store_dword v[0:1], v1, off
|
||||
; GFX90A-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-LABEL: global_atomic_fadd_ret_f32:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX10-NEXT: s_mov_b32 s2, 0
|
||||
; GFX10-NEXT: BB0_1: ; %atomicrmw.start
|
||||
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX10-NEXT: v_add_f32_e32 v0, 4.0, v1
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, v1
|
||||
; GFX10-NEXT: v_add_f32_e32 v1, 4.0, v2
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
|
||||
; GFX10-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: buffer_gl0_inv
|
||||
; GFX10-NEXT: buffer_gl1_inv
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
|
||||
; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2
|
||||
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
|
||||
; GFX10-NEXT: s_cbranch_execnz BB0_1
|
||||
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2
|
||||
; GFX10-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX10-NEXT: global_store_dword v[0:1], v1, off
|
||||
; GFX10-NEXT: s_endpgm
|
||||
%result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst
|
||||
store float %result, float addrspace(1)* undef
|
||||
|
@ -122,52 +122,52 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_ieee(float addrspace(1)* %
|
|||
; GFX900: ; %bb.0:
|
||||
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX900-NEXT: s_mov_b64 s[2:3], 0
|
||||
; GFX900-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0
|
||||
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX900-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX900-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX900-NEXT: BB1_1: ; %atomicrmw.start
|
||||
; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX900-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX900-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX900-NEXT: v_add_f32_e32 v0, 4.0, v1
|
||||
; GFX900-NEXT: v_mov_b32_e32 v2, v1
|
||||
; GFX900-NEXT: v_add_f32_e32 v1, 4.0, v2
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX900-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
|
||||
; GFX900-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-NEXT: buffer_wbinvl1_vol
|
||||
; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
||||
; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
|
||||
; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
||||
; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
||||
; GFX900-NEXT: s_cbranch_execnz BB1_1
|
||||
; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX900-NEXT: s_or_b64 exec, exec, s[2:3]
|
||||
; GFX900-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX900-NEXT: global_store_dword v[0:1], v1, off
|
||||
; GFX900-NEXT: s_endpgm
|
||||
;
|
||||
; GFX908-LABEL: global_atomic_fadd_ret_f32_ieee:
|
||||
; GFX908: ; %bb.0:
|
||||
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX908-NEXT: s_mov_b64 s[2:3], 0
|
||||
; GFX908-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX908-NEXT: s_load_dword s4, s[0:1], 0x0
|
||||
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX908-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX908-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX908-NEXT: BB1_1: ; %atomicrmw.start
|
||||
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX908-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX908-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX908-NEXT: v_add_f32_e32 v0, 4.0, v1
|
||||
; GFX908-NEXT: v_mov_b32_e32 v2, v1
|
||||
; GFX908-NEXT: v_add_f32_e32 v1, 4.0, v2
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX908-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
|
||||
; GFX908-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX908-NEXT: buffer_wbinvl1_vol
|
||||
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
||||
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
|
||||
; GFX908-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
||||
; GFX908-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
||||
; GFX908-NEXT: s_cbranch_execnz BB1_1
|
||||
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX908-NEXT: s_or_b64 exec, exec, s[2:3]
|
||||
; GFX908-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX908-NEXT: global_store_dword v[0:1], v1, off
|
||||
; GFX908-NEXT: s_endpgm
|
||||
;
|
||||
; GFX90A-LABEL: global_atomic_fadd_ret_f32_ieee:
|
||||
|
@ -185,29 +185,29 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_ieee(float addrspace(1)* %
|
|||
; GFX10-LABEL: global_atomic_fadd_ret_f32_ieee:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX10-NEXT: s_mov_b32 s2, 0
|
||||
; GFX10-NEXT: BB1_1: ; %atomicrmw.start
|
||||
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX10-NEXT: v_add_f32_e32 v0, 4.0, v1
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, v1
|
||||
; GFX10-NEXT: v_add_f32_e32 v1, 4.0, v2
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
|
||||
; GFX10-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: buffer_gl0_inv
|
||||
; GFX10-NEXT: buffer_gl1_inv
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
|
||||
; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2
|
||||
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
|
||||
; GFX10-NEXT: s_cbranch_execnz BB1_1
|
||||
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2
|
||||
; GFX10-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX10-NEXT: global_store_dword v[0:1], v1, off
|
||||
; GFX10-NEXT: s_endpgm
|
||||
%result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst
|
||||
store float %result, float addrspace(1)* undef
|
||||
|
@ -219,13 +219,13 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32(float addrspace(1)* %ptr
|
|||
; GFX900: ; %bb.0:
|
||||
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX900-NEXT: s_mov_b64 s[2:3], 0
|
||||
; GFX900-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0
|
||||
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX900-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX900-NEXT: BB2_1: ; %atomicrmw.start
|
||||
; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX900-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX900-NEXT: v_add_f32_e32 v0, 4.0, v1
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX900-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
|
||||
|
@ -264,6 +264,7 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32(float addrspace(1)* %ptr
|
|||
; GFX10-LABEL: global_atomic_fadd_noret_f32:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
|
@ -271,7 +272,6 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32(float addrspace(1)* %ptr
|
|||
; GFX10-NEXT: s_mov_b32 s2, 0
|
||||
; GFX10-NEXT: BB2_1: ; %atomicrmw.start
|
||||
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX10-NEXT: v_add_f32_e32 v0, 4.0, v1
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
|
@ -295,13 +295,13 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_ieee(float addrspace(1)*
|
|||
; GFX900: ; %bb.0:
|
||||
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX900-NEXT: s_mov_b64 s[2:3], 0
|
||||
; GFX900-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0
|
||||
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX900-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX900-NEXT: BB3_1: ; %atomicrmw.start
|
||||
; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX900-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX900-NEXT: v_add_f32_e32 v0, 4.0, v1
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX900-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
|
||||
|
@ -340,6 +340,7 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_ieee(float addrspace(1)*
|
|||
; GFX10-LABEL: global_atomic_fadd_noret_f32_ieee:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
|
@ -347,7 +348,6 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_ieee(float addrspace(1)*
|
|||
; GFX10-NEXT: s_mov_b32 s2, 0
|
||||
; GFX10-NEXT: BB3_1: ; %atomicrmw.start
|
||||
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX10-NEXT: v_add_f32_e32 v0, 4.0, v1
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
|
@ -371,52 +371,52 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_agent(float addrspace(1)*
|
|||
; GFX900: ; %bb.0:
|
||||
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX900-NEXT: s_mov_b64 s[2:3], 0
|
||||
; GFX900-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0
|
||||
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX900-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX900-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX900-NEXT: BB4_1: ; %atomicrmw.start
|
||||
; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX900-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX900-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX900-NEXT: v_add_f32_e32 v0, 4.0, v1
|
||||
; GFX900-NEXT: v_mov_b32_e32 v2, v1
|
||||
; GFX900-NEXT: v_add_f32_e32 v1, 4.0, v2
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX900-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
|
||||
; GFX900-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-NEXT: buffer_wbinvl1_vol
|
||||
; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
||||
; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
|
||||
; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
||||
; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
||||
; GFX900-NEXT: s_cbranch_execnz BB4_1
|
||||
; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX900-NEXT: s_or_b64 exec, exec, s[2:3]
|
||||
; GFX900-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX900-NEXT: global_store_dword v[0:1], v1, off
|
||||
; GFX900-NEXT: s_endpgm
|
||||
;
|
||||
; GFX908-LABEL: global_atomic_fadd_ret_f32_agent:
|
||||
; GFX908: ; %bb.0:
|
||||
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX908-NEXT: s_mov_b64 s[2:3], 0
|
||||
; GFX908-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX908-NEXT: s_load_dword s4, s[0:1], 0x0
|
||||
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX908-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX908-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX908-NEXT: BB4_1: ; %atomicrmw.start
|
||||
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX908-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX908-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX908-NEXT: v_add_f32_e32 v0, 4.0, v1
|
||||
; GFX908-NEXT: v_mov_b32_e32 v2, v1
|
||||
; GFX908-NEXT: v_add_f32_e32 v1, 4.0, v2
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX908-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
|
||||
; GFX908-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX908-NEXT: buffer_wbinvl1_vol
|
||||
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
||||
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
|
||||
; GFX908-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
||||
; GFX908-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
||||
; GFX908-NEXT: s_cbranch_execnz BB4_1
|
||||
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX908-NEXT: s_or_b64 exec, exec, s[2:3]
|
||||
; GFX908-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX908-NEXT: global_store_dword v[0:1], v1, off
|
||||
; GFX908-NEXT: s_endpgm
|
||||
;
|
||||
; GFX90A-LABEL: global_atomic_fadd_ret_f32_agent:
|
||||
|
@ -434,29 +434,29 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_agent(float addrspace(1)*
|
|||
; GFX10-LABEL: global_atomic_fadd_ret_f32_agent:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX10-NEXT: s_mov_b32 s2, 0
|
||||
; GFX10-NEXT: BB4_1: ; %atomicrmw.start
|
||||
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX10-NEXT: v_add_f32_e32 v0, 4.0, v1
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, v1
|
||||
; GFX10-NEXT: v_add_f32_e32 v1, 4.0, v2
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
|
||||
; GFX10-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: buffer_gl0_inv
|
||||
; GFX10-NEXT: buffer_gl1_inv
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
|
||||
; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2
|
||||
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
|
||||
; GFX10-NEXT: s_cbranch_execnz BB4_1
|
||||
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2
|
||||
; GFX10-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX10-NEXT: global_store_dword v[0:1], v1, off
|
||||
; GFX10-NEXT: s_endpgm
|
||||
%result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst
|
||||
store float %result, float addrspace(1)* undef
|
||||
|
@ -468,108 +468,108 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_system(float addrspace(1)*
|
|||
; GFX900: ; %bb.0:
|
||||
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX900-NEXT: s_mov_b64 s[2:3], 0
|
||||
; GFX900-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0
|
||||
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX900-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX900-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX900-NEXT: BB5_1: ; %atomicrmw.start
|
||||
; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX900-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX900-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX900-NEXT: v_add_f32_e32 v0, 4.0, v1
|
||||
; GFX900-NEXT: v_mov_b32_e32 v2, v1
|
||||
; GFX900-NEXT: v_add_f32_e32 v1, 4.0, v2
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
|
||||
; GFX900-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-NEXT: buffer_wbinvl1_vol
|
||||
; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
||||
; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
|
||||
; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
||||
; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
||||
; GFX900-NEXT: s_cbranch_execnz BB5_1
|
||||
; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX900-NEXT: s_or_b64 exec, exec, s[2:3]
|
||||
; GFX900-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX900-NEXT: global_store_dword v[0:1], v1, off
|
||||
; GFX900-NEXT: s_endpgm
|
||||
;
|
||||
; GFX908-LABEL: global_atomic_fadd_ret_f32_system:
|
||||
; GFX908: ; %bb.0:
|
||||
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX908-NEXT: s_mov_b64 s[2:3], 0
|
||||
; GFX908-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX908-NEXT: s_load_dword s4, s[0:1], 0x0
|
||||
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX908-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX908-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX908-NEXT: BB5_1: ; %atomicrmw.start
|
||||
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX908-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX908-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX908-NEXT: v_add_f32_e32 v0, 4.0, v1
|
||||
; GFX908-NEXT: v_mov_b32_e32 v2, v1
|
||||
; GFX908-NEXT: v_add_f32_e32 v1, 4.0, v2
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX908-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
|
||||
; GFX908-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX908-NEXT: buffer_wbinvl1_vol
|
||||
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
||||
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
|
||||
; GFX908-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
||||
; GFX908-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
||||
; GFX908-NEXT: s_cbranch_execnz BB5_1
|
||||
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX908-NEXT: s_or_b64 exec, exec, s[2:3]
|
||||
; GFX908-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX908-NEXT: global_store_dword v[0:1], v1, off
|
||||
; GFX908-NEXT: s_endpgm
|
||||
;
|
||||
; GFX90A-LABEL: global_atomic_fadd_ret_f32_system:
|
||||
; GFX90A: ; %bb.0:
|
||||
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX90A-NEXT: s_mov_b64 s[2:3], 0
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX90A-NEXT: BB5_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX90A-NEXT: v_add_f32_e32 v0, 4.0, v1
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v3, v1
|
||||
; GFX90A-NEXT: v_add_f32_e32 v2, 4.0, v3
|
||||
; GFX90A-NEXT: buffer_wbl2
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: buffer_invl2
|
||||
; GFX90A-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
||||
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
|
||||
; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
||||
; GFX90A-NEXT: s_cbranch_execnz BB5_1
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3]
|
||||
; GFX90A-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX90A-NEXT: global_store_dword v[0:1], v1, off
|
||||
; GFX90A-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-LABEL: global_atomic_fadd_ret_f32_system:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX10-NEXT: s_mov_b32 s2, 0
|
||||
; GFX10-NEXT: BB5_1: ; %atomicrmw.start
|
||||
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX10-NEXT: v_add_f32_e32 v0, 4.0, v1
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, v1
|
||||
; GFX10-NEXT: v_add_f32_e32 v1, 4.0, v2
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
|
||||
; GFX10-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: buffer_gl0_inv
|
||||
; GFX10-NEXT: buffer_gl1_inv
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
|
||||
; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2
|
||||
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
|
||||
; GFX10-NEXT: s_cbranch_execnz BB5_1
|
||||
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2
|
||||
; GFX10-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX10-NEXT: global_store_dword v[0:1], v1, off
|
||||
; GFX10-NEXT: s_endpgm
|
||||
%result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("one-as") seq_cst
|
||||
store float %result, float addrspace(1)* undef
|
||||
|
@ -581,26 +581,26 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_wrong_subtarget(float addr
|
|||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GCN-NEXT: s_mov_b64 s[2:3], 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_load_dword s4, s[0:1], 0x0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GCN-NEXT: BB6_1: ; %atomicrmw.start
|
||||
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GCN-NEXT: v_add_f32_e32 v0, 4.0, v1
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, v1
|
||||
; GCN-NEXT: v_add_f32_e32 v1, 4.0, v2
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
|
||||
; GCN-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: buffer_wbinvl1_vol
|
||||
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
||||
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
|
||||
; GCN-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
||||
; GCN-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
||||
; GCN-NEXT: s_cbranch_execnz BB6_1
|
||||
; GCN-NEXT: ; %bb.2: ; %atomicrmw.end
|
||||
; GCN-NEXT: s_or_b64 exec, exec, s[2:3]
|
||||
; GCN-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GCN-NEXT: global_store_dword v[0:1], v1, off
|
||||
; GCN-NEXT: s_endpgm
|
||||
%result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst
|
||||
store float %result, float addrspace(1)* undef
|
||||
|
@ -627,13 +627,13 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_safe(float addrspace(1)*
|
|||
; GFX900: ; %bb.0:
|
||||
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX900-NEXT: s_mov_b64 s[2:3], 0
|
||||
; GFX900-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0
|
||||
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX900-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX900-NEXT: BB8_1: ; %atomicrmw.start
|
||||
; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX900-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX900-NEXT: v_add_f32_e32 v0, 4.0, v1
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX900-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
|
||||
|
@ -651,13 +651,13 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_safe(float addrspace(1)*
|
|||
; GFX908: ; %bb.0:
|
||||
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX908-NEXT: s_mov_b64 s[2:3], 0
|
||||
; GFX908-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX908-NEXT: s_load_dword s4, s[0:1], 0x0
|
||||
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX908-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX908-NEXT: BB8_1: ; %atomicrmw.start
|
||||
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX908-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX908-NEXT: v_add_f32_e32 v0, 4.0, v1
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX908-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
|
||||
|
@ -675,13 +675,13 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_safe(float addrspace(1)*
|
|||
; GFX90A: ; %bb.0:
|
||||
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX90A-NEXT: s_mov_b64 s[2:3], 0
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x0
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX90A-NEXT: BB8_1: ; %atomicrmw.start
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX90A-NEXT: v_add_f32_e32 v0, 4.0, v1
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
|
||||
|
@ -698,6 +698,7 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_safe(float addrspace(1)*
|
|||
; GFX10-LABEL: global_atomic_fadd_noret_f32_safe:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
|
@ -705,7 +706,6 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_safe(float addrspace(1)*
|
|||
; GFX10-NEXT: s_mov_b32 s2, 0
|
||||
; GFX10-NEXT: BB8_1: ; %atomicrmw.start
|
||||
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX10-NEXT: v_add_f32_e32 v0, 4.0, v1
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
|
@ -729,6 +729,7 @@ define amdgpu_kernel void @infer_as_before_atomic(float* addrspace(4)* %arg) #0
|
|||
; GFX900: ; %bb.0:
|
||||
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX900-NEXT: s_mov_b64 s[2:3], 0
|
||||
; GFX900-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
|
||||
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
||||
|
@ -738,7 +739,6 @@ define amdgpu_kernel void @infer_as_before_atomic(float* addrspace(4)* %arg) #0
|
|||
; GFX900-NEXT: BB9_1: ; %atomicrmw.start
|
||||
; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX900-NEXT: v_add_f32_e32 v0, 1.0, v1
|
||||
; GFX900-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX900-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
||||
|
@ -774,6 +774,7 @@ define amdgpu_kernel void @infer_as_before_atomic(float* addrspace(4)* %arg) #0
|
|||
; GFX10-LABEL: infer_as_before_atomic:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
|
@ -784,7 +785,6 @@ define amdgpu_kernel void @infer_as_before_atomic(float* addrspace(4)* %arg) #0
|
|||
; GFX10-NEXT: BB9_1: ; %atomicrmw.start
|
||||
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX10-NEXT: v_add_f32_e32 v0, 1.0, v1
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX10-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
|
||||
|
|
|
@ -318,12 +318,12 @@ define void @test_indirect_call_vgpr_ptr_arg(void(i32)* %fptr) {
|
|||
; GCN-NEXT: s_mov_b64 s[38:39], s[6:7]
|
||||
; GCN-NEXT: s_mov_b64 s[40:41], s[4:5]
|
||||
; GCN-NEXT: s_mov_b64 s[46:47], exec
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, 0x7b
|
||||
; GCN-NEXT: BB3_1: ; =>This Inner Loop Header: Depth=1
|
||||
; GCN-NEXT: v_readfirstlane_b32 s16, v0
|
||||
; GCN-NEXT: v_readfirstlane_b32 s17, v1
|
||||
; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1]
|
||||
; GCN-NEXT: s_and_saveexec_b64 s[48:49], vcc
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0x7b
|
||||
; GCN-NEXT: s_mov_b64 s[4:5], s[40:41]
|
||||
; GCN-NEXT: s_mov_b64 s[6:7], s[38:39]
|
||||
; GCN-NEXT: s_mov_b64 s[8:9], s[36:37]
|
||||
|
@ -331,9 +331,11 @@ define void @test_indirect_call_vgpr_ptr_arg(void(i32)* %fptr) {
|
|||
; GCN-NEXT: s_mov_b32 s12, s44
|
||||
; GCN-NEXT: s_mov_b32 s13, s43
|
||||
; GCN-NEXT: s_mov_b32 s14, s42
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
|
||||
; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1
|
||||
; GCN-NEXT: ; implicit-def: $vgpr31
|
||||
; GCN-NEXT: ; implicit-def: $vgpr2
|
||||
; GCN-NEXT: s_xor_b64 exec, exec, s[48:49]
|
||||
; GCN-NEXT: s_cbranch_execnz BB3_1
|
||||
; GCN-NEXT: ; %bb.2:
|
||||
|
|
|
@ -147,6 +147,7 @@ define amdgpu_kernel void @infinite_loop_nest_ret(i32 addrspace(1)* %out) {
|
|||
; SI-NEXT: v_cmp_ne_u32_e64 s[0:1], 3, v0
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: v_mov_b32_e32 v0, 0x3e7
|
||||
; SI-NEXT: BB3_2: ; %outer_loop
|
||||
; SI-NEXT: ; =>This Loop Header: Depth=1
|
||||
; SI-NEXT: ; Child Loop BB3_3 Depth 2
|
||||
|
@ -156,8 +157,6 @@ define amdgpu_kernel void @infinite_loop_nest_ret(i32 addrspace(1)* %out) {
|
|||
; SI-NEXT: ; => This Inner Loop Header: Depth=2
|
||||
; SI-NEXT: s_and_b64 s[8:9], exec, s[0:1]
|
||||
; SI-NEXT: s_or_b64 s[2:3], s[8:9], s[2:3]
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v0, 0x3e7
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
|
|
|
@ -61,43 +61,44 @@ define void @lsr_order_mul24_1(i32 %arg, i32 %arg1, i32 %arg2, float addrspace(3
|
|||
; GFX9-NEXT: s_cbranch_execz BB1_3
|
||||
; GFX9-NEXT: ; %bb.1: ; %bb19
|
||||
; GFX9-NEXT: v_cvt_f32_u32_e32 v7, v6
|
||||
; GFX9-NEXT: v_add_u32_e32 v4, v4, v0
|
||||
; GFX9-NEXT: v_and_b32_e32 v5, 0xffffff, v6
|
||||
; GFX9-NEXT: v_add_u32_e32 v6, v4, v0
|
||||
; GFX9-NEXT: v_lshl_add_u32 v3, v6, 2, v3
|
||||
; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v7
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v6, 2, v2
|
||||
; GFX9-NEXT: v_add_u32_e32 v7, v17, v12
|
||||
; GFX9-NEXT: v_lshl_add_u32 v6, v4, 2, v3
|
||||
; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v7
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v8, 2, v2
|
||||
; GFX9-NEXT: v_add_u32_e32 v9, v17, v12
|
||||
; GFX9-NEXT: s_mov_b64 s[10:11], 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, 0x3727c5ac
|
||||
; GFX9-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GFX9-NEXT: BB1_2: ; %bb23
|
||||
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v0
|
||||
; GFX9-NEXT: v_add_u32_e32 v9, v17, v0
|
||||
; GFX9-NEXT: v_add_u32_e32 v12, v7, v0
|
||||
; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v0
|
||||
; GFX9-NEXT: v_add_u32_e32 v12, v17, v0
|
||||
; GFX9-NEXT: v_add_u32_e32 v19, v9, v0
|
||||
; GFX9-NEXT: v_add_u32_e32 v0, v0, v2
|
||||
; GFX9-NEXT: v_madak_f32 v8, v8, v4, 0x3727c5ac
|
||||
; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v8
|
||||
; GFX9-NEXT: v_mul_u32_u24_e32 v18, v8, v5
|
||||
; GFX9-NEXT: v_add_u32_e32 v8, v8, v16
|
||||
; GFX9-NEXT: v_cmp_lt_u32_e64 s[4:5], v8, v13
|
||||
; GFX9-NEXT: v_mul_lo_u32 v8, v8, v15
|
||||
; GFX9-NEXT: v_sub_u32_e32 v19, v9, v18
|
||||
; GFX9-NEXT: v_cmp_lt_u32_e64 s[6:7], v19, v14
|
||||
; GFX9-NEXT: v_madak_f32 v3, v3, v7, 0x3727c5ac
|
||||
; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3
|
||||
; GFX9-NEXT: v_mul_u32_u24_e32 v18, v3, v5
|
||||
; GFX9-NEXT: v_add_u32_e32 v3, v3, v16
|
||||
; GFX9-NEXT: v_cmp_lt_u32_e64 s[4:5], v3, v13
|
||||
; GFX9-NEXT: v_mul_lo_u32 v3, v3, v15
|
||||
; GFX9-NEXT: v_sub_u32_e32 v12, v12, v18
|
||||
; GFX9-NEXT: v_cmp_lt_u32_e64 s[6:7], v12, v14
|
||||
; GFX9-NEXT: v_sub_u32_e32 v18, v19, v18
|
||||
; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
|
||||
; GFX9-NEXT: v_add_u32_e32 v8, v12, v8
|
||||
; GFX9-NEXT: v_add_u32_e32 v3, v18, v3
|
||||
; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], vcc
|
||||
; GFX9-NEXT: v_mov_b32_e32 v9, 0
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5]
|
||||
; GFX9-NEXT: v_lshlrev_b64 v[8:9], 2, v[8:9]
|
||||
; GFX9-NEXT: v_add_co_u32_e64 v8, s[6:7], v10, v8
|
||||
; GFX9-NEXT: v_addc_co_u32_e64 v9, s[6:7], v11, v9, s[6:7]
|
||||
; GFX9-NEXT: global_load_dword v8, v[8:9], off
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[4:5]
|
||||
; GFX9-NEXT: v_lshlrev_b64 v[18:19], 2, v[3:4]
|
||||
; GFX9-NEXT: v_add_co_u32_e64 v18, s[6:7], v10, v18
|
||||
; GFX9-NEXT: v_addc_co_u32_e64 v19, s[6:7], v11, v19, s[6:7]
|
||||
; GFX9-NEXT: global_load_dword v3, v[18:19], off
|
||||
; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v0, v1
|
||||
; GFX9-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11]
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5]
|
||||
; GFX9-NEXT: ds_write_b32 v3, v8
|
||||
; GFX9-NEXT: v_add_u32_e32 v3, v3, v6
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[4:5]
|
||||
; GFX9-NEXT: ds_write_b32 v6, v3
|
||||
; GFX9-NEXT: v_add_u32_e32 v6, v6, v8
|
||||
; GFX9-NEXT: s_andn2_b64 exec, exec, s[10:11]
|
||||
; GFX9-NEXT: s_cbranch_execnz BB1_2
|
||||
; GFX9-NEXT: BB1_3: ; %Flow3
|
||||
|
|
|
@ -384,22 +384,22 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) {
|
|||
; GCN-IR-NEXT: v_xor_b32_e32 v0, v4, v0
|
||||
; GCN-IR-NEXT: v_ashrrev_i32_e32 v5, 31, v3
|
||||
; GCN-IR-NEXT: v_xor_b32_e32 v1, v4, v1
|
||||
; GCN-IR-NEXT: v_sub_i32_e32 v9, vcc, v0, v4
|
||||
; GCN-IR-NEXT: v_subb_u32_e32 v10, vcc, v1, v4, vcc
|
||||
; GCN-IR-NEXT: v_sub_i32_e32 v11, vcc, v0, v4
|
||||
; GCN-IR-NEXT: v_subb_u32_e32 v12, vcc, v1, v4, vcc
|
||||
; GCN-IR-NEXT: v_xor_b32_e32 v1, v5, v2
|
||||
; GCN-IR-NEXT: v_xor_b32_e32 v0, v5, v3
|
||||
; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, v1, v5
|
||||
; GCN-IR-NEXT: v_subb_u32_e32 v3, vcc, v0, v5, vcc
|
||||
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
|
||||
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[9:10]
|
||||
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[11:12]
|
||||
; GCN-IR-NEXT: v_ffbh_u32_e32 v0, v2
|
||||
; GCN-IR-NEXT: s_or_b64 s[6:7], vcc, s[4:5]
|
||||
; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 32, v0
|
||||
; GCN-IR-NEXT: v_ffbh_u32_e32 v7, v3
|
||||
; GCN-IR-NEXT: v_min_u32_e32 v0, v0, v7
|
||||
; GCN-IR-NEXT: v_ffbh_u32_e32 v7, v9
|
||||
; GCN-IR-NEXT: v_ffbh_u32_e32 v7, v11
|
||||
; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 32, v7
|
||||
; GCN-IR-NEXT: v_ffbh_u32_e32 v8, v10
|
||||
; GCN-IR-NEXT: v_ffbh_u32_e32 v8, v12
|
||||
; GCN-IR-NEXT: v_min_u32_e32 v13, v7, v8
|
||||
; GCN-IR-NEXT: v_sub_i32_e32 v7, vcc, v0, v13
|
||||
; GCN-IR-NEXT: v_subb_u32_e64 v8, s[4:5], 0, 0, vcc
|
||||
|
@ -410,59 +410,59 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) {
|
|||
; GCN-IR-NEXT: v_mov_b32_e32 v17, 0
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v6, v4
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v1, v5
|
||||
; GCN-IR-NEXT: v_cndmask_b32_e64 v12, v10, 0, s[6:7]
|
||||
; GCN-IR-NEXT: v_cndmask_b32_e64 v10, v12, 0, s[6:7]
|
||||
; GCN-IR-NEXT: s_and_b64 s[4:5], s[8:9], s[4:5]
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v14, v17
|
||||
; GCN-IR-NEXT: v_cndmask_b32_e64 v11, v9, 0, s[6:7]
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v16, v17
|
||||
; GCN-IR-NEXT: v_cndmask_b32_e64 v9, v11, 0, s[6:7]
|
||||
; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
|
||||
; GCN-IR-NEXT: s_cbranch_execz BB1_6
|
||||
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
|
||||
; GCN-IR-NEXT: v_add_i32_e32 v15, vcc, 1, v7
|
||||
; GCN-IR-NEXT: v_addc_u32_e32 v16, vcc, 0, v8, vcc
|
||||
; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[15:16], v[7:8]
|
||||
; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, 1, v7
|
||||
; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, 0, v8, vcc
|
||||
; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[14:15], v[7:8]
|
||||
; GCN-IR-NEXT: v_sub_i32_e64 v7, s[4:5], 63, v7
|
||||
; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[9:10], v7
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v11, 0
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v12, 0
|
||||
; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[11:12], v7
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v9, 0
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v10, 0
|
||||
; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
|
||||
; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
|
||||
; GCN-IR-NEXT: s_cbranch_execz BB1_5
|
||||
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
|
||||
; GCN-IR-NEXT: v_add_i32_e32 v19, vcc, -1, v2
|
||||
; GCN-IR-NEXT: v_addc_u32_e32 v20, vcc, -1, v3, vcc
|
||||
; GCN-IR-NEXT: v_add_i32_e32 v18, vcc, -1, v2
|
||||
; GCN-IR-NEXT: v_addc_u32_e32 v19, vcc, -1, v3, vcc
|
||||
; GCN-IR-NEXT: v_not_b32_e32 v0, v0
|
||||
; GCN-IR-NEXT: v_lshr_b64 v[15:16], v[9:10], v15
|
||||
; GCN-IR-NEXT: v_not_b32_e32 v10, v17
|
||||
; GCN-IR-NEXT: v_lshr_b64 v[14:15], v[11:12], v14
|
||||
; GCN-IR-NEXT: v_not_b32_e32 v9, v17
|
||||
; GCN-IR-NEXT: v_add_i32_e32 v11, vcc, v0, v13
|
||||
; GCN-IR-NEXT: v_addc_u32_e32 v12, vcc, v9, v16, vcc
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v16, 0
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v17, 0
|
||||
; GCN-IR-NEXT: v_add_i32_e32 v9, vcc, v0, v13
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v18, 0
|
||||
; GCN-IR-NEXT: v_addc_u32_e32 v10, vcc, v10, v14, vcc
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v10, 0
|
||||
; GCN-IR-NEXT: BB1_3: ; %udiv-do-while
|
||||
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GCN-IR-NEXT: v_lshl_b64 v[13:14], v[15:16], 1
|
||||
; GCN-IR-NEXT: v_lshl_b64 v[14:15], v[14:15], 1
|
||||
; GCN-IR-NEXT: v_lshrrev_b32_e32 v0, 31, v8
|
||||
; GCN-IR-NEXT: v_or_b32_e32 v0, v13, v0
|
||||
; GCN-IR-NEXT: v_or_b32_e32 v0, v14, v0
|
||||
; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[7:8], 1
|
||||
; GCN-IR-NEXT: v_sub_i32_e32 v11, vcc, v19, v0
|
||||
; GCN-IR-NEXT: v_subb_u32_e32 v11, vcc, v20, v14, vcc
|
||||
; GCN-IR-NEXT: v_or_b32_e32 v7, v17, v7
|
||||
; GCN-IR-NEXT: v_add_i32_e32 v17, vcc, 1, v9
|
||||
; GCN-IR-NEXT: v_or_b32_e32 v8, v18, v8
|
||||
; GCN-IR-NEXT: v_ashrrev_i32_e32 v13, 31, v11
|
||||
; GCN-IR-NEXT: v_addc_u32_e32 v18, vcc, 0, v10, vcc
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v12, 0
|
||||
; GCN-IR-NEXT: v_and_b32_e32 v11, 1, v13
|
||||
; GCN-IR-NEXT: v_and_b32_e32 v16, v13, v3
|
||||
; GCN-IR-NEXT: v_sub_i32_e32 v9, vcc, v18, v0
|
||||
; GCN-IR-NEXT: v_subb_u32_e32 v9, vcc, v19, v15, vcc
|
||||
; GCN-IR-NEXT: v_or_b32_e32 v7, v16, v7
|
||||
; GCN-IR-NEXT: v_add_i32_e32 v16, vcc, 1, v11
|
||||
; GCN-IR-NEXT: v_or_b32_e32 v8, v17, v8
|
||||
; GCN-IR-NEXT: v_ashrrev_i32_e32 v13, 31, v9
|
||||
; GCN-IR-NEXT: v_addc_u32_e32 v17, vcc, 0, v12, vcc
|
||||
; GCN-IR-NEXT: v_and_b32_e32 v9, 1, v13
|
||||
; GCN-IR-NEXT: v_and_b32_e32 v20, v13, v3
|
||||
; GCN-IR-NEXT: v_and_b32_e32 v13, v13, v2
|
||||
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[17:18], v[9:10]
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v9, v17
|
||||
; GCN-IR-NEXT: v_sub_i32_e64 v15, s[4:5], v0, v13
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v10, v18
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v18, v12
|
||||
; GCN-IR-NEXT: v_subb_u32_e64 v16, s[4:5], v14, v16, s[4:5]
|
||||
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[16:17], v[11:12]
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v11, v16
|
||||
; GCN-IR-NEXT: v_sub_i32_e64 v14, s[4:5], v0, v13
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v12, v17
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v17, v10
|
||||
; GCN-IR-NEXT: v_subb_u32_e64 v15, s[4:5], v15, v20, s[4:5]
|
||||
; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v17, v11
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v16, v9
|
||||
; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
|
||||
; GCN-IR-NEXT: s_cbranch_execnz BB1_3
|
||||
; GCN-IR-NEXT: ; %bb.4: ; %Flow
|
||||
|
@ -470,14 +470,14 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) {
|
|||
; GCN-IR-NEXT: BB1_5: ; %Flow3
|
||||
; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
|
||||
; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[7:8], 1
|
||||
; GCN-IR-NEXT: v_or_b32_e32 v12, v12, v3
|
||||
; GCN-IR-NEXT: v_or_b32_e32 v11, v11, v2
|
||||
; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v3
|
||||
; GCN-IR-NEXT: v_or_b32_e32 v9, v9, v2
|
||||
; GCN-IR-NEXT: BB1_6: ; %Flow4
|
||||
; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7]
|
||||
; GCN-IR-NEXT: v_xor_b32_e32 v0, v5, v4
|
||||
; GCN-IR-NEXT: v_xor_b32_e32 v1, v1, v6
|
||||
; GCN-IR-NEXT: v_xor_b32_e32 v3, v11, v0
|
||||
; GCN-IR-NEXT: v_xor_b32_e32 v2, v12, v1
|
||||
; GCN-IR-NEXT: v_xor_b32_e32 v3, v9, v0
|
||||
; GCN-IR-NEXT: v_xor_b32_e32 v2, v10, v1
|
||||
; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v3, v0
|
||||
; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc
|
||||
; GCN-IR-NEXT: s_setpc_b64 s[30:31]
|
||||
|
@ -1474,10 +1474,11 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) {
|
|||
; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, -1, v0
|
||||
; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v1, vcc
|
||||
; GCN-IR-NEXT: v_lshr_b64 v[10:11], 24, v10
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v12, 0
|
||||
; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, 58, v8
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v13, 0
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v12, 0
|
||||
; GCN-IR-NEXT: v_subb_u32_e32 v9, vcc, 0, v9, vcc
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v13, 0
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v7, 0
|
||||
; GCN-IR-NEXT: BB11_3: ; %udiv-do-while
|
||||
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1
|
||||
|
@ -1494,7 +1495,6 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) {
|
|||
; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v8
|
||||
; GCN-IR-NEXT: v_or_b32_e32 v5, v13, v5
|
||||
; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v9, vcc
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v7, 0
|
||||
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[12:13], v[8:9]
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v8, v12
|
||||
; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v17
|
||||
|
@ -1647,21 +1647,21 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
|
|||
; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v0
|
||||
; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 32, v4
|
||||
; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v1
|
||||
; GCN-IR-NEXT: v_min_u32_e32 v6, v4, v5
|
||||
; GCN-IR-NEXT: v_min_u32_e32 v8, v4, v5
|
||||
; GCN-IR-NEXT: s_movk_i32 s6, 0xffd0
|
||||
; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, s6, v6
|
||||
; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, s6, v8
|
||||
; GCN-IR-NEXT: v_addc_u32_e64 v5, s[6:7], 0, -1, vcc
|
||||
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
|
||||
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5]
|
||||
; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v8, s8
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v6, s8
|
||||
; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc
|
||||
; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[4:5]
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v7, 0
|
||||
; GCN-IR-NEXT: v_cndmask_b32_e64 v8, v8, 0, s[4:5]
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v9, 0
|
||||
; GCN-IR-NEXT: v_cndmask_b32_e64 v6, v6, 0, s[4:5]
|
||||
; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v9, v7
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v7, v9
|
||||
; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], vcc
|
||||
; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
|
||||
; GCN-IR-NEXT: s_cbranch_execz BB12_6
|
||||
|
@ -1671,8 +1671,8 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
|
|||
; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[10:11], v[4:5]
|
||||
; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v4
|
||||
; GCN-IR-NEXT: v_lshl_b64 v[4:5], s[8:9], v4
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v8, 0
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v9, 0
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v6, 0
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v7, 0
|
||||
; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
|
||||
; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
|
||||
|
@ -1682,35 +1682,35 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
|
|||
; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000
|
||||
; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v1, vcc
|
||||
; GCN-IR-NEXT: v_lshr_b64 v[10:11], s[4:5], v10
|
||||
; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, 47, v8
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v12, 0
|
||||
; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v6
|
||||
; GCN-IR-NEXT: v_subb_u32_e32 v9, vcc, 0, v9, vcc
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v13, 0
|
||||
; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, 0, v7, vcc
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v7, 0
|
||||
; GCN-IR-NEXT: BB12_3: ; %udiv-do-while
|
||||
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1
|
||||
; GCN-IR-NEXT: v_lshrrev_b32_e32 v8, 31, v5
|
||||
; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v8
|
||||
; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v5
|
||||
; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v6
|
||||
; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1
|
||||
; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, v14, v10
|
||||
; GCN-IR-NEXT: v_subb_u32_e32 v8, vcc, v15, v11, vcc
|
||||
; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v14, v10
|
||||
; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v15, v11, vcc
|
||||
; GCN-IR-NEXT: v_or_b32_e32 v4, v12, v4
|
||||
; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v8
|
||||
; GCN-IR-NEXT: v_and_b32_e32 v8, 1, v12
|
||||
; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v6
|
||||
; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v12
|
||||
; GCN-IR-NEXT: v_and_b32_e32 v16, v12, v1
|
||||
; GCN-IR-NEXT: v_and_b32_e32 v17, v12, v0
|
||||
; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v6
|
||||
; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v8
|
||||
; GCN-IR-NEXT: v_or_b32_e32 v5, v13, v5
|
||||
; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v7, vcc
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v9, 0
|
||||
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[12:13], v[6:7]
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v6, v12
|
||||
; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v9, vcc
|
||||
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[12:13], v[8:9]
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v8, v12
|
||||
; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v17
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v7, v13
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v13, v9
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v9, v13
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v13, v7
|
||||
; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v16, s[4:5]
|
||||
; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v12, v8
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v12, v6
|
||||
; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
|
||||
; GCN-IR-NEXT: s_cbranch_execnz BB12_3
|
||||
; GCN-IR-NEXT: ; %bb.4: ; %Flow
|
||||
|
@ -1718,12 +1718,12 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
|
|||
; GCN-IR-NEXT: BB12_5: ; %Flow3
|
||||
; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
|
||||
; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[4:5], 1
|
||||
; GCN-IR-NEXT: v_or_b32_e32 v9, v9, v1
|
||||
; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v0
|
||||
; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v1
|
||||
; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v0
|
||||
; GCN-IR-NEXT: BB12_6: ; %Flow4
|
||||
; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7]
|
||||
; GCN-IR-NEXT: v_xor_b32_e32 v0, v8, v2
|
||||
; GCN-IR-NEXT: v_xor_b32_e32 v1, v9, v3
|
||||
; GCN-IR-NEXT: v_xor_b32_e32 v0, v6, v2
|
||||
; GCN-IR-NEXT: v_xor_b32_e32 v1, v7, v3
|
||||
; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
|
||||
; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
|
||||
; GCN-IR-NEXT: s_setpc_b64 s[30:31]
|
||||
|
@ -1785,6 +1785,7 @@ define i64 @v_test_sdiv_pow2_k_den_i64(i64 %x) {
|
|||
; GCN-IR-NEXT: v_mov_b32_e32 v11, 0
|
||||
; GCN-IR-NEXT: v_addc_u32_e64 v8, s[4:5], 0, -1, vcc
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v12, 0
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v6, 0
|
||||
; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff
|
||||
; GCN-IR-NEXT: BB13_3: ; %udiv-do-while
|
||||
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
|
@ -1799,7 +1800,6 @@ define i64 @v_test_sdiv_pow2_k_den_i64(i64 %x) {
|
|||
; GCN-IR-NEXT: v_or_b32_e32 v4, v12, v4
|
||||
; GCN-IR-NEXT: v_ashrrev_i32_e32 v9, 31, v5
|
||||
; GCN-IR-NEXT: v_addc_u32_e32 v12, vcc, 0, v8, vcc
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v6, 0
|
||||
; GCN-IR-NEXT: v_and_b32_e32 v5, 1, v9
|
||||
; GCN-IR-NEXT: v_and_b32_e32 v9, 0x8000, v9
|
||||
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[11:12], v[7:8]
|
||||
|
|
|
@ -409,10 +409,11 @@ define i64 @v_test_srem(i64 %x, i64 %y) {
|
|||
; GCN-IR-NEXT: v_not_b32_e32 v3, v3
|
||||
; GCN-IR-NEXT: v_lshr_b64 v[14:15], v[0:1], v14
|
||||
; GCN-IR-NEXT: v_not_b32_e32 v9, v11
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v16, 0
|
||||
; GCN-IR-NEXT: v_add_i32_e32 v11, vcc, v3, v12
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v17, 0
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v16, 0
|
||||
; GCN-IR-NEXT: v_addc_u32_e32 v12, vcc, v9, v13, vcc
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v17, 0
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v10, 0
|
||||
; GCN-IR-NEXT: BB1_3: ; %udiv-do-while
|
||||
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GCN-IR-NEXT: v_lshl_b64 v[14:15], v[14:15], 1
|
||||
|
@ -426,7 +427,6 @@ define i64 @v_test_srem(i64 %x, i64 %y) {
|
|||
; GCN-IR-NEXT: v_or_b32_e32 v8, v17, v8
|
||||
; GCN-IR-NEXT: v_ashrrev_i32_e32 v13, 31, v9
|
||||
; GCN-IR-NEXT: v_addc_u32_e32 v17, vcc, 0, v12, vcc
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v10, 0
|
||||
; GCN-IR-NEXT: v_and_b32_e32 v9, 1, v13
|
||||
; GCN-IR-NEXT: v_and_b32_e32 v20, v13, v6
|
||||
; GCN-IR-NEXT: v_and_b32_e32 v13, v13, v5
|
||||
|
@ -1645,10 +1645,11 @@ define i64 @v_test_srem_k_num_i64(i64 %x) {
|
|||
; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0
|
||||
; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc
|
||||
; GCN-IR-NEXT: v_lshr_b64 v[8:9], 24, v8
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v10, 0
|
||||
; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 58, v6
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v11, 0
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v10, 0
|
||||
; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, 0, v7, vcc
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v11, 0
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
|
||||
; GCN-IR-NEXT: BB11_3: ; %udiv-do-while
|
||||
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
|
||||
|
@ -1665,7 +1666,6 @@ define i64 @v_test_srem_k_num_i64(i64 %x) {
|
|||
; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v6
|
||||
; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3
|
||||
; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v7, vcc
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
|
||||
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[10:11], v[6:7]
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v6, v10
|
||||
; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v15
|
||||
|
@ -1817,20 +1817,20 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
|
|||
; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0
|
||||
; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2
|
||||
; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1
|
||||
; GCN-IR-NEXT: v_min_u32_e32 v4, v2, v3
|
||||
; GCN-IR-NEXT: v_min_u32_e32 v6, v2, v3
|
||||
; GCN-IR-NEXT: s_movk_i32 s6, 0xffd0
|
||||
; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s6, v4
|
||||
; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s6, v6
|
||||
; GCN-IR-NEXT: v_addc_u32_e64 v3, s[6:7], 0, -1, vcc
|
||||
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
|
||||
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3]
|
||||
; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v6, s8
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v4, s8
|
||||
; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc
|
||||
; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[2:3]
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
|
||||
; GCN-IR-NEXT: v_cndmask_b32_e64 v6, v6, 0, s[4:5]
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v7, 0
|
||||
; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[4:5]
|
||||
; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v7, v5
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v5, v7
|
||||
; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], vcc
|
||||
; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
|
||||
; GCN-IR-NEXT: s_cbranch_execz BB12_6
|
||||
|
@ -1840,8 +1840,8 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
|
|||
; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[8:9], v[2:3]
|
||||
; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2
|
||||
; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[8:9], v2
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v6, 0
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v7, 0
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
|
||||
; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
|
||||
; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
|
||||
|
@ -1851,35 +1851,35 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
|
|||
; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000
|
||||
; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc
|
||||
; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v8
|
||||
; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v6
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v10, 0
|
||||
; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, 47, v4
|
||||
; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, 0, v7, vcc
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v11, 0
|
||||
; GCN-IR-NEXT: v_subb_u32_e32 v5, vcc, 0, v5, vcc
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
|
||||
; GCN-IR-NEXT: BB12_3: ; %udiv-do-while
|
||||
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
|
||||
; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v3
|
||||
; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v6
|
||||
; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3
|
||||
; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4
|
||||
; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
|
||||
; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v12, v8
|
||||
; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v13, v9, vcc
|
||||
; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v12, v8
|
||||
; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v13, v9, vcc
|
||||
; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2
|
||||
; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v6
|
||||
; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v10
|
||||
; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4
|
||||
; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10
|
||||
; GCN-IR-NEXT: v_and_b32_e32 v14, v10, v1
|
||||
; GCN-IR-NEXT: v_and_b32_e32 v15, v10, v0
|
||||
; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v4
|
||||
; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v6
|
||||
; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3
|
||||
; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v7, 0
|
||||
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[10:11], v[4:5]
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v4, v10
|
||||
; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v7, vcc
|
||||
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[10:11], v[6:7]
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v6, v10
|
||||
; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v15
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v5, v11
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v11, v7
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v7, v11
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v11, v5
|
||||
; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v14, s[4:5]
|
||||
; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v10, v6
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v10, v4
|
||||
; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
|
||||
; GCN-IR-NEXT: s_cbranch_execnz BB12_3
|
||||
; GCN-IR-NEXT: ; %bb.4: ; %Flow
|
||||
|
@ -1887,14 +1887,14 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
|
|||
; GCN-IR-NEXT: BB12_5: ; %Flow3
|
||||
; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
|
||||
; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
|
||||
; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v3
|
||||
; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v2
|
||||
; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3
|
||||
; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v2
|
||||
; GCN-IR-NEXT: BB12_6: ; %Flow4
|
||||
; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7]
|
||||
; GCN-IR-NEXT: v_mul_lo_u32 v2, v0, v7
|
||||
; GCN-IR-NEXT: v_mul_hi_u32 v3, v0, v6
|
||||
; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v6
|
||||
; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v6
|
||||
; GCN-IR-NEXT: v_mul_lo_u32 v2, v0, v5
|
||||
; GCN-IR-NEXT: v_mul_hi_u32 v3, v0, v4
|
||||
; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v4
|
||||
; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v4
|
||||
; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, v3, v2
|
||||
; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1
|
||||
; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 0x8000, v0
|
||||
|
@ -1960,6 +1960,7 @@ define i64 @v_test_srem_pow2_k_den_i64(i64 %x) {
|
|||
; GCN-IR-NEXT: v_mov_b32_e32 v12, 0
|
||||
; GCN-IR-NEXT: v_addc_u32_e64 v9, s[4:5], 0, -1, vcc
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v13, 0
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v7, 0
|
||||
; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff
|
||||
; GCN-IR-NEXT: BB13_3: ; %udiv-do-while
|
||||
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
|
@ -1976,7 +1977,6 @@ define i64 @v_test_srem_pow2_k_den_i64(i64 %x) {
|
|||
; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v8
|
||||
; GCN-IR-NEXT: v_or_b32_e32 v5, v13, v5
|
||||
; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v9, vcc
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v7, 0
|
||||
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[12:13], v[8:9]
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v8, v12
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v14, 0
|
||||
|
|
|
@ -375,10 +375,11 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) {
|
|||
; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v3, vcc
|
||||
; GCN-IR-NEXT: v_not_b32_e32 v0, v8
|
||||
; GCN-IR-NEXT: v_not_b32_e32 v1, v9
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v8, 0
|
||||
; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v10
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v9, 0
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v8, 0
|
||||
; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, v1, v11, vcc
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v9, 0
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v7, 0
|
||||
; GCN-IR-NEXT: BB1_3: ; %udiv-do-while
|
||||
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[12:13], 1
|
||||
|
@ -395,7 +396,6 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) {
|
|||
; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v0
|
||||
; GCN-IR-NEXT: v_or_b32_e32 v5, v9, v5
|
||||
; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v7, 0
|
||||
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[0:1]
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v0, v8
|
||||
; GCN-IR-NEXT: v_sub_i32_e64 v12, s[4:5], v10, v12
|
||||
|
@ -1215,30 +1215,30 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) {
|
|||
; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0
|
||||
; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2
|
||||
; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1
|
||||
; GCN-IR-NEXT: v_min_u32_e32 v4, v2, v3
|
||||
; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 0xffffffd0, v4
|
||||
; GCN-IR-NEXT: v_addc_u32_e64 v7, s[6:7], 0, -1, vcc
|
||||
; GCN-IR-NEXT: v_min_u32_e32 v6, v2, v3
|
||||
; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 0xffffffd0, v6
|
||||
; GCN-IR-NEXT: v_addc_u32_e64 v5, s[6:7], 0, -1, vcc
|
||||
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
|
||||
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[6:7]
|
||||
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5]
|
||||
; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v2, s8
|
||||
; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc
|
||||
; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[6:7]
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
|
||||
; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[4:5]
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v7, 0
|
||||
; GCN-IR-NEXT: v_cndmask_b32_e64 v2, v2, 0, s[4:5]
|
||||
; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v3, v5
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v3, v7
|
||||
; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], vcc
|
||||
; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
|
||||
; GCN-IR-NEXT: s_cbranch_execz BB9_6
|
||||
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
|
||||
; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v6
|
||||
; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v7, vcc
|
||||
; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v6
|
||||
; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[8:9], v[6:7]
|
||||
; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v4
|
||||
; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc
|
||||
; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v4
|
||||
; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[8:9], v[4:5]
|
||||
; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[8:9], v2
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v6, 0
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v7, 0
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
|
||||
; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
|
||||
; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
|
||||
|
@ -1248,35 +1248,35 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) {
|
|||
; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000
|
||||
; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc
|
||||
; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v8
|
||||
; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v6
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v10, 0
|
||||
; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, 47, v4
|
||||
; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, 0, v7, vcc
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v11, 0
|
||||
; GCN-IR-NEXT: v_subb_u32_e32 v5, vcc, 0, v5, vcc
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
|
||||
; GCN-IR-NEXT: BB9_3: ; %udiv-do-while
|
||||
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
|
||||
; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v3
|
||||
; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v6
|
||||
; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3
|
||||
; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4
|
||||
; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
|
||||
; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v12, v8
|
||||
; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v13, v9, vcc
|
||||
; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v12, v8
|
||||
; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v13, v9, vcc
|
||||
; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2
|
||||
; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v6
|
||||
; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v10
|
||||
; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4
|
||||
; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10
|
||||
; GCN-IR-NEXT: v_and_b32_e32 v14, v10, v1
|
||||
; GCN-IR-NEXT: v_and_b32_e32 v15, v10, v0
|
||||
; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v4
|
||||
; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v6
|
||||
; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3
|
||||
; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v7, 0
|
||||
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[10:11], v[4:5]
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v4, v10
|
||||
; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v7, vcc
|
||||
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[10:11], v[6:7]
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v6, v10
|
||||
; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v15
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v5, v11
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v11, v7
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v7, v11
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v11, v5
|
||||
; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v14, s[4:5]
|
||||
; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v10, v6
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v10, v4
|
||||
; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
|
||||
; GCN-IR-NEXT: s_cbranch_execnz BB9_3
|
||||
; GCN-IR-NEXT: ; %bb.4: ; %Flow
|
||||
|
@ -1284,8 +1284,8 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) {
|
|||
; GCN-IR-NEXT: BB9_5: ; %Flow3
|
||||
; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
|
||||
; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1
|
||||
; GCN-IR-NEXT: v_or_b32_e32 v3, v7, v1
|
||||
; GCN-IR-NEXT: v_or_b32_e32 v2, v6, v0
|
||||
; GCN-IR-NEXT: v_or_b32_e32 v3, v5, v1
|
||||
; GCN-IR-NEXT: v_or_b32_e32 v2, v4, v0
|
||||
; GCN-IR-NEXT: BB9_6: ; %Flow4
|
||||
; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7]
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v0, v2
|
||||
|
@ -1340,6 +1340,7 @@ define i64 @v_test_udiv_pow2_k_den_i64(i64 %x) {
|
|||
; GCN-IR-NEXT: v_mov_b32_e32 v9, 0
|
||||
; GCN-IR-NEXT: v_addc_u32_e64 v1, s[4:5], 0, -1, vcc
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v10, 0
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
|
||||
; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff
|
||||
; GCN-IR-NEXT: BB10_3: ; %udiv-do-while
|
||||
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
|
@ -1356,7 +1357,6 @@ define i64 @v_test_udiv_pow2_k_den_i64(i64 %x) {
|
|||
; GCN-IR-NEXT: v_add_i32_e32 v9, vcc, 1, v0
|
||||
; GCN-IR-NEXT: v_or_b32_e32 v3, v10, v3
|
||||
; GCN-IR-NEXT: v_addc_u32_e32 v10, vcc, 0, v1, vcc
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
|
||||
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[9:10], v[0:1]
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v0, v9
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v6, 0
|
||||
|
@ -1708,8 +1708,9 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) {
|
|||
; GCN-IR-NEXT: v_lshr_b64 v[7:8], v[0:1], v7
|
||||
; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 0xffffffc4, v6
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v9, 0
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v10, 0
|
||||
; GCN-IR-NEXT: v_addc_u32_e64 v1, s[4:5], 0, -1, vcc
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v10, 0
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
|
||||
; GCN-IR-NEXT: BB12_3: ; %udiv-do-while
|
||||
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[7:8], 1
|
||||
|
@ -1723,7 +1724,6 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) {
|
|||
; GCN-IR-NEXT: v_or_b32_e32 v3, v10, v3
|
||||
; GCN-IR-NEXT: v_ashrrev_i32_e32 v7, 31, v4
|
||||
; GCN-IR-NEXT: v_addc_u32_e32 v10, vcc, 0, v1, vcc
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
|
||||
; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v7
|
||||
; GCN-IR-NEXT: v_and_b32_e32 v7, 24, v7
|
||||
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[9:10], v[0:1]
|
||||
|
|
|
@ -186,14 +186,15 @@ define hidden void @blam() {
|
|||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
|
||||
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
|
||||
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
|
||||
; GCN-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; GCN-NEXT: v_writelane_b32 v40, s33, 4
|
||||
; GCN-NEXT: s_mov_b32 s33, s32
|
||||
; GCN-NEXT: s_addk_i32 s32, 0x800
|
||||
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
|
||||
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
|
||||
; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
|
||||
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
|
||||
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
|
||||
; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
|
||||
; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill
|
||||
; GCN-NEXT: v_writelane_b32 v40, s34, 0
|
||||
; GCN-NEXT: v_writelane_b32 v40, s35, 1
|
||||
; GCN-NEXT: v_writelane_b32 v40, s36, 2
|
||||
|
@ -202,8 +203,9 @@ define hidden void @blam() {
|
|||
; GCN-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
||||
; GCN-NEXT: flat_load_dword v43, v[1:2]
|
||||
; GCN-NEXT: v_mov_b32_e32 v42, 0
|
||||
; GCN-NEXT: flat_load_dword v43, v[1:2]
|
||||
; GCN-NEXT: v_mov_b32_e32 v44, 0x7fc00000
|
||||
; GCN-NEXT: s_getpc_b64 s[36:37]
|
||||
; GCN-NEXT: s_add_u32 s36, s36, spam@rel32@lo+4
|
||||
; GCN-NEXT: s_addc_u32 s37, s37, spam@rel32@hi+12
|
||||
|
@ -214,13 +216,11 @@ define hidden void @blam() {
|
|||
; GCN-NEXT: BB1_1: ; %bb10
|
||||
; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1
|
||||
; GCN-NEXT: s_or_b64 exec, exec, s[6:7]
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0x7fc00000
|
||||
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; GCN-NEXT: buffer_store_dword v44, off, s[0:3], 0
|
||||
; GCN-NEXT: BB1_2: ; %bb18
|
||||
; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0x7fc00000
|
||||
; GCN-NEXT: buffer_store_dword v44, off, s[0:3], 0
|
||||
; GCN-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; GCN-NEXT: BB1_3: ; %bb2
|
||||
; GCN-NEXT: ; =>This Loop Header: Depth=1
|
||||
; GCN-NEXT: ; Child Loop BB1_4 Depth 2
|
||||
|
@ -229,8 +229,7 @@ define hidden void @blam() {
|
|||
; GCN-NEXT: ; Parent Loop BB1_3 Depth=1
|
||||
; GCN-NEXT: ; => This Inner Loop Header: Depth=2
|
||||
; GCN-NEXT: flat_load_dword v0, v[41:42]
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
|
||||
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], 0
|
||||
; GCN-NEXT: s_waitcnt vmcnt(1)
|
||||
; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 3, v0
|
||||
; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc
|
||||
|
@ -268,8 +267,7 @@ define hidden void @blam() {
|
|||
; GCN-NEXT: ; %bb.9: ; %bb16
|
||||
; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1
|
||||
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0x7fc00000
|
||||
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; GCN-NEXT: buffer_store_dword v44, off, s[0:3], 0
|
||||
; GCN-NEXT: BB1_10: ; %bb17
|
||||
; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1
|
||||
; GCN-NEXT: buffer_store_dword v43, off, s[0:3], 0
|
||||
|
|
|
@ -379,40 +379,40 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) {
|
|||
; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
|
||||
; GCN-IR-NEXT: s_cbranch_execz BB1_5
|
||||
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
|
||||
; GCN-IR-NEXT: v_add_i32_e32 v16, vcc, -1, v2
|
||||
; GCN-IR-NEXT: v_addc_u32_e32 v17, vcc, -1, v3, vcc
|
||||
; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, -1, v2
|
||||
; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v3, vcc
|
||||
; GCN-IR-NEXT: v_not_b32_e32 v6, v8
|
||||
; GCN-IR-NEXT: v_lshr_b64 v[12:13], v[0:1], v12
|
||||
; GCN-IR-NEXT: v_not_b32_e32 v7, v9
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v14, 0
|
||||
; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, v6, v10
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v15, 0
|
||||
; GCN-IR-NEXT: v_lshr_b64 v[12:13], v[0:1], v12
|
||||
; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, v7, v11, vcc
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v10, 0
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v11, 0
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v7, 0
|
||||
; GCN-IR-NEXT: BB1_3: ; %udiv-do-while
|
||||
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[12:13], 1
|
||||
; GCN-IR-NEXT: v_lshl_b64 v[12:13], v[12:13], 1
|
||||
; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v5
|
||||
; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v6
|
||||
; GCN-IR-NEXT: v_or_b32_e32 v12, v12, v6
|
||||
; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1
|
||||
; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v16, v10
|
||||
; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v17, v11, vcc
|
||||
; GCN-IR-NEXT: v_or_b32_e32 v4, v14, v4
|
||||
; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, 1, v8
|
||||
; GCN-IR-NEXT: v_or_b32_e32 v5, v15, v5
|
||||
; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v6
|
||||
; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, 0, v9, vcc
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v7, 0
|
||||
; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v12
|
||||
; GCN-IR-NEXT: v_and_b32_e32 v13, v12, v3
|
||||
; GCN-IR-NEXT: v_and_b32_e32 v12, v12, v2
|
||||
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[14:15], v[8:9]
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v8, v14
|
||||
; GCN-IR-NEXT: v_sub_i32_e64 v12, s[4:5], v10, v12
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v9, v15
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v15, v7
|
||||
; GCN-IR-NEXT: v_subb_u32_e64 v13, s[4:5], v11, v13, s[4:5]
|
||||
; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v14, v12
|
||||
; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v15, v13, vcc
|
||||
; GCN-IR-NEXT: v_or_b32_e32 v4, v10, v4
|
||||
; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v6
|
||||
; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v10
|
||||
; GCN-IR-NEXT: v_and_b32_e32 v16, v10, v3
|
||||
; GCN-IR-NEXT: v_and_b32_e32 v17, v10, v2
|
||||
; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v8
|
||||
; GCN-IR-NEXT: v_or_b32_e32 v5, v11, v5
|
||||
; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc
|
||||
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[10:11], v[8:9]
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v8, v10
|
||||
; GCN-IR-NEXT: v_sub_i32_e64 v12, s[4:5], v12, v17
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v9, v11
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v11, v7
|
||||
; GCN-IR-NEXT: v_subb_u32_e64 v13, s[4:5], v13, v16, s[4:5]
|
||||
; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v14, v6
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v10, v6
|
||||
; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
|
||||
; GCN-IR-NEXT: s_cbranch_execnz BB1_3
|
||||
; GCN-IR-NEXT: ; %bb.4: ; %Flow
|
||||
|
@ -1239,19 +1239,19 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) {
|
|||
; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0
|
||||
; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2
|
||||
; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1
|
||||
; GCN-IR-NEXT: v_min_u32_e32 v4, v2, v3
|
||||
; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 0xffffffd0, v4
|
||||
; GCN-IR-NEXT: v_min_u32_e32 v6, v2, v3
|
||||
; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 0xffffffd0, v6
|
||||
; GCN-IR-NEXT: v_addc_u32_e64 v3, s[6:7], 0, -1, vcc
|
||||
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
|
||||
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3]
|
||||
; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v6, s8
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v4, s8
|
||||
; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc
|
||||
; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[2:3]
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
|
||||
; GCN-IR-NEXT: v_cndmask_b32_e64 v6, v6, 0, s[4:5]
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v7, 0
|
||||
; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[4:5]
|
||||
; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v7, v5
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v5, v7
|
||||
; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], vcc
|
||||
; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
|
||||
; GCN-IR-NEXT: s_cbranch_execz BB8_6
|
||||
|
@ -1261,8 +1261,8 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) {
|
|||
; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[8:9], v[2:3]
|
||||
; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2
|
||||
; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[8:9], v2
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v6, 0
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v7, 0
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
|
||||
; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
|
||||
; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
|
||||
|
@ -1272,35 +1272,35 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) {
|
|||
; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000
|
||||
; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc
|
||||
; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v8
|
||||
; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v6
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v10, 0
|
||||
; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, 47, v4
|
||||
; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, 0, v7, vcc
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v11, 0
|
||||
; GCN-IR-NEXT: v_subb_u32_e32 v5, vcc, 0, v5, vcc
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
|
||||
; GCN-IR-NEXT: BB8_3: ; %udiv-do-while
|
||||
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
|
||||
; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v3
|
||||
; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v6
|
||||
; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3
|
||||
; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4
|
||||
; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
|
||||
; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v12, v8
|
||||
; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v13, v9, vcc
|
||||
; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v12, v8
|
||||
; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v13, v9, vcc
|
||||
; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2
|
||||
; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v6
|
||||
; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v10
|
||||
; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4
|
||||
; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10
|
||||
; GCN-IR-NEXT: v_and_b32_e32 v14, v10, v1
|
||||
; GCN-IR-NEXT: v_and_b32_e32 v15, v10, v0
|
||||
; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v4
|
||||
; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v6
|
||||
; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3
|
||||
; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v7, 0
|
||||
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[10:11], v[4:5]
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v4, v10
|
||||
; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v7, vcc
|
||||
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[10:11], v[6:7]
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v6, v10
|
||||
; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v15
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v5, v11
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v11, v7
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v7, v11
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v11, v5
|
||||
; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v14, s[4:5]
|
||||
; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v10, v6
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v10, v4
|
||||
; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
|
||||
; GCN-IR-NEXT: s_cbranch_execnz BB8_3
|
||||
; GCN-IR-NEXT: ; %bb.4: ; %Flow
|
||||
|
@ -1308,14 +1308,14 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) {
|
|||
; GCN-IR-NEXT: BB8_5: ; %Flow3
|
||||
; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
|
||||
; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
|
||||
; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v3
|
||||
; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v2
|
||||
; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3
|
||||
; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v2
|
||||
; GCN-IR-NEXT: BB8_6: ; %Flow4
|
||||
; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7]
|
||||
; GCN-IR-NEXT: v_mul_lo_u32 v2, v0, v7
|
||||
; GCN-IR-NEXT: v_mul_hi_u32 v3, v0, v6
|
||||
; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v6
|
||||
; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v6
|
||||
; GCN-IR-NEXT: v_mul_lo_u32 v2, v0, v5
|
||||
; GCN-IR-NEXT: v_mul_hi_u32 v3, v0, v4
|
||||
; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v4
|
||||
; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v4
|
||||
; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, v3, v2
|
||||
; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1
|
||||
; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 0x8000, v0
|
||||
|
@ -1370,6 +1370,7 @@ define i64 @v_test_urem_pow2_k_den_i64(i64 %x) {
|
|||
; GCN-IR-NEXT: v_mov_b32_e32 v10, 0
|
||||
; GCN-IR-NEXT: v_addc_u32_e64 v7, s[4:5], 0, -1, vcc
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v11, 0
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
|
||||
; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff
|
||||
; GCN-IR-NEXT: BB9_3: ; %udiv-do-while
|
||||
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
|
@ -1386,7 +1387,6 @@ define i64 @v_test_urem_pow2_k_den_i64(i64 %x) {
|
|||
; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v6
|
||||
; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3
|
||||
; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v7, vcc
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
|
||||
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[10:11], v[6:7]
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v6, v10
|
||||
; GCN-IR-NEXT: v_mov_b32_e32 v12, 0
|
||||
|
|
|
@ -6,15 +6,15 @@ define void @vgpr_descriptor_waterfall_loop_idom_update(<4 x i32>* %arg) #0 {
|
|||
; GCN: ; %bb.0: ; %entry
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GCN-NEXT: v_add_co_u32 v6, vcc_lo, v0, 8
|
||||
; GCN-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo
|
||||
; GCN-NEXT: BB0_1: ; %bb0
|
||||
; GCN-NEXT: ; =>This Loop Header: Depth=1
|
||||
; GCN-NEXT: ; Child Loop BB0_2 Depth 2
|
||||
; GCN-NEXT: v_add_co_u32 v6, vcc_lo, v0, 8
|
||||
; GCN-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo
|
||||
; GCN-NEXT: s_mov_b32 s5, exec_lo
|
||||
; GCN-NEXT: s_clause 0x1
|
||||
; GCN-NEXT: flat_load_dwordx2 v[4:5], v[6:7]
|
||||
; GCN-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
||||
; GCN-NEXT: s_mov_b32 s5, exec_lo
|
||||
; GCN-NEXT: BB0_2: ; Parent Loop BB0_1 Depth=1
|
||||
; GCN-NEXT: ; => This Inner Loop Header: Depth=2
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
|
|
|
@ -250,9 +250,9 @@ bb13:
|
|||
; GCN: s_cbranch_execz
|
||||
; GCN: BB{{.*}}:
|
||||
|
||||
; GFX1032: s_or_b32 [[MASK1]], [[MASK1]], exec_lo
|
||||
; GFX1064: s_or_b64 [[MASK1]], [[MASK1]], exec
|
||||
; GCN: global_load_dword [[LOAD:v[0-9]+]]
|
||||
; GFX1032-DAG: s_or_b32 [[MASK1]], [[MASK1]], exec_lo
|
||||
; GFX1064-DAG: s_or_b64 [[MASK1]], [[MASK1]], exec
|
||||
; GCN-DAG: global_load_dword [[LOAD:v[0-9]+]]
|
||||
; GFX1032: v_cmp_gt_i32_e32 vcc_lo, 11, [[LOAD]]
|
||||
; GFX1064: v_cmp_gt_i32_e32 vcc, 11, [[LOAD]]
|
||||
define amdgpu_kernel void @test_loop_with_if_else_break(i32 addrspace(1)* %arg) #0 {
|
||||
|
|
Loading…
Reference in New Issue