forked from OSchip/llvm-project
[AMDGPU] Fix whole wavefront mode
We cannot move wwm over exec copies because the exec register needs an exact exec mask. Differential Revision: https://reviews.llvm.org/D76232
This commit is contained in:
parent
1f93b162fc
commit
6e29846b29
|
@ -171,8 +171,6 @@ private:
|
|||
void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
|
||||
char analyzeFunction(MachineFunction &MF);
|
||||
|
||||
bool requiresCorrectState(const MachineInstr &MI) const;
|
||||
|
||||
MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB,
|
||||
MachineBasicBlock::iterator Before);
|
||||
MachineBasicBlock::iterator
|
||||
|
@ -526,36 +524,6 @@ char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
|
|||
return GlobalFlags;
|
||||
}
|
||||
|
||||
/// Whether \p MI really requires the exec state computed during analysis.
|
||||
///
|
||||
/// Scalar instructions must occasionally be marked WQM for correct propagation
|
||||
/// (e.g. thread masks leading up to branches), but when it comes to actual
|
||||
/// execution, they don't care about EXEC.
|
||||
bool SIWholeQuadMode::requiresCorrectState(const MachineInstr &MI) const {
|
||||
if (MI.isTerminator())
|
||||
return true;
|
||||
|
||||
// Skip instructions that are not affected by EXEC
|
||||
if (TII->isScalarUnit(MI))
|
||||
return false;
|
||||
|
||||
// Generic instructions such as COPY will either disappear by register
|
||||
// coalescing or be lowered to SALU or VALU instructions.
|
||||
if (MI.isTransient()) {
|
||||
if (MI.getNumExplicitOperands() >= 1) {
|
||||
const MachineOperand &Op = MI.getOperand(0);
|
||||
if (Op.isReg()) {
|
||||
if (TRI->isSGPRReg(*MRI, Op.getReg())) {
|
||||
// SGPR instructions are not affected by EXEC
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
MachineBasicBlock::iterator
|
||||
SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
|
||||
MachineBasicBlock::iterator Before) {
|
||||
|
@ -742,7 +710,7 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
|
|||
if (II != IE) {
|
||||
MachineInstr &MI = *II;
|
||||
|
||||
if (requiresCorrectState(MI)) {
|
||||
if (MI.isTerminator() || TII->mayReadEXEC(*MRI, MI)) {
|
||||
auto III = Instructions.find(&MI);
|
||||
if (III != Instructions.end()) {
|
||||
if (III->second.Needs & StateWWM)
|
||||
|
|
|
@ -375,9 +375,9 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
|
|||
;
|
||||
; GFX8-LABEL: add_i32_varying:
|
||||
; GFX8: ; %bb.0: ; %entry
|
||||
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1
|
||||
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX8-NEXT: s_mov_b64 exec, s[2:3]
|
||||
; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0
|
||||
|
@ -428,9 +428,9 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
|
|||
;
|
||||
; GFX9-LABEL: add_i32_varying:
|
||||
; GFX9: ; %bb.0: ; %entry
|
||||
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1
|
||||
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[2:3]
|
||||
; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0
|
||||
|
@ -480,9 +480,9 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
|
|||
;
|
||||
; GFX1064-LABEL: add_i32_varying:
|
||||
; GFX1064: ; %bb.0: ; %entry
|
||||
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1
|
||||
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX1064-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
|
||||
; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0
|
||||
|
@ -539,10 +539,10 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
|
|||
;
|
||||
; GFX1032-LABEL: add_i32_varying:
|
||||
; GFX1032: ; %bb.0: ; %entry
|
||||
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX1032-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
|
||||
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX1032-NEXT: s_mov_b32 exec_lo, s2
|
||||
; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0
|
||||
|
@ -614,9 +614,9 @@ define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) {
|
|||
;
|
||||
; GFX8-LABEL: add_i32_varying_gfx1032:
|
||||
; GFX8: ; %bb.0: ; %entry
|
||||
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1
|
||||
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX8-NEXT: s_mov_b64 exec, s[2:3]
|
||||
; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0
|
||||
|
@ -667,9 +667,9 @@ define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) {
|
|||
;
|
||||
; GFX9-LABEL: add_i32_varying_gfx1032:
|
||||
; GFX9: ; %bb.0: ; %entry
|
||||
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1
|
||||
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[2:3]
|
||||
; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0
|
||||
|
@ -719,9 +719,9 @@ define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) {
|
|||
;
|
||||
; GFX1064-LABEL: add_i32_varying_gfx1032:
|
||||
; GFX1064: ; %bb.0: ; %entry
|
||||
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1
|
||||
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX1064-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
|
||||
; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0
|
||||
|
@ -778,10 +778,10 @@ define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) {
|
|||
;
|
||||
; GFX1032-LABEL: add_i32_varying_gfx1032:
|
||||
; GFX1032: ; %bb.0: ; %entry
|
||||
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX1032-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
|
||||
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX1032-NEXT: s_mov_b32 exec_lo, s2
|
||||
; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0
|
||||
|
@ -853,9 +853,9 @@ define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) {
|
|||
;
|
||||
; GFX8-LABEL: add_i32_varying_gfx1064:
|
||||
; GFX8: ; %bb.0: ; %entry
|
||||
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1
|
||||
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX8-NEXT: s_mov_b64 exec, s[2:3]
|
||||
; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0
|
||||
|
@ -906,9 +906,9 @@ define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) {
|
|||
;
|
||||
; GFX9-LABEL: add_i32_varying_gfx1064:
|
||||
; GFX9: ; %bb.0: ; %entry
|
||||
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1
|
||||
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[2:3]
|
||||
; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0
|
||||
|
@ -958,9 +958,9 @@ define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) {
|
|||
;
|
||||
; GFX1064-LABEL: add_i32_varying_gfx1064:
|
||||
; GFX1064: ; %bb.0: ; %entry
|
||||
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1
|
||||
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX1064-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
|
||||
; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0
|
||||
|
@ -1017,10 +1017,10 @@ define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) {
|
|||
;
|
||||
; GFX1032-LABEL: add_i32_varying_gfx1064:
|
||||
; GFX1032: ; %bb.0: ; %entry
|
||||
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX1032-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
|
||||
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX1032-NEXT: s_mov_b32 exec_lo, s2
|
||||
; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0
|
||||
|
@ -1934,9 +1934,9 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
|
|||
;
|
||||
; GFX8-LABEL: sub_i32_varying:
|
||||
; GFX8: ; %bb.0: ; %entry
|
||||
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1
|
||||
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX8-NEXT: s_mov_b64 exec, s[2:3]
|
||||
; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0
|
||||
|
@ -1987,9 +1987,9 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
|
|||
;
|
||||
; GFX9-LABEL: sub_i32_varying:
|
||||
; GFX9: ; %bb.0: ; %entry
|
||||
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1
|
||||
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[2:3]
|
||||
; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0
|
||||
|
@ -2039,9 +2039,9 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
|
|||
;
|
||||
; GFX1064-LABEL: sub_i32_varying:
|
||||
; GFX1064: ; %bb.0: ; %entry
|
||||
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1
|
||||
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX1064-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
|
||||
; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0
|
||||
|
@ -2098,10 +2098,10 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
|
|||
;
|
||||
; GFX1032-LABEL: sub_i32_varying:
|
||||
; GFX1032: ; %bb.0: ; %entry
|
||||
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX1032-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
|
||||
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX1032-NEXT: s_mov_b32 exec_lo, s2
|
||||
; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0
|
||||
|
@ -2917,9 +2917,9 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
|
|||
;
|
||||
; GFX8-LABEL: or_i32_varying:
|
||||
; GFX8: ; %bb.0: ; %entry
|
||||
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1
|
||||
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX8-NEXT: s_mov_b64 exec, s[2:3]
|
||||
; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0
|
||||
|
@ -2970,9 +2970,9 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
|
|||
;
|
||||
; GFX9-LABEL: or_i32_varying:
|
||||
; GFX9: ; %bb.0: ; %entry
|
||||
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1
|
||||
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[2:3]
|
||||
; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0
|
||||
|
@ -3022,9 +3022,9 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
|
|||
;
|
||||
; GFX1064-LABEL: or_i32_varying:
|
||||
; GFX1064: ; %bb.0: ; %entry
|
||||
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1
|
||||
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX1064-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
|
||||
; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0
|
||||
|
@ -3081,10 +3081,10 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
|
|||
;
|
||||
; GFX1032-LABEL: or_i32_varying:
|
||||
; GFX1032: ; %bb.0: ; %entry
|
||||
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX1032-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
|
||||
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX1032-NEXT: s_mov_b32 exec_lo, s2
|
||||
; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0
|
||||
|
@ -3159,9 +3159,9 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
|
|||
;
|
||||
; GFX8-LABEL: xor_i32_varying:
|
||||
; GFX8: ; %bb.0: ; %entry
|
||||
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1
|
||||
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX8-NEXT: s_mov_b64 exec, s[2:3]
|
||||
; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0
|
||||
|
@ -3212,9 +3212,9 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
|
|||
;
|
||||
; GFX9-LABEL: xor_i32_varying:
|
||||
; GFX9: ; %bb.0: ; %entry
|
||||
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1
|
||||
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[2:3]
|
||||
; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0
|
||||
|
@ -3264,9 +3264,9 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
|
|||
;
|
||||
; GFX1064-LABEL: xor_i32_varying:
|
||||
; GFX1064: ; %bb.0: ; %entry
|
||||
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1
|
||||
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX1064-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
|
||||
; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0
|
||||
|
@ -3323,10 +3323,10 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
|
|||
;
|
||||
; GFX1032-LABEL: xor_i32_varying:
|
||||
; GFX1032: ; %bb.0: ; %entry
|
||||
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX1032-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
|
||||
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX1032-NEXT: s_mov_b32 exec_lo, s2
|
||||
; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0
|
||||
|
@ -4265,9 +4265,9 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
|
|||
;
|
||||
; GFX8-LABEL: umax_i32_varying:
|
||||
; GFX8: ; %bb.0: ; %entry
|
||||
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1
|
||||
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX8-NEXT: s_mov_b64 exec, s[2:3]
|
||||
; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0
|
||||
|
@ -4318,9 +4318,9 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
|
|||
;
|
||||
; GFX9-LABEL: umax_i32_varying:
|
||||
; GFX9: ; %bb.0: ; %entry
|
||||
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1
|
||||
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[2:3]
|
||||
; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0
|
||||
|
@ -4370,9 +4370,9 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
|
|||
;
|
||||
; GFX1064-LABEL: umax_i32_varying:
|
||||
; GFX1064: ; %bb.0: ; %entry
|
||||
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1
|
||||
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX1064-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
|
||||
; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0
|
||||
|
@ -4429,10 +4429,10 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
|
|||
;
|
||||
; GFX1032-LABEL: umax_i32_varying:
|
||||
; GFX1032: ; %bb.0: ; %entry
|
||||
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX1032-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
|
||||
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX1032-NEXT: s_mov_b32 exec_lo, s2
|
||||
; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0
|
||||
|
|
|
@ -107,3 +107,38 @@ body: |
|
|||
S_ENDPGM 0
|
||||
|
||||
...
|
||||
|
||||
---
|
||||
# Ensure that wwm is not put around an EXEC copy
|
||||
#CHECK-LABEL: name: copy_exec
|
||||
#CHECK: %7:sreg_64 = COPY $exec
|
||||
#CHECK-NEXT: %14:sreg_64 = ENTER_WWM -1, implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
#CHECK-NEXT: %8:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
|
||||
#CHECK-NEXT: $exec = EXIT_WWM %14
|
||||
#CHECK-NEXT: %9:vgpr_32 = V_MBCNT_LO_U32_B32_e64 %7.sub0, 0, implicit $exec
|
||||
name: copy_exec
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3
|
||||
|
||||
%3:sgpr_32 = COPY $sgpr3
|
||||
%2:sgpr_32 = COPY $sgpr2
|
||||
%1:sgpr_32 = COPY $sgpr1
|
||||
%0:sgpr_32 = COPY $sgpr0
|
||||
%4:sgpr_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3
|
||||
%5:sreg_32 = S_MOV_B32 0
|
||||
%6:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %4, %5, 0, 0, 0, 0, 0, 0, implicit $exec
|
||||
|
||||
%8:sreg_64 = COPY $exec
|
||||
%9:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
|
||||
%10:vgpr_32 = V_MBCNT_LO_U32_B32_e64 %8.sub0:sreg_64, 0, implicit $exec
|
||||
%11:vgpr_32 = V_MOV_B32_dpp %9:vgpr_32, %10:vgpr_32, 312, 15, 15, 0, implicit $exec
|
||||
%12:sreg_32 = V_READLANE_B32 %11:vgpr_32, 63
|
||||
early-clobber %13:sreg_32 = WWM %9:vgpr_32, implicit $exec
|
||||
|
||||
%14:vgpr_32 = COPY %13
|
||||
BUFFER_STORE_DWORD_OFFSET_exact killed %14, %4, %5, 4, 0, 0, 0, 0, 0, implicit $exec
|
||||
S_ENDPGM 0
|
||||
|
||||
...
|
||||
|
|
Loading…
Reference in New Issue