[AMDGPU] Revise handling of preexisting waitcnt

Preexisting waitcnt may not update the scoreboard if the instruction
being examined needed to wait on fewer counters than what was encoded in
the old waitcnt instruction. Fixing this results in the elimination of
some redudnat waitcnt.

These changes also enable combining consecutive waitcnt into a single
S_WAITCNT or S_WAITCNT_VSCNT instruction.

Reviewed By: rampitec

Differential Revision: https://reviews.llvm.org/D100281
This commit is contained in:
Austin Kerbow 2021-04-09 10:54:21 -07:00
parent 9ba5238c28
commit f5199d7ae0
14 changed files with 464 additions and 248 deletions

View File

@ -245,8 +245,8 @@ public:
const SIRegisterInfo *TRI, unsigned OpNo) const;
bool counterOutOfOrder(InstCounterType T) const;
bool simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
bool simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
void determineWait(InstCounterType T, unsigned ScoreToWait,
AMDGPU::Waitcnt &Wait) const;
void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
@ -418,7 +418,7 @@ public:
}
if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
DebugCounter::shouldExecute(ForceLgkmCounter)) {
DebugCounter::shouldExecute(ForceLgkmCounter)) {
ForceEmitWaitcnt[LGKM_CNT] = true;
} else {
ForceEmitWaitcnt[LGKM_CNT] = false;
@ -442,6 +442,9 @@ public:
WaitcntBrackets *ScoreBrackets);
bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
WaitcntBrackets &ScoreBrackets);
bool applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
MachineInstr &OldWaitcntInstr,
AMDGPU::Waitcnt &Wait, const MachineInstr *MI);
};
} // end anonymous namespace
@ -708,22 +711,23 @@ void WaitcntBrackets::print(raw_ostream &OS) {
/// Simplify the waitcnt, in the sense of removing redundant counts, and return
/// whether a waitcnt instruction is needed at all.
bool WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
return simplifyWaitcnt(VM_CNT, Wait.VmCnt) |
simplifyWaitcnt(EXP_CNT, Wait.ExpCnt) |
simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt) |
simplifyWaitcnt(VS_CNT, Wait.VsCnt);
void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
simplifyWaitcnt(VM_CNT, Wait.VmCnt);
simplifyWaitcnt(EXP_CNT, Wait.ExpCnt);
simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
simplifyWaitcnt(VS_CNT, Wait.VsCnt);
}
bool WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
unsigned &Count) const {
const unsigned LB = getScoreLB(T);
const unsigned UB = getScoreUB(T);
if (Count < UB && UB - Count > LB)
return true;
Count = ~0u;
return false;
// The number of outstanding events for this type, T, can be calculated
// as (UB - LB). If the current Count is greater than or equal to the number
// of outstanding events, then the wait for this counter is redundant.
if (Count >= UB - LB)
Count = ~0u;
}
void WaitcntBrackets::determineWait(InstCounterType T, unsigned ScoreToWait,
@ -798,6 +802,107 @@ FunctionPass *llvm::createSIInsertWaitcntsPass() {
return new SIInsertWaitcnts();
}
/// Combine consecutive waitcnt instructions that precede \p MI and follow
/// \p OldWaitcntInstr and apply any extra wait from waitcnt that were added
/// by previous passes. Currently this pass conservatively assumes that these
/// preexisting waitcnt are required for correctness.
bool SIInsertWaitcnts::applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
MachineInstr &OldWaitcntInstr,
AMDGPU::Waitcnt &Wait,
const MachineInstr *MI) {
bool Modified = false;
MachineInstr *WaitcntInstr = nullptr;
MachineInstr *WaitcntVsCntInstr = nullptr;
for (auto II = OldWaitcntInstr.getIterator(), NextI = std::next(II);
&*II != MI; II = NextI, ++NextI) {
if (II->isMetaInstruction())
continue;
if (II->getOpcode() == AMDGPU::S_WAITCNT) {
// Conservatively update required wait if this waitcnt was added in an
// earlier pass. In this case it will not exist in the tracked waitcnt
// set.
if (!TrackedWaitcntSet.count(&*II)) {
unsigned IEnc = II->getOperand(0).getImm();
AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
Wait = Wait.combined(OldWait);
}
// Merge consecutive waitcnt of the same type by erasing multiples.
if (!WaitcntInstr) {
WaitcntInstr = &*II;
} else {
II->eraseFromParent();
Modified = true;
}
} else {
assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
if (!TrackedWaitcntSet.count(&*II)) {
unsigned OldVSCnt =
TII->getNamedOperand(*II, AMDGPU::OpName::simm16)->getImm();
Wait.VsCnt = std::min(Wait.VsCnt, OldVSCnt);
}
if (!WaitcntVsCntInstr) {
WaitcntVsCntInstr = &*II;
} else {
II->eraseFromParent();
Modified = true;
}
}
}
// Updated encoding of merged waitcnt with the required wait.
if (WaitcntInstr) {
if (Wait.hasWaitExceptVsCnt()) {
unsigned NewEnc = AMDGPU::encodeWaitcnt(IV, Wait);
unsigned OldEnc = WaitcntInstr->getOperand(0).getImm();
if (OldEnc != NewEnc) {
WaitcntInstr->getOperand(0).setImm(NewEnc);
Modified = true;
}
ScoreBrackets.applyWaitcnt(Wait);
Wait.VmCnt = ~0u;
Wait.LgkmCnt = ~0u;
Wait.ExpCnt = ~0u;
LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
<< "Old Instr: " << MI << "New Instr: " << *WaitcntInstr
<< '\n');
} else {
WaitcntInstr->eraseFromParent();
Modified = true;
}
}
if (WaitcntVsCntInstr) {
if (Wait.hasWaitVsCnt()) {
assert(ST->hasVscnt());
unsigned OldVSCnt =
TII->getNamedOperand(*WaitcntVsCntInstr, AMDGPU::OpName::simm16)
->getImm();
if (Wait.VsCnt != OldVSCnt) {
TII->getNamedOperand(*WaitcntVsCntInstr, AMDGPU::OpName::simm16)
->setImm(Wait.VsCnt);
Modified = true;
}
ScoreBrackets.applyWaitcnt(Wait);
Wait.VsCnt = ~0u;
LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
<< "Old Instr: " << MI
<< "New Instr: " << *WaitcntVsCntInstr << '\n');
} else {
WaitcntVsCntInstr->eraseFromParent();
Modified = true;
}
}
return Modified;
}
static bool readsVCCZ(const MachineInstr &MI) {
unsigned Opc = MI.getOpcode();
return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
@ -833,12 +938,12 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
MachineInstr &MI, WaitcntBrackets &ScoreBrackets,
MachineInstr *OldWaitcntInstr) {
setForceEmitWaitcnt();
bool IsForceEmitWaitcnt = isForceEmitWaitcnt();
if (MI.isMetaInstruction())
return false;
AMDGPU::Waitcnt Wait;
bool Modified = false;
// See if this instruction has a forced S_WAITCNT VM.
// TODO: Handle other cases of NeedsWaitcntVmBefore()
@ -1053,32 +1158,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
}
}
// Early-out if no wait is indicated.
if (!ScoreBrackets.simplifyWaitcnt(Wait) && !IsForceEmitWaitcnt) {
bool Modified = false;
if (OldWaitcntInstr) {
for (auto II = OldWaitcntInstr->getIterator(), NextI = std::next(II);
&*II != &MI; II = NextI, ++NextI) {
if (II->isDebugInstr())
continue;
if (TrackedWaitcntSet.count(&*II)) {
TrackedWaitcntSet.erase(&*II);
II->eraseFromParent();
Modified = true;
} else if (II->getOpcode() == AMDGPU::S_WAITCNT) {
int64_t Imm = II->getOperand(0).getImm();
ScoreBrackets.applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm));
} else {
assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
auto W = TII->getNamedOperand(*II, AMDGPU::OpName::simm16)->getImm();
ScoreBrackets.applyWaitcnt(AMDGPU::Waitcnt(~0u, ~0u, ~0u, W));
}
}
}
return Modified;
}
// Verify that the wait is actually needed.
ScoreBrackets.simplifyWaitcnt(Wait);
if (ForceEmitZeroWaitcnts)
Wait = AMDGPU::Waitcnt::allZero(ST->hasVscnt());
@ -1092,57 +1173,19 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
if (ForceEmitWaitcnt[VS_CNT])
Wait.VsCnt = 0;
ScoreBrackets.applyWaitcnt(Wait);
AMDGPU::Waitcnt OldWait;
bool Modified = false;
if (OldWaitcntInstr) {
for (auto II = OldWaitcntInstr->getIterator(), NextI = std::next(II);
&*II != &MI; II = NextI, NextI++) {
if (II->isDebugInstr())
continue;
if (II->getOpcode() == AMDGPU::S_WAITCNT) {
unsigned IEnc = II->getOperand(0).getImm();
AMDGPU::Waitcnt IWait = AMDGPU::decodeWaitcnt(IV, IEnc);
OldWait = OldWait.combined(IWait);
if (!TrackedWaitcntSet.count(&*II))
Wait = Wait.combined(IWait);
unsigned NewEnc = AMDGPU::encodeWaitcnt(IV, Wait);
if (IEnc != NewEnc) {
II->getOperand(0).setImm(NewEnc);
Modified = true;
}
Wait.VmCnt = ~0u;
Wait.LgkmCnt = ~0u;
Wait.ExpCnt = ~0u;
} else {
assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
unsigned ICnt = TII->getNamedOperand(*II, AMDGPU::OpName::simm16)
->getImm();
OldWait.VsCnt = std::min(OldWait.VsCnt, ICnt);
if (!TrackedWaitcntSet.count(&*II))
Wait.VsCnt = std::min(Wait.VsCnt, ICnt);
if (Wait.VsCnt != ICnt) {
TII->getNamedOperand(*II, AMDGPU::OpName::simm16)->setImm(Wait.VsCnt);
Modified = true;
}
Wait.VsCnt = ~0u;
}
LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
<< "Old Instr: " << MI
<< "New Instr: " << *II << '\n');
if (!Wait.hasWait())
return Modified;
}
// Try to merge the required wait with preexisting waitcnt instructions.
// Also erase redundant waitcnt.
Modified =
applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, &MI);
} else {
// Update waitcnt brackets after determining the required wait.
ScoreBrackets.applyWaitcnt(Wait);
}
if (Wait.VmCnt != ~0u || Wait.LgkmCnt != ~0u || Wait.ExpCnt != ~0u) {
// Build new waitcnt instructions unless no wait is needed or the old waitcnt
// instruction was modified to handle the required wait.
if (Wait.hasWaitExceptVsCnt()) {
unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
@ -1155,7 +1198,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
<< "New Instr: " << *SWaitInst << '\n');
}
if (Wait.VsCnt != ~0u) {
if (Wait.hasWaitVsCnt()) {
assert(ST->hasVscnt());
auto SWaitInst =
@ -1430,7 +1473,8 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
Iter != E;) {
MachineInstr &Inst = *Iter;
// Track pre-existing waitcnts from earlier iterations.
// Track pre-existing waitcnts that were added in earlier iterations or by
// the memory legalizer.
if (Inst.getOpcode() == AMDGPU::S_WAITCNT ||
(Inst.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
Inst.getOperand(0).isReg() &&

View File

@ -477,6 +477,14 @@ struct Waitcnt {
return VmCnt != ~0u || ExpCnt != ~0u || LgkmCnt != ~0u || VsCnt != ~0u;
}
bool hasWaitExceptVsCnt() const {
return VmCnt != ~0u || ExpCnt != ~0u || LgkmCnt != ~0u;
}
bool hasWaitVsCnt() const {
return VsCnt != ~0u;
}
bool dominates(const Waitcnt &Other) const {
return VmCnt <= Other.VmCnt && ExpCnt <= Other.ExpCnt &&
LgkmCnt <= Other.LgkmCnt && VsCnt <= Other.VsCnt;

View File

@ -184,7 +184,6 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
; GFX9-NEXT: s_lshl_b32 s1, s0, 2
; GFX9-NEXT: s_and_b32 s0, s0, 15
; GFX9-NEXT: s_lshl_b32 s0, s0, 2
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, 15
; GFX9-NEXT: s_add_u32 s1, 0x104, s1
; GFX9-NEXT: scratch_store_dword off, v0, s1
@ -357,7 +356,6 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
; GFX9-NEXT: s_lshl_b32 s1, s0, 2
; GFX9-NEXT: s_and_b32 s0, s0, 15
; GFX9-NEXT: s_lshl_b32 s0, s0, 2
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, 15
; GFX9-NEXT: s_add_u32 s1, 0x4004, s1
; GFX9-NEXT: scratch_store_dword off, v0, s1

View File

@ -72,7 +72,6 @@ define i32 @atomic_nand_i32_flat(i32* %ptr) nounwind {
; GCN-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN-NEXT: buffer_wbinvl1_vol
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5]

View File

@ -788,7 +788,6 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
; GFX9-NEXT: s_lshl_b32 s1, s0, 2
; GFX9-NEXT: s_and_b32 s0, s0, 15
; GFX9-NEXT: s_lshl_b32 s0, s0, 2
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, 15
; GFX9-NEXT: s_add_u32 s1, 0x104, s1
; GFX9-NEXT: scratch_store_dword off, v0, s1
@ -1419,7 +1418,6 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
; GFX9-NEXT: s_lshl_b32 s1, s0, 2
; GFX9-NEXT: s_and_b32 s0, s0, 15
; GFX9-NEXT: s_lshl_b32 s0, s0, 2
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, 15
; GFX9-NEXT: s_add_u32 s1, 0x4004, s1
; GFX9-NEXT: scratch_store_dword off, v0, s1

View File

@ -630,7 +630,6 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(double* %ptr) #1 {
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
@ -706,7 +705,6 @@ define double @flat_atomic_fadd_f64_rtn_pat(double* %ptr) #1 {
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
@ -731,7 +729,6 @@ define double @flat_atomic_fadd_f64_rtn_pat_agent(double* %ptr) #1 {
; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
main_body:
%ret = atomicrmw fadd double* %ptr, double 4.0 syncscope("agent") seq_cst

View File

@ -192,7 +192,6 @@ define amdgpu_kernel void @gws_barrier_fence_before(i32 %val, i32 addrspace(1)*
; NOLOOP: s_mov_b32 m0, 0{{$}}
; NOLOOP: ds_gws_barrier v{{[0-9]+}} offset:7 gds
; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; NOLOOP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; NOLOOP-NEXT: load_dword
define amdgpu_kernel void @gws_barrier_fence_after(i32 %val, i32 addrspace(1)* %ptr) #0 {
@ -220,7 +219,6 @@ define amdgpu_kernel void @gws_init_barrier(i32 %val) #0 {
; NOLOOP: s_mov_b32 m0, 0
; NOLOOP: ds_gws_init v0 offset:7 gds
; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; NOLOOP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; NOLOOP-NEXT: ds_gws_barrier v0 offset:7 gds
; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)

View File

@ -188,7 +188,6 @@ define amdgpu_kernel void @flat_agent_acquire_load(
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[2:3], v0
; GFX7-NEXT: s_endpgm
;
@ -204,7 +203,6 @@ define amdgpu_kernel void @flat_agent_acquire_load(
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@ -220,7 +218,6 @@ define amdgpu_kernel void @flat_agent_acquire_load(
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@ -248,7 +245,6 @@ define amdgpu_kernel void @flat_agent_acquire_load(
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
@ -285,7 +281,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[2:3], v0
; GFX7-NEXT: s_endpgm
;
@ -303,7 +298,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@ -321,7 +315,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@ -351,7 +344,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
@ -1121,7 +1113,6 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
@ -1138,7 +1129,6 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@ -1155,7 +1145,6 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@ -1182,7 +1171,6 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
@ -1218,7 +1206,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
@ -1237,7 +1224,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@ -1256,7 +1242,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@ -1285,7 +1270,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
@ -1322,7 +1306,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
@ -1341,7 +1324,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@ -1360,7 +1342,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@ -1389,7 +1370,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
@ -2456,7 +2436,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
@ -2478,7 +2457,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@ -2500,7 +2478,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@ -2532,7 +2509,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
@ -2575,7 +2551,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
@ -2599,7 +2574,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@ -2623,7 +2597,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@ -2657,7 +2630,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
@ -2701,7 +2673,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
@ -2725,7 +2696,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@ -2749,7 +2719,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@ -2783,7 +2752,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
@ -2826,7 +2794,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
@ -2848,7 +2815,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@ -2870,7 +2836,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@ -2902,7 +2867,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
@ -2945,7 +2909,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
@ -2969,7 +2932,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@ -2993,7 +2955,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@ -3027,7 +2988,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
@ -3071,7 +3031,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
@ -3095,7 +3054,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@ -3119,7 +3077,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@ -3153,7 +3110,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
@ -3197,7 +3153,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
@ -3221,7 +3176,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@ -3245,7 +3199,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@ -3279,7 +3232,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
@ -3323,7 +3275,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
@ -3347,7 +3298,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@ -3371,7 +3321,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@ -3405,7 +3354,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;

View File

@ -188,7 +188,6 @@ define amdgpu_kernel void @flat_system_acquire_load(
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[2:3], v0
; GFX7-NEXT: s_endpgm
;
@ -204,7 +203,6 @@ define amdgpu_kernel void @flat_system_acquire_load(
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@ -220,7 +218,6 @@ define amdgpu_kernel void @flat_system_acquire_load(
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@ -248,7 +245,6 @@ define amdgpu_kernel void @flat_system_acquire_load(
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
@ -285,7 +281,6 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[2:3], v0
; GFX7-NEXT: s_endpgm
;
@ -303,7 +298,6 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@ -321,7 +315,6 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@ -351,7 +344,6 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
@ -1121,7 +1113,6 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
@ -1138,7 +1129,6 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@ -1155,7 +1145,6 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@ -1182,7 +1171,6 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
@ -1218,7 +1206,6 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
@ -1237,7 +1224,6 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@ -1256,7 +1242,6 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@ -1285,7 +1270,6 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
@ -1322,7 +1306,6 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
@ -1341,7 +1324,6 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@ -1360,7 +1342,6 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@ -1389,7 +1370,6 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
@ -2456,7 +2436,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
@ -2478,7 +2457,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@ -2500,7 +2478,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@ -2532,7 +2509,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
@ -2575,7 +2551,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
@ -2599,7 +2574,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@ -2623,7 +2597,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@ -2657,7 +2630,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
@ -2701,7 +2673,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
@ -2725,7 +2696,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@ -2749,7 +2719,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@ -2783,7 +2752,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
@ -2826,7 +2794,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
@ -2848,7 +2815,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@ -2870,7 +2836,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@ -2902,7 +2867,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
@ -2945,7 +2909,6 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
@ -2969,7 +2932,6 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@ -2993,7 +2955,6 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@ -3027,7 +2988,6 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
@ -3071,7 +3031,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
@ -3095,7 +3054,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@ -3119,7 +3077,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@ -3153,7 +3110,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
@ -3197,7 +3153,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
@ -3221,7 +3176,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@ -3245,7 +3199,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@ -3279,7 +3232,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
@ -3323,7 +3275,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
@ -3347,7 +3298,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@ -3371,7 +3321,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
@ -3405,7 +3354,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;

View File

@ -298,7 +298,6 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load(
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;

View File

@ -202,7 +202,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_load(
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@ -297,7 +296,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@ -1103,7 +1101,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@ -1194,7 +1191,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@ -1289,7 +1285,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@ -2363,7 +2358,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@ -2479,7 +2473,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@ -2599,7 +2592,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@ -2716,7 +2708,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@ -2832,7 +2823,6 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@ -2952,7 +2942,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@ -3072,7 +3061,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
@ -3192,7 +3180,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;

View File

@ -25,6 +25,7 @@
# VM-NEXT: S_NOP 0
# ZERO: S_WAITCNT 0
# ZERO-NEXT: S_NOP 0
# ZERO-NEXT: S_WAITCNT 0
# ZERO-NEXT: S_NOP 0
# ZERO-NEXT: S_WAITCNT 0
@ -32,6 +33,8 @@
name: waitcnt-debug
liveins:
machineFunctionInfo:
isEntryFunction: true
body: |
bb.0:
S_NOP 0

View File

@ -0,0 +1,131 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX10 %s
---
name: test_waitcnt_preexisting_vscnt_unmodified
body: |
bb.0:
liveins: $vgpr0_vgpr1, $vgpr2
; GFX10-LABEL: name: test_waitcnt_preexisting_vscnt_unmodified
; GFX10: S_WAITCNT 0
; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0
; GFX10: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0
; GFX10: S_BARRIER
; GFX10: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
; GFX10: S_WAITCNT 112
; GFX10: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
; GFX10: S_ENDPGM 0
GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
S_WAITCNT_VSCNT undef $sgpr_null, 0
S_BARRIER
$vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
S_ENDPGM 0
...
---
name: test_waitcnt_preexisting_vscnt_needs_vscnt
body: |
bb.0:
liveins: $vgpr0_vgpr1, $vgpr2
; GFX10-LABEL: name: test_waitcnt_preexisting_vscnt_needs_vscnt
; GFX10: S_WAITCNT 0
; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0
; GFX10: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0
; GFX10: S_BARRIER
; GFX10: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
; GFX10: S_WAITCNT 112
; GFX10: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
; GFX10: S_ENDPGM 0
GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
S_WAITCNT_VSCNT undef $sgpr_null, 1
S_BARRIER
$vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
S_ENDPGM 0
...
---
name: test_waitcnt_preexisting_vscnt_with_other_waitcnt
body: |
bb.0:
liveins: $vgpr0_vgpr1, $vgpr2
; GFX10-LABEL: name: test_waitcnt_preexisting_vscnt_with_other_waitcnt
; GFX10: S_WAITCNT 0
; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0
; GFX10: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
; GFX10: S_WAITCNT 112
; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0
; GFX10: S_BARRIER
; GFX10: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
; GFX10: S_WAITCNT 112
; GFX10: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
; GFX10: S_ENDPGM 0
GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
S_WAITCNT 112
S_WAITCNT_VSCNT undef $sgpr_null, 0
S_BARRIER
$vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
S_ENDPGM 0
...
---
name: test_waitcnt_preexisting_vscnt_combined
body: |
bb.0:
liveins: $vgpr0_vgpr1, $vgpr2
; GFX10-LABEL: name: test_waitcnt_preexisting_vscnt_combined
; GFX10: S_WAITCNT 0
; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0
; GFX10: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0
; GFX10: S_BARRIER
; GFX10: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
; GFX10: S_WAITCNT 112
; GFX10: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
; GFX10: S_ENDPGM 0
GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
S_WAITCNT_VSCNT undef $sgpr_null, 0
S_WAITCNT_VSCNT undef $sgpr_null, 1
S_WAITCNT_VSCNT undef $sgpr_null, 2
S_BARRIER
$vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
S_ENDPGM 0
...
---
name: test_waitcnt_preexisting_vscnt_combined_both_types
body: |
bb.0:
liveins: $vgpr0_vgpr1, $vgpr2
; GFX10-LABEL: name: test_waitcnt_preexisting_vscnt_combined_both_types
; GFX10: S_WAITCNT 0
; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0
; GFX10: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
; GFX10: S_WAITCNT 0
; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0
; GFX10: S_BARRIER
; GFX10: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
; GFX10: S_WAITCNT 112
; GFX10: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
; GFX10: S_ENDPGM 0
GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
S_WAITCNT 0
S_WAITCNT_VSCNT undef $sgpr_null, 1
S_WAITCNT 0
S_WAITCNT_VSCNT undef $sgpr_null, 2
S_WAITCNT 0
S_BARRIER
$vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
S_ENDPGM 0
...

View File

@ -1,37 +1,195 @@
# RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefixes=GCN %s
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefixes=GFX9 %s
# GCN-LABEL: name: test{{$}}
# GCN: S_WAITCNT -16257
# GCN: DS_READ2_B32
# GCN: DS_READ2_B32
# GCN: S_WAITCNT 383{{$}}
# GCN-NEXT: $vgpr1 = V_OR_B32_e32 1, killed $vgpr1, implicit $exec
# GCN-NEXT: $vgpr1 = V_MAX_U32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
# GCN-NEXT: S_WAITCNT 127{{$}}
# GCN-NEXT: $vgpr1 = V_MAX_U32_e32 killed $vgpr2, killed $vgpr1, implicit $exec
--- |
define amdgpu_cs void @test() {
ret void
}
...
---
name: test
name: test_waitcnt_preexisting_lgkmcnt_unmodified
body: |
bb.0:
liveins: $sgpr0, $sgpr1, $vgpr0
liveins: $vgpr0
renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 = S_LOAD_DWORDX8_IMM renamable $sgpr0_sgpr1, 480, 0
renamable $vgpr13 = V_LSHLREV_B32_e32 2, killed $vgpr0, implicit $exec
S_WAITCNT -16257
renamable $vgpr0_vgpr1 = DS_READ2_B32 renamable $vgpr13, 0, 1, 0, implicit $m0, implicit $exec
renamable $vgpr2_vgpr3 = DS_READ2_B32 renamable $vgpr13, 2, 3, 0, implicit $m0, implicit $exec
renamable $vgpr1 = V_OR_B32_e32 1, killed $vgpr1, implicit $exec
renamable $vgpr1 = V_MAX_U32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
renamable $vgpr1 = V_MAX_U32_e32 killed $vgpr2, killed $vgpr1, implicit $exec
renamable $vgpr1 = V_MAX_U32_e32 killed $vgpr3, killed $vgpr1, implicit $exec
$vgpr0 = V_MOV_B32_e32 $vgpr1, implicit $exec, implicit $exec
$vgpr2 = V_MOV_B32_e32 $vgpr1, implicit $exec, implicit $exec
$vgpr3 = V_MOV_B32_e32 $vgpr1, implicit $exec, implicit $exec
IMAGE_STORE_V4_V2 killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, implicit $exec :: (store 16)
; GFX9-LABEL: name: test_waitcnt_preexisting_lgkmcnt_unmodified
; GFX9: S_WAITCNT 0
; GFX9: $vgpr0_vgpr1 = DS_READ2_B32 $vgpr0, 0, 1, 0, implicit $m0, implicit $exec
; GFX9: S_WAITCNT 49279
; GFX9: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
; GFX9: S_WAITCNT 112
; GFX9: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
; GFX9: S_ENDPGM 0
$vgpr0_vgpr1 = DS_READ2_B32 $vgpr0, 0, 1, 0, implicit $m0, implicit $exec
S_WAITCNT 49279
$vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
S_ENDPGM 0
...
---
name: test_waitcnt_preexisting_vmcnt_unmodified
body: |
bb.0:
liveins: $vgpr0_vgpr1
; GFX9-LABEL: name: test_waitcnt_preexisting_vmcnt_unmodified
; GFX9: S_WAITCNT 0
; GFX9: $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec
; GFX9: S_WAITCNT 3952
; GFX9: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
; GFX9: S_WAITCNT 112
; GFX9: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
; GFX9: S_ENDPGM 0
$vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec
S_WAITCNT 3952
$vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
S_ENDPGM 0
...
# Respect preexisting waitcnt and add required wait.
---
name: test_waitcnt_preexisting_vmcnt_needs_lgkmcnt
body: |
bb.0:
liveins: $vgpr0
; GFX9-LABEL: name: test_waitcnt_preexisting_vmcnt_needs_lgkmcnt
; GFX9: S_WAITCNT 0
; GFX9: $vgpr0_vgpr1 = DS_READ2_B32 $vgpr0, 0, 1, 0, implicit $m0, implicit $exec
; GFX9: S_WAITCNT 112
; GFX9: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
; GFX9: S_WAITCNT 112
; GFX9: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
; GFX9: S_ENDPGM 0
$vgpr0_vgpr1 = DS_READ2_B32 $vgpr0, 0, 1, 0, implicit $m0, implicit $exec
S_WAITCNT 3952
$vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
S_ENDPGM 0
...
---
name: test_waitcnt_preexisting_lgkmcnt_needs_vmcnt
body: |
bb.0:
liveins: $vgpr0_vgpr1
; GFX9-LABEL: name: test_waitcnt_preexisting_lgkmcnt_needs_vmcnt
; GFX9: S_WAITCNT 0
; GFX9: $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec
; GFX9: S_WAITCNT 112
; GFX9: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
; GFX9: S_WAITCNT 112
; GFX9: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
; GFX9: S_ENDPGM 0
$vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec
S_WAITCNT 49279
$vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
S_ENDPGM 0
...
# Apply wait for all counters from preexisting waitcnt regardless of the wait
# required by the next instruction.
---
name: test_waitcnt_preexisting_apply_all_counters
body: |
bb.0:
liveins: $vgpr0_vgpr1, $vgpr2
; GFX9-LABEL: name: test_waitcnt_preexisting_apply_all_counters
; GFX9: S_WAITCNT 0
; GFX9: $vgpr4_vgpr5 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec
; GFX9: $vgpr6_vgpr7 = DS_READ2_B32 $vgpr2, 0, 1, 0, implicit $m0, implicit $exec
; GFX9: S_WAITCNT 0
; GFX9: $vgpr6 = V_OR_B32_e32 1, killed $vgpr6, implicit $exec
; GFX9: $vgpr0 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec, implicit $flat_scr
; GFX9: S_WAITCNT 112
; GFX9: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
$vgpr4_vgpr5 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec
$vgpr6_vgpr7 = DS_READ2_B32 $vgpr2, 0, 1, 0, implicit $m0, implicit $exec
S_WAITCNT 0
$vgpr6 = V_OR_B32_e32 1, killed $vgpr6, implicit $exec
$vgpr0 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec, implicit $flat_scr
FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
...
---
name: test_waitcnt_preexisting_combine_waitcnt
body: |
bb.0:
liveins: $vgpr0_vgpr1
; GFX9-LABEL: name: test_waitcnt_preexisting_combine_waitcnt
; GFX9: S_WAITCNT 0
; GFX9: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
; GFX9: S_WAITCNT 0
; GFX9: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
$vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
S_WAITCNT 0
S_WAITCNT 0
S_WAITCNT 0
S_WAITCNT 0
S_WAITCNT 0
S_WAITCNT 0
FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
...
---
name: test_waitcnt_preexisting_combine_waitcnt_diff_counters
body: |
bb.0:
liveins: $vgpr0_vgpr1
; GFX9-LABEL: name: test_waitcnt_preexisting_combine_waitcnt_diff_counters
; GFX9: S_WAITCNT 0
; GFX9: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
; GFX9: S_WAITCNT 112
; GFX9: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
$vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
S_WAITCNT 49279
S_WAITCNT 3952
FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
...
# Apply preexisting waitcnt when no wait is immediately needed.
# FIXME: Move waitcnt as late as possible.
---
name: test_waitcnt_preexisting_early_wait
body: |
bb.0:
liveins: $vgpr0_vgpr1
; GFX9-LABEL: name: test_waitcnt_preexisting_early_wait
; GFX9: S_WAITCNT 0
; GFX9: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
; GFX9: S_WAITCNT 0
; GFX9: S_NOP 0
; GFX9: S_NOP 0
; GFX9: S_NOP 0
; GFX9: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
; GFX9: S_ENDPGM 0
$vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
S_WAITCNT 0
S_NOP 0
S_NOP 0
S_NOP 0
FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
S_ENDPGM 0
...
---
name: test_waitcnt_preexisting_ignore_kill
body: |
bb.0:
liveins: $vgpr0_vgpr1
; GFX9-LABEL: name: test_waitcnt_preexisting_ignore_kill
; GFX9: S_WAITCNT 0
; GFX9: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
; GFX9: S_WAITCNT 3952
; GFX9: KILL $vgpr0
$vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
S_WAITCNT 3952
KILL $vgpr0
...