forked from OSchip/llvm-project
[AMDGPU] Revise handling of preexisting waitcnt
Preexisting waitcnt may not update the scoreboard if the instruction being examined needed to wait on fewer counters than what was encoded in the old waitcnt instruction. Fixing this results in the elimination of some redudnat waitcnt. These changes also enable combining consecutive waitcnt into a single S_WAITCNT or S_WAITCNT_VSCNT instruction. Reviewed By: rampitec Differential Revision: https://reviews.llvm.org/D100281
This commit is contained in:
parent
9ba5238c28
commit
f5199d7ae0
|
@ -245,8 +245,8 @@ public:
|
|||
const SIRegisterInfo *TRI, unsigned OpNo) const;
|
||||
|
||||
bool counterOutOfOrder(InstCounterType T) const;
|
||||
bool simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
|
||||
bool simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
|
||||
void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
|
||||
void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
|
||||
void determineWait(InstCounterType T, unsigned ScoreToWait,
|
||||
AMDGPU::Waitcnt &Wait) const;
|
||||
void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
|
||||
|
@ -418,7 +418,7 @@ public:
|
|||
}
|
||||
|
||||
if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
|
||||
DebugCounter::shouldExecute(ForceLgkmCounter)) {
|
||||
DebugCounter::shouldExecute(ForceLgkmCounter)) {
|
||||
ForceEmitWaitcnt[LGKM_CNT] = true;
|
||||
} else {
|
||||
ForceEmitWaitcnt[LGKM_CNT] = false;
|
||||
|
@ -442,6 +442,9 @@ public:
|
|||
WaitcntBrackets *ScoreBrackets);
|
||||
bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
|
||||
WaitcntBrackets &ScoreBrackets);
|
||||
bool applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
|
||||
MachineInstr &OldWaitcntInstr,
|
||||
AMDGPU::Waitcnt &Wait, const MachineInstr *MI);
|
||||
};
|
||||
|
||||
} // end anonymous namespace
|
||||
|
@ -708,22 +711,23 @@ void WaitcntBrackets::print(raw_ostream &OS) {
|
|||
|
||||
/// Simplify the waitcnt, in the sense of removing redundant counts, and return
|
||||
/// whether a waitcnt instruction is needed at all.
|
||||
bool WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
|
||||
return simplifyWaitcnt(VM_CNT, Wait.VmCnt) |
|
||||
simplifyWaitcnt(EXP_CNT, Wait.ExpCnt) |
|
||||
simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt) |
|
||||
simplifyWaitcnt(VS_CNT, Wait.VsCnt);
|
||||
void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
|
||||
simplifyWaitcnt(VM_CNT, Wait.VmCnt);
|
||||
simplifyWaitcnt(EXP_CNT, Wait.ExpCnt);
|
||||
simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
|
||||
simplifyWaitcnt(VS_CNT, Wait.VsCnt);
|
||||
}
|
||||
|
||||
bool WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
|
||||
void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
|
||||
unsigned &Count) const {
|
||||
const unsigned LB = getScoreLB(T);
|
||||
const unsigned UB = getScoreUB(T);
|
||||
if (Count < UB && UB - Count > LB)
|
||||
return true;
|
||||
|
||||
Count = ~0u;
|
||||
return false;
|
||||
// The number of outstanding events for this type, T, can be calculated
|
||||
// as (UB - LB). If the current Count is greater than or equal to the number
|
||||
// of outstanding events, then the wait for this counter is redundant.
|
||||
if (Count >= UB - LB)
|
||||
Count = ~0u;
|
||||
}
|
||||
|
||||
void WaitcntBrackets::determineWait(InstCounterType T, unsigned ScoreToWait,
|
||||
|
@ -798,6 +802,107 @@ FunctionPass *llvm::createSIInsertWaitcntsPass() {
|
|||
return new SIInsertWaitcnts();
|
||||
}
|
||||
|
||||
/// Combine consecutive waitcnt instructions that precede \p MI and follow
|
||||
/// \p OldWaitcntInstr and apply any extra wait from waitcnt that were added
|
||||
/// by previous passes. Currently this pass conservatively assumes that these
|
||||
/// preexisting waitcnt are required for correctness.
|
||||
bool SIInsertWaitcnts::applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
|
||||
MachineInstr &OldWaitcntInstr,
|
||||
AMDGPU::Waitcnt &Wait,
|
||||
const MachineInstr *MI) {
|
||||
bool Modified = false;
|
||||
MachineInstr *WaitcntInstr = nullptr;
|
||||
MachineInstr *WaitcntVsCntInstr = nullptr;
|
||||
for (auto II = OldWaitcntInstr.getIterator(), NextI = std::next(II);
|
||||
&*II != MI; II = NextI, ++NextI) {
|
||||
if (II->isMetaInstruction())
|
||||
continue;
|
||||
|
||||
if (II->getOpcode() == AMDGPU::S_WAITCNT) {
|
||||
// Conservatively update required wait if this waitcnt was added in an
|
||||
// earlier pass. In this case it will not exist in the tracked waitcnt
|
||||
// set.
|
||||
if (!TrackedWaitcntSet.count(&*II)) {
|
||||
unsigned IEnc = II->getOperand(0).getImm();
|
||||
AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
|
||||
Wait = Wait.combined(OldWait);
|
||||
}
|
||||
|
||||
// Merge consecutive waitcnt of the same type by erasing multiples.
|
||||
if (!WaitcntInstr) {
|
||||
WaitcntInstr = &*II;
|
||||
} else {
|
||||
II->eraseFromParent();
|
||||
Modified = true;
|
||||
}
|
||||
|
||||
} else {
|
||||
assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
|
||||
assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
|
||||
if (!TrackedWaitcntSet.count(&*II)) {
|
||||
unsigned OldVSCnt =
|
||||
TII->getNamedOperand(*II, AMDGPU::OpName::simm16)->getImm();
|
||||
Wait.VsCnt = std::min(Wait.VsCnt, OldVSCnt);
|
||||
}
|
||||
|
||||
if (!WaitcntVsCntInstr) {
|
||||
WaitcntVsCntInstr = &*II;
|
||||
} else {
|
||||
II->eraseFromParent();
|
||||
Modified = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Updated encoding of merged waitcnt with the required wait.
|
||||
if (WaitcntInstr) {
|
||||
if (Wait.hasWaitExceptVsCnt()) {
|
||||
unsigned NewEnc = AMDGPU::encodeWaitcnt(IV, Wait);
|
||||
unsigned OldEnc = WaitcntInstr->getOperand(0).getImm();
|
||||
if (OldEnc != NewEnc) {
|
||||
WaitcntInstr->getOperand(0).setImm(NewEnc);
|
||||
Modified = true;
|
||||
}
|
||||
ScoreBrackets.applyWaitcnt(Wait);
|
||||
Wait.VmCnt = ~0u;
|
||||
Wait.LgkmCnt = ~0u;
|
||||
Wait.ExpCnt = ~0u;
|
||||
|
||||
LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
|
||||
<< "Old Instr: " << MI << "New Instr: " << *WaitcntInstr
|
||||
<< '\n');
|
||||
} else {
|
||||
WaitcntInstr->eraseFromParent();
|
||||
Modified = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (WaitcntVsCntInstr) {
|
||||
if (Wait.hasWaitVsCnt()) {
|
||||
assert(ST->hasVscnt());
|
||||
unsigned OldVSCnt =
|
||||
TII->getNamedOperand(*WaitcntVsCntInstr, AMDGPU::OpName::simm16)
|
||||
->getImm();
|
||||
if (Wait.VsCnt != OldVSCnt) {
|
||||
TII->getNamedOperand(*WaitcntVsCntInstr, AMDGPU::OpName::simm16)
|
||||
->setImm(Wait.VsCnt);
|
||||
Modified = true;
|
||||
}
|
||||
ScoreBrackets.applyWaitcnt(Wait);
|
||||
Wait.VsCnt = ~0u;
|
||||
|
||||
LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
|
||||
<< "Old Instr: " << MI
|
||||
<< "New Instr: " << *WaitcntVsCntInstr << '\n');
|
||||
} else {
|
||||
WaitcntVsCntInstr->eraseFromParent();
|
||||
Modified = true;
|
||||
}
|
||||
}
|
||||
|
||||
return Modified;
|
||||
}
|
||||
|
||||
static bool readsVCCZ(const MachineInstr &MI) {
|
||||
unsigned Opc = MI.getOpcode();
|
||||
return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
|
||||
|
@ -833,12 +938,12 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
|
|||
MachineInstr &MI, WaitcntBrackets &ScoreBrackets,
|
||||
MachineInstr *OldWaitcntInstr) {
|
||||
setForceEmitWaitcnt();
|
||||
bool IsForceEmitWaitcnt = isForceEmitWaitcnt();
|
||||
|
||||
if (MI.isMetaInstruction())
|
||||
return false;
|
||||
|
||||
AMDGPU::Waitcnt Wait;
|
||||
bool Modified = false;
|
||||
|
||||
// See if this instruction has a forced S_WAITCNT VM.
|
||||
// TODO: Handle other cases of NeedsWaitcntVmBefore()
|
||||
|
@ -1053,32 +1158,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
|
|||
}
|
||||
}
|
||||
|
||||
// Early-out if no wait is indicated.
|
||||
if (!ScoreBrackets.simplifyWaitcnt(Wait) && !IsForceEmitWaitcnt) {
|
||||
bool Modified = false;
|
||||
if (OldWaitcntInstr) {
|
||||
for (auto II = OldWaitcntInstr->getIterator(), NextI = std::next(II);
|
||||
&*II != &MI; II = NextI, ++NextI) {
|
||||
if (II->isDebugInstr())
|
||||
continue;
|
||||
|
||||
if (TrackedWaitcntSet.count(&*II)) {
|
||||
TrackedWaitcntSet.erase(&*II);
|
||||
II->eraseFromParent();
|
||||
Modified = true;
|
||||
} else if (II->getOpcode() == AMDGPU::S_WAITCNT) {
|
||||
int64_t Imm = II->getOperand(0).getImm();
|
||||
ScoreBrackets.applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm));
|
||||
} else {
|
||||
assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
|
||||
assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
|
||||
auto W = TII->getNamedOperand(*II, AMDGPU::OpName::simm16)->getImm();
|
||||
ScoreBrackets.applyWaitcnt(AMDGPU::Waitcnt(~0u, ~0u, ~0u, W));
|
||||
}
|
||||
}
|
||||
}
|
||||
return Modified;
|
||||
}
|
||||
// Verify that the wait is actually needed.
|
||||
ScoreBrackets.simplifyWaitcnt(Wait);
|
||||
|
||||
if (ForceEmitZeroWaitcnts)
|
||||
Wait = AMDGPU::Waitcnt::allZero(ST->hasVscnt());
|
||||
|
@ -1092,57 +1173,19 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
|
|||
if (ForceEmitWaitcnt[VS_CNT])
|
||||
Wait.VsCnt = 0;
|
||||
|
||||
ScoreBrackets.applyWaitcnt(Wait);
|
||||
|
||||
AMDGPU::Waitcnt OldWait;
|
||||
bool Modified = false;
|
||||
|
||||
if (OldWaitcntInstr) {
|
||||
for (auto II = OldWaitcntInstr->getIterator(), NextI = std::next(II);
|
||||
&*II != &MI; II = NextI, NextI++) {
|
||||
if (II->isDebugInstr())
|
||||
continue;
|
||||
|
||||
if (II->getOpcode() == AMDGPU::S_WAITCNT) {
|
||||
unsigned IEnc = II->getOperand(0).getImm();
|
||||
AMDGPU::Waitcnt IWait = AMDGPU::decodeWaitcnt(IV, IEnc);
|
||||
OldWait = OldWait.combined(IWait);
|
||||
if (!TrackedWaitcntSet.count(&*II))
|
||||
Wait = Wait.combined(IWait);
|
||||
unsigned NewEnc = AMDGPU::encodeWaitcnt(IV, Wait);
|
||||
if (IEnc != NewEnc) {
|
||||
II->getOperand(0).setImm(NewEnc);
|
||||
Modified = true;
|
||||
}
|
||||
Wait.VmCnt = ~0u;
|
||||
Wait.LgkmCnt = ~0u;
|
||||
Wait.ExpCnt = ~0u;
|
||||
} else {
|
||||
assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
|
||||
assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
|
||||
|
||||
unsigned ICnt = TII->getNamedOperand(*II, AMDGPU::OpName::simm16)
|
||||
->getImm();
|
||||
OldWait.VsCnt = std::min(OldWait.VsCnt, ICnt);
|
||||
if (!TrackedWaitcntSet.count(&*II))
|
||||
Wait.VsCnt = std::min(Wait.VsCnt, ICnt);
|
||||
if (Wait.VsCnt != ICnt) {
|
||||
TII->getNamedOperand(*II, AMDGPU::OpName::simm16)->setImm(Wait.VsCnt);
|
||||
Modified = true;
|
||||
}
|
||||
Wait.VsCnt = ~0u;
|
||||
}
|
||||
|
||||
LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
|
||||
<< "Old Instr: " << MI
|
||||
<< "New Instr: " << *II << '\n');
|
||||
|
||||
if (!Wait.hasWait())
|
||||
return Modified;
|
||||
}
|
||||
// Try to merge the required wait with preexisting waitcnt instructions.
|
||||
// Also erase redundant waitcnt.
|
||||
Modified =
|
||||
applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, &MI);
|
||||
} else {
|
||||
// Update waitcnt brackets after determining the required wait.
|
||||
ScoreBrackets.applyWaitcnt(Wait);
|
||||
}
|
||||
|
||||
if (Wait.VmCnt != ~0u || Wait.LgkmCnt != ~0u || Wait.ExpCnt != ~0u) {
|
||||
// Build new waitcnt instructions unless no wait is needed or the old waitcnt
|
||||
// instruction was modified to handle the required wait.
|
||||
if (Wait.hasWaitExceptVsCnt()) {
|
||||
unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
|
||||
auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
|
||||
MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
|
||||
|
@ -1155,7 +1198,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
|
|||
<< "New Instr: " << *SWaitInst << '\n');
|
||||
}
|
||||
|
||||
if (Wait.VsCnt != ~0u) {
|
||||
if (Wait.hasWaitVsCnt()) {
|
||||
assert(ST->hasVscnt());
|
||||
|
||||
auto SWaitInst =
|
||||
|
@ -1430,7 +1473,8 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
|
|||
Iter != E;) {
|
||||
MachineInstr &Inst = *Iter;
|
||||
|
||||
// Track pre-existing waitcnts from earlier iterations.
|
||||
// Track pre-existing waitcnts that were added in earlier iterations or by
|
||||
// the memory legalizer.
|
||||
if (Inst.getOpcode() == AMDGPU::S_WAITCNT ||
|
||||
(Inst.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
|
||||
Inst.getOperand(0).isReg() &&
|
||||
|
|
|
@ -477,6 +477,14 @@ struct Waitcnt {
|
|||
return VmCnt != ~0u || ExpCnt != ~0u || LgkmCnt != ~0u || VsCnt != ~0u;
|
||||
}
|
||||
|
||||
bool hasWaitExceptVsCnt() const {
|
||||
return VmCnt != ~0u || ExpCnt != ~0u || LgkmCnt != ~0u;
|
||||
}
|
||||
|
||||
bool hasWaitVsCnt() const {
|
||||
return VsCnt != ~0u;
|
||||
}
|
||||
|
||||
bool dominates(const Waitcnt &Other) const {
|
||||
return VmCnt <= Other.VmCnt && ExpCnt <= Other.ExpCnt &&
|
||||
LgkmCnt <= Other.LgkmCnt && VsCnt <= Other.VsCnt;
|
||||
|
|
|
@ -184,7 +184,6 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
|
|||
; GFX9-NEXT: s_lshl_b32 s1, s0, 2
|
||||
; GFX9-NEXT: s_and_b32 s0, s0, 15
|
||||
; GFX9-NEXT: s_lshl_b32 s0, s0, 2
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, 15
|
||||
; GFX9-NEXT: s_add_u32 s1, 0x104, s1
|
||||
; GFX9-NEXT: scratch_store_dword off, v0, s1
|
||||
|
@ -357,7 +356,6 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
|
|||
; GFX9-NEXT: s_lshl_b32 s1, s0, 2
|
||||
; GFX9-NEXT: s_and_b32 s0, s0, 15
|
||||
; GFX9-NEXT: s_lshl_b32 s0, s0, 2
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, 15
|
||||
; GFX9-NEXT: s_add_u32 s1, 0x4004, s1
|
||||
; GFX9-NEXT: scratch_store_dword off, v0, s1
|
||||
|
|
|
@ -72,7 +72,6 @@ define i32 @atomic_nand_i32_flat(i32* %ptr) nounwind {
|
|||
; GCN-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: buffer_wbinvl1_vol
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
|
||||
; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
|
|
|
@ -788,7 +788,6 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
|
|||
; GFX9-NEXT: s_lshl_b32 s1, s0, 2
|
||||
; GFX9-NEXT: s_and_b32 s0, s0, 15
|
||||
; GFX9-NEXT: s_lshl_b32 s0, s0, 2
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, 15
|
||||
; GFX9-NEXT: s_add_u32 s1, 0x104, s1
|
||||
; GFX9-NEXT: scratch_store_dword off, v0, s1
|
||||
|
@ -1419,7 +1418,6 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
|
|||
; GFX9-NEXT: s_lshl_b32 s1, s0, 2
|
||||
; GFX9-NEXT: s_and_b32 s0, s0, 15
|
||||
; GFX9-NEXT: s_lshl_b32 s0, s0, 2
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, 15
|
||||
; GFX9-NEXT: s_add_u32 s1, 0x4004, s1
|
||||
; GFX9-NEXT: scratch_store_dword off, v0, s1
|
||||
|
|
|
@ -630,7 +630,6 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(double* %ptr) #1 {
|
|||
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
||||
; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
|
||||
|
@ -706,7 +705,6 @@ define double @flat_atomic_fadd_f64_rtn_pat(double* %ptr) #1 {
|
|||
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
|
||||
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
|
@ -731,7 +729,6 @@ define double @flat_atomic_fadd_f64_rtn_pat_agent(double* %ptr) #1 {
|
|||
; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NEXT: s_setpc_b64 s[30:31]
|
||||
main_body:
|
||||
%ret = atomicrmw fadd double* %ptr, double 4.0 syncscope("agent") seq_cst
|
||||
|
|
|
@ -192,7 +192,6 @@ define amdgpu_kernel void @gws_barrier_fence_before(i32 %val, i32 addrspace(1)*
|
|||
; NOLOOP: s_mov_b32 m0, 0{{$}}
|
||||
; NOLOOP: ds_gws_barrier v{{[0-9]+}} offset:7 gds
|
||||
; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; NOLOOP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; NOLOOP-NEXT: load_dword
|
||||
define amdgpu_kernel void @gws_barrier_fence_after(i32 %val, i32 addrspace(1)* %ptr) #0 {
|
||||
|
@ -220,7 +219,6 @@ define amdgpu_kernel void @gws_init_barrier(i32 %val) #0 {
|
|||
; NOLOOP: s_mov_b32 m0, 0
|
||||
; NOLOOP: ds_gws_init v0 offset:7 gds
|
||||
; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; NOLOOP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; NOLOOP-NEXT: ds_gws_barrier v0 offset:7 gds
|
||||
; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
|
|
|
@ -188,7 +188,6 @@ define amdgpu_kernel void @flat_agent_acquire_load(
|
|||
; GFX7-NEXT: buffer_wbinvl1_vol
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: flat_store_dword v[2:3], v0
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -204,7 +203,6 @@ define amdgpu_kernel void @flat_agent_acquire_load(
|
|||
; GFX10-WGP-NEXT: buffer_gl1_inv
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -220,7 +218,6 @@ define amdgpu_kernel void @flat_agent_acquire_load(
|
|||
; GFX10-CU-NEXT: buffer_gl1_inv
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-CU-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -248,7 +245,6 @@ define amdgpu_kernel void @flat_agent_acquire_load(
|
|||
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -285,7 +281,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
|
|||
; GFX7-NEXT: buffer_wbinvl1_vol
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: flat_store_dword v[2:3], v0
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -303,7 +298,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
|
|||
; GFX10-WGP-NEXT: buffer_gl1_inv
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -321,7 +315,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
|
|||
; GFX10-CU-NEXT: buffer_gl1_inv
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-CU-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -351,7 +344,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
|
|||
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -1121,7 +1113,6 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
|
|||
; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX7-NEXT: buffer_wbinvl1_vol
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -1138,7 +1129,6 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
|
|||
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: buffer_gl0_inv
|
||||
; GFX10-WGP-NEXT: buffer_gl1_inv
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -1155,7 +1145,6 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
|
|||
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: buffer_gl0_inv
|
||||
; GFX10-CU-NEXT: buffer_gl1_inv
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-CU-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -1182,7 +1171,6 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
|
|||
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -1218,7 +1206,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
|
|||
; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX7-NEXT: buffer_wbinvl1_vol
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -1237,7 +1224,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
|
|||
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: buffer_gl0_inv
|
||||
; GFX10-WGP-NEXT: buffer_gl1_inv
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -1256,7 +1242,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
|
|||
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: buffer_gl0_inv
|
||||
; GFX10-CU-NEXT: buffer_gl1_inv
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-CU-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -1285,7 +1270,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
|
|||
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -1322,7 +1306,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
|
|||
; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX7-NEXT: buffer_wbinvl1_vol
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -1341,7 +1324,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
|
|||
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: buffer_gl0_inv
|
||||
; GFX10-WGP-NEXT: buffer_gl1_inv
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -1360,7 +1342,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
|
|||
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: buffer_gl0_inv
|
||||
; GFX10-CU-NEXT: buffer_gl1_inv
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-CU-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -1389,7 +1370,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
|
|||
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -2456,7 +2436,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
|
|||
; GFX7-NEXT: buffer_wbinvl1_vol
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -2478,7 +2457,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
|
|||
; GFX10-WGP-NEXT: buffer_gl1_inv
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -2500,7 +2478,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
|
|||
; GFX10-CU-NEXT: buffer_gl1_inv
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-CU-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -2532,7 +2509,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
|
|||
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -2575,7 +2551,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
|
|||
; GFX7-NEXT: buffer_wbinvl1_vol
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -2599,7 +2574,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
|
|||
; GFX10-WGP-NEXT: buffer_gl1_inv
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -2623,7 +2597,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
|
|||
; GFX10-CU-NEXT: buffer_gl1_inv
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-CU-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -2657,7 +2630,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
|
|||
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -2701,7 +2673,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
|
|||
; GFX7-NEXT: buffer_wbinvl1_vol
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -2725,7 +2696,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
|
|||
; GFX10-WGP-NEXT: buffer_gl1_inv
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -2749,7 +2719,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
|
|||
; GFX10-CU-NEXT: buffer_gl1_inv
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-CU-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -2783,7 +2752,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
|
|||
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -2826,7 +2794,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
|
|||
; GFX7-NEXT: buffer_wbinvl1_vol
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -2848,7 +2815,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
|
|||
; GFX10-WGP-NEXT: buffer_gl1_inv
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -2870,7 +2836,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
|
|||
; GFX10-CU-NEXT: buffer_gl1_inv
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-CU-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -2902,7 +2867,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
|
|||
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -2945,7 +2909,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
|
|||
; GFX7-NEXT: buffer_wbinvl1_vol
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -2969,7 +2932,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
|
|||
; GFX10-WGP-NEXT: buffer_gl1_inv
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -2993,7 +2955,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
|
|||
; GFX10-CU-NEXT: buffer_gl1_inv
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-CU-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -3027,7 +2988,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
|
|||
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -3071,7 +3031,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
|
|||
; GFX7-NEXT: buffer_wbinvl1_vol
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -3095,7 +3054,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
|
|||
; GFX10-WGP-NEXT: buffer_gl1_inv
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -3119,7 +3077,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
|
|||
; GFX10-CU-NEXT: buffer_gl1_inv
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-CU-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -3153,7 +3110,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
|
|||
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -3197,7 +3153,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
|
|||
; GFX7-NEXT: buffer_wbinvl1_vol
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -3221,7 +3176,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
|
|||
; GFX10-WGP-NEXT: buffer_gl1_inv
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -3245,7 +3199,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
|
|||
; GFX10-CU-NEXT: buffer_gl1_inv
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-CU-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -3279,7 +3232,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
|
|||
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -3323,7 +3275,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
|
|||
; GFX7-NEXT: buffer_wbinvl1_vol
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -3347,7 +3298,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
|
|||
; GFX10-WGP-NEXT: buffer_gl1_inv
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -3371,7 +3321,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
|
|||
; GFX10-CU-NEXT: buffer_gl1_inv
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-CU-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -3405,7 +3354,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
|
|||
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
||||
;
|
||||
|
|
|
@ -188,7 +188,6 @@ define amdgpu_kernel void @flat_system_acquire_load(
|
|||
; GFX7-NEXT: buffer_wbinvl1_vol
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: flat_store_dword v[2:3], v0
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -204,7 +203,6 @@ define amdgpu_kernel void @flat_system_acquire_load(
|
|||
; GFX10-WGP-NEXT: buffer_gl1_inv
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -220,7 +218,6 @@ define amdgpu_kernel void @flat_system_acquire_load(
|
|||
; GFX10-CU-NEXT: buffer_gl1_inv
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-CU-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -248,7 +245,6 @@ define amdgpu_kernel void @flat_system_acquire_load(
|
|||
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -285,7 +281,6 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
|
|||
; GFX7-NEXT: buffer_wbinvl1_vol
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: flat_store_dword v[2:3], v0
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -303,7 +298,6 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
|
|||
; GFX10-WGP-NEXT: buffer_gl1_inv
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -321,7 +315,6 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
|
|||
; GFX10-CU-NEXT: buffer_gl1_inv
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-CU-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -351,7 +344,6 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
|
|||
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -1121,7 +1113,6 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
|
|||
; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX7-NEXT: buffer_wbinvl1_vol
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -1138,7 +1129,6 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
|
|||
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: buffer_gl0_inv
|
||||
; GFX10-WGP-NEXT: buffer_gl1_inv
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -1155,7 +1145,6 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
|
|||
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: buffer_gl0_inv
|
||||
; GFX10-CU-NEXT: buffer_gl1_inv
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-CU-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -1182,7 +1171,6 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
|
|||
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -1218,7 +1206,6 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
|
|||
; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX7-NEXT: buffer_wbinvl1_vol
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -1237,7 +1224,6 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
|
|||
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: buffer_gl0_inv
|
||||
; GFX10-WGP-NEXT: buffer_gl1_inv
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -1256,7 +1242,6 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
|
|||
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: buffer_gl0_inv
|
||||
; GFX10-CU-NEXT: buffer_gl1_inv
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-CU-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -1285,7 +1270,6 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
|
|||
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -1322,7 +1306,6 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
|
|||
; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX7-NEXT: buffer_wbinvl1_vol
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -1341,7 +1324,6 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
|
|||
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: buffer_gl0_inv
|
||||
; GFX10-WGP-NEXT: buffer_gl1_inv
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -1360,7 +1342,6 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
|
|||
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: buffer_gl0_inv
|
||||
; GFX10-CU-NEXT: buffer_gl1_inv
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-CU-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -1389,7 +1370,6 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
|
|||
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -2456,7 +2436,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
|
|||
; GFX7-NEXT: buffer_wbinvl1_vol
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -2478,7 +2457,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
|
|||
; GFX10-WGP-NEXT: buffer_gl1_inv
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -2500,7 +2478,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
|
|||
; GFX10-CU-NEXT: buffer_gl1_inv
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-CU-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -2532,7 +2509,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
|
|||
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -2575,7 +2551,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
|
|||
; GFX7-NEXT: buffer_wbinvl1_vol
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -2599,7 +2574,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
|
|||
; GFX10-WGP-NEXT: buffer_gl1_inv
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -2623,7 +2597,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
|
|||
; GFX10-CU-NEXT: buffer_gl1_inv
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-CU-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -2657,7 +2630,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
|
|||
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -2701,7 +2673,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
|
|||
; GFX7-NEXT: buffer_wbinvl1_vol
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -2725,7 +2696,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
|
|||
; GFX10-WGP-NEXT: buffer_gl1_inv
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -2749,7 +2719,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
|
|||
; GFX10-CU-NEXT: buffer_gl1_inv
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-CU-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -2783,7 +2752,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
|
|||
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -2826,7 +2794,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
|
|||
; GFX7-NEXT: buffer_wbinvl1_vol
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -2848,7 +2815,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
|
|||
; GFX10-WGP-NEXT: buffer_gl1_inv
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -2870,7 +2836,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
|
|||
; GFX10-CU-NEXT: buffer_gl1_inv
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-CU-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -2902,7 +2867,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
|
|||
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -2945,7 +2909,6 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
|
|||
; GFX7-NEXT: buffer_wbinvl1_vol
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -2969,7 +2932,6 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
|
|||
; GFX10-WGP-NEXT: buffer_gl1_inv
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -2993,7 +2955,6 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
|
|||
; GFX10-CU-NEXT: buffer_gl1_inv
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-CU-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -3027,7 +2988,6 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
|
|||
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -3071,7 +3031,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
|
|||
; GFX7-NEXT: buffer_wbinvl1_vol
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -3095,7 +3054,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
|
|||
; GFX10-WGP-NEXT: buffer_gl1_inv
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -3119,7 +3077,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
|
|||
; GFX10-CU-NEXT: buffer_gl1_inv
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-CU-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -3153,7 +3110,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
|
|||
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -3197,7 +3153,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
|
|||
; GFX7-NEXT: buffer_wbinvl1_vol
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -3221,7 +3176,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
|
|||
; GFX10-WGP-NEXT: buffer_gl1_inv
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -3245,7 +3199,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
|
|||
; GFX10-CU-NEXT: buffer_gl1_inv
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-CU-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -3279,7 +3232,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
|
|||
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -3323,7 +3275,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
|
|||
; GFX7-NEXT: buffer_wbinvl1_vol
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -3347,7 +3298,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
|
|||
; GFX10-WGP-NEXT: buffer_gl1_inv
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -3371,7 +3321,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
|
|||
; GFX10-CU-NEXT: buffer_gl1_inv
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-CU-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -3405,7 +3354,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
|
|||
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
||||
;
|
||||
|
|
|
@ -298,7 +298,6 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load(
|
|||
; GFX10-WGP-NEXT: buffer_gl0_inv
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
|
|
|
@ -202,7 +202,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_load(
|
|||
; GFX10-WGP-NEXT: buffer_gl0_inv
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -297,7 +296,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
|
|||
; GFX10-WGP-NEXT: buffer_gl0_inv
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -1103,7 +1101,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
|
|||
; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
||||
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: buffer_gl0_inv
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -1194,7 +1191,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
|
|||
; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
||||
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: buffer_gl0_inv
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -1289,7 +1285,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
|
|||
; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
||||
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: buffer_gl0_inv
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -2363,7 +2358,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
|
|||
; GFX10-WGP-NEXT: buffer_gl0_inv
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -2479,7 +2473,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
|
|||
; GFX10-WGP-NEXT: buffer_gl0_inv
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -2599,7 +2592,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
|
|||
; GFX10-WGP-NEXT: buffer_gl0_inv
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -2716,7 +2708,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
|
|||
; GFX10-WGP-NEXT: buffer_gl0_inv
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -2832,7 +2823,6 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
|
|||
; GFX10-WGP-NEXT: buffer_gl0_inv
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -2952,7 +2942,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
|
|||
; GFX10-WGP-NEXT: buffer_gl0_inv
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -3072,7 +3061,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
|
|||
; GFX10-WGP-NEXT: buffer_gl0_inv
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -3192,7 +3180,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
|
|||
; GFX10-WGP-NEXT: buffer_gl0_inv
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
|
|
|
@ -25,6 +25,7 @@
|
|||
# VM-NEXT: S_NOP 0
|
||||
|
||||
# ZERO: S_WAITCNT 0
|
||||
# ZERO-NEXT: S_NOP 0
|
||||
# ZERO-NEXT: S_WAITCNT 0
|
||||
# ZERO-NEXT: S_NOP 0
|
||||
# ZERO-NEXT: S_WAITCNT 0
|
||||
|
@ -32,6 +33,8 @@
|
|||
|
||||
name: waitcnt-debug
|
||||
liveins:
|
||||
machineFunctionInfo:
|
||||
isEntryFunction: true
|
||||
body: |
|
||||
bb.0:
|
||||
S_NOP 0
|
||||
|
|
|
@ -0,0 +1,131 @@
|
|||
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
|
||||
# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX10 %s
|
||||
|
||||
---
|
||||
name: test_waitcnt_preexisting_vscnt_unmodified
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0_vgpr1, $vgpr2
|
||||
|
||||
; GFX10-LABEL: name: test_waitcnt_preexisting_vscnt_unmodified
|
||||
; GFX10: S_WAITCNT 0
|
||||
; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0
|
||||
; GFX10: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
|
||||
; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0
|
||||
; GFX10: S_BARRIER
|
||||
; GFX10: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
|
||||
; GFX10: S_WAITCNT 112
|
||||
; GFX10: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
|
||||
; GFX10: S_ENDPGM 0
|
||||
GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
|
||||
S_WAITCNT_VSCNT undef $sgpr_null, 0
|
||||
S_BARRIER
|
||||
$vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
|
||||
FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
|
||||
S_ENDPGM 0
|
||||
...
|
||||
|
||||
---
|
||||
name: test_waitcnt_preexisting_vscnt_needs_vscnt
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0_vgpr1, $vgpr2
|
||||
|
||||
; GFX10-LABEL: name: test_waitcnt_preexisting_vscnt_needs_vscnt
|
||||
; GFX10: S_WAITCNT 0
|
||||
; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0
|
||||
; GFX10: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
|
||||
; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0
|
||||
; GFX10: S_BARRIER
|
||||
; GFX10: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
|
||||
; GFX10: S_WAITCNT 112
|
||||
; GFX10: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
|
||||
; GFX10: S_ENDPGM 0
|
||||
GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
|
||||
S_WAITCNT_VSCNT undef $sgpr_null, 1
|
||||
S_BARRIER
|
||||
$vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
|
||||
FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
|
||||
S_ENDPGM 0
|
||||
...
|
||||
|
||||
---
|
||||
name: test_waitcnt_preexisting_vscnt_with_other_waitcnt
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0_vgpr1, $vgpr2
|
||||
|
||||
; GFX10-LABEL: name: test_waitcnt_preexisting_vscnt_with_other_waitcnt
|
||||
; GFX10: S_WAITCNT 0
|
||||
; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0
|
||||
; GFX10: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
|
||||
; GFX10: S_WAITCNT 112
|
||||
; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0
|
||||
; GFX10: S_BARRIER
|
||||
; GFX10: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
|
||||
; GFX10: S_WAITCNT 112
|
||||
; GFX10: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
|
||||
; GFX10: S_ENDPGM 0
|
||||
GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
|
||||
S_WAITCNT 112
|
||||
S_WAITCNT_VSCNT undef $sgpr_null, 0
|
||||
S_BARRIER
|
||||
$vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
|
||||
FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
|
||||
S_ENDPGM 0
|
||||
...
|
||||
|
||||
---
|
||||
name: test_waitcnt_preexisting_vscnt_combined
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0_vgpr1, $vgpr2
|
||||
|
||||
; GFX10-LABEL: name: test_waitcnt_preexisting_vscnt_combined
|
||||
; GFX10: S_WAITCNT 0
|
||||
; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0
|
||||
; GFX10: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
|
||||
; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0
|
||||
; GFX10: S_BARRIER
|
||||
; GFX10: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
|
||||
; GFX10: S_WAITCNT 112
|
||||
; GFX10: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
|
||||
; GFX10: S_ENDPGM 0
|
||||
GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
|
||||
S_WAITCNT_VSCNT undef $sgpr_null, 0
|
||||
S_WAITCNT_VSCNT undef $sgpr_null, 1
|
||||
S_WAITCNT_VSCNT undef $sgpr_null, 2
|
||||
S_BARRIER
|
||||
$vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
|
||||
FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
|
||||
S_ENDPGM 0
|
||||
...
|
||||
|
||||
---
|
||||
name: test_waitcnt_preexisting_vscnt_combined_both_types
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0_vgpr1, $vgpr2
|
||||
|
||||
; GFX10-LABEL: name: test_waitcnt_preexisting_vscnt_combined_both_types
|
||||
; GFX10: S_WAITCNT 0
|
||||
; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0
|
||||
; GFX10: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
|
||||
; GFX10: S_WAITCNT 0
|
||||
; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0
|
||||
; GFX10: S_BARRIER
|
||||
; GFX10: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
|
||||
; GFX10: S_WAITCNT 112
|
||||
; GFX10: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
|
||||
; GFX10: S_ENDPGM 0
|
||||
GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
|
||||
S_WAITCNT 0
|
||||
S_WAITCNT_VSCNT undef $sgpr_null, 1
|
||||
S_WAITCNT 0
|
||||
S_WAITCNT_VSCNT undef $sgpr_null, 2
|
||||
S_WAITCNT 0
|
||||
S_BARRIER
|
||||
$vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
|
||||
FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
|
||||
S_ENDPGM 0
|
||||
...
|
|
@ -1,37 +1,195 @@
|
|||
# RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefixes=GCN %s
|
||||
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
|
||||
# RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefixes=GFX9 %s
|
||||
|
||||
# GCN-LABEL: name: test{{$}}
|
||||
# GCN: S_WAITCNT -16257
|
||||
# GCN: DS_READ2_B32
|
||||
# GCN: DS_READ2_B32
|
||||
# GCN: S_WAITCNT 383{{$}}
|
||||
# GCN-NEXT: $vgpr1 = V_OR_B32_e32 1, killed $vgpr1, implicit $exec
|
||||
# GCN-NEXT: $vgpr1 = V_MAX_U32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
|
||||
# GCN-NEXT: S_WAITCNT 127{{$}}
|
||||
# GCN-NEXT: $vgpr1 = V_MAX_U32_e32 killed $vgpr2, killed $vgpr1, implicit $exec
|
||||
--- |
|
||||
define amdgpu_cs void @test() {
|
||||
ret void
|
||||
}
|
||||
...
|
||||
---
|
||||
name: test
|
||||
name: test_waitcnt_preexisting_lgkmcnt_unmodified
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $sgpr0, $sgpr1, $vgpr0
|
||||
liveins: $vgpr0
|
||||
|
||||
renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 = S_LOAD_DWORDX8_IMM renamable $sgpr0_sgpr1, 480, 0
|
||||
renamable $vgpr13 = V_LSHLREV_B32_e32 2, killed $vgpr0, implicit $exec
|
||||
S_WAITCNT -16257
|
||||
renamable $vgpr0_vgpr1 = DS_READ2_B32 renamable $vgpr13, 0, 1, 0, implicit $m0, implicit $exec
|
||||
renamable $vgpr2_vgpr3 = DS_READ2_B32 renamable $vgpr13, 2, 3, 0, implicit $m0, implicit $exec
|
||||
renamable $vgpr1 = V_OR_B32_e32 1, killed $vgpr1, implicit $exec
|
||||
renamable $vgpr1 = V_MAX_U32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
|
||||
renamable $vgpr1 = V_MAX_U32_e32 killed $vgpr2, killed $vgpr1, implicit $exec
|
||||
renamable $vgpr1 = V_MAX_U32_e32 killed $vgpr3, killed $vgpr1, implicit $exec
|
||||
$vgpr0 = V_MOV_B32_e32 $vgpr1, implicit $exec, implicit $exec
|
||||
$vgpr2 = V_MOV_B32_e32 $vgpr1, implicit $exec, implicit $exec
|
||||
$vgpr3 = V_MOV_B32_e32 $vgpr1, implicit $exec, implicit $exec
|
||||
IMAGE_STORE_V4_V2 killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, implicit $exec :: (store 16)
|
||||
; GFX9-LABEL: name: test_waitcnt_preexisting_lgkmcnt_unmodified
|
||||
; GFX9: S_WAITCNT 0
|
||||
; GFX9: $vgpr0_vgpr1 = DS_READ2_B32 $vgpr0, 0, 1, 0, implicit $m0, implicit $exec
|
||||
; GFX9: S_WAITCNT 49279
|
||||
; GFX9: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
|
||||
; GFX9: S_WAITCNT 112
|
||||
; GFX9: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
|
||||
; GFX9: S_ENDPGM 0
|
||||
$vgpr0_vgpr1 = DS_READ2_B32 $vgpr0, 0, 1, 0, implicit $m0, implicit $exec
|
||||
S_WAITCNT 49279
|
||||
$vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
|
||||
FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
|
||||
S_ENDPGM 0
|
||||
...
|
||||
|
||||
---
|
||||
name: test_waitcnt_preexisting_vmcnt_unmodified
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0_vgpr1
|
||||
|
||||
; GFX9-LABEL: name: test_waitcnt_preexisting_vmcnt_unmodified
|
||||
; GFX9: S_WAITCNT 0
|
||||
; GFX9: $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec
|
||||
; GFX9: S_WAITCNT 3952
|
||||
; GFX9: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
|
||||
; GFX9: S_WAITCNT 112
|
||||
; GFX9: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
|
||||
; GFX9: S_ENDPGM 0
|
||||
$vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec
|
||||
S_WAITCNT 3952
|
||||
$vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
|
||||
FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
|
||||
S_ENDPGM 0
|
||||
...
|
||||
|
||||
# Respect preexisting waitcnt and add required wait.
|
||||
|
||||
---
|
||||
name: test_waitcnt_preexisting_vmcnt_needs_lgkmcnt
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0
|
||||
|
||||
; GFX9-LABEL: name: test_waitcnt_preexisting_vmcnt_needs_lgkmcnt
|
||||
; GFX9: S_WAITCNT 0
|
||||
; GFX9: $vgpr0_vgpr1 = DS_READ2_B32 $vgpr0, 0, 1, 0, implicit $m0, implicit $exec
|
||||
; GFX9: S_WAITCNT 112
|
||||
; GFX9: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
|
||||
; GFX9: S_WAITCNT 112
|
||||
; GFX9: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
|
||||
; GFX9: S_ENDPGM 0
|
||||
$vgpr0_vgpr1 = DS_READ2_B32 $vgpr0, 0, 1, 0, implicit $m0, implicit $exec
|
||||
S_WAITCNT 3952
|
||||
$vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
|
||||
FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
|
||||
S_ENDPGM 0
|
||||
...
|
||||
|
||||
---
|
||||
name: test_waitcnt_preexisting_lgkmcnt_needs_vmcnt
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0_vgpr1
|
||||
|
||||
; GFX9-LABEL: name: test_waitcnt_preexisting_lgkmcnt_needs_vmcnt
|
||||
; GFX9: S_WAITCNT 0
|
||||
; GFX9: $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec
|
||||
; GFX9: S_WAITCNT 112
|
||||
; GFX9: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
|
||||
; GFX9: S_WAITCNT 112
|
||||
; GFX9: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
|
||||
; GFX9: S_ENDPGM 0
|
||||
$vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec
|
||||
S_WAITCNT 49279
|
||||
$vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
|
||||
FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
|
||||
S_ENDPGM 0
|
||||
...
|
||||
|
||||
# Apply wait for all counters from preexisting waitcnt regardless of the wait
|
||||
# required by the next instruction.
|
||||
|
||||
---
|
||||
name: test_waitcnt_preexisting_apply_all_counters
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0_vgpr1, $vgpr2
|
||||
|
||||
; GFX9-LABEL: name: test_waitcnt_preexisting_apply_all_counters
|
||||
; GFX9: S_WAITCNT 0
|
||||
; GFX9: $vgpr4_vgpr5 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec
|
||||
; GFX9: $vgpr6_vgpr7 = DS_READ2_B32 $vgpr2, 0, 1, 0, implicit $m0, implicit $exec
|
||||
; GFX9: S_WAITCNT 0
|
||||
; GFX9: $vgpr6 = V_OR_B32_e32 1, killed $vgpr6, implicit $exec
|
||||
; GFX9: $vgpr0 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec, implicit $flat_scr
|
||||
; GFX9: S_WAITCNT 112
|
||||
; GFX9: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
|
||||
$vgpr4_vgpr5 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec
|
||||
$vgpr6_vgpr7 = DS_READ2_B32 $vgpr2, 0, 1, 0, implicit $m0, implicit $exec
|
||||
S_WAITCNT 0
|
||||
$vgpr6 = V_OR_B32_e32 1, killed $vgpr6, implicit $exec
|
||||
$vgpr0 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec, implicit $flat_scr
|
||||
FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
|
||||
...
|
||||
|
||||
---
|
||||
name: test_waitcnt_preexisting_combine_waitcnt
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0_vgpr1
|
||||
|
||||
; GFX9-LABEL: name: test_waitcnt_preexisting_combine_waitcnt
|
||||
; GFX9: S_WAITCNT 0
|
||||
; GFX9: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
|
||||
; GFX9: S_WAITCNT 0
|
||||
; GFX9: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
|
||||
$vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
|
||||
S_WAITCNT 0
|
||||
S_WAITCNT 0
|
||||
S_WAITCNT 0
|
||||
S_WAITCNT 0
|
||||
S_WAITCNT 0
|
||||
S_WAITCNT 0
|
||||
FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
|
||||
...
|
||||
|
||||
---
|
||||
name: test_waitcnt_preexisting_combine_waitcnt_diff_counters
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0_vgpr1
|
||||
|
||||
; GFX9-LABEL: name: test_waitcnt_preexisting_combine_waitcnt_diff_counters
|
||||
; GFX9: S_WAITCNT 0
|
||||
; GFX9: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
|
||||
; GFX9: S_WAITCNT 112
|
||||
; GFX9: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
|
||||
$vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
|
||||
S_WAITCNT 49279
|
||||
S_WAITCNT 3952
|
||||
FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
|
||||
...
|
||||
|
||||
# Apply preexisting waitcnt when no wait is immediately needed.
|
||||
# FIXME: Move waitcnt as late as possible.
|
||||
|
||||
---
|
||||
name: test_waitcnt_preexisting_early_wait
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0_vgpr1
|
||||
|
||||
; GFX9-LABEL: name: test_waitcnt_preexisting_early_wait
|
||||
; GFX9: S_WAITCNT 0
|
||||
; GFX9: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
|
||||
; GFX9: S_WAITCNT 0
|
||||
; GFX9: S_NOP 0
|
||||
; GFX9: S_NOP 0
|
||||
; GFX9: S_NOP 0
|
||||
; GFX9: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
|
||||
; GFX9: S_ENDPGM 0
|
||||
$vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
|
||||
S_WAITCNT 0
|
||||
S_NOP 0
|
||||
S_NOP 0
|
||||
S_NOP 0
|
||||
FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
|
||||
S_ENDPGM 0
|
||||
...
|
||||
|
||||
---
|
||||
name: test_waitcnt_preexisting_ignore_kill
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0_vgpr1
|
||||
|
||||
; GFX9-LABEL: name: test_waitcnt_preexisting_ignore_kill
|
||||
; GFX9: S_WAITCNT 0
|
||||
; GFX9: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
|
||||
; GFX9: S_WAITCNT 3952
|
||||
; GFX9: KILL $vgpr0
|
||||
$vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
|
||||
S_WAITCNT 3952
|
||||
KILL $vgpr0
|
||||
...
|
||||
|
|
Loading…
Reference in New Issue