forked from OSchip/llvm-project
AMDGPU: Force s_waitcnt after GWS instructions
This is apparently required to be the immediately following instruction, so force it into a bundle with a waitcnt. llvm-svn: 366607
This commit is contained in:
parent
4e8c8aa959
commit
85f3890126
|
@ -317,13 +317,16 @@ class DS_GWS <string opName, dag ins, string asmOps>
|
|||
|
||||
class DS_GWS_0D <string opName>
|
||||
: DS_GWS<opName,
|
||||
(ins offset:$offset, gds:$gds), "$offset gds">;
|
||||
(ins offset:$offset, gds:$gds), "$offset gds"> {
|
||||
let hasSideEffects = 1;
|
||||
}
|
||||
|
||||
class DS_GWS_1D <string opName>
|
||||
: DS_GWS<opName,
|
||||
(ins VGPR_32:$data0, offset:$offset, gds:$gds), "$data0$offset gds"> {
|
||||
|
||||
let has_gws_data0 = 1;
|
||||
let hasSideEffects = 1;
|
||||
}
|
||||
|
||||
class DS_VOID <string opName> : DS_Pseudo<opName,
|
||||
|
|
|
@ -3069,6 +3069,20 @@ splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop) {
|
|||
return std::make_pair(LoopBB, RemainderBB);
|
||||
}
|
||||
|
||||
/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
|
||||
void SITargetLowering::bundleInstWithWaitcnt(MachineInstr &MI) const {
|
||||
MachineBasicBlock *MBB = MI.getParent();
|
||||
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
|
||||
auto I = MI.getIterator();
|
||||
auto E = std::next(I);
|
||||
|
||||
BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
|
||||
.addImm(0);
|
||||
|
||||
MIBundleBuilder Bundler(*MBB, I, E);
|
||||
finalizeBundle(*MBB, Bundler.begin());
|
||||
}
|
||||
|
||||
MachineBasicBlock *
|
||||
SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI,
|
||||
MachineBasicBlock *BB) const {
|
||||
|
@ -3108,8 +3122,7 @@ SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI,
|
|||
MRI.setSimpleHint(Data0, Src->getReg());
|
||||
}
|
||||
|
||||
BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_WAITCNT))
|
||||
.addImm(0);
|
||||
bundleInstWithWaitcnt(MI);
|
||||
|
||||
unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
|
||||
|
||||
|
@ -3828,8 +3841,12 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
|
|||
case AMDGPU::DS_GWS_SEMA_P:
|
||||
case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
|
||||
case AMDGPU::DS_GWS_BARRIER:
|
||||
if (getSubtarget()->hasGWSAutoReplay())
|
||||
// A s_waitcnt 0 is required to be the instruction immediately following.
|
||||
if (getSubtarget()->hasGWSAutoReplay()) {
|
||||
bundleInstWithWaitcnt(MI);
|
||||
return BB;
|
||||
}
|
||||
|
||||
return emitGWSMemViolTestLoop(MI, BB);
|
||||
default:
|
||||
return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
|
||||
|
|
|
@ -315,6 +315,7 @@ public:
|
|||
MachineBasicBlock *splitKillBlock(MachineInstr &MI,
|
||||
MachineBasicBlock *BB) const;
|
||||
|
||||
void bundleInstWithWaitcnt(MachineInstr &MI) const;
|
||||
MachineBasicBlock *emitGWSMemViolTestLoop(MachineInstr &MI,
|
||||
MachineBasicBlock *BB) const;
|
||||
|
||||
|
|
|
@ -1531,7 +1531,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
|
|||
break;
|
||||
}
|
||||
case TargetOpcode::BUNDLE: {
|
||||
if (!MI.mayLoad())
|
||||
if (!MI.mayLoad() || MI.hasUnmodeledSideEffects())
|
||||
return false;
|
||||
|
||||
// If it is a load it must be a memory clause
|
||||
|
|
|
@ -4,6 +4,11 @@
|
|||
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,NOLOOP %s
|
||||
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,NOLOOP,GFX10 %s
|
||||
|
||||
; Make sure the op is emitted bundled with a waitcnt with and without the retry loop, and the bundle is not removed by ExpandPostRAPseudos.
|
||||
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -stop-after=postrapseudos -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=MIR %s
|
||||
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -stop-after=postrapseudos -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=MIR %s
|
||||
|
||||
|
||||
; Minimum offset
|
||||
; GCN-LABEL: {{^}}gws_barrier_offset0:
|
||||
; NOLOOP-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
|
||||
|
@ -18,11 +23,19 @@
|
|||
; LOOP-NEXT: s_getreg_b32 [[GETREG:s[0-9]+]], hwreg(HW_REG_TRAPSTS, 8, 1)
|
||||
; LOOP-NEXT: s_cmp_lg_u32 [[GETREG]], 0
|
||||
; LOOP-NEXT: s_cbranch_scc1 [[LOOP]]
|
||||
|
||||
; MIR-LABEL: name: gws_barrier_offset0{{$}}
|
||||
; MIR: BUNDLE implicit{{( killed)?}} $vgpr0, implicit $m0, implicit $exec {
|
||||
; MIR-NEXT: DS_GWS_BARRIER $vgpr0, 1, -1, implicit $m0, implicit $exec :: (load 4 from custom GWSResource)
|
||||
; MIR-NEXT: S_WAITCNT 0
|
||||
; MIR-NEXT: }
|
||||
define amdgpu_kernel void @gws_barrier_offset0(i32 %val) #0 {
|
||||
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
; MIR-LABEL: name: gws_barrier_offset63{{$}}
|
||||
|
||||
; Maximum offset
|
||||
; GCN-LABEL: {{^}}gws_barrier_offset63:
|
||||
; NOLOOP-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
|
||||
|
@ -103,7 +116,7 @@ define amdgpu_kernel void @gws_barrier_save_m0_barrier_constant_offset(i32 %val)
|
|||
; Make sure this increments lgkmcnt
|
||||
; GCN-LABEL: {{^}}gws_barrier_lgkmcnt:
|
||||
; NOLOOP: ds_gws_barrier v0 offset:1 gds{{$}}
|
||||
; NOLOOP-NEXT: s_waitcnt expcnt(0) lgkmcnt(0)
|
||||
; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; NOLOOP-NEXT: s_setpc_b64
|
||||
define void @gws_barrier_lgkmcnt(i32 %val) {
|
||||
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 0)
|
||||
|
@ -122,7 +135,7 @@ define amdgpu_kernel void @gws_barrier_wait_before(i32 %val, i32 addrspace(1)* %
|
|||
|
||||
; GCN-LABEL: {{^}}gws_barrier_wait_after:
|
||||
; NOLOOP: ds_gws_barrier v0 offset:8 gds
|
||||
; NOLOOP-NEXT: s_waitcnt expcnt(0){{$}}
|
||||
; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; NOLOOP-NEXT: load_dword
|
||||
define amdgpu_kernel void @gws_barrier_wait_after(i32 %val, i32 addrspace(1)* %ptr) #0 {
|
||||
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7)
|
||||
|
@ -135,6 +148,7 @@ define amdgpu_kernel void @gws_barrier_wait_after(i32 %val, i32 addrspace(1)* %p
|
|||
; NOLOOP: store_dword
|
||||
; NOLOOP: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; NOLOOP: ds_gws_barrier v0 offset:8 gds
|
||||
; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
define amdgpu_kernel void @gws_barrier_fence_before(i32 %val, i32 addrspace(1)* %ptr) #0 {
|
||||
store i32 0, i32 addrspace(1)* %ptr
|
||||
fence release
|
||||
|
@ -142,9 +156,11 @@ define amdgpu_kernel void @gws_barrier_fence_before(i32 %val, i32 addrspace(1)*
|
|||
ret void
|
||||
}
|
||||
|
||||
; FIXME: Extra waitcnt
|
||||
; GCN-LABEL: {{^}}gws_barrier_fence_after:
|
||||
; NOLOOP: ds_gws_barrier v0 offset:8 gds
|
||||
; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; NOLOOP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; NOLOOP-NEXT: load_dword
|
||||
define amdgpu_kernel void @gws_barrier_fence_after(i32 %val, i32 addrspace(1)* %ptr) #0 {
|
||||
|
@ -158,7 +174,9 @@ define amdgpu_kernel void @gws_barrier_fence_after(i32 %val, i32 addrspace(1)* %
|
|||
; GCN-LABEL: {{^}}gws_init_barrier:
|
||||
; NOLOOP: s_mov_b32 m0, -1
|
||||
; NOLOOP: ds_gws_init v0 offset:8 gds
|
||||
; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; NOLOOP-NEXT: ds_gws_barrier v0 offset:8 gds
|
||||
; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
define amdgpu_kernel void @gws_init_barrier(i32 %val) #0 {
|
||||
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 7)
|
||||
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7)
|
||||
|
@ -169,9 +187,11 @@ define amdgpu_kernel void @gws_init_barrier(i32 %val) #0 {
|
|||
; GCN-LABEL: {{^}}gws_init_fence_barrier:
|
||||
; NOLOOP: s_mov_b32 m0, -1
|
||||
; NOLOOP: ds_gws_init v0 offset:8 gds
|
||||
; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; NOLOOP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; NOLOOP-NEXT: ds_gws_barrier v0 offset:8 gds
|
||||
; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
define amdgpu_kernel void @gws_init_fence_barrier(i32 %val) #0 {
|
||||
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 7)
|
||||
fence release
|
||||
|
|
|
@ -111,7 +111,7 @@ define amdgpu_kernel void @gws_init_save_m0_init_constant_offset(i32 %val) #0 {
|
|||
|
||||
; GCN-LABEL: {{^}}gws_init_lgkmcnt:
|
||||
; NOLOOP: ds_gws_init v0 offset:1 gds{{$}}
|
||||
; NOLOOP-NEXT: s_waitcnt expcnt(0) lgkmcnt(0)
|
||||
; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; NOLOOP-NEXT: s_setpc_b64
|
||||
define void @gws_init_lgkmcnt(i32 %val) {
|
||||
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 0)
|
||||
|
@ -120,8 +120,10 @@ define void @gws_init_lgkmcnt(i32 %val) {
|
|||
|
||||
; Does not imply memory fence on its own
|
||||
; GCN-LABEL: {{^}}gws_init_wait_before:
|
||||
; NOLOOP: s_waitcnt
|
||||
; NOLOOP: s_waitcnt lgkmcnt(0)
|
||||
; NOLOOP-NOT: s_waitcnt
|
||||
; NOLOOP: ds_gws_init
|
||||
; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
define amdgpu_kernel void @gws_init_wait_before(i32 %val, i32 addrspace(1)* %ptr) #0 {
|
||||
store i32 0, i32 addrspace(1)* %ptr
|
||||
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 7)
|
||||
|
|
Loading…
Reference in New Issue