Reapply "AMDGPU: Add ds_gws_init / ds_gws_barrier intrinsics"

This reapplies r363678, using the correct chain for the CopyToReg for
v0. glueCopyToM0 counterintuitively changes the operands of the
original node.

llvm-svn: 363870
This commit is contained in:
Matt Arsenault 2019-06-19 19:55:27 +00:00
parent 709a769cdd
commit 4d55d024be
12 changed files with 685 additions and 21 deletions

View File

@ -1348,6 +1348,28 @@ def int_amdgcn_alignbyte : Intrinsic<[llvm_i32_ty],
[IntrNoMem, IntrSpeculatable]
>;
// llvm.amdgcn.ds.gws.init(i32 bar_val, i32 resource_id)
//
// bar_val is the total number of waves that will wait on this
// barrier, minus 1.
def int_amdgcn_ds_gws_init :
GCCBuiltin<"__builtin_amdgcn_ds_gws_init">,
Intrinsic<[],
[llvm_i32_ty, llvm_i32_ty],
[IntrConvergent, IntrWriteMem, IntrInaccessibleMemOnly], "",
[SDNPMemOperand]
>;
// llvm.amdgcn.ds.gws.barrier(i32 vsrc0, i32 resource_id)
// bar_val is the total number of waves that will wait on this
// barrier, minus 1.
def int_amdgcn_ds_gws_barrier :
GCCBuiltin<"__builtin_amdgcn_ds_gws_barrier">,
Intrinsic<[],
[llvm_i32_ty, llvm_i32_ty],
[IntrConvergent, IntrInaccessibleMemOnly], "",
[SDNPMemOperand]
>;
// Copies the source value to the destination value, with the guarantee that
// the source value is computed as if the entire program were executed in WQM.

View File

@ -218,7 +218,9 @@ private:
void SelectFMAD_FMA(SDNode *N);
void SelectATOMIC_CMP_SWAP(SDNode *N);
void SelectDSAppendConsume(SDNode *N, unsigned IntrID);
void SelectDS_GWS(SDNode *N, unsigned IntrID);
void SelectINTRINSIC_W_CHAIN(SDNode *N);
void SelectINTRINSIC_VOID(SDNode *N);
protected:
// Include the pieces autogenerated from the target description.
@ -832,6 +834,10 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
SelectINTRINSIC_W_CHAIN(N);
return;
}
case ISD::INTRINSIC_VOID: {
SelectINTRINSIC_VOID(N);
return;
}
}
SelectCode(N);
@ -2034,6 +2040,72 @@ void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
}
void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
SDLoc SL(N);
SDValue VSrc0 = N->getOperand(2);
SDValue BaseOffset = N->getOperand(3);
int ImmOffset = 0;
MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
MachineMemOperand *MMO = M->getMemOperand();
// Don't worry if the offset ends up in a VGPR. Only one lane will have
// effect, so SIFixSGPRCopies will validly insert readfirstlane.
// The resource id offset is computed as (<isa opaque base> + M0[21:16] +
// offset field) % 64. Some versions of the programming guide omit the m0
// part, or claim it's from offset 0.
if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(BaseOffset)) {
// If we have a constant offset, try to use the default value for m0 as a
// base to possibly avoid setting it up.
glueCopyToM0(N, CurDAG->getTargetConstant(-1, SL, MVT::i32));
ImmOffset = ConstOffset->getZExtValue() + 1;
} else {
if (CurDAG->isBaseWithConstantOffset(BaseOffset)) {
ImmOffset = BaseOffset.getConstantOperandVal(1);
BaseOffset = BaseOffset.getOperand(0);
}
// Prefer to do the shift in an SGPR since it should be possible to use m0
// as the result directly. If it's already an SGPR, it will be eliminated
// later.
SDNode *SGPROffset
= CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32,
BaseOffset);
// Shift to offset in m0
SDNode *M0Base
= CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
SDValue(SGPROffset, 0),
CurDAG->getTargetConstant(16, SL, MVT::i32));
glueCopyToM0(N, SDValue(M0Base, 0));
}
// The manual doesn't mention this, but it seems only v0 works.
SDValue V0 = CurDAG->getRegister(AMDGPU::VGPR0, MVT::i32);
SDValue CopyToV0 = CurDAG->getCopyToReg(
N->getOperand(0), SL, V0, VSrc0,
N->getOperand(N->getNumOperands() - 1));
SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);
// TODO: Can this just be removed from the instruction?
SDValue GDS = CurDAG->getTargetConstant(1, SL, MVT::i1);
unsigned Opc = IntrID == Intrinsic::amdgcn_ds_gws_init ?
AMDGPU::DS_GWS_INIT : AMDGPU::DS_GWS_BARRIER;
SDValue Ops[] = {
V0,
OffsetField,
GDS,
CopyToV0, // Chain
CopyToV0.getValue(1) // Glue
};
SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
}
void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
switch (IntrID) {
@ -2044,6 +2116,18 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
SelectDSAppendConsume(N, IntrID);
return;
}
}
SelectCode(N);
}
void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
switch (IntrID) {
case Intrinsic::amdgcn_ds_gws_init:
case Intrinsic::amdgcn_ds_gws_barrier:
SelectDS_GWS(N, IntrID);
return;
default:
break;
}

View File

@ -467,11 +467,15 @@ defm DS_WRXCHG_RTN_B64 : DS_1A1D_RET_mc<"ds_wrxchg_rtn_b64", VReg_64>;
defm DS_WRXCHG2_RTN_B64 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2_rtn_b64", VReg_128, VReg_64>;
defm DS_WRXCHG2ST64_RTN_B64 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2st64_rtn_b64", VReg_128, VReg_64>;
def DS_GWS_INIT : DS_GWS_1D<"ds_gws_init">;
let isConvergent = 1 in {
def DS_GWS_INIT : DS_GWS_1D<"ds_gws_init"> {
let mayLoad = 0;
}
def DS_GWS_SEMA_V : DS_GWS_0D<"ds_gws_sema_v">;
def DS_GWS_SEMA_BR : DS_GWS_1D<"ds_gws_sema_br">;
def DS_GWS_SEMA_P : DS_GWS_0D<"ds_gws_sema_p">;
def DS_GWS_BARRIER : DS_GWS_1D<"ds_gws_barrier">;
}
def DS_ADD_SRC2_U32 : DS_1A<"ds_add_src2_u32">;
def DS_SUB_SRC2_U32 : DS_1A<"ds_sub_src2_u32">;

View File

@ -961,6 +961,24 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
return true;
}
case Intrinsic::amdgcn_ds_gws_init:
case Intrinsic::amdgcn_ds_gws_barrier: {
Info.opc = ISD::INTRINSIC_VOID;
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
Info.ptrVal =
MFI->getGWSPSV(*MF.getSubtarget<GCNSubtarget>().getInstrInfo());
// This is an abstract access, but we need to specify a type and size.
Info.memVT = MVT::i32;
Info.size = 4;
Info.align = 4;
Info.flags = MachineMemOperand::MOStore;
if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
Info.flags = MachineMemOperand::MOLoad;
return true;
}
default:
return false;
}

View File

@ -536,15 +536,19 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
// Put score on the source vgprs. If this is a store, just use those
// specific register(s).
if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
int AddrOpIdx =
AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr);
// All GDS operations must protect their address register (same as
// export.)
if (Inst.getOpcode() != AMDGPU::DS_APPEND &&
Inst.getOpcode() != AMDGPU::DS_CONSUME) {
setExpScore(
&Inst, TII, TRI, MRI,
AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr),
CurrScore);
if (AddrOpIdx != -1) {
setExpScore(&Inst, TII, TRI, MRI, AddrOpIdx, CurrScore);
} else {
assert(Inst.getOpcode() == AMDGPU::DS_APPEND ||
Inst.getOpcode() == AMDGPU::DS_CONSUME ||
Inst.getOpcode() == AMDGPU::DS_GWS_INIT ||
Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER);
}
if (Inst.mayStore()) {
if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
AMDGPU::OpName::data0) != -1) {
@ -1407,18 +1411,6 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
ScoreBrackets.dump();
});
// Check to see if this is a GWS instruction. If so, and if this is CI or
// VI, then the generated code sequence will include an S_WAITCNT 0.
// TODO: Are these the only GWS instructions?
if (Inst.getOpcode() == AMDGPU::DS_GWS_INIT ||
Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_V ||
Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_P ||
Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
// TODO: && context->target_info->GwsRequiresMemViolTest() ) {
ScoreBrackets.applyWaitcnt(AMDGPU::Waitcnt::allZeroExceptVsCnt());
}
// TODO: Remove this work-around after fixing the scheduler and enable the
// assert above.
if (VCCZBugWorkAround) {

View File

@ -2547,7 +2547,8 @@ bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
// given the typical code patterns.
if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
Opcode == AMDGPU::EXP || Opcode == AMDGPU::EXP_DONE ||
Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::S_TRAP)
Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::S_TRAP ||
Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_BARRIER)
return true;
if (MI.isCall() || MI.isInlineAsm())

View File

@ -43,7 +43,8 @@ class AMDGPUPseudoSourceValue : public PseudoSourceValue {
public:
enum AMDGPUPSVKind : unsigned {
PSVBuffer = PseudoSourceValue::TargetCustom,
PSVImage
PSVImage,
GWSResource
};
protected:
@ -87,6 +88,30 @@ public:
}
};
class AMDGPUGWSResourcePseudoSourceValue final : public AMDGPUPseudoSourceValue {
public:
explicit AMDGPUGWSResourcePseudoSourceValue(const TargetInstrInfo &TII)
: AMDGPUPseudoSourceValue(GWSResource, TII) {}
static bool classof(const PseudoSourceValue *V) {
return V->kind() == GWSResource;
}
// These are inaccessible memory from IR.
bool isAliased(const MachineFrameInfo *) const override {
return false;
}
// These are inaccessible memory from IR.
bool mayAlias(const MachineFrameInfo *) const override {
return false;
}
void printCustom(raw_ostream &OS) const override {
OS << "GWSResource";
}
};
namespace yaml {
struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
@ -188,6 +213,7 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
std::unique_ptr<const AMDGPUBufferPseudoSourceValue>> BufferPSVs;
DenseMap<const Value *,
std::unique_ptr<const AMDGPUImagePseudoSourceValue>> ImagePSVs;
std::unique_ptr<const AMDGPUGWSResourcePseudoSourceValue> GWSResourcePSV;
private:
unsigned LDSWaveSpillSize = 0;
@ -674,6 +700,15 @@ public:
return PSV.first->second.get();
}
const AMDGPUGWSResourcePseudoSourceValue *getGWSPSV(const SIInstrInfo &TII) {
if (!GWSResourcePSV) {
GWSResourcePSV =
llvm::make_unique<AMDGPUGWSResourcePseudoSourceValue>(TII);
}
return GWSResourcePSV.get();
}
unsigned getOccupancy() const {
return Occupancy;
}

View File

@ -0,0 +1,103 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GFX9 %s
# RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=VI %s
# RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=CI %s
# RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=SI %s
---
name: m0_gws_init0
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0
; GFX9-LABEL: name: m0_gws_init0
; GFX9: liveins: $vgpr0
; GFX9: $m0 = S_MOV_B32 -1
; GFX9: S_NOP 0
; GFX9: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
; VI-LABEL: name: m0_gws_init0
; VI: liveins: $vgpr0
; VI: $m0 = S_MOV_B32 -1
; VI: S_NOP 0
; VI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
; CI-LABEL: name: m0_gws_init0
; CI: liveins: $vgpr0
; CI: $m0 = S_MOV_B32 -1
; CI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
; SI-LABEL: name: m0_gws_init0
; SI: liveins: $vgpr0
; SI: $m0 = S_MOV_B32 -1
; SI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
$m0 = S_MOV_B32 -1
DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
...
---
name: m0_gws_init1
tracksRegLiveness: true
body: |
bb.0:
; GFX9-LABEL: name: m0_gws_init1
; GFX9: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GFX9: $m0 = S_MOV_B32 -1
; GFX9: S_NOP 0
; GFX9: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
; VI-LABEL: name: m0_gws_init1
; VI: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; VI: $m0 = S_MOV_B32 -1
; VI: S_NOP 0
; VI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
; CI-LABEL: name: m0_gws_init1
; CI: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; CI: $m0 = S_MOV_B32 -1
; CI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
; SI-LABEL: name: m0_gws_init1
; SI: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; SI: $m0 = S_MOV_B32 -1
; SI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
$m0 = S_MOV_B32 -1
DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
...
# Test a typical situation where m0 needs to be set from a VGPR
# through readfirstlane
---
name: m0_gws_readlane
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0, $vgpr1
; GFX9-LABEL: name: m0_gws_readlane
; GFX9: liveins: $vgpr0, $vgpr1
; GFX9: $sgpr0 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
; GFX9: $m0 = S_MOV_B32 $sgpr0
; GFX9: S_NOP 0
; GFX9: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
; VI-LABEL: name: m0_gws_readlane
; VI: liveins: $vgpr0, $vgpr1
; VI: $sgpr0 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
; VI: $m0 = S_MOV_B32 $sgpr0
; VI: S_NOP 0
; VI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
; CI-LABEL: name: m0_gws_readlane
; CI: liveins: $vgpr0, $vgpr1
; CI: $sgpr0 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
; CI: $m0 = S_MOV_B32 $sgpr0
; CI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
; SI-LABEL: name: m0_gws_readlane
; SI: liveins: $vgpr0, $vgpr1
; SI: $sgpr0 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
; SI: $m0 = S_MOV_B32 $sgpr0
; SI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
$sgpr0 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
$m0 = S_MOV_B32 $sgpr0
DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
...

View File

@ -0,0 +1,59 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass si-insert-skips -amdgpu-skip-threshold=1 -verify-machineinstrs %s -o - | FileCheck %s
# Make sure mandatory skips are inserted to ensure GWS ops aren't run with exec = 0
---
name: skip_gws_init
body: |
; CHECK-LABEL: name: skip_gws_init
; CHECK: bb.0:
; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; CHECK: SI_MASK_BRANCH %bb.2, implicit $exec
; CHECK: S_CBRANCH_EXECZ %bb.2, implicit $exec
; CHECK: bb.1:
; CHECK: successors: %bb.2(0x80000000)
; CHECK: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; CHECK: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
; CHECK: bb.2:
; CHECK: S_ENDPGM 0
bb.0:
successors: %bb.1, %bb.2
SI_MASK_BRANCH %bb.2, implicit $exec
bb.1:
successors: %bb.2
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
bb.2:
S_ENDPGM 0
...
---
name: skip_gws_barrier
body: |
; CHECK-LABEL: name: skip_gws_barrier
; CHECK: bb.0:
; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; CHECK: SI_MASK_BRANCH %bb.2, implicit $exec
; CHECK: S_CBRANCH_EXECZ %bb.2, implicit $exec
; CHECK: bb.1:
; CHECK: successors: %bb.2(0x80000000)
; CHECK: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; CHECK: DS_GWS_BARRIER $vgpr0, 0, 1, implicit $m0, implicit $exec
; CHECK: bb.2:
; CHECK: S_ENDPGM 0
bb.0:
successors: %bb.1, %bb.2
SI_MASK_BRANCH %bb.2, implicit $exec
bb.1:
successors: %bb.2
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
DS_GWS_BARRIER $vgpr0, 0, 1, implicit $m0, implicit $exec
bb.2:
S_ENDPGM 0
...

View File

@ -0,0 +1,179 @@
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -mattr=+flat-for-global -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIPLUS %s
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIPLUS %s
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIPLUS %s
; Minimum offset
; GCN-LABEL: {{^}}gws_barrier_offset0:
; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
; GCN-DAG: s_mov_b32 m0, -1{{$}}
; GCN: v_mov_b32_e32 v0, [[BAR_NUM]]
; GCN: ds_gws_barrier v0 offset:1 gds{{$}}
define amdgpu_kernel void @gws_barrier_offset0(i32 %val) #0 {
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 0)
ret void
}
; Maximum offset
; GCN-LABEL: {{^}}gws_barrier_offset63:
; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
; GCN-DAG: s_mov_b32 m0, -1{{$}}
; GCN-DAG: v_mov_b32_e32 v0, [[BAR_NUM]]
; GCN: ds_gws_barrier v0 offset:64 gds{{$}}
define amdgpu_kernel void @gws_barrier_offset63(i32 %val) #0 {
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 63)
ret void
}
; FIXME: Should be able to shift directly into m0
; GCN-LABEL: {{^}}gws_barrier_sgpr_offset:
; GCN-DAG: s_load_dwordx2 s{{\[}}[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]{{\]}}
; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16
; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}}
; GCN-DAG: v_mov_b32_e32 v0, s[[BAR_NUM]]
; GCN: ds_gws_barrier v0 gds{{$}}
define amdgpu_kernel void @gws_barrier_sgpr_offset(i32 %val, i32 %offset) #0 {
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 %offset)
ret void
}
; Variable offset in SGPR with constant add
; GCN-LABEL: {{^}}gws_barrier_sgpr_offset_add1:
; GCN-DAG: s_load_dwordx2 s{{\[}}[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]{{\]}}
; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16
; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}}
; GCN-DAG: v_mov_b32_e32 v0, s[[BAR_NUM]]
; GCN: ds_gws_barrier v0 offset:1 gds{{$}}
define amdgpu_kernel void @gws_barrier_sgpr_offset_add1(i32 %val, i32 %offset.base) #0 {
%offset = add i32 %offset.base, 1
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 %offset)
ret void
}
; GCN-LABEL: {{^}}gws_barrier_vgpr_offset:
; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
; GCN-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0
; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16
; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}}
; GCN-DAG: v_mov_b32_e32 v0, [[BAR_NUM]]
; GCN: ds_gws_barrier v0 gds{{$}}
define amdgpu_kernel void @gws_barrier_vgpr_offset(i32 %val) #0 {
%vgpr.offset = call i32 @llvm.amdgcn.workitem.id.x()
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 %vgpr.offset)
ret void
}
; Variable offset in VGPR with constant add
; GCN-LABEL: {{^}}gws_barrier_vgpr_offset_add:
; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
; GCN-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0
; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16
; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}}
; GCN-DAG: v_mov_b32_e32 v0, [[BAR_NUM]]
; GCN: ds_gws_barrier v0 offset:3 gds{{$}}
define amdgpu_kernel void @gws_barrier_vgpr_offset_add(i32 %val) #0 {
%vgpr.offset.base = call i32 @llvm.amdgcn.workitem.id.x()
%vgpr.offset = add i32 %vgpr.offset.base, 3
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 %vgpr.offset)
ret void
}
@lds = internal unnamed_addr addrspace(3) global i32 undef
; Check if m0 initialization is shared
; GCN-LABEL: {{^}}gws_barrier_save_m0_barrier_constant_offset:
; GCN: s_mov_b32 m0, -1
; GCN-NOT: s_mov_b32 m0
define amdgpu_kernel void @gws_barrier_save_m0_barrier_constant_offset(i32 %val) #0 {
store i32 1, i32 addrspace(3)* @lds
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 10)
store i32 2, i32 addrspace(3)* @lds
ret void
}
; Make sure this increments lgkmcnt
; GCN-LABEL: {{^}}gws_barrier_lgkmcnt:
; GCN: ds_gws_barrier v0 offset:1 gds{{$}}
; GCN-NEXT: s_waitcnt expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_setpc_b64
define void @gws_barrier_lgkmcnt(i32 %val) {
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 0)
ret void
}
; Does not imply memory fence on its own
; GCN-LABEL: {{^}}gws_barrier_wait_before:
; GCN: store_dword
; CIPLUS-NOT: s_waitcnt
; GCN: ds_gws_barrier v0 offset:8 gds
define amdgpu_kernel void @gws_barrier_wait_before(i32 %val, i32 addrspace(1)* %ptr) #0 {
store i32 0, i32 addrspace(1)* %ptr
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7)
ret void
}
; GCN-LABEL: {{^}}gws_barrier_wait_after:
; GCN: ds_gws_barrier v0 offset:8 gds
; GCN-NEXT: s_waitcnt expcnt(0){{$}}
; GCN-NEXT: load_dword
define amdgpu_kernel void @gws_barrier_wait_after(i32 %val, i32 addrspace(1)* %ptr) #0 {
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7)
%load = load volatile i32, i32 addrspace(1)* %ptr
ret void
}
; Does not imply memory fence on its own
; GCN-LABEL: {{^}}gws_barrier_fence_before:
; GCN: store_dword
; GCN: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN: ds_gws_barrier v0 offset:8 gds
define amdgpu_kernel void @gws_barrier_fence_before(i32 %val, i32 addrspace(1)* %ptr) #0 {
store i32 0, i32 addrspace(1)* %ptr
fence release
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7)
ret void
}
; GCN-LABEL: {{^}}gws_barrier_fence_after:
; GCN: ds_gws_barrier v0 offset:8 gds
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: load_dword
define amdgpu_kernel void @gws_barrier_fence_after(i32 %val, i32 addrspace(1)* %ptr) #0 {
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7)
fence release
%load = load volatile i32, i32 addrspace(1)* %ptr
ret void
}
; FIXME: Should a wait be inserted here, or is an explicit fence needed?
; GCN-LABEL: {{^}}gws_init_barrier:
; GCN: s_mov_b32 m0, -1
; GCN: ds_gws_init v0 offset:8 gds
; GCN-NEXT: ds_gws_barrier v0 offset:8 gds
define amdgpu_kernel void @gws_init_barrier(i32 %val) #0 {
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 7)
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7)
ret void
}
; FIXME: Why vmcnt, not expcnt?
; GCN-LABEL: {{^}}gws_init_fence_barrier:
; GCN: s_mov_b32 m0, -1
; GCN: ds_gws_init v0 offset:8 gds
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN-NEXT: ds_gws_barrier v0 offset:8 gds
define amdgpu_kernel void @gws_init_fence_barrier(i32 %val) #0 {
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 7)
fence release
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7)
ret void
}
declare void @llvm.amdgcn.ds.gws.barrier(i32, i32) #1
declare void @llvm.amdgcn.ds.gws.init(i32, i32) #2
declare i32 @llvm.amdgcn.workitem.id.x() #3
attributes #0 = { nounwind }
attributes #1 = { convergent inaccessiblememonly nounwind }
attributes #2 = { convergent inaccessiblememonly nounwind writeonly }
attributes #3 = { nounwind readnone speculatable }

View File

@ -0,0 +1,119 @@
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
; Minimum offset
; GCN-LABEL: {{^}}gws_init_offset0:
; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
; GCN-DAG: s_mov_b32 m0, -1{{$}}
; GCN: v_mov_b32_e32 v0, [[BAR_NUM]]
; GCN: ds_gws_init v0 offset:1 gds{{$}}
define amdgpu_kernel void @gws_init_offset0(i32 %val) #0 {
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 0)
ret void
}
; Maximum offset
; GCN-LABEL: {{^}}gws_init_offset63:
; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
; GCN-DAG: s_mov_b32 m0, -1{{$}}
; GCN-DAG: v_mov_b32_e32 v0, [[BAR_NUM]]
; GCN: ds_gws_init v0 offset:64 gds{{$}}
define amdgpu_kernel void @gws_init_offset63(i32 %val) #0 {
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 63)
ret void
}
; FIXME: Should be able to shift directly into m0
; GCN-LABEL: {{^}}gws_init_sgpr_offset:
; GCN-DAG: s_load_dwordx2 s{{\[}}[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]{{\]}}
; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16
; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}}
; GCN-DAG: v_mov_b32_e32 v0, s[[BAR_NUM]]
; GCN: ds_gws_init v0 gds{{$}}
define amdgpu_kernel void @gws_init_sgpr_offset(i32 %val, i32 %offset) #0 {
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 %offset)
ret void
}
; Variable offset in SGPR with constant add
; GCN-LABEL: {{^}}gws_init_sgpr_offset_add1:
; GCN-DAG: s_load_dwordx2 s{{\[}}[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]{{\]}}
; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16
; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}}
; GCN-DAG: v_mov_b32_e32 v0, s[[BAR_NUM]]
; GCN: ds_gws_init v0 offset:1 gds{{$}}
define amdgpu_kernel void @gws_init_sgpr_offset_add1(i32 %val, i32 %offset.base) #0 {
%offset = add i32 %offset.base, 1
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 %offset)
ret void
}
; GCN-LABEL: {{^}}gws_init_vgpr_offset:
; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
; GCN-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0
; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16
; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}}
; GCN-DAG: v_mov_b32_e32 v0, [[BAR_NUM]]
; GCN: ds_gws_init v0 gds{{$}}
define amdgpu_kernel void @gws_init_vgpr_offset(i32 %val) #0 {
%vgpr.offset = call i32 @llvm.amdgcn.workitem.id.x()
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 %vgpr.offset)
ret void
}
; Variable offset in VGPR with constant add
; GCN-LABEL: {{^}}gws_init_vgpr_offset_add:
; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
; GCN-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0
; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16
; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}}
; GCN-DAG: v_mov_b32_e32 v0, [[BAR_NUM]]
; GCN: ds_gws_init v0 offset:3 gds{{$}}
define amdgpu_kernel void @gws_init_vgpr_offset_add(i32 %val) #0 {
%vgpr.offset.base = call i32 @llvm.amdgcn.workitem.id.x()
%vgpr.offset = add i32 %vgpr.offset.base, 3
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 %vgpr.offset)
ret void
}
@lds = internal unnamed_addr addrspace(3) global i32 undef
; Check if m0 initialization is shared.
; GCN-LABEL: {{^}}gws_init_save_m0_init_constant_offset:
; GCN: s_mov_b32 m0, -1
; GCN-NOT: s_mov_b32 m0
define amdgpu_kernel void @gws_init_save_m0_init_constant_offset(i32 %val) #0 {
store i32 1, i32 addrspace(3)* @lds
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 10)
store i32 2, i32 addrspace(3)* @lds
ret void
}
; GCN-LABEL: {{^}}gws_init_lgkmcnt:
; GCN: ds_gws_init v0 offset:1 gds{{$}}
; GCN-NEXT: s_waitcnt expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_setpc_b64
define void @gws_init_lgkmcnt(i32 %val) {
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 0)
ret void
}
; Does not imply memory fence on its own
; GCN-LABEL: {{^}}gws_init_wait_before:
; GCN: store_dword
; CIPLUS-NOT: s_waitcnt
; GCN: ds_gws_init v0 offset:8 gds
define amdgpu_kernel void @gws_init_wait_before(i32 %val, i32 addrspace(1)* %ptr) #0 {
store i32 0, i32 addrspace(1)* %ptr
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 7)
ret void
}
declare void @llvm.amdgcn.ds.gws.init(i32, i32) #1
declare i32 @llvm.amdgcn.workitem.id.x() #2
attributes #0 = { nounwind }
attributes #1 = { convergent inaccessiblememonly nounwind writeonly }
attributes #2 = { nounwind readnone speculatable }

View File

@ -6,6 +6,8 @@
declare void @nonconvergent_func() #0
declare void @convergent_func() #1
declare void @llvm.amdgcn.s.barrier() #1
declare void @llvm.amdgcn.ds.gws.init(i32, i32) #2
declare void @llvm.amdgcn.ds.gws.barrier(i32, i32) #2
; barrier shouldn't be duplicated.
@ -100,6 +102,52 @@ call:
ret void
}
; GCN-LABEL: {{^}}taildup_gws_init:
; GCN: ds_gws_init
; GCN-NOT: ds_gws_init
define amdgpu_kernel void @taildup_gws_init(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i1 %cond, i32 %val, i32 %offset) #0 {
entry:
br i1 %cond, label %bb1, label %bb2
bb1:
store i32 0, i32 addrspace(1)* %a
br label %call
bb2:
store i32 1, i32 addrspace(1)* %a
br label %call
call:
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 %offset)
br label %ret
ret:
ret void
}
; GCN-LABEL: {{^}}taildup_gws_barrier:
; GCN: ds_gws_barrier
; GCN-NOT: ds_gws_barrier
define amdgpu_kernel void @taildup_gws_barrier(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i1 %cond, i32 %val, i32 %offset) #0 {
entry:
br i1 %cond, label %bb1, label %bb2
bb1:
store i32 0, i32 addrspace(1)* %a
br label %call
bb2:
store i32 1, i32 addrspace(1)* %a
br label %call
call:
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 %offset)
br label %ret
ret:
ret void
}
attributes #0 = { nounwind }
attributes #1 = { nounwind convergent }
attributes #2 = { convergent inaccessiblememonly nounwind }