forked from OSchip/llvm-project
Reapply "AMDGPU: Add ds_gws_init / ds_gws_barrier intrinsics"
This reapplies r363678, using the correct chain for the CopyToReg for v0. glueCopyToM0 counterintuitively changes the operands of the original node. llvm-svn: 363870
This commit is contained in:
parent
709a769cdd
commit
4d55d024be
|
@ -1348,6 +1348,28 @@ def int_amdgcn_alignbyte : Intrinsic<[llvm_i32_ty],
|
|||
[IntrNoMem, IntrSpeculatable]
|
||||
>;
|
||||
|
||||
// llvm.amdgcn.ds.gws.init(i32 bar_val, i32 resource_id)
|
||||
//
|
||||
// bar_val is the total number of waves that will wait on this
|
||||
// barrier, minus 1.
|
||||
def int_amdgcn_ds_gws_init :
|
||||
GCCBuiltin<"__builtin_amdgcn_ds_gws_init">,
|
||||
Intrinsic<[],
|
||||
[llvm_i32_ty, llvm_i32_ty],
|
||||
[IntrConvergent, IntrWriteMem, IntrInaccessibleMemOnly], "",
|
||||
[SDNPMemOperand]
|
||||
>;
|
||||
|
||||
// llvm.amdgcn.ds.gws.barrier(i32 vsrc0, i32 resource_id)
|
||||
// bar_val is the total number of waves that will wait on this
|
||||
// barrier, minus 1.
|
||||
def int_amdgcn_ds_gws_barrier :
|
||||
GCCBuiltin<"__builtin_amdgcn_ds_gws_barrier">,
|
||||
Intrinsic<[],
|
||||
[llvm_i32_ty, llvm_i32_ty],
|
||||
[IntrConvergent, IntrInaccessibleMemOnly], "",
|
||||
[SDNPMemOperand]
|
||||
>;
|
||||
|
||||
// Copies the source value to the destination value, with the guarantee that
|
||||
// the source value is computed as if the entire program were executed in WQM.
|
||||
|
|
|
@ -218,7 +218,9 @@ private:
|
|||
void SelectFMAD_FMA(SDNode *N);
|
||||
void SelectATOMIC_CMP_SWAP(SDNode *N);
|
||||
void SelectDSAppendConsume(SDNode *N, unsigned IntrID);
|
||||
void SelectDS_GWS(SDNode *N, unsigned IntrID);
|
||||
void SelectINTRINSIC_W_CHAIN(SDNode *N);
|
||||
void SelectINTRINSIC_VOID(SDNode *N);
|
||||
|
||||
protected:
|
||||
// Include the pieces autogenerated from the target description.
|
||||
|
@ -832,6 +834,10 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
|
|||
SelectINTRINSIC_W_CHAIN(N);
|
||||
return;
|
||||
}
|
||||
case ISD::INTRINSIC_VOID: {
|
||||
SelectINTRINSIC_VOID(N);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
SelectCode(N);
|
||||
|
@ -2034,6 +2040,72 @@ void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
|
|||
CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
|
||||
}
|
||||
|
||||
void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
|
||||
SDLoc SL(N);
|
||||
SDValue VSrc0 = N->getOperand(2);
|
||||
SDValue BaseOffset = N->getOperand(3);
|
||||
int ImmOffset = 0;
|
||||
MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
|
||||
MachineMemOperand *MMO = M->getMemOperand();
|
||||
|
||||
// Don't worry if the offset ends up in a VGPR. Only one lane will have
|
||||
// effect, so SIFixSGPRCopies will validly insert readfirstlane.
|
||||
|
||||
// The resource id offset is computed as (<isa opaque base> + M0[21:16] +
|
||||
// offset field) % 64. Some versions of the programming guide omit the m0
|
||||
// part, or claim it's from offset 0.
|
||||
if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(BaseOffset)) {
|
||||
// If we have a constant offset, try to use the default value for m0 as a
|
||||
// base to possibly avoid setting it up.
|
||||
glueCopyToM0(N, CurDAG->getTargetConstant(-1, SL, MVT::i32));
|
||||
ImmOffset = ConstOffset->getZExtValue() + 1;
|
||||
} else {
|
||||
if (CurDAG->isBaseWithConstantOffset(BaseOffset)) {
|
||||
ImmOffset = BaseOffset.getConstantOperandVal(1);
|
||||
BaseOffset = BaseOffset.getOperand(0);
|
||||
}
|
||||
|
||||
// Prefer to do the shift in an SGPR since it should be possible to use m0
|
||||
// as the result directly. If it's already an SGPR, it will be eliminated
|
||||
// later.
|
||||
SDNode *SGPROffset
|
||||
= CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32,
|
||||
BaseOffset);
|
||||
// Shift to offset in m0
|
||||
SDNode *M0Base
|
||||
= CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
|
||||
SDValue(SGPROffset, 0),
|
||||
CurDAG->getTargetConstant(16, SL, MVT::i32));
|
||||
glueCopyToM0(N, SDValue(M0Base, 0));
|
||||
}
|
||||
|
||||
// The manual doesn't mention this, but it seems only v0 works.
|
||||
SDValue V0 = CurDAG->getRegister(AMDGPU::VGPR0, MVT::i32);
|
||||
|
||||
SDValue CopyToV0 = CurDAG->getCopyToReg(
|
||||
N->getOperand(0), SL, V0, VSrc0,
|
||||
N->getOperand(N->getNumOperands() - 1));
|
||||
|
||||
SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);
|
||||
|
||||
// TODO: Can this just be removed from the instruction?
|
||||
SDValue GDS = CurDAG->getTargetConstant(1, SL, MVT::i1);
|
||||
|
||||
unsigned Opc = IntrID == Intrinsic::amdgcn_ds_gws_init ?
|
||||
AMDGPU::DS_GWS_INIT : AMDGPU::DS_GWS_BARRIER;
|
||||
|
||||
SDValue Ops[] = {
|
||||
V0,
|
||||
OffsetField,
|
||||
GDS,
|
||||
CopyToV0, // Chain
|
||||
CopyToV0.getValue(1) // Glue
|
||||
};
|
||||
|
||||
SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
|
||||
CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
|
||||
}
|
||||
|
||||
void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
|
||||
unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
|
||||
switch (IntrID) {
|
||||
|
@ -2044,6 +2116,18 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
|
|||
SelectDSAppendConsume(N, IntrID);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
SelectCode(N);
|
||||
}
|
||||
|
||||
void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
|
||||
unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
|
||||
switch (IntrID) {
|
||||
case Intrinsic::amdgcn_ds_gws_init:
|
||||
case Intrinsic::amdgcn_ds_gws_barrier:
|
||||
SelectDS_GWS(N, IntrID);
|
||||
return;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
|
|
@ -467,11 +467,15 @@ defm DS_WRXCHG_RTN_B64 : DS_1A1D_RET_mc<"ds_wrxchg_rtn_b64", VReg_64>;
|
|||
defm DS_WRXCHG2_RTN_B64 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2_rtn_b64", VReg_128, VReg_64>;
|
||||
defm DS_WRXCHG2ST64_RTN_B64 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2st64_rtn_b64", VReg_128, VReg_64>;
|
||||
|
||||
def DS_GWS_INIT : DS_GWS_1D<"ds_gws_init">;
|
||||
let isConvergent = 1 in {
|
||||
def DS_GWS_INIT : DS_GWS_1D<"ds_gws_init"> {
|
||||
let mayLoad = 0;
|
||||
}
|
||||
def DS_GWS_SEMA_V : DS_GWS_0D<"ds_gws_sema_v">;
|
||||
def DS_GWS_SEMA_BR : DS_GWS_1D<"ds_gws_sema_br">;
|
||||
def DS_GWS_SEMA_P : DS_GWS_0D<"ds_gws_sema_p">;
|
||||
def DS_GWS_BARRIER : DS_GWS_1D<"ds_gws_barrier">;
|
||||
}
|
||||
|
||||
def DS_ADD_SRC2_U32 : DS_1A<"ds_add_src2_u32">;
|
||||
def DS_SUB_SRC2_U32 : DS_1A<"ds_sub_src2_u32">;
|
||||
|
|
|
@ -961,6 +961,24 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
|
|||
|
||||
return true;
|
||||
}
|
||||
case Intrinsic::amdgcn_ds_gws_init:
|
||||
case Intrinsic::amdgcn_ds_gws_barrier: {
|
||||
Info.opc = ISD::INTRINSIC_VOID;
|
||||
|
||||
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
|
||||
Info.ptrVal =
|
||||
MFI->getGWSPSV(*MF.getSubtarget<GCNSubtarget>().getInstrInfo());
|
||||
|
||||
// This is an abstract access, but we need to specify a type and size.
|
||||
Info.memVT = MVT::i32;
|
||||
Info.size = 4;
|
||||
Info.align = 4;
|
||||
|
||||
Info.flags = MachineMemOperand::MOStore;
|
||||
if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
|
||||
Info.flags = MachineMemOperand::MOLoad;
|
||||
return true;
|
||||
}
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
|
|
@ -536,15 +536,19 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
|
|||
// Put score on the source vgprs. If this is a store, just use those
|
||||
// specific register(s).
|
||||
if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
|
||||
int AddrOpIdx =
|
||||
AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr);
|
||||
// All GDS operations must protect their address register (same as
|
||||
// export.)
|
||||
if (Inst.getOpcode() != AMDGPU::DS_APPEND &&
|
||||
Inst.getOpcode() != AMDGPU::DS_CONSUME) {
|
||||
setExpScore(
|
||||
&Inst, TII, TRI, MRI,
|
||||
AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr),
|
||||
CurrScore);
|
||||
if (AddrOpIdx != -1) {
|
||||
setExpScore(&Inst, TII, TRI, MRI, AddrOpIdx, CurrScore);
|
||||
} else {
|
||||
assert(Inst.getOpcode() == AMDGPU::DS_APPEND ||
|
||||
Inst.getOpcode() == AMDGPU::DS_CONSUME ||
|
||||
Inst.getOpcode() == AMDGPU::DS_GWS_INIT ||
|
||||
Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER);
|
||||
}
|
||||
|
||||
if (Inst.mayStore()) {
|
||||
if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
|
||||
AMDGPU::OpName::data0) != -1) {
|
||||
|
@ -1407,18 +1411,6 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
|
|||
ScoreBrackets.dump();
|
||||
});
|
||||
|
||||
// Check to see if this is a GWS instruction. If so, and if this is CI or
|
||||
// VI, then the generated code sequence will include an S_WAITCNT 0.
|
||||
// TODO: Are these the only GWS instructions?
|
||||
if (Inst.getOpcode() == AMDGPU::DS_GWS_INIT ||
|
||||
Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_V ||
|
||||
Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
|
||||
Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_P ||
|
||||
Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
|
||||
// TODO: && context->target_info->GwsRequiresMemViolTest() ) {
|
||||
ScoreBrackets.applyWaitcnt(AMDGPU::Waitcnt::allZeroExceptVsCnt());
|
||||
}
|
||||
|
||||
// TODO: Remove this work-around after fixing the scheduler and enable the
|
||||
// assert above.
|
||||
if (VCCZBugWorkAround) {
|
||||
|
|
|
@ -2547,7 +2547,8 @@ bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
|
|||
// given the typical code patterns.
|
||||
if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
|
||||
Opcode == AMDGPU::EXP || Opcode == AMDGPU::EXP_DONE ||
|
||||
Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::S_TRAP)
|
||||
Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::S_TRAP ||
|
||||
Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_BARRIER)
|
||||
return true;
|
||||
|
||||
if (MI.isCall() || MI.isInlineAsm())
|
||||
|
|
|
@ -43,7 +43,8 @@ class AMDGPUPseudoSourceValue : public PseudoSourceValue {
|
|||
public:
|
||||
enum AMDGPUPSVKind : unsigned {
|
||||
PSVBuffer = PseudoSourceValue::TargetCustom,
|
||||
PSVImage
|
||||
PSVImage,
|
||||
GWSResource
|
||||
};
|
||||
|
||||
protected:
|
||||
|
@ -87,6 +88,30 @@ public:
|
|||
}
|
||||
};
|
||||
|
||||
class AMDGPUGWSResourcePseudoSourceValue final : public AMDGPUPseudoSourceValue {
|
||||
public:
|
||||
explicit AMDGPUGWSResourcePseudoSourceValue(const TargetInstrInfo &TII)
|
||||
: AMDGPUPseudoSourceValue(GWSResource, TII) {}
|
||||
|
||||
static bool classof(const PseudoSourceValue *V) {
|
||||
return V->kind() == GWSResource;
|
||||
}
|
||||
|
||||
// These are inaccessible memory from IR.
|
||||
bool isAliased(const MachineFrameInfo *) const override {
|
||||
return false;
|
||||
}
|
||||
|
||||
// These are inaccessible memory from IR.
|
||||
bool mayAlias(const MachineFrameInfo *) const override {
|
||||
return false;
|
||||
}
|
||||
|
||||
void printCustom(raw_ostream &OS) const override {
|
||||
OS << "GWSResource";
|
||||
}
|
||||
};
|
||||
|
||||
namespace yaml {
|
||||
|
||||
struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
|
||||
|
@ -188,6 +213,7 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
|
|||
std::unique_ptr<const AMDGPUBufferPseudoSourceValue>> BufferPSVs;
|
||||
DenseMap<const Value *,
|
||||
std::unique_ptr<const AMDGPUImagePseudoSourceValue>> ImagePSVs;
|
||||
std::unique_ptr<const AMDGPUGWSResourcePseudoSourceValue> GWSResourcePSV;
|
||||
|
||||
private:
|
||||
unsigned LDSWaveSpillSize = 0;
|
||||
|
@ -674,6 +700,15 @@ public:
|
|||
return PSV.first->second.get();
|
||||
}
|
||||
|
||||
const AMDGPUGWSResourcePseudoSourceValue *getGWSPSV(const SIInstrInfo &TII) {
|
||||
if (!GWSResourcePSV) {
|
||||
GWSResourcePSV =
|
||||
llvm::make_unique<AMDGPUGWSResourcePseudoSourceValue>(TII);
|
||||
}
|
||||
|
||||
return GWSResourcePSV.get();
|
||||
}
|
||||
|
||||
unsigned getOccupancy() const {
|
||||
return Occupancy;
|
||||
}
|
||||
|
|
|
@ -0,0 +1,103 @@
|
|||
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
|
||||
# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GFX9 %s
|
||||
# RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=VI %s
|
||||
# RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=CI %s
|
||||
# RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=SI %s
|
||||
|
||||
---
|
||||
name: m0_gws_init0
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
|
||||
bb.0:
|
||||
liveins: $vgpr0
|
||||
; GFX9-LABEL: name: m0_gws_init0
|
||||
; GFX9: liveins: $vgpr0
|
||||
; GFX9: $m0 = S_MOV_B32 -1
|
||||
; GFX9: S_NOP 0
|
||||
; GFX9: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
|
||||
; VI-LABEL: name: m0_gws_init0
|
||||
; VI: liveins: $vgpr0
|
||||
; VI: $m0 = S_MOV_B32 -1
|
||||
; VI: S_NOP 0
|
||||
; VI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
|
||||
; CI-LABEL: name: m0_gws_init0
|
||||
; CI: liveins: $vgpr0
|
||||
; CI: $m0 = S_MOV_B32 -1
|
||||
; CI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
|
||||
; SI-LABEL: name: m0_gws_init0
|
||||
; SI: liveins: $vgpr0
|
||||
; SI: $m0 = S_MOV_B32 -1
|
||||
; SI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
|
||||
$m0 = S_MOV_B32 -1
|
||||
DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
|
||||
|
||||
...
|
||||
|
||||
---
|
||||
name: m0_gws_init1
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
|
||||
bb.0:
|
||||
; GFX9-LABEL: name: m0_gws_init1
|
||||
; GFX9: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
|
||||
; GFX9: $m0 = S_MOV_B32 -1
|
||||
; GFX9: S_NOP 0
|
||||
; GFX9: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
|
||||
; VI-LABEL: name: m0_gws_init1
|
||||
; VI: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
|
||||
; VI: $m0 = S_MOV_B32 -1
|
||||
; VI: S_NOP 0
|
||||
; VI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
|
||||
; CI-LABEL: name: m0_gws_init1
|
||||
; CI: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
|
||||
; CI: $m0 = S_MOV_B32 -1
|
||||
; CI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
|
||||
; SI-LABEL: name: m0_gws_init1
|
||||
; SI: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
|
||||
; SI: $m0 = S_MOV_B32 -1
|
||||
; SI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
|
||||
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
|
||||
$m0 = S_MOV_B32 -1
|
||||
DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
|
||||
|
||||
...
|
||||
|
||||
# Test a typical situation where m0 needs to be set from a VGPR
|
||||
# through readfirstlane
|
||||
---
|
||||
name: m0_gws_readlane
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1
|
||||
|
||||
; GFX9-LABEL: name: m0_gws_readlane
|
||||
; GFX9: liveins: $vgpr0, $vgpr1
|
||||
; GFX9: $sgpr0 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
|
||||
; GFX9: $m0 = S_MOV_B32 $sgpr0
|
||||
; GFX9: S_NOP 0
|
||||
; GFX9: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
|
||||
; VI-LABEL: name: m0_gws_readlane
|
||||
; VI: liveins: $vgpr0, $vgpr1
|
||||
; VI: $sgpr0 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
|
||||
; VI: $m0 = S_MOV_B32 $sgpr0
|
||||
; VI: S_NOP 0
|
||||
; VI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
|
||||
; CI-LABEL: name: m0_gws_readlane
|
||||
; CI: liveins: $vgpr0, $vgpr1
|
||||
; CI: $sgpr0 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
|
||||
; CI: $m0 = S_MOV_B32 $sgpr0
|
||||
; CI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
|
||||
; SI-LABEL: name: m0_gws_readlane
|
||||
; SI: liveins: $vgpr0, $vgpr1
|
||||
; SI: $sgpr0 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
|
||||
; SI: $m0 = S_MOV_B32 $sgpr0
|
||||
; SI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
|
||||
$sgpr0 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
|
||||
$m0 = S_MOV_B32 $sgpr0
|
||||
DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
|
||||
|
||||
...
|
|
@ -0,0 +1,59 @@
|
|||
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
|
||||
# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass si-insert-skips -amdgpu-skip-threshold=1 -verify-machineinstrs %s -o - | FileCheck %s
|
||||
# Make sure mandatory skips are inserted to ensure GWS ops aren't run with exec = 0
|
||||
|
||||
---
|
||||
|
||||
name: skip_gws_init
|
||||
body: |
|
||||
; CHECK-LABEL: name: skip_gws_init
|
||||
; CHECK: bb.0:
|
||||
; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000)
|
||||
; CHECK: SI_MASK_BRANCH %bb.2, implicit $exec
|
||||
; CHECK: S_CBRANCH_EXECZ %bb.2, implicit $exec
|
||||
; CHECK: bb.1:
|
||||
; CHECK: successors: %bb.2(0x80000000)
|
||||
; CHECK: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
|
||||
; CHECK: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
|
||||
; CHECK: bb.2:
|
||||
; CHECK: S_ENDPGM 0
|
||||
bb.0:
|
||||
successors: %bb.1, %bb.2
|
||||
SI_MASK_BRANCH %bb.2, implicit $exec
|
||||
|
||||
bb.1:
|
||||
successors: %bb.2
|
||||
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
|
||||
DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
|
||||
|
||||
bb.2:
|
||||
S_ENDPGM 0
|
||||
...
|
||||
|
||||
---
|
||||
|
||||
name: skip_gws_barrier
|
||||
body: |
|
||||
; CHECK-LABEL: name: skip_gws_barrier
|
||||
; CHECK: bb.0:
|
||||
; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000)
|
||||
; CHECK: SI_MASK_BRANCH %bb.2, implicit $exec
|
||||
; CHECK: S_CBRANCH_EXECZ %bb.2, implicit $exec
|
||||
; CHECK: bb.1:
|
||||
; CHECK: successors: %bb.2(0x80000000)
|
||||
; CHECK: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
|
||||
; CHECK: DS_GWS_BARRIER $vgpr0, 0, 1, implicit $m0, implicit $exec
|
||||
; CHECK: bb.2:
|
||||
; CHECK: S_ENDPGM 0
|
||||
bb.0:
|
||||
successors: %bb.1, %bb.2
|
||||
SI_MASK_BRANCH %bb.2, implicit $exec
|
||||
|
||||
bb.1:
|
||||
successors: %bb.2
|
||||
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
|
||||
DS_GWS_BARRIER $vgpr0, 0, 1, implicit $m0, implicit $exec
|
||||
|
||||
bb.2:
|
||||
S_ENDPGM 0
|
||||
...
|
|
@ -0,0 +1,179 @@
|
|||
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
|
||||
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -mattr=+flat-for-global -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIPLUS %s
|
||||
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIPLUS %s
|
||||
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIPLUS %s
|
||||
|
||||
; Minimum offset
|
||||
; GCN-LABEL: {{^}}gws_barrier_offset0:
|
||||
; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
|
||||
; GCN-DAG: s_mov_b32 m0, -1{{$}}
|
||||
; GCN: v_mov_b32_e32 v0, [[BAR_NUM]]
|
||||
; GCN: ds_gws_barrier v0 offset:1 gds{{$}}
|
||||
define amdgpu_kernel void @gws_barrier_offset0(i32 %val) #0 {
|
||||
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Maximum offset
|
||||
; GCN-LABEL: {{^}}gws_barrier_offset63:
|
||||
; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
|
||||
; GCN-DAG: s_mov_b32 m0, -1{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 v0, [[BAR_NUM]]
|
||||
; GCN: ds_gws_barrier v0 offset:64 gds{{$}}
|
||||
define amdgpu_kernel void @gws_barrier_offset63(i32 %val) #0 {
|
||||
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 63)
|
||||
ret void
|
||||
}
|
||||
|
||||
; FIXME: Should be able to shift directly into m0
|
||||
; GCN-LABEL: {{^}}gws_barrier_sgpr_offset:
|
||||
; GCN-DAG: s_load_dwordx2 s{{\[}}[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]{{\]}}
|
||||
; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16
|
||||
; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 v0, s[[BAR_NUM]]
|
||||
; GCN: ds_gws_barrier v0 gds{{$}}
|
||||
define amdgpu_kernel void @gws_barrier_sgpr_offset(i32 %val, i32 %offset) #0 {
|
||||
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 %offset)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Variable offset in SGPR with constant add
|
||||
; GCN-LABEL: {{^}}gws_barrier_sgpr_offset_add1:
|
||||
; GCN-DAG: s_load_dwordx2 s{{\[}}[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]{{\]}}
|
||||
; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16
|
||||
; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 v0, s[[BAR_NUM]]
|
||||
; GCN: ds_gws_barrier v0 offset:1 gds{{$}}
|
||||
define amdgpu_kernel void @gws_barrier_sgpr_offset_add1(i32 %val, i32 %offset.base) #0 {
|
||||
%offset = add i32 %offset.base, 1
|
||||
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 %offset)
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}gws_barrier_vgpr_offset:
|
||||
; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
|
||||
; GCN-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0
|
||||
; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16
|
||||
; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 v0, [[BAR_NUM]]
|
||||
; GCN: ds_gws_barrier v0 gds{{$}}
|
||||
define amdgpu_kernel void @gws_barrier_vgpr_offset(i32 %val) #0 {
|
||||
%vgpr.offset = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 %vgpr.offset)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Variable offset in VGPR with constant add
|
||||
; GCN-LABEL: {{^}}gws_barrier_vgpr_offset_add:
|
||||
; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
|
||||
; GCN-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0
|
||||
; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16
|
||||
; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 v0, [[BAR_NUM]]
|
||||
; GCN: ds_gws_barrier v0 offset:3 gds{{$}}
|
||||
define amdgpu_kernel void @gws_barrier_vgpr_offset_add(i32 %val) #0 {
|
||||
%vgpr.offset.base = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%vgpr.offset = add i32 %vgpr.offset.base, 3
|
||||
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 %vgpr.offset)
|
||||
ret void
|
||||
}
|
||||
|
||||
@lds = internal unnamed_addr addrspace(3) global i32 undef
|
||||
|
||||
; Check if m0 initialization is shared
|
||||
; GCN-LABEL: {{^}}gws_barrier_save_m0_barrier_constant_offset:
|
||||
; GCN: s_mov_b32 m0, -1
|
||||
; GCN-NOT: s_mov_b32 m0
|
||||
define amdgpu_kernel void @gws_barrier_save_m0_barrier_constant_offset(i32 %val) #0 {
|
||||
store i32 1, i32 addrspace(3)* @lds
|
||||
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 10)
|
||||
store i32 2, i32 addrspace(3)* @lds
|
||||
ret void
|
||||
}
|
||||
|
||||
; Make sure this increments lgkmcnt
|
||||
; GCN-LABEL: {{^}}gws_barrier_lgkmcnt:
|
||||
; GCN: ds_gws_barrier v0 offset:1 gds{{$}}
|
||||
; GCN-NEXT: s_waitcnt expcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define void @gws_barrier_lgkmcnt(i32 %val) {
|
||||
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Does not imply memory fence on its own
|
||||
; GCN-LABEL: {{^}}gws_barrier_wait_before:
|
||||
; GCN: store_dword
|
||||
; CIPLUS-NOT: s_waitcnt
|
||||
; GCN: ds_gws_barrier v0 offset:8 gds
|
||||
define amdgpu_kernel void @gws_barrier_wait_before(i32 %val, i32 addrspace(1)* %ptr) #0 {
|
||||
store i32 0, i32 addrspace(1)* %ptr
|
||||
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7)
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}gws_barrier_wait_after:
|
||||
; GCN: ds_gws_barrier v0 offset:8 gds
|
||||
; GCN-NEXT: s_waitcnt expcnt(0){{$}}
|
||||
; GCN-NEXT: load_dword
|
||||
define amdgpu_kernel void @gws_barrier_wait_after(i32 %val, i32 addrspace(1)* %ptr) #0 {
|
||||
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7)
|
||||
%load = load volatile i32, i32 addrspace(1)* %ptr
|
||||
ret void
|
||||
}
|
||||
|
||||
; Does not imply memory fence on its own
|
||||
; GCN-LABEL: {{^}}gws_barrier_fence_before:
|
||||
; GCN: store_dword
|
||||
; GCN: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GCN: ds_gws_barrier v0 offset:8 gds
|
||||
define amdgpu_kernel void @gws_barrier_fence_before(i32 %val, i32 addrspace(1)* %ptr) #0 {
|
||||
store i32 0, i32 addrspace(1)* %ptr
|
||||
fence release
|
||||
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7)
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}gws_barrier_fence_after:
|
||||
; GCN: ds_gws_barrier v0 offset:8 gds
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: load_dword
|
||||
define amdgpu_kernel void @gws_barrier_fence_after(i32 %val, i32 addrspace(1)* %ptr) #0 {
|
||||
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7)
|
||||
fence release
|
||||
%load = load volatile i32, i32 addrspace(1)* %ptr
|
||||
ret void
|
||||
}
|
||||
|
||||
; FIXME: Should a wait be inserted here, or is an explicit fence needed?
|
||||
; GCN-LABEL: {{^}}gws_init_barrier:
|
||||
; GCN: s_mov_b32 m0, -1
|
||||
; GCN: ds_gws_init v0 offset:8 gds
|
||||
; GCN-NEXT: ds_gws_barrier v0 offset:8 gds
|
||||
define amdgpu_kernel void @gws_init_barrier(i32 %val) #0 {
|
||||
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 7)
|
||||
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7)
|
||||
ret void
|
||||
}
|
||||
|
||||
; FIXME: Why vmcnt, not expcnt?
|
||||
; GCN-LABEL: {{^}}gws_init_fence_barrier:
|
||||
; GCN: s_mov_b32 m0, -1
|
||||
; GCN: ds_gws_init v0 offset:8 gds
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: ds_gws_barrier v0 offset:8 gds
|
||||
define amdgpu_kernel void @gws_init_fence_barrier(i32 %val) #0 {
|
||||
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 7)
|
||||
fence release
|
||||
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @llvm.amdgcn.ds.gws.barrier(i32, i32) #1
|
||||
declare void @llvm.amdgcn.ds.gws.init(i32, i32) #2
|
||||
declare i32 @llvm.amdgcn.workitem.id.x() #3
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
attributes #1 = { convergent inaccessiblememonly nounwind }
|
||||
attributes #2 = { convergent inaccessiblememonly nounwind writeonly }
|
||||
attributes #3 = { nounwind readnone speculatable }
|
|
@ -0,0 +1,119 @@
|
|||
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
|
||||
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
|
||||
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
|
||||
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
|
||||
|
||||
; Minimum offset
|
||||
; GCN-LABEL: {{^}}gws_init_offset0:
|
||||
; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
|
||||
; GCN-DAG: s_mov_b32 m0, -1{{$}}
|
||||
; GCN: v_mov_b32_e32 v0, [[BAR_NUM]]
|
||||
; GCN: ds_gws_init v0 offset:1 gds{{$}}
|
||||
define amdgpu_kernel void @gws_init_offset0(i32 %val) #0 {
|
||||
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Maximum offset
|
||||
; GCN-LABEL: {{^}}gws_init_offset63:
|
||||
; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
|
||||
; GCN-DAG: s_mov_b32 m0, -1{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 v0, [[BAR_NUM]]
|
||||
; GCN: ds_gws_init v0 offset:64 gds{{$}}
|
||||
define amdgpu_kernel void @gws_init_offset63(i32 %val) #0 {
|
||||
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 63)
|
||||
ret void
|
||||
}
|
||||
|
||||
; FIXME: Should be able to shift directly into m0
|
||||
; GCN-LABEL: {{^}}gws_init_sgpr_offset:
|
||||
; GCN-DAG: s_load_dwordx2 s{{\[}}[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]{{\]}}
|
||||
; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16
|
||||
; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 v0, s[[BAR_NUM]]
|
||||
; GCN: ds_gws_init v0 gds{{$}}
|
||||
define amdgpu_kernel void @gws_init_sgpr_offset(i32 %val, i32 %offset) #0 {
|
||||
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 %offset)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Variable offset in SGPR with constant add
|
||||
; GCN-LABEL: {{^}}gws_init_sgpr_offset_add1:
|
||||
; GCN-DAG: s_load_dwordx2 s{{\[}}[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]{{\]}}
|
||||
; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16
|
||||
; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 v0, s[[BAR_NUM]]
|
||||
; GCN: ds_gws_init v0 offset:1 gds{{$}}
|
||||
define amdgpu_kernel void @gws_init_sgpr_offset_add1(i32 %val, i32 %offset.base) #0 {
|
||||
%offset = add i32 %offset.base, 1
|
||||
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 %offset)
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}gws_init_vgpr_offset:
|
||||
; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
|
||||
; GCN-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0
|
||||
; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16
|
||||
; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 v0, [[BAR_NUM]]
|
||||
; GCN: ds_gws_init v0 gds{{$}}
|
||||
define amdgpu_kernel void @gws_init_vgpr_offset(i32 %val) #0 {
|
||||
%vgpr.offset = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 %vgpr.offset)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Variable offset in VGPR with constant add
|
||||
; GCN-LABEL: {{^}}gws_init_vgpr_offset_add:
|
||||
; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
|
||||
; GCN-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0
|
||||
; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16
|
||||
; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 v0, [[BAR_NUM]]
|
||||
; GCN: ds_gws_init v0 offset:3 gds{{$}}
|
||||
define amdgpu_kernel void @gws_init_vgpr_offset_add(i32 %val) #0 {
|
||||
%vgpr.offset.base = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%vgpr.offset = add i32 %vgpr.offset.base, 3
|
||||
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 %vgpr.offset)
|
||||
ret void
|
||||
}
|
||||
|
||||
@lds = internal unnamed_addr addrspace(3) global i32 undef
|
||||
|
||||
; Check if m0 initialization is shared.
|
||||
; GCN-LABEL: {{^}}gws_init_save_m0_init_constant_offset:
|
||||
; GCN: s_mov_b32 m0, -1
|
||||
; GCN-NOT: s_mov_b32 m0
|
||||
define amdgpu_kernel void @gws_init_save_m0_init_constant_offset(i32 %val) #0 {
|
||||
store i32 1, i32 addrspace(3)* @lds
|
||||
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 10)
|
||||
store i32 2, i32 addrspace(3)* @lds
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}gws_init_lgkmcnt:
|
||||
; GCN: ds_gws_init v0 offset:1 gds{{$}}
|
||||
; GCN-NEXT: s_waitcnt expcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define void @gws_init_lgkmcnt(i32 %val) {
|
||||
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Does not imply memory fence on its own
|
||||
; GCN-LABEL: {{^}}gws_init_wait_before:
|
||||
; GCN: store_dword
|
||||
; CIPLUS-NOT: s_waitcnt
|
||||
; GCN: ds_gws_init v0 offset:8 gds
|
||||
define amdgpu_kernel void @gws_init_wait_before(i32 %val, i32 addrspace(1)* %ptr) #0 {
|
||||
store i32 0, i32 addrspace(1)* %ptr
|
||||
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 7)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @llvm.amdgcn.ds.gws.init(i32, i32) #1
|
||||
declare i32 @llvm.amdgcn.workitem.id.x() #2
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
attributes #1 = { convergent inaccessiblememonly nounwind writeonly }
|
||||
attributes #2 = { nounwind readnone speculatable }
|
|
@ -6,6 +6,8 @@
|
|||
declare void @nonconvergent_func() #0
|
||||
declare void @convergent_func() #1
|
||||
declare void @llvm.amdgcn.s.barrier() #1
|
||||
declare void @llvm.amdgcn.ds.gws.init(i32, i32) #2
|
||||
declare void @llvm.amdgcn.ds.gws.barrier(i32, i32) #2
|
||||
|
||||
; barrier shouldn't be duplicated.
|
||||
|
||||
|
@ -100,6 +102,52 @@ call:
|
|||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}taildup_gws_init:
|
||||
; GCN: ds_gws_init
|
||||
; GCN-NOT: ds_gws_init
|
||||
define amdgpu_kernel void @taildup_gws_init(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i1 %cond, i32 %val, i32 %offset) #0 {
|
||||
entry:
|
||||
br i1 %cond, label %bb1, label %bb2
|
||||
|
||||
bb1:
|
||||
store i32 0, i32 addrspace(1)* %a
|
||||
br label %call
|
||||
|
||||
bb2:
|
||||
store i32 1, i32 addrspace(1)* %a
|
||||
br label %call
|
||||
|
||||
call:
|
||||
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 %offset)
|
||||
br label %ret
|
||||
|
||||
ret:
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}taildup_gws_barrier:
|
||||
; GCN: ds_gws_barrier
|
||||
; GCN-NOT: ds_gws_barrier
|
||||
define amdgpu_kernel void @taildup_gws_barrier(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i1 %cond, i32 %val, i32 %offset) #0 {
|
||||
entry:
|
||||
br i1 %cond, label %bb1, label %bb2
|
||||
|
||||
bb1:
|
||||
store i32 0, i32 addrspace(1)* %a
|
||||
br label %call
|
||||
|
||||
bb2:
|
||||
store i32 1, i32 addrspace(1)* %a
|
||||
br label %call
|
||||
|
||||
call:
|
||||
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 %offset)
|
||||
br label %ret
|
||||
|
||||
ret:
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
attributes #1 = { nounwind convergent }
|
||||
attributes #2 = { convergent inaccessiblememonly nounwind }
|
||||
|
|
Loading…
Reference in New Issue