forked from OSchip/llvm-project
Revert rL363678 : AMDGPU: Add ds_gws_init / ds_gws_barrier intrinsics
There may or may not be additional work to handle this correctly on SI/CI. ........ Breaks EXPENSIVE_CHECKS buildbots - http://lab.llvm.org:8011/builders/llvm-clang-x86_64-expensive-checks-win/builds/78/ llvm-svn: 363797
This commit is contained in:
parent
e3cd19d330
commit
128ce93c60
|
@ -1348,28 +1348,6 @@ def int_amdgcn_alignbyte : Intrinsic<[llvm_i32_ty],
|
||||||
[IntrNoMem, IntrSpeculatable]
|
[IntrNoMem, IntrSpeculatable]
|
||||||
>;
|
>;
|
||||||
|
|
||||||
// llvm.amdgcn.ds.gws.init(i32 bar_val, i32 resource_id)
|
|
||||||
//
|
|
||||||
// bar_val is the total number of waves that will wait on this
|
|
||||||
// barrier, minus 1.
|
|
||||||
def int_amdgcn_ds_gws_init :
|
|
||||||
GCCBuiltin<"__builtin_amdgcn_ds_gws_init">,
|
|
||||||
Intrinsic<[],
|
|
||||||
[llvm_i32_ty, llvm_i32_ty],
|
|
||||||
[IntrConvergent, IntrWriteMem, IntrInaccessibleMemOnly], "",
|
|
||||||
[SDNPMemOperand]
|
|
||||||
>;
|
|
||||||
|
|
||||||
// llvm.amdgcn.ds.gws.barrier(i32 vsrc0, i32 resource_id)
|
|
||||||
// bar_val is the total number of waves that will wait on this
|
|
||||||
// barrier, minus 1.
|
|
||||||
def int_amdgcn_ds_gws_barrier :
|
|
||||||
GCCBuiltin<"__builtin_amdgcn_ds_gws_barrier">,
|
|
||||||
Intrinsic<[],
|
|
||||||
[llvm_i32_ty, llvm_i32_ty],
|
|
||||||
[IntrConvergent, IntrInaccessibleMemOnly], "",
|
|
||||||
[SDNPMemOperand]
|
|
||||||
>;
|
|
||||||
|
|
||||||
// Copies the source value to the destination value, with the guarantee that
|
// Copies the source value to the destination value, with the guarantee that
|
||||||
// the source value is computed as if the entire program were executed in WQM.
|
// the source value is computed as if the entire program were executed in WQM.
|
||||||
|
|
|
@ -218,9 +218,7 @@ private:
|
||||||
void SelectFMAD_FMA(SDNode *N);
|
void SelectFMAD_FMA(SDNode *N);
|
||||||
void SelectATOMIC_CMP_SWAP(SDNode *N);
|
void SelectATOMIC_CMP_SWAP(SDNode *N);
|
||||||
void SelectDSAppendConsume(SDNode *N, unsigned IntrID);
|
void SelectDSAppendConsume(SDNode *N, unsigned IntrID);
|
||||||
void SelectDS_GWS(SDNode *N, unsigned IntrID);
|
|
||||||
void SelectINTRINSIC_W_CHAIN(SDNode *N);
|
void SelectINTRINSIC_W_CHAIN(SDNode *N);
|
||||||
void SelectINTRINSIC_VOID(SDNode *N);
|
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
// Include the pieces autogenerated from the target description.
|
// Include the pieces autogenerated from the target description.
|
||||||
|
@ -834,10 +832,6 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
|
||||||
SelectINTRINSIC_W_CHAIN(N);
|
SelectINTRINSIC_W_CHAIN(N);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
case ISD::INTRINSIC_VOID: {
|
|
||||||
SelectINTRINSIC_VOID(N);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
SelectCode(N);
|
SelectCode(N);
|
||||||
|
@ -2040,73 +2034,6 @@ void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
|
||||||
CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
|
CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
|
||||||
}
|
}
|
||||||
|
|
||||||
void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
|
|
||||||
SDLoc SL(N);
|
|
||||||
SDValue VSrc0 = N->getOperand(2);
|
|
||||||
SDValue BaseOffset = N->getOperand(3);
|
|
||||||
int ImmOffset = 0;
|
|
||||||
SDNode *CopyToM0;
|
|
||||||
MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
|
|
||||||
MachineMemOperand *MMO = M->getMemOperand();
|
|
||||||
|
|
||||||
// Don't worry if the offset ends up in a VGPR. Only one lane will have
|
|
||||||
// effect, so SIFixSGPRCopies will validly insert readfirstlane.
|
|
||||||
|
|
||||||
// The resource id offset is computed as (<isa opaque base> + M0[21:16] +
|
|
||||||
// offset field) % 64. Some versions of the programming guide omit the m0
|
|
||||||
// part, or claim it's from offset 0.
|
|
||||||
if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(BaseOffset)) {
|
|
||||||
// If we have a constant offset, try to use the default value for m0 as a
|
|
||||||
// base to possibly avoid setting it up.
|
|
||||||
CopyToM0 = glueCopyToM0(N, CurDAG->getTargetConstant(-1, SL, MVT::i32));
|
|
||||||
ImmOffset = ConstOffset->getZExtValue() + 1;
|
|
||||||
} else {
|
|
||||||
if (CurDAG->isBaseWithConstantOffset(BaseOffset)) {
|
|
||||||
ImmOffset = BaseOffset.getConstantOperandVal(1);
|
|
||||||
BaseOffset = BaseOffset.getOperand(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Prefer to do the shift in an SGPR since it should be possible to use m0
|
|
||||||
// as the result directly. If it's already an SGPR, it will be eliminated
|
|
||||||
// later.
|
|
||||||
SDNode *SGPROffset
|
|
||||||
= CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32,
|
|
||||||
BaseOffset);
|
|
||||||
// Shift to offset in m0
|
|
||||||
SDNode *M0Base
|
|
||||||
= CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
|
|
||||||
SDValue(SGPROffset, 0),
|
|
||||||
CurDAG->getTargetConstant(16, SL, MVT::i32));
|
|
||||||
CopyToM0 = glueCopyToM0(N, SDValue(M0Base, 0));
|
|
||||||
}
|
|
||||||
|
|
||||||
// The manual doesn't mention this, but it seems only v0 works.
|
|
||||||
SDValue V0 = CurDAG->getRegister(AMDGPU::VGPR0, MVT::i32);
|
|
||||||
|
|
||||||
SDValue CopyToV0 = CurDAG->getCopyToReg(
|
|
||||||
SDValue(CopyToM0, 0), SL, V0, VSrc0,
|
|
||||||
N->getOperand(N->getNumOperands() - 1));
|
|
||||||
|
|
||||||
SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);
|
|
||||||
|
|
||||||
// TODO: Can this just be removed from the instruction?
|
|
||||||
SDValue GDS = CurDAG->getTargetConstant(1, SL, MVT::i1);
|
|
||||||
|
|
||||||
unsigned Opc = IntrID == Intrinsic::amdgcn_ds_gws_init ?
|
|
||||||
AMDGPU::DS_GWS_INIT : AMDGPU::DS_GWS_BARRIER;
|
|
||||||
|
|
||||||
SDValue Ops[] = {
|
|
||||||
V0,
|
|
||||||
OffsetField,
|
|
||||||
GDS,
|
|
||||||
CopyToV0, // Chain
|
|
||||||
CopyToV0.getValue(1) // Glue
|
|
||||||
};
|
|
||||||
|
|
||||||
SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
|
|
||||||
CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
|
|
||||||
}
|
|
||||||
|
|
||||||
void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
|
void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
|
||||||
unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
|
unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
|
||||||
switch (IntrID) {
|
switch (IntrID) {
|
||||||
|
@ -2117,18 +2044,6 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
|
||||||
SelectDSAppendConsume(N, IntrID);
|
SelectDSAppendConsume(N, IntrID);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
SelectCode(N);
|
|
||||||
}
|
|
||||||
|
|
||||||
void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
|
|
||||||
unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
|
|
||||||
switch (IntrID) {
|
|
||||||
case Intrinsic::amdgcn_ds_gws_init:
|
|
||||||
case Intrinsic::amdgcn_ds_gws_barrier:
|
|
||||||
SelectDS_GWS(N, IntrID);
|
|
||||||
return;
|
|
||||||
default:
|
default:
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
|
@ -467,15 +467,11 @@ defm DS_WRXCHG_RTN_B64 : DS_1A1D_RET_mc<"ds_wrxchg_rtn_b64", VReg_64>;
|
||||||
defm DS_WRXCHG2_RTN_B64 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2_rtn_b64", VReg_128, VReg_64>;
|
defm DS_WRXCHG2_RTN_B64 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2_rtn_b64", VReg_128, VReg_64>;
|
||||||
defm DS_WRXCHG2ST64_RTN_B64 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2st64_rtn_b64", VReg_128, VReg_64>;
|
defm DS_WRXCHG2ST64_RTN_B64 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2st64_rtn_b64", VReg_128, VReg_64>;
|
||||||
|
|
||||||
let isConvergent = 1 in {
|
def DS_GWS_INIT : DS_GWS_1D<"ds_gws_init">;
|
||||||
def DS_GWS_INIT : DS_GWS_1D<"ds_gws_init"> {
|
|
||||||
let mayLoad = 0;
|
|
||||||
}
|
|
||||||
def DS_GWS_SEMA_V : DS_GWS_0D<"ds_gws_sema_v">;
|
def DS_GWS_SEMA_V : DS_GWS_0D<"ds_gws_sema_v">;
|
||||||
def DS_GWS_SEMA_BR : DS_GWS_1D<"ds_gws_sema_br">;
|
def DS_GWS_SEMA_BR : DS_GWS_1D<"ds_gws_sema_br">;
|
||||||
def DS_GWS_SEMA_P : DS_GWS_0D<"ds_gws_sema_p">;
|
def DS_GWS_SEMA_P : DS_GWS_0D<"ds_gws_sema_p">;
|
||||||
def DS_GWS_BARRIER : DS_GWS_1D<"ds_gws_barrier">;
|
def DS_GWS_BARRIER : DS_GWS_1D<"ds_gws_barrier">;
|
||||||
}
|
|
||||||
|
|
||||||
def DS_ADD_SRC2_U32 : DS_1A<"ds_add_src2_u32">;
|
def DS_ADD_SRC2_U32 : DS_1A<"ds_add_src2_u32">;
|
||||||
def DS_SUB_SRC2_U32 : DS_1A<"ds_sub_src2_u32">;
|
def DS_SUB_SRC2_U32 : DS_1A<"ds_sub_src2_u32">;
|
||||||
|
|
|
@ -961,24 +961,6 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
case Intrinsic::amdgcn_ds_gws_init:
|
|
||||||
case Intrinsic::amdgcn_ds_gws_barrier: {
|
|
||||||
Info.opc = ISD::INTRINSIC_VOID;
|
|
||||||
|
|
||||||
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
|
|
||||||
Info.ptrVal =
|
|
||||||
MFI->getGWSPSV(*MF.getSubtarget<GCNSubtarget>().getInstrInfo());
|
|
||||||
|
|
||||||
// This is an abstract access, but we need to specify a type and size.
|
|
||||||
Info.memVT = MVT::i32;
|
|
||||||
Info.size = 4;
|
|
||||||
Info.align = 4;
|
|
||||||
|
|
||||||
Info.flags = MachineMemOperand::MOStore;
|
|
||||||
if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
|
|
||||||
Info.flags = MachineMemOperand::MOLoad;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
default:
|
default:
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
|
@ -536,19 +536,15 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
|
||||||
// Put score on the source vgprs. If this is a store, just use those
|
// Put score on the source vgprs. If this is a store, just use those
|
||||||
// specific register(s).
|
// specific register(s).
|
||||||
if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
|
if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
|
||||||
int AddrOpIdx =
|
|
||||||
AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr);
|
|
||||||
// All GDS operations must protect their address register (same as
|
// All GDS operations must protect their address register (same as
|
||||||
// export.)
|
// export.)
|
||||||
if (AddrOpIdx != -1) {
|
if (Inst.getOpcode() != AMDGPU::DS_APPEND &&
|
||||||
setExpScore(&Inst, TII, TRI, MRI, AddrOpIdx, CurrScore);
|
Inst.getOpcode() != AMDGPU::DS_CONSUME) {
|
||||||
} else {
|
setExpScore(
|
||||||
assert(Inst.getOpcode() == AMDGPU::DS_APPEND ||
|
&Inst, TII, TRI, MRI,
|
||||||
Inst.getOpcode() == AMDGPU::DS_CONSUME ||
|
AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr),
|
||||||
Inst.getOpcode() == AMDGPU::DS_GWS_INIT ||
|
CurrScore);
|
||||||
Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (Inst.mayStore()) {
|
if (Inst.mayStore()) {
|
||||||
if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
|
if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
|
||||||
AMDGPU::OpName::data0) != -1) {
|
AMDGPU::OpName::data0) != -1) {
|
||||||
|
@ -1411,6 +1407,18 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
|
||||||
ScoreBrackets.dump();
|
ScoreBrackets.dump();
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Check to see if this is a GWS instruction. If so, and if this is CI or
|
||||||
|
// VI, then the generated code sequence will include an S_WAITCNT 0.
|
||||||
|
// TODO: Are these the only GWS instructions?
|
||||||
|
if (Inst.getOpcode() == AMDGPU::DS_GWS_INIT ||
|
||||||
|
Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_V ||
|
||||||
|
Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
|
||||||
|
Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_P ||
|
||||||
|
Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
|
||||||
|
// TODO: && context->target_info->GwsRequiresMemViolTest() ) {
|
||||||
|
ScoreBrackets.applyWaitcnt(AMDGPU::Waitcnt::allZeroExceptVsCnt());
|
||||||
|
}
|
||||||
|
|
||||||
// TODO: Remove this work-around after fixing the scheduler and enable the
|
// TODO: Remove this work-around after fixing the scheduler and enable the
|
||||||
// assert above.
|
// assert above.
|
||||||
if (VCCZBugWorkAround) {
|
if (VCCZBugWorkAround) {
|
||||||
|
|
|
@ -2547,8 +2547,7 @@ bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
|
||||||
// given the typical code patterns.
|
// given the typical code patterns.
|
||||||
if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
|
if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
|
||||||
Opcode == AMDGPU::EXP || Opcode == AMDGPU::EXP_DONE ||
|
Opcode == AMDGPU::EXP || Opcode == AMDGPU::EXP_DONE ||
|
||||||
Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::S_TRAP ||
|
Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::S_TRAP)
|
||||||
Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_BARRIER)
|
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
if (MI.isCall() || MI.isInlineAsm())
|
if (MI.isCall() || MI.isInlineAsm())
|
||||||
|
|
|
@ -43,8 +43,7 @@ class AMDGPUPseudoSourceValue : public PseudoSourceValue {
|
||||||
public:
|
public:
|
||||||
enum AMDGPUPSVKind : unsigned {
|
enum AMDGPUPSVKind : unsigned {
|
||||||
PSVBuffer = PseudoSourceValue::TargetCustom,
|
PSVBuffer = PseudoSourceValue::TargetCustom,
|
||||||
PSVImage,
|
PSVImage
|
||||||
GWSResource
|
|
||||||
};
|
};
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
|
@ -88,30 +87,6 @@ public:
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
class AMDGPUGWSResourcePseudoSourceValue final : public AMDGPUPseudoSourceValue {
|
|
||||||
public:
|
|
||||||
explicit AMDGPUGWSResourcePseudoSourceValue(const TargetInstrInfo &TII)
|
|
||||||
: AMDGPUPseudoSourceValue(GWSResource, TII) {}
|
|
||||||
|
|
||||||
static bool classof(const PseudoSourceValue *V) {
|
|
||||||
return V->kind() == GWSResource;
|
|
||||||
}
|
|
||||||
|
|
||||||
// These are inaccessible memory from IR.
|
|
||||||
bool isAliased(const MachineFrameInfo *) const override {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// These are inaccessible memory from IR.
|
|
||||||
bool mayAlias(const MachineFrameInfo *) const override {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
void printCustom(raw_ostream &OS) const override {
|
|
||||||
OS << "GWSResource";
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
namespace yaml {
|
namespace yaml {
|
||||||
|
|
||||||
struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
|
struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
|
||||||
|
@ -213,7 +188,6 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
|
||||||
std::unique_ptr<const AMDGPUBufferPseudoSourceValue>> BufferPSVs;
|
std::unique_ptr<const AMDGPUBufferPseudoSourceValue>> BufferPSVs;
|
||||||
DenseMap<const Value *,
|
DenseMap<const Value *,
|
||||||
std::unique_ptr<const AMDGPUImagePseudoSourceValue>> ImagePSVs;
|
std::unique_ptr<const AMDGPUImagePseudoSourceValue>> ImagePSVs;
|
||||||
std::unique_ptr<const AMDGPUGWSResourcePseudoSourceValue> GWSResourcePSV;
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
unsigned LDSWaveSpillSize = 0;
|
unsigned LDSWaveSpillSize = 0;
|
||||||
|
@ -700,15 +674,6 @@ public:
|
||||||
return PSV.first->second.get();
|
return PSV.first->second.get();
|
||||||
}
|
}
|
||||||
|
|
||||||
const AMDGPUGWSResourcePseudoSourceValue *getGWSPSV(const SIInstrInfo &TII) {
|
|
||||||
if (!GWSResourcePSV) {
|
|
||||||
GWSResourcePSV =
|
|
||||||
llvm::make_unique<AMDGPUGWSResourcePseudoSourceValue>(TII);
|
|
||||||
}
|
|
||||||
|
|
||||||
return GWSResourcePSV.get();
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned getOccupancy() const {
|
unsigned getOccupancy() const {
|
||||||
return Occupancy;
|
return Occupancy;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,103 +0,0 @@
|
||||||
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
|
|
||||||
# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GFX9 %s
|
|
||||||
# RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=VI %s
|
|
||||||
# RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=CI %s
|
|
||||||
# RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=SI %s
|
|
||||||
|
|
||||||
---
|
|
||||||
name: m0_gws_init0
|
|
||||||
tracksRegLiveness: true
|
|
||||||
body: |
|
|
||||||
|
|
||||||
bb.0:
|
|
||||||
liveins: $vgpr0
|
|
||||||
; GFX9-LABEL: name: m0_gws_init0
|
|
||||||
; GFX9: liveins: $vgpr0
|
|
||||||
; GFX9: $m0 = S_MOV_B32 -1
|
|
||||||
; GFX9: S_NOP 0
|
|
||||||
; GFX9: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
|
|
||||||
; VI-LABEL: name: m0_gws_init0
|
|
||||||
; VI: liveins: $vgpr0
|
|
||||||
; VI: $m0 = S_MOV_B32 -1
|
|
||||||
; VI: S_NOP 0
|
|
||||||
; VI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
|
|
||||||
; CI-LABEL: name: m0_gws_init0
|
|
||||||
; CI: liveins: $vgpr0
|
|
||||||
; CI: $m0 = S_MOV_B32 -1
|
|
||||||
; CI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
|
|
||||||
; SI-LABEL: name: m0_gws_init0
|
|
||||||
; SI: liveins: $vgpr0
|
|
||||||
; SI: $m0 = S_MOV_B32 -1
|
|
||||||
; SI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
|
|
||||||
$m0 = S_MOV_B32 -1
|
|
||||||
DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
|
|
||||||
|
|
||||||
...
|
|
||||||
|
|
||||||
---
|
|
||||||
name: m0_gws_init1
|
|
||||||
tracksRegLiveness: true
|
|
||||||
body: |
|
|
||||||
|
|
||||||
bb.0:
|
|
||||||
; GFX9-LABEL: name: m0_gws_init1
|
|
||||||
; GFX9: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
|
|
||||||
; GFX9: $m0 = S_MOV_B32 -1
|
|
||||||
; GFX9: S_NOP 0
|
|
||||||
; GFX9: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
|
|
||||||
; VI-LABEL: name: m0_gws_init1
|
|
||||||
; VI: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
|
|
||||||
; VI: $m0 = S_MOV_B32 -1
|
|
||||||
; VI: S_NOP 0
|
|
||||||
; VI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
|
|
||||||
; CI-LABEL: name: m0_gws_init1
|
|
||||||
; CI: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
|
|
||||||
; CI: $m0 = S_MOV_B32 -1
|
|
||||||
; CI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
|
|
||||||
; SI-LABEL: name: m0_gws_init1
|
|
||||||
; SI: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
|
|
||||||
; SI: $m0 = S_MOV_B32 -1
|
|
||||||
; SI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
|
|
||||||
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
|
|
||||||
$m0 = S_MOV_B32 -1
|
|
||||||
DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
|
|
||||||
|
|
||||||
...
|
|
||||||
|
|
||||||
# Test a typical situation where m0 needs to be set from a VGPR
|
|
||||||
# through readfirstlane
|
|
||||||
---
|
|
||||||
name: m0_gws_readlane
|
|
||||||
tracksRegLiveness: true
|
|
||||||
body: |
|
|
||||||
|
|
||||||
bb.0:
|
|
||||||
liveins: $vgpr0, $vgpr1
|
|
||||||
|
|
||||||
; GFX9-LABEL: name: m0_gws_readlane
|
|
||||||
; GFX9: liveins: $vgpr0, $vgpr1
|
|
||||||
; GFX9: $sgpr0 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
|
|
||||||
; GFX9: $m0 = S_MOV_B32 $sgpr0
|
|
||||||
; GFX9: S_NOP 0
|
|
||||||
; GFX9: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
|
|
||||||
; VI-LABEL: name: m0_gws_readlane
|
|
||||||
; VI: liveins: $vgpr0, $vgpr1
|
|
||||||
; VI: $sgpr0 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
|
|
||||||
; VI: $m0 = S_MOV_B32 $sgpr0
|
|
||||||
; VI: S_NOP 0
|
|
||||||
; VI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
|
|
||||||
; CI-LABEL: name: m0_gws_readlane
|
|
||||||
; CI: liveins: $vgpr0, $vgpr1
|
|
||||||
; CI: $sgpr0 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
|
|
||||||
; CI: $m0 = S_MOV_B32 $sgpr0
|
|
||||||
; CI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
|
|
||||||
; SI-LABEL: name: m0_gws_readlane
|
|
||||||
; SI: liveins: $vgpr0, $vgpr1
|
|
||||||
; SI: $sgpr0 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
|
|
||||||
; SI: $m0 = S_MOV_B32 $sgpr0
|
|
||||||
; SI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
|
|
||||||
$sgpr0 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
|
|
||||||
$m0 = S_MOV_B32 $sgpr0
|
|
||||||
DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
|
|
||||||
|
|
||||||
...
|
|
|
@ -1,59 +0,0 @@
|
||||||
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
|
|
||||||
# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass si-insert-skips -amdgpu-skip-threshold=1 -verify-machineinstrs %s -o - | FileCheck %s
|
|
||||||
# Make sure mandatory skips are inserted to ensure GWS ops aren't run with exec = 0
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
name: skip_gws_init
|
|
||||||
body: |
|
|
||||||
; CHECK-LABEL: name: skip_gws_init
|
|
||||||
; CHECK: bb.0:
|
|
||||||
; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000)
|
|
||||||
; CHECK: SI_MASK_BRANCH %bb.2, implicit $exec
|
|
||||||
; CHECK: S_CBRANCH_EXECZ %bb.2, implicit $exec
|
|
||||||
; CHECK: bb.1:
|
|
||||||
; CHECK: successors: %bb.2(0x80000000)
|
|
||||||
; CHECK: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
|
|
||||||
; CHECK: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
|
|
||||||
; CHECK: bb.2:
|
|
||||||
; CHECK: S_ENDPGM 0
|
|
||||||
bb.0:
|
|
||||||
successors: %bb.1, %bb.2
|
|
||||||
SI_MASK_BRANCH %bb.2, implicit $exec
|
|
||||||
|
|
||||||
bb.1:
|
|
||||||
successors: %bb.2
|
|
||||||
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
|
|
||||||
DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
|
|
||||||
|
|
||||||
bb.2:
|
|
||||||
S_ENDPGM 0
|
|
||||||
...
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
name: skip_gws_barrier
|
|
||||||
body: |
|
|
||||||
; CHECK-LABEL: name: skip_gws_barrier
|
|
||||||
; CHECK: bb.0:
|
|
||||||
; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000)
|
|
||||||
; CHECK: SI_MASK_BRANCH %bb.2, implicit $exec
|
|
||||||
; CHECK: S_CBRANCH_EXECZ %bb.2, implicit $exec
|
|
||||||
; CHECK: bb.1:
|
|
||||||
; CHECK: successors: %bb.2(0x80000000)
|
|
||||||
; CHECK: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
|
|
||||||
; CHECK: DS_GWS_BARRIER $vgpr0, 0, 1, implicit $m0, implicit $exec
|
|
||||||
; CHECK: bb.2:
|
|
||||||
; CHECK: S_ENDPGM 0
|
|
||||||
bb.0:
|
|
||||||
successors: %bb.1, %bb.2
|
|
||||||
SI_MASK_BRANCH %bb.2, implicit $exec
|
|
||||||
|
|
||||||
bb.1:
|
|
||||||
successors: %bb.2
|
|
||||||
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
|
|
||||||
DS_GWS_BARRIER $vgpr0, 0, 1, implicit $m0, implicit $exec
|
|
||||||
|
|
||||||
bb.2:
|
|
||||||
S_ENDPGM 0
|
|
||||||
...
|
|
|
@ -1,179 +0,0 @@
|
||||||
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
|
|
||||||
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -mattr=+flat-for-global -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIPLUS %s
|
|
||||||
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIPLUS %s
|
|
||||||
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIPLUS %s
|
|
||||||
|
|
||||||
; Minimum offset
|
|
||||||
; GCN-LABEL: {{^}}gws_barrier_offset0:
|
|
||||||
; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
|
|
||||||
; GCN-DAG: s_mov_b32 m0, -1{{$}}
|
|
||||||
; GCN: v_mov_b32_e32 v0, [[BAR_NUM]]
|
|
||||||
; GCN: ds_gws_barrier v0 offset:1 gds{{$}}
|
|
||||||
define amdgpu_kernel void @gws_barrier_offset0(i32 %val) #0 {
|
|
||||||
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 0)
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
; Maximum offset
|
|
||||||
; GCN-LABEL: {{^}}gws_barrier_offset63:
|
|
||||||
; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
|
|
||||||
; GCN-DAG: s_mov_b32 m0, -1{{$}}
|
|
||||||
; GCN-DAG: v_mov_b32_e32 v0, [[BAR_NUM]]
|
|
||||||
; GCN: ds_gws_barrier v0 offset:64 gds{{$}}
|
|
||||||
define amdgpu_kernel void @gws_barrier_offset63(i32 %val) #0 {
|
|
||||||
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 63)
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
; FIXME: Should be able to shift directly into m0
|
|
||||||
; GCN-LABEL: {{^}}gws_barrier_sgpr_offset:
|
|
||||||
; GCN-DAG: s_load_dwordx2 s{{\[}}[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]{{\]}}
|
|
||||||
; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16
|
|
||||||
; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}}
|
|
||||||
; GCN-DAG: v_mov_b32_e32 v0, s[[BAR_NUM]]
|
|
||||||
; GCN: ds_gws_barrier v0 gds{{$}}
|
|
||||||
define amdgpu_kernel void @gws_barrier_sgpr_offset(i32 %val, i32 %offset) #0 {
|
|
||||||
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 %offset)
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
; Variable offset in SGPR with constant add
|
|
||||||
; GCN-LABEL: {{^}}gws_barrier_sgpr_offset_add1:
|
|
||||||
; GCN-DAG: s_load_dwordx2 s{{\[}}[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]{{\]}}
|
|
||||||
; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16
|
|
||||||
; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}}
|
|
||||||
; GCN-DAG: v_mov_b32_e32 v0, s[[BAR_NUM]]
|
|
||||||
; GCN: ds_gws_barrier v0 offset:1 gds{{$}}
|
|
||||||
define amdgpu_kernel void @gws_barrier_sgpr_offset_add1(i32 %val, i32 %offset.base) #0 {
|
|
||||||
%offset = add i32 %offset.base, 1
|
|
||||||
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 %offset)
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}gws_barrier_vgpr_offset:
|
|
||||||
; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
|
|
||||||
; GCN-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0
|
|
||||||
; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16
|
|
||||||
; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}}
|
|
||||||
; GCN-DAG: v_mov_b32_e32 v0, [[BAR_NUM]]
|
|
||||||
; GCN: ds_gws_barrier v0 gds{{$}}
|
|
||||||
define amdgpu_kernel void @gws_barrier_vgpr_offset(i32 %val) #0 {
|
|
||||||
%vgpr.offset = call i32 @llvm.amdgcn.workitem.id.x()
|
|
||||||
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 %vgpr.offset)
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
; Variable offset in VGPR with constant add
|
|
||||||
; GCN-LABEL: {{^}}gws_barrier_vgpr_offset_add:
|
|
||||||
; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
|
|
||||||
; GCN-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0
|
|
||||||
; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16
|
|
||||||
; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}}
|
|
||||||
; GCN-DAG: v_mov_b32_e32 v0, [[BAR_NUM]]
|
|
||||||
; GCN: ds_gws_barrier v0 offset:3 gds{{$}}
|
|
||||||
define amdgpu_kernel void @gws_barrier_vgpr_offset_add(i32 %val) #0 {
|
|
||||||
%vgpr.offset.base = call i32 @llvm.amdgcn.workitem.id.x()
|
|
||||||
%vgpr.offset = add i32 %vgpr.offset.base, 3
|
|
||||||
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 %vgpr.offset)
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
@lds = internal unnamed_addr addrspace(3) global i32 undef
|
|
||||||
|
|
||||||
; Check if m0 initialization is shared
|
|
||||||
; GCN-LABEL: {{^}}gws_barrier_save_m0_barrier_constant_offset:
|
|
||||||
; GCN: s_mov_b32 m0, -1
|
|
||||||
; GCN-NOT: s_mov_b32 m0
|
|
||||||
define amdgpu_kernel void @gws_barrier_save_m0_barrier_constant_offset(i32 %val) #0 {
|
|
||||||
store i32 1, i32 addrspace(3)* @lds
|
|
||||||
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 10)
|
|
||||||
store i32 2, i32 addrspace(3)* @lds
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
; Make sure this increments lgkmcnt
|
|
||||||
; GCN-LABEL: {{^}}gws_barrier_lgkmcnt:
|
|
||||||
; GCN: ds_gws_barrier v0 offset:1 gds{{$}}
|
|
||||||
; GCN-NEXT: s_waitcnt expcnt(0) lgkmcnt(0)
|
|
||||||
; GCN-NEXT: s_setpc_b64
|
|
||||||
define void @gws_barrier_lgkmcnt(i32 %val) {
|
|
||||||
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 0)
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
; Does not imply memory fence on its own
|
|
||||||
; GCN-LABEL: {{^}}gws_barrier_wait_before:
|
|
||||||
; GCN: store_dword
|
|
||||||
; CIPLUS-NOT: s_waitcnt
|
|
||||||
; GCN: ds_gws_barrier v0 offset:8 gds
|
|
||||||
define amdgpu_kernel void @gws_barrier_wait_before(i32 %val, i32 addrspace(1)* %ptr) #0 {
|
|
||||||
store i32 0, i32 addrspace(1)* %ptr
|
|
||||||
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7)
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}gws_barrier_wait_after:
|
|
||||||
; GCN: ds_gws_barrier v0 offset:8 gds
|
|
||||||
; GCN-NEXT: s_waitcnt expcnt(0){{$}}
|
|
||||||
; GCN-NEXT: load_dword
|
|
||||||
define amdgpu_kernel void @gws_barrier_wait_after(i32 %val, i32 addrspace(1)* %ptr) #0 {
|
|
||||||
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7)
|
|
||||||
%load = load volatile i32, i32 addrspace(1)* %ptr
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
; Does not imply memory fence on its own
|
|
||||||
; GCN-LABEL: {{^}}gws_barrier_fence_before:
|
|
||||||
; GCN: store_dword
|
|
||||||
; GCN: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
||||||
; GCN: ds_gws_barrier v0 offset:8 gds
|
|
||||||
define amdgpu_kernel void @gws_barrier_fence_before(i32 %val, i32 addrspace(1)* %ptr) #0 {
|
|
||||||
store i32 0, i32 addrspace(1)* %ptr
|
|
||||||
fence release
|
|
||||||
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7)
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}gws_barrier_fence_after:
|
|
||||||
; GCN: ds_gws_barrier v0 offset:8 gds
|
|
||||||
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
||||||
; GCN-NEXT: load_dword
|
|
||||||
define amdgpu_kernel void @gws_barrier_fence_after(i32 %val, i32 addrspace(1)* %ptr) #0 {
|
|
||||||
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7)
|
|
||||||
fence release
|
|
||||||
%load = load volatile i32, i32 addrspace(1)* %ptr
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
; FIXME: Should a wait be inserted here, or is an explicit fence needed?
|
|
||||||
; GCN-LABEL: {{^}}gws_init_barrier:
|
|
||||||
; GCN: s_mov_b32 m0, -1
|
|
||||||
; GCN: ds_gws_init v0 offset:8 gds
|
|
||||||
; GCN-NEXT: ds_gws_barrier v0 offset:8 gds
|
|
||||||
define amdgpu_kernel void @gws_init_barrier(i32 %val) #0 {
|
|
||||||
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 7)
|
|
||||||
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7)
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
; FIXME: Why vmcnt, not expcnt?
|
|
||||||
; GCN-LABEL: {{^}}gws_init_fence_barrier:
|
|
||||||
; GCN: s_mov_b32 m0, -1
|
|
||||||
; GCN: ds_gws_init v0 offset:8 gds
|
|
||||||
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
||||||
; GCN-NEXT: ds_gws_barrier v0 offset:8 gds
|
|
||||||
define amdgpu_kernel void @gws_init_fence_barrier(i32 %val) #0 {
|
|
||||||
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 7)
|
|
||||||
fence release
|
|
||||||
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7)
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
declare void @llvm.amdgcn.ds.gws.barrier(i32, i32) #1
|
|
||||||
declare void @llvm.amdgcn.ds.gws.init(i32, i32) #2
|
|
||||||
declare i32 @llvm.amdgcn.workitem.id.x() #3
|
|
||||||
|
|
||||||
attributes #0 = { nounwind }
|
|
||||||
attributes #1 = { convergent inaccessiblememonly nounwind }
|
|
||||||
attributes #2 = { convergent inaccessiblememonly nounwind writeonly }
|
|
||||||
attributes #3 = { nounwind readnone speculatable }
|
|
|
@ -1,119 +0,0 @@
|
||||||
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
|
|
||||||
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
|
|
||||||
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
|
|
||||||
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
|
|
||||||
|
|
||||||
; Minimum offset
|
|
||||||
; GCN-LABEL: {{^}}gws_init_offset0:
|
|
||||||
; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
|
|
||||||
; GCN-DAG: s_mov_b32 m0, -1{{$}}
|
|
||||||
; GCN: v_mov_b32_e32 v0, [[BAR_NUM]]
|
|
||||||
; GCN: ds_gws_init v0 offset:1 gds{{$}}
|
|
||||||
define amdgpu_kernel void @gws_init_offset0(i32 %val) #0 {
|
|
||||||
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 0)
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
; Maximum offset
|
|
||||||
; GCN-LABEL: {{^}}gws_init_offset63:
|
|
||||||
; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
|
|
||||||
; GCN-DAG: s_mov_b32 m0, -1{{$}}
|
|
||||||
; GCN-DAG: v_mov_b32_e32 v0, [[BAR_NUM]]
|
|
||||||
; GCN: ds_gws_init v0 offset:64 gds{{$}}
|
|
||||||
define amdgpu_kernel void @gws_init_offset63(i32 %val) #0 {
|
|
||||||
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 63)
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
; FIXME: Should be able to shift directly into m0
|
|
||||||
; GCN-LABEL: {{^}}gws_init_sgpr_offset:
|
|
||||||
; GCN-DAG: s_load_dwordx2 s{{\[}}[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]{{\]}}
|
|
||||||
; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16
|
|
||||||
; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}}
|
|
||||||
; GCN-DAG: v_mov_b32_e32 v0, s[[BAR_NUM]]
|
|
||||||
; GCN: ds_gws_init v0 gds{{$}}
|
|
||||||
define amdgpu_kernel void @gws_init_sgpr_offset(i32 %val, i32 %offset) #0 {
|
|
||||||
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 %offset)
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
; Variable offset in SGPR with constant add
|
|
||||||
; GCN-LABEL: {{^}}gws_init_sgpr_offset_add1:
|
|
||||||
; GCN-DAG: s_load_dwordx2 s{{\[}}[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]{{\]}}
|
|
||||||
; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16
|
|
||||||
; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}}
|
|
||||||
; GCN-DAG: v_mov_b32_e32 v0, s[[BAR_NUM]]
|
|
||||||
; GCN: ds_gws_init v0 offset:1 gds{{$}}
|
|
||||||
define amdgpu_kernel void @gws_init_sgpr_offset_add1(i32 %val, i32 %offset.base) #0 {
|
|
||||||
%offset = add i32 %offset.base, 1
|
|
||||||
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 %offset)
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}gws_init_vgpr_offset:
|
|
||||||
; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
|
|
||||||
; GCN-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0
|
|
||||||
; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16
|
|
||||||
; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}}
|
|
||||||
; GCN-DAG: v_mov_b32_e32 v0, [[BAR_NUM]]
|
|
||||||
; GCN: ds_gws_init v0 gds{{$}}
|
|
||||||
define amdgpu_kernel void @gws_init_vgpr_offset(i32 %val) #0 {
|
|
||||||
%vgpr.offset = call i32 @llvm.amdgcn.workitem.id.x()
|
|
||||||
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 %vgpr.offset)
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
; Variable offset in VGPR with constant add
|
|
||||||
; GCN-LABEL: {{^}}gws_init_vgpr_offset_add:
|
|
||||||
; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
|
|
||||||
; GCN-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0
|
|
||||||
; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16
|
|
||||||
; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}}
|
|
||||||
; GCN-DAG: v_mov_b32_e32 v0, [[BAR_NUM]]
|
|
||||||
; GCN: ds_gws_init v0 offset:3 gds{{$}}
|
|
||||||
define amdgpu_kernel void @gws_init_vgpr_offset_add(i32 %val) #0 {
|
|
||||||
%vgpr.offset.base = call i32 @llvm.amdgcn.workitem.id.x()
|
|
||||||
%vgpr.offset = add i32 %vgpr.offset.base, 3
|
|
||||||
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 %vgpr.offset)
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
@lds = internal unnamed_addr addrspace(3) global i32 undef
|
|
||||||
|
|
||||||
; Check if m0 initialization is shared.
|
|
||||||
; GCN-LABEL: {{^}}gws_init_save_m0_init_constant_offset:
|
|
||||||
; GCN: s_mov_b32 m0, -1
|
|
||||||
; GCN-NOT: s_mov_b32 m0
|
|
||||||
define amdgpu_kernel void @gws_init_save_m0_init_constant_offset(i32 %val) #0 {
|
|
||||||
store i32 1, i32 addrspace(3)* @lds
|
|
||||||
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 10)
|
|
||||||
store i32 2, i32 addrspace(3)* @lds
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}gws_init_lgkmcnt:
|
|
||||||
; GCN: ds_gws_init v0 offset:1 gds{{$}}
|
|
||||||
; GCN-NEXT: s_waitcnt expcnt(0) lgkmcnt(0)
|
|
||||||
; GCN-NEXT: s_setpc_b64
|
|
||||||
define void @gws_init_lgkmcnt(i32 %val) {
|
|
||||||
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 0)
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
; Does not imply memory fence on its own
|
|
||||||
; GCN-LABEL: {{^}}gws_init_wait_before:
|
|
||||||
; GCN: store_dword
|
|
||||||
; CIPLUS-NOT: s_waitcnt
|
|
||||||
; GCN: ds_gws_init v0 offset:8 gds
|
|
||||||
define amdgpu_kernel void @gws_init_wait_before(i32 %val, i32 addrspace(1)* %ptr) #0 {
|
|
||||||
store i32 0, i32 addrspace(1)* %ptr
|
|
||||||
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 7)
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
declare void @llvm.amdgcn.ds.gws.init(i32, i32) #1
|
|
||||||
declare i32 @llvm.amdgcn.workitem.id.x() #2
|
|
||||||
|
|
||||||
attributes #0 = { nounwind }
|
|
||||||
attributes #1 = { convergent inaccessiblememonly nounwind writeonly }
|
|
||||||
attributes #2 = { nounwind readnone speculatable }
|
|
|
@ -6,8 +6,6 @@
|
||||||
declare void @nonconvergent_func() #0
|
declare void @nonconvergent_func() #0
|
||||||
declare void @convergent_func() #1
|
declare void @convergent_func() #1
|
||||||
declare void @llvm.amdgcn.s.barrier() #1
|
declare void @llvm.amdgcn.s.barrier() #1
|
||||||
declare void @llvm.amdgcn.ds.gws.init(i32, i32) #2
|
|
||||||
declare void @llvm.amdgcn.ds.gws.barrier(i32, i32) #2
|
|
||||||
|
|
||||||
; barrier shouldn't be duplicated.
|
; barrier shouldn't be duplicated.
|
||||||
|
|
||||||
|
@ -102,52 +100,6 @@ call:
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}taildup_gws_init:
|
|
||||||
; GCN: ds_gws_init
|
|
||||||
; GCN-NOT: ds_gws_init
|
|
||||||
define amdgpu_kernel void @taildup_gws_init(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i1 %cond, i32 %val, i32 %offset) #0 {
|
|
||||||
entry:
|
|
||||||
br i1 %cond, label %bb1, label %bb2
|
|
||||||
|
|
||||||
bb1:
|
|
||||||
store i32 0, i32 addrspace(1)* %a
|
|
||||||
br label %call
|
|
||||||
|
|
||||||
bb2:
|
|
||||||
store i32 1, i32 addrspace(1)* %a
|
|
||||||
br label %call
|
|
||||||
|
|
||||||
call:
|
|
||||||
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 %offset)
|
|
||||||
br label %ret
|
|
||||||
|
|
||||||
ret:
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}taildup_gws_barrier:
|
|
||||||
; GCN: ds_gws_barrier
|
|
||||||
; GCN-NOT: ds_gws_barrier
|
|
||||||
define amdgpu_kernel void @taildup_gws_barrier(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i1 %cond, i32 %val, i32 %offset) #0 {
|
|
||||||
entry:
|
|
||||||
br i1 %cond, label %bb1, label %bb2
|
|
||||||
|
|
||||||
bb1:
|
|
||||||
store i32 0, i32 addrspace(1)* %a
|
|
||||||
br label %call
|
|
||||||
|
|
||||||
bb2:
|
|
||||||
store i32 1, i32 addrspace(1)* %a
|
|
||||||
br label %call
|
|
||||||
|
|
||||||
call:
|
|
||||||
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 %offset)
|
|
||||||
br label %ret
|
|
||||||
|
|
||||||
ret:
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
attributes #0 = { nounwind }
|
attributes #0 = { nounwind }
|
||||||
attributes #1 = { nounwind convergent }
|
attributes #1 = { nounwind convergent }
|
||||||
attributes #2 = { convergent inaccessiblememonly nounwind }
|
|
||||||
|
|
Loading…
Reference in New Issue