diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index c8310099ee7e..a26aa02c75f4 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1348,28 +1348,6 @@ def int_amdgcn_alignbyte : Intrinsic<[llvm_i32_ty], [IntrNoMem, IntrSpeculatable] >; -// llvm.amdgcn.ds.gws.init(i32 bar_val, i32 resource_id) -// -// bar_val is the total number of waves that will wait on this -// barrier, minus 1. -def int_amdgcn_ds_gws_init : - GCCBuiltin<"__builtin_amdgcn_ds_gws_init">, - Intrinsic<[], - [llvm_i32_ty, llvm_i32_ty], - [IntrConvergent, IntrWriteMem, IntrInaccessibleMemOnly], "", - [SDNPMemOperand] ->; - -// llvm.amdgcn.ds.gws.barrier(i32 vsrc0, i32 resource_id) -// bar_val is the total number of waves that will wait on this -// barrier, minus 1. -def int_amdgcn_ds_gws_barrier : - GCCBuiltin<"__builtin_amdgcn_ds_gws_barrier">, - Intrinsic<[], - [llvm_i32_ty, llvm_i32_ty], - [IntrConvergent, IntrInaccessibleMemOnly], "", - [SDNPMemOperand] ->; // Copies the source value to the destination value, with the guarantee that // the source value is computed as if the entire program were executed in WQM. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index a4d096ac3272..0c880a31cd12 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -218,9 +218,7 @@ private: void SelectFMAD_FMA(SDNode *N); void SelectATOMIC_CMP_SWAP(SDNode *N); void SelectDSAppendConsume(SDNode *N, unsigned IntrID); - void SelectDS_GWS(SDNode *N, unsigned IntrID); void SelectINTRINSIC_W_CHAIN(SDNode *N); - void SelectINTRINSIC_VOID(SDNode *N); protected: // Include the pieces autogenerated from the target description. @@ -834,10 +832,6 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { SelectINTRINSIC_W_CHAIN(N); return; } - case ISD::INTRINSIC_VOID: { - SelectINTRINSIC_VOID(N); - return; - } } SelectCode(N); @@ -2040,73 +2034,6 @@ void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) { CurDAG->setNodeMemRefs(cast(Selected), {MMO}); } -void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) { - SDLoc SL(N); - SDValue VSrc0 = N->getOperand(2); - SDValue BaseOffset = N->getOperand(3); - int ImmOffset = 0; - SDNode *CopyToM0; - MemIntrinsicSDNode *M = cast(N); - MachineMemOperand *MMO = M->getMemOperand(); - - // Don't worry if the offset ends up in a VGPR. Only one lane will have - // effect, so SIFixSGPRCopies will validly insert readfirstlane. - - // The resource id offset is computed as ( + M0[21:16] + - // offset field) % 64. Some versions of the programming guide omit the m0 - // part, or claim it's from offset 0. - if (ConstantSDNode *ConstOffset = dyn_cast(BaseOffset)) { - // If we have a constant offset, try to use the default value for m0 as a - // base to possibly avoid setting it up. - CopyToM0 = glueCopyToM0(N, CurDAG->getTargetConstant(-1, SL, MVT::i32)); - ImmOffset = ConstOffset->getZExtValue() + 1; - } else { - if (CurDAG->isBaseWithConstantOffset(BaseOffset)) { - ImmOffset = BaseOffset.getConstantOperandVal(1); - BaseOffset = BaseOffset.getOperand(0); - } - - // Prefer to do the shift in an SGPR since it should be possible to use m0 - // as the result directly. If it's already an SGPR, it will be eliminated - // later. - SDNode *SGPROffset - = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32, - BaseOffset); - // Shift to offset in m0 - SDNode *M0Base - = CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32, - SDValue(SGPROffset, 0), - CurDAG->getTargetConstant(16, SL, MVT::i32)); - CopyToM0 = glueCopyToM0(N, SDValue(M0Base, 0)); - } - - // The manual doesn't mention this, but it seems only v0 works. - SDValue V0 = CurDAG->getRegister(AMDGPU::VGPR0, MVT::i32); - - SDValue CopyToV0 = CurDAG->getCopyToReg( - SDValue(CopyToM0, 0), SL, V0, VSrc0, - N->getOperand(N->getNumOperands() - 1)); - - SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32); - - // TODO: Can this just be removed from the instruction? - SDValue GDS = CurDAG->getTargetConstant(1, SL, MVT::i1); - - unsigned Opc = IntrID == Intrinsic::amdgcn_ds_gws_init ? - AMDGPU::DS_GWS_INIT : AMDGPU::DS_GWS_BARRIER; - - SDValue Ops[] = { - V0, - OffsetField, - GDS, - CopyToV0, // Chain - CopyToV0.getValue(1) // Glue - }; - - SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); - CurDAG->setNodeMemRefs(cast(Selected), {MMO}); -} - void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) { unsigned IntrID = cast(N->getOperand(1))->getZExtValue(); switch (IntrID) { @@ -2117,18 +2044,6 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) { SelectDSAppendConsume(N, IntrID); return; } - } - - SelectCode(N); -} - -void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) { - unsigned IntrID = cast(N->getOperand(1))->getZExtValue(); - switch (IntrID) { - case Intrinsic::amdgcn_ds_gws_init: - case Intrinsic::amdgcn_ds_gws_barrier: - SelectDS_GWS(N, IntrID); - return; default: break; } diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index 248f6599f7ee..db2dcff5c569 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -467,15 +467,11 @@ defm DS_WRXCHG_RTN_B64 : DS_1A1D_RET_mc<"ds_wrxchg_rtn_b64", VReg_64>; defm DS_WRXCHG2_RTN_B64 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2_rtn_b64", VReg_128, VReg_64>; defm DS_WRXCHG2ST64_RTN_B64 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2st64_rtn_b64", VReg_128, VReg_64>; -let isConvergent = 1 in { -def DS_GWS_INIT : DS_GWS_1D<"ds_gws_init"> { - let mayLoad = 0; -} +def DS_GWS_INIT : DS_GWS_1D<"ds_gws_init">; def DS_GWS_SEMA_V : DS_GWS_0D<"ds_gws_sema_v">; def DS_GWS_SEMA_BR : DS_GWS_1D<"ds_gws_sema_br">; def DS_GWS_SEMA_P : DS_GWS_0D<"ds_gws_sema_p">; def DS_GWS_BARRIER : DS_GWS_1D<"ds_gws_barrier">; -} def DS_ADD_SRC2_U32 : DS_1A<"ds_add_src2_u32">; def DS_SUB_SRC2_U32 : DS_1A<"ds_sub_src2_u32">; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 54cc459e1489..80acf5783add 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -961,24 +961,6 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, return true; } - case Intrinsic::amdgcn_ds_gws_init: - case Intrinsic::amdgcn_ds_gws_barrier: { - Info.opc = ISD::INTRINSIC_VOID; - - SIMachineFunctionInfo *MFI = MF.getInfo(); - Info.ptrVal = - MFI->getGWSPSV(*MF.getSubtarget().getInstrInfo()); - - // This is an abstract access, but we need to specify a type and size. - Info.memVT = MVT::i32; - Info.size = 4; - Info.align = 4; - - Info.flags = MachineMemOperand::MOStore; - if (IntrID == Intrinsic::amdgcn_ds_gws_barrier) - Info.flags = MachineMemOperand::MOLoad; - return true; - } default: return false; } diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 29c891c72af1..326942307178 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -536,19 +536,15 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, // Put score on the source vgprs. If this is a store, just use those // specific register(s). if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) { - int AddrOpIdx = - AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr); // All GDS operations must protect their address register (same as // export.) - if (AddrOpIdx != -1) { - setExpScore(&Inst, TII, TRI, MRI, AddrOpIdx, CurrScore); - } else { - assert(Inst.getOpcode() == AMDGPU::DS_APPEND || - Inst.getOpcode() == AMDGPU::DS_CONSUME || - Inst.getOpcode() == AMDGPU::DS_GWS_INIT || - Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER); + if (Inst.getOpcode() != AMDGPU::DS_APPEND && + Inst.getOpcode() != AMDGPU::DS_CONSUME) { + setExpScore( + &Inst, TII, TRI, MRI, + AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr), + CurrScore); } - if (Inst.mayStore()) { if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0) != -1) { @@ -1411,6 +1407,18 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, ScoreBrackets.dump(); }); + // Check to see if this is a GWS instruction. If so, and if this is CI or + // VI, then the generated code sequence will include an S_WAITCNT 0. + // TODO: Are these the only GWS instructions? + if (Inst.getOpcode() == AMDGPU::DS_GWS_INIT || + Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_V || + Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_BR || + Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_P || + Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER) { + // TODO: && context->target_info->GwsRequiresMemViolTest() ) { + ScoreBrackets.applyWaitcnt(AMDGPU::Waitcnt::allZeroExceptVsCnt()); + } + // TODO: Remove this work-around after fixing the scheduler and enable the // assert above. if (VCCZBugWorkAround) { diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 56935b35734a..bb0076c6db38 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2547,8 +2547,7 @@ bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const // given the typical code patterns. if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT || Opcode == AMDGPU::EXP || Opcode == AMDGPU::EXP_DONE || - Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::S_TRAP || - Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_BARRIER) + Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::S_TRAP) return true; if (MI.isCall() || MI.isInlineAsm()) diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index b8d1ac55adb0..f7d31439a250 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -43,8 +43,7 @@ class AMDGPUPseudoSourceValue : public PseudoSourceValue { public: enum AMDGPUPSVKind : unsigned { PSVBuffer = PseudoSourceValue::TargetCustom, - PSVImage, - GWSResource + PSVImage }; protected: @@ -88,30 +87,6 @@ public: } }; -class AMDGPUGWSResourcePseudoSourceValue final : public AMDGPUPseudoSourceValue { -public: - explicit AMDGPUGWSResourcePseudoSourceValue(const TargetInstrInfo &TII) - : AMDGPUPseudoSourceValue(GWSResource, TII) {} - - static bool classof(const PseudoSourceValue *V) { - return V->kind() == GWSResource; - } - - // These are inaccessible memory from IR. - bool isAliased(const MachineFrameInfo *) const override { - return false; - } - - // These are inaccessible memory from IR. - bool mayAlias(const MachineFrameInfo *) const override { - return false; - } - - void printCustom(raw_ostream &OS) const override { - OS << "GWSResource"; - } -}; - namespace yaml { struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo { @@ -213,7 +188,6 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction { std::unique_ptr> BufferPSVs; DenseMap> ImagePSVs; - std::unique_ptr GWSResourcePSV; private: unsigned LDSWaveSpillSize = 0; @@ -700,15 +674,6 @@ public: return PSV.first->second.get(); } - const AMDGPUGWSResourcePseudoSourceValue *getGWSPSV(const SIInstrInfo &TII) { - if (!GWSResourcePSV) { - GWSResourcePSV = - llvm::make_unique(TII); - } - - return GWSResourcePSV.get(); - } - unsigned getOccupancy() const { return Occupancy; } diff --git a/llvm/test/CodeGen/AMDGPU/gws-hazards.mir b/llvm/test/CodeGen/AMDGPU/gws-hazards.mir deleted file mode 100644 index e1ce2ad9bb64..000000000000 --- a/llvm/test/CodeGen/AMDGPU/gws-hazards.mir +++ /dev/null @@ -1,103 +0,0 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GFX9 %s -# RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=VI %s -# RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=CI %s -# RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=SI %s - ---- -name: m0_gws_init0 -tracksRegLiveness: true -body: | - - bb.0: - liveins: $vgpr0 - ; GFX9-LABEL: name: m0_gws_init0 - ; GFX9: liveins: $vgpr0 - ; GFX9: $m0 = S_MOV_B32 -1 - ; GFX9: S_NOP 0 - ; GFX9: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec - ; VI-LABEL: name: m0_gws_init0 - ; VI: liveins: $vgpr0 - ; VI: $m0 = S_MOV_B32 -1 - ; VI: S_NOP 0 - ; VI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec - ; CI-LABEL: name: m0_gws_init0 - ; CI: liveins: $vgpr0 - ; CI: $m0 = S_MOV_B32 -1 - ; CI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec - ; SI-LABEL: name: m0_gws_init0 - ; SI: liveins: $vgpr0 - ; SI: $m0 = S_MOV_B32 -1 - ; SI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec - $m0 = S_MOV_B32 -1 - DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec - -... - ---- -name: m0_gws_init1 -tracksRegLiveness: true -body: | - - bb.0: - ; GFX9-LABEL: name: m0_gws_init1 - ; GFX9: $vgpr0 = V_MOV_B32_e32 0, implicit $exec - ; GFX9: $m0 = S_MOV_B32 -1 - ; GFX9: S_NOP 0 - ; GFX9: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec - ; VI-LABEL: name: m0_gws_init1 - ; VI: $vgpr0 = V_MOV_B32_e32 0, implicit $exec - ; VI: $m0 = S_MOV_B32 -1 - ; VI: S_NOP 0 - ; VI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec - ; CI-LABEL: name: m0_gws_init1 - ; CI: $vgpr0 = V_MOV_B32_e32 0, implicit $exec - ; CI: $m0 = S_MOV_B32 -1 - ; CI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec - ; SI-LABEL: name: m0_gws_init1 - ; SI: $vgpr0 = V_MOV_B32_e32 0, implicit $exec - ; SI: $m0 = S_MOV_B32 -1 - ; SI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec - $vgpr0 = V_MOV_B32_e32 0, implicit $exec - $m0 = S_MOV_B32 -1 - DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec - -... - -# Test a typical situation where m0 needs to be set from a VGPR -# through readfirstlane ---- -name: m0_gws_readlane -tracksRegLiveness: true -body: | - - bb.0: - liveins: $vgpr0, $vgpr1 - - ; GFX9-LABEL: name: m0_gws_readlane - ; GFX9: liveins: $vgpr0, $vgpr1 - ; GFX9: $sgpr0 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec - ; GFX9: $m0 = S_MOV_B32 $sgpr0 - ; GFX9: S_NOP 0 - ; GFX9: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec - ; VI-LABEL: name: m0_gws_readlane - ; VI: liveins: $vgpr0, $vgpr1 - ; VI: $sgpr0 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec - ; VI: $m0 = S_MOV_B32 $sgpr0 - ; VI: S_NOP 0 - ; VI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec - ; CI-LABEL: name: m0_gws_readlane - ; CI: liveins: $vgpr0, $vgpr1 - ; CI: $sgpr0 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec - ; CI: $m0 = S_MOV_B32 $sgpr0 - ; CI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec - ; SI-LABEL: name: m0_gws_readlane - ; SI: liveins: $vgpr0, $vgpr1 - ; SI: $sgpr0 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec - ; SI: $m0 = S_MOV_B32 $sgpr0 - ; SI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec - $sgpr0 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec - $m0 = S_MOV_B32 $sgpr0 - DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec - -... diff --git a/llvm/test/CodeGen/AMDGPU/insert-skips-gws.mir b/llvm/test/CodeGen/AMDGPU/insert-skips-gws.mir deleted file mode 100644 index c84372086fd3..000000000000 --- a/llvm/test/CodeGen/AMDGPU/insert-skips-gws.mir +++ /dev/null @@ -1,59 +0,0 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass si-insert-skips -amdgpu-skip-threshold=1 -verify-machineinstrs %s -o - | FileCheck %s -# Make sure mandatory skips are inserted to ensure GWS ops aren't run with exec = 0 - ---- - -name: skip_gws_init -body: | - ; CHECK-LABEL: name: skip_gws_init - ; CHECK: bb.0: - ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000) - ; CHECK: SI_MASK_BRANCH %bb.2, implicit $exec - ; CHECK: S_CBRANCH_EXECZ %bb.2, implicit $exec - ; CHECK: bb.1: - ; CHECK: successors: %bb.2(0x80000000) - ; CHECK: $vgpr0 = V_MOV_B32_e32 0, implicit $exec - ; CHECK: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec - ; CHECK: bb.2: - ; CHECK: S_ENDPGM 0 - bb.0: - successors: %bb.1, %bb.2 - SI_MASK_BRANCH %bb.2, implicit $exec - - bb.1: - successors: %bb.2 - $vgpr0 = V_MOV_B32_e32 0, implicit $exec - DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec - - bb.2: - S_ENDPGM 0 -... - ---- - -name: skip_gws_barrier -body: | - ; CHECK-LABEL: name: skip_gws_barrier - ; CHECK: bb.0: - ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000) - ; CHECK: SI_MASK_BRANCH %bb.2, implicit $exec - ; CHECK: S_CBRANCH_EXECZ %bb.2, implicit $exec - ; CHECK: bb.1: - ; CHECK: successors: %bb.2(0x80000000) - ; CHECK: $vgpr0 = V_MOV_B32_e32 0, implicit $exec - ; CHECK: DS_GWS_BARRIER $vgpr0, 0, 1, implicit $m0, implicit $exec - ; CHECK: bb.2: - ; CHECK: S_ENDPGM 0 - bb.0: - successors: %bb.1, %bb.2 - SI_MASK_BRANCH %bb.2, implicit $exec - - bb.1: - successors: %bb.2 - $vgpr0 = V_MOV_B32_e32 0, implicit $exec - DS_GWS_BARRIER $vgpr0, 0, 1, implicit $m0, implicit $exec - - bb.2: - S_ENDPGM 0 -... diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll deleted file mode 100644 index 6c347a8d037b..000000000000 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll +++ /dev/null @@ -1,179 +0,0 @@ -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -mattr=+flat-for-global -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIPLUS %s -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIPLUS %s -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIPLUS %s - -; Minimum offset -; GCN-LABEL: {{^}}gws_barrier_offset0: -; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]] -; GCN-DAG: s_mov_b32 m0, -1{{$}} -; GCN: v_mov_b32_e32 v0, [[BAR_NUM]] -; GCN: ds_gws_barrier v0 offset:1 gds{{$}} -define amdgpu_kernel void @gws_barrier_offset0(i32 %val) #0 { - call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 0) - ret void -} - -; Maximum offset -; GCN-LABEL: {{^}}gws_barrier_offset63: -; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]] -; GCN-DAG: s_mov_b32 m0, -1{{$}} -; GCN-DAG: v_mov_b32_e32 v0, [[BAR_NUM]] -; GCN: ds_gws_barrier v0 offset:64 gds{{$}} -define amdgpu_kernel void @gws_barrier_offset63(i32 %val) #0 { - call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 63) - ret void -} - -; FIXME: Should be able to shift directly into m0 -; GCN-LABEL: {{^}}gws_barrier_sgpr_offset: -; GCN-DAG: s_load_dwordx2 s{{\[}}[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]{{\]}} -; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16 -; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}} -; GCN-DAG: v_mov_b32_e32 v0, s[[BAR_NUM]] -; GCN: ds_gws_barrier v0 gds{{$}} -define amdgpu_kernel void @gws_barrier_sgpr_offset(i32 %val, i32 %offset) #0 { - call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 %offset) - ret void -} - -; Variable offset in SGPR with constant add -; GCN-LABEL: {{^}}gws_barrier_sgpr_offset_add1: -; GCN-DAG: s_load_dwordx2 s{{\[}}[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]{{\]}} -; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16 -; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}} -; GCN-DAG: v_mov_b32_e32 v0, s[[BAR_NUM]] -; GCN: ds_gws_barrier v0 offset:1 gds{{$}} -define amdgpu_kernel void @gws_barrier_sgpr_offset_add1(i32 %val, i32 %offset.base) #0 { - %offset = add i32 %offset.base, 1 - call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 %offset) - ret void -} - -; GCN-LABEL: {{^}}gws_barrier_vgpr_offset: -; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]] -; GCN-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0 -; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16 -; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}} -; GCN-DAG: v_mov_b32_e32 v0, [[BAR_NUM]] -; GCN: ds_gws_barrier v0 gds{{$}} -define amdgpu_kernel void @gws_barrier_vgpr_offset(i32 %val) #0 { - %vgpr.offset = call i32 @llvm.amdgcn.workitem.id.x() - call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 %vgpr.offset) - ret void -} - -; Variable offset in VGPR with constant add -; GCN-LABEL: {{^}}gws_barrier_vgpr_offset_add: -; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]] -; GCN-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0 -; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16 -; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}} -; GCN-DAG: v_mov_b32_e32 v0, [[BAR_NUM]] -; GCN: ds_gws_barrier v0 offset:3 gds{{$}} -define amdgpu_kernel void @gws_barrier_vgpr_offset_add(i32 %val) #0 { - %vgpr.offset.base = call i32 @llvm.amdgcn.workitem.id.x() - %vgpr.offset = add i32 %vgpr.offset.base, 3 - call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 %vgpr.offset) - ret void -} - -@lds = internal unnamed_addr addrspace(3) global i32 undef - -; Check if m0 initialization is shared -; GCN-LABEL: {{^}}gws_barrier_save_m0_barrier_constant_offset: -; GCN: s_mov_b32 m0, -1 -; GCN-NOT: s_mov_b32 m0 -define amdgpu_kernel void @gws_barrier_save_m0_barrier_constant_offset(i32 %val) #0 { - store i32 1, i32 addrspace(3)* @lds - call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 10) - store i32 2, i32 addrspace(3)* @lds - ret void -} - -; Make sure this increments lgkmcnt -; GCN-LABEL: {{^}}gws_barrier_lgkmcnt: -; GCN: ds_gws_barrier v0 offset:1 gds{{$}} -; GCN-NEXT: s_waitcnt expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 -define void @gws_barrier_lgkmcnt(i32 %val) { - call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 0) - ret void -} - -; Does not imply memory fence on its own -; GCN-LABEL: {{^}}gws_barrier_wait_before: -; GCN: store_dword -; CIPLUS-NOT: s_waitcnt -; GCN: ds_gws_barrier v0 offset:8 gds -define amdgpu_kernel void @gws_barrier_wait_before(i32 %val, i32 addrspace(1)* %ptr) #0 { - store i32 0, i32 addrspace(1)* %ptr - call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7) - ret void -} - -; GCN-LABEL: {{^}}gws_barrier_wait_after: -; GCN: ds_gws_barrier v0 offset:8 gds -; GCN-NEXT: s_waitcnt expcnt(0){{$}} -; GCN-NEXT: load_dword -define amdgpu_kernel void @gws_barrier_wait_after(i32 %val, i32 addrspace(1)* %ptr) #0 { - call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7) - %load = load volatile i32, i32 addrspace(1)* %ptr - ret void -} - -; Does not imply memory fence on its own -; GCN-LABEL: {{^}}gws_barrier_fence_before: -; GCN: store_dword -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN: ds_gws_barrier v0 offset:8 gds -define amdgpu_kernel void @gws_barrier_fence_before(i32 %val, i32 addrspace(1)* %ptr) #0 { - store i32 0, i32 addrspace(1)* %ptr - fence release - call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7) - ret void -} - -; GCN-LABEL: {{^}}gws_barrier_fence_after: -; GCN: ds_gws_barrier v0 offset:8 gds -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: load_dword -define amdgpu_kernel void @gws_barrier_fence_after(i32 %val, i32 addrspace(1)* %ptr) #0 { - call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7) - fence release - %load = load volatile i32, i32 addrspace(1)* %ptr - ret void -} - -; FIXME: Should a wait be inserted here, or is an explicit fence needed? -; GCN-LABEL: {{^}}gws_init_barrier: -; GCN: s_mov_b32 m0, -1 -; GCN: ds_gws_init v0 offset:8 gds -; GCN-NEXT: ds_gws_barrier v0 offset:8 gds -define amdgpu_kernel void @gws_init_barrier(i32 %val) #0 { - call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 7) - call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7) - ret void -} - -; FIXME: Why vmcnt, not expcnt? -; GCN-LABEL: {{^}}gws_init_fence_barrier: -; GCN: s_mov_b32 m0, -1 -; GCN: ds_gws_init v0 offset:8 gds -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: ds_gws_barrier v0 offset:8 gds -define amdgpu_kernel void @gws_init_fence_barrier(i32 %val) #0 { - call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 7) - fence release - call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7) - ret void -} - -declare void @llvm.amdgcn.ds.gws.barrier(i32, i32) #1 -declare void @llvm.amdgcn.ds.gws.init(i32, i32) #2 -declare i32 @llvm.amdgcn.workitem.id.x() #3 - -attributes #0 = { nounwind } -attributes #1 = { convergent inaccessiblememonly nounwind } -attributes #2 = { convergent inaccessiblememonly nounwind writeonly } -attributes #3 = { nounwind readnone speculatable } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll deleted file mode 100644 index 23f259033b4b..000000000000 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll +++ /dev/null @@ -1,119 +0,0 @@ -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s - -; Minimum offset -; GCN-LABEL: {{^}}gws_init_offset0: -; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]] -; GCN-DAG: s_mov_b32 m0, -1{{$}} -; GCN: v_mov_b32_e32 v0, [[BAR_NUM]] -; GCN: ds_gws_init v0 offset:1 gds{{$}} -define amdgpu_kernel void @gws_init_offset0(i32 %val) #0 { - call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 0) - ret void -} - -; Maximum offset -; GCN-LABEL: {{^}}gws_init_offset63: -; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]] -; GCN-DAG: s_mov_b32 m0, -1{{$}} -; GCN-DAG: v_mov_b32_e32 v0, [[BAR_NUM]] -; GCN: ds_gws_init v0 offset:64 gds{{$}} -define amdgpu_kernel void @gws_init_offset63(i32 %val) #0 { - call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 63) - ret void -} - -; FIXME: Should be able to shift directly into m0 -; GCN-LABEL: {{^}}gws_init_sgpr_offset: -; GCN-DAG: s_load_dwordx2 s{{\[}}[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]{{\]}} -; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16 -; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}} -; GCN-DAG: v_mov_b32_e32 v0, s[[BAR_NUM]] -; GCN: ds_gws_init v0 gds{{$}} -define amdgpu_kernel void @gws_init_sgpr_offset(i32 %val, i32 %offset) #0 { - call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 %offset) - ret void -} - -; Variable offset in SGPR with constant add -; GCN-LABEL: {{^}}gws_init_sgpr_offset_add1: -; GCN-DAG: s_load_dwordx2 s{{\[}}[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]{{\]}} -; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16 -; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}} -; GCN-DAG: v_mov_b32_e32 v0, s[[BAR_NUM]] -; GCN: ds_gws_init v0 offset:1 gds{{$}} -define amdgpu_kernel void @gws_init_sgpr_offset_add1(i32 %val, i32 %offset.base) #0 { - %offset = add i32 %offset.base, 1 - call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 %offset) - ret void -} - -; GCN-LABEL: {{^}}gws_init_vgpr_offset: -; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]] -; GCN-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0 -; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16 -; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}} -; GCN-DAG: v_mov_b32_e32 v0, [[BAR_NUM]] -; GCN: ds_gws_init v0 gds{{$}} -define amdgpu_kernel void @gws_init_vgpr_offset(i32 %val) #0 { - %vgpr.offset = call i32 @llvm.amdgcn.workitem.id.x() - call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 %vgpr.offset) - ret void -} - -; Variable offset in VGPR with constant add -; GCN-LABEL: {{^}}gws_init_vgpr_offset_add: -; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]] -; GCN-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0 -; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16 -; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}} -; GCN-DAG: v_mov_b32_e32 v0, [[BAR_NUM]] -; GCN: ds_gws_init v0 offset:3 gds{{$}} -define amdgpu_kernel void @gws_init_vgpr_offset_add(i32 %val) #0 { - %vgpr.offset.base = call i32 @llvm.amdgcn.workitem.id.x() - %vgpr.offset = add i32 %vgpr.offset.base, 3 - call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 %vgpr.offset) - ret void -} - -@lds = internal unnamed_addr addrspace(3) global i32 undef - -; Check if m0 initialization is shared. -; GCN-LABEL: {{^}}gws_init_save_m0_init_constant_offset: -; GCN: s_mov_b32 m0, -1 -; GCN-NOT: s_mov_b32 m0 -define amdgpu_kernel void @gws_init_save_m0_init_constant_offset(i32 %val) #0 { - store i32 1, i32 addrspace(3)* @lds - call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 10) - store i32 2, i32 addrspace(3)* @lds - ret void -} - -; GCN-LABEL: {{^}}gws_init_lgkmcnt: -; GCN: ds_gws_init v0 offset:1 gds{{$}} -; GCN-NEXT: s_waitcnt expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 -define void @gws_init_lgkmcnt(i32 %val) { - call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 0) - ret void -} - -; Does not imply memory fence on its own -; GCN-LABEL: {{^}}gws_init_wait_before: -; GCN: store_dword -; CIPLUS-NOT: s_waitcnt -; GCN: ds_gws_init v0 offset:8 gds -define amdgpu_kernel void @gws_init_wait_before(i32 %val, i32 addrspace(1)* %ptr) #0 { - store i32 0, i32 addrspace(1)* %ptr - call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 7) - ret void -} - -declare void @llvm.amdgcn.ds.gws.init(i32, i32) #1 -declare i32 @llvm.amdgcn.workitem.id.x() #2 - -attributes #0 = { nounwind } -attributes #1 = { convergent inaccessiblememonly nounwind writeonly } -attributes #2 = { nounwind readnone speculatable } diff --git a/llvm/test/CodeGen/AMDGPU/tail-duplication-convergent.ll b/llvm/test/CodeGen/AMDGPU/tail-duplication-convergent.ll index 34b26af000bb..70790311480c 100644 --- a/llvm/test/CodeGen/AMDGPU/tail-duplication-convergent.ll +++ b/llvm/test/CodeGen/AMDGPU/tail-duplication-convergent.ll @@ -6,8 +6,6 @@ declare void @nonconvergent_func() #0 declare void @convergent_func() #1 declare void @llvm.amdgcn.s.barrier() #1 -declare void @llvm.amdgcn.ds.gws.init(i32, i32) #2 -declare void @llvm.amdgcn.ds.gws.barrier(i32, i32) #2 ; barrier shouldn't be duplicated. @@ -102,52 +100,6 @@ call: ret void } -; GCN-LABEL: {{^}}taildup_gws_init: -; GCN: ds_gws_init -; GCN-NOT: ds_gws_init -define amdgpu_kernel void @taildup_gws_init(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i1 %cond, i32 %val, i32 %offset) #0 { -entry: - br i1 %cond, label %bb1, label %bb2 - -bb1: - store i32 0, i32 addrspace(1)* %a - br label %call - -bb2: - store i32 1, i32 addrspace(1)* %a - br label %call - -call: - call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 %offset) - br label %ret - -ret: - ret void -} - -; GCN-LABEL: {{^}}taildup_gws_barrier: -; GCN: ds_gws_barrier -; GCN-NOT: ds_gws_barrier -define amdgpu_kernel void @taildup_gws_barrier(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i1 %cond, i32 %val, i32 %offset) #0 { -entry: - br i1 %cond, label %bb1, label %bb2 - -bb1: - store i32 0, i32 addrspace(1)* %a - br label %call - -bb2: - store i32 1, i32 addrspace(1)* %a - br label %call - -call: - call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 %offset) - br label %ret - -ret: - ret void -} attributes #0 = { nounwind } attributes #1 = { nounwind convergent } -attributes #2 = { convergent inaccessiblememonly nounwind }