From 8bcc9bb595931d281410d3b3f781d4515aeb66b0 Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Thu, 13 Jun 2019 19:18:29 +0000 Subject: [PATCH] [AMDGPU] gfx1010 base changes for wave32 Differential Revision: https://reviews.llvm.org/D63293 llvm-svn: 363299 --- llvm/lib/Target/AMDGPU/AMDGPUInstructions.td | 4 +- llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 10 ++++ .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 58 +++++++++++++++---- .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 7 ++- .../AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp | 2 +- llvm/lib/Target/AMDGPU/SIInstrInfo.td | 5 ++ llvm/lib/Target/AMDGPU/SIInstructions.td | 18 ++++++ llvm/lib/Target/AMDGPU/SOPInstructions.td | 32 ++++++++++ .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 22 ++++--- llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 18 +++++- llvm/lib/Target/AMDGPU/VOP2Instructions.td | 35 +++++++++++ llvm/lib/Target/AMDGPU/VOPCInstructions.td | 23 ++++++++ 12 files changed, 209 insertions(+), 25 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index 13b6758afab5..be0ba2fe7ae8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -69,9 +69,11 @@ class PredicateControl { Predicate SubtargetPredicate = TruePredicate; list AssemblerPredicates = []; Predicate AssemblerPredicate = TruePredicate; + Predicate WaveSizePredicate = TruePredicate; list OtherPredicates = []; list Predicates = !listconcat([SubtargetPredicate, - AssemblerPredicate], + AssemblerPredicate, + WaveSizePredicate], AssemblerPredicates, OtherPredicates); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 3671c919a62e..6b12fb2a3496 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -94,6 +94,16 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS + // Disable mutually exclusive bits. + if (FS.find_lower("+wavefrontsize") != StringRef::npos) { + if (FS.find_lower("wavefrontsize16") == StringRef::npos) + FullFS += "-wavefrontsize16,"; + if (FS.find_lower("wavefrontsize32") == StringRef::npos) + FullFS += "-wavefrontsize32,"; + if (FS.find_lower("wavefrontsize64") == StringRef::npos) + FullFS += "-wavefrontsize64,"; + } + FullFS += FS; ParseSubtargetFeatures(GPU, FullFS); diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 78ac9dda0357..54dfb1196ce5 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -375,6 +375,8 @@ public: return isRegOrInlineNoMods(AMDGPU::SReg_64RegClassID, MVT::i64); } + bool isBoolReg() const; + bool isSCSrcF16() const { return isRegOrInlineNoMods(AMDGPU::SReg_32RegClassID, MVT::f16); } @@ -616,6 +618,10 @@ public: void addRegOperands(MCInst &Inst, unsigned N) const; + void addBoolRegOperands(MCInst &Inst, unsigned N) const { + addRegOperands(Inst, N); + } + void addRegOrImmOperands(MCInst &Inst, unsigned N) const { if (isRegKind()) addRegOperands(Inst, N); @@ -881,6 +887,8 @@ private: /// \param VCCUsed [in] Whether VCC special SGPR is reserved. /// \param FlatScrUsed [in] Whether FLAT_SCRATCH special SGPR is reserved. /// \param XNACKUsed [in] Whether XNACK_MASK special SGPR is reserved. + /// \param EnableWavefrontSize32 [in] Value of ENABLE_WAVEFRONT_SIZE32 kernel + /// descriptor field, if valid. /// \param NextFreeVGPR [in] Max VGPR number referenced, plus one. /// \param VGPRRange [in] Token range, used for VGPR diagnostics. /// \param NextFreeSGPR [in] Max SGPR number referenced, plus one. @@ -889,9 +897,10 @@ private: /// \param SGPRBlocks [out] Result SGPR block count. bool calculateGPRBlocks(const FeatureBitset &Features, bool VCCUsed, bool FlatScrUsed, bool XNACKUsed, - unsigned NextFreeVGPR, SMRange VGPRRange, - unsigned NextFreeSGPR, SMRange SGPRRange, - unsigned &VGPRBlocks, unsigned &SGPRBlocks); + Optional EnableWavefrontSize32, unsigned NextFreeVGPR, + SMRange VGPRRange, unsigned NextFreeSGPR, + SMRange SGPRRange, unsigned &VGPRBlocks, + unsigned &SGPRBlocks); bool ParseDirectiveAMDGCNTarget(); bool ParseDirectiveAMDHSAKernel(); bool ParseDirectiveMajorMinor(uint32_t &Major, uint32_t &Minor); @@ -1159,6 +1168,7 @@ private: bool validateMIMGDim(const MCInst &Inst); bool validateLdsDirect(const MCInst &Inst); bool validateOpSel(const MCInst &Inst); + bool validateVccOperand(unsigned Reg) const; bool validateVOP3Literal(const MCInst &Inst) const; bool usesConstantBus(const MCInst &Inst, unsigned OpIdx); bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const; @@ -1190,6 +1200,7 @@ public: OperandMatchResultTy parseInterpSlot(OperandVector &Operands); OperandMatchResultTy parseInterpAttr(OperandVector &Operands); OperandMatchResultTy parseSOppBrTarget(OperandVector &Operands); + OperandMatchResultTy parseBoolReg(OperandVector &Operands); bool parseSwizzleOperands(const unsigned OpNum, int64_t* Op, const unsigned MinVal, @@ -1479,6 +1490,11 @@ bool AMDGPUOperand::isSDWAInt32Operand() const { return isSDWAOperand(MVT::i32); } +bool AMDGPUOperand::isBoolReg() const { + return AsmParser->getFeatureBits()[AMDGPU::FeatureWavefrontSize64] ? + isSCSrcB64() : isSCSrcB32(); +} + uint64_t AMDGPUOperand::applyInputFPModifiers(uint64_t Val, unsigned Size) const { assert(isImmTy(ImmTyNone) && Imm.Mods.hasFPModifiers()); @@ -3030,6 +3046,13 @@ bool AMDGPUAsmParser::validateOpSel(const MCInst &Inst) { return true; } +// Check if VCC register matches wavefront size +bool AMDGPUAsmParser::validateVccOperand(unsigned Reg) const { + auto FB = getFeatureBits(); + return (FB[AMDGPU::FeatureWavefrontSize64] && Reg == AMDGPU::VCC) || + (FB[AMDGPU::FeatureWavefrontSize32] && Reg == AMDGPU::VCC_LO); +} + // VOP3 literal is only allowed in GFX10+ and only one can be used bool AMDGPUAsmParser::validateVOP3Literal(const MCInst &Inst) const { unsigned Opcode = Inst.getOpcode(); @@ -3267,9 +3290,9 @@ bool AMDGPUAsmParser::OutOfRangeError(SMRange Range) { bool AMDGPUAsmParser::calculateGPRBlocks( const FeatureBitset &Features, bool VCCUsed, bool FlatScrUsed, - bool XNACKUsed, unsigned NextFreeVGPR, SMRange VGPRRange, - unsigned NextFreeSGPR, SMRange SGPRRange, unsigned &VGPRBlocks, - unsigned &SGPRBlocks) { + bool XNACKUsed, Optional EnableWavefrontSize32, unsigned NextFreeVGPR, + SMRange VGPRRange, unsigned NextFreeSGPR, SMRange SGPRRange, + unsigned &VGPRBlocks, unsigned &SGPRBlocks) { // TODO(scott.linder): These calculations are duplicated from // AMDGPUAsmPrinter::getSIProgramInfo and could be unified. IsaVersion Version = getIsaVersion(getSTI().getCPU()); @@ -3298,7 +3321,8 @@ bool AMDGPUAsmParser::calculateGPRBlocks( NumSGPRs = IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; } - VGPRBlocks = IsaInfo::getNumVGPRBlocks(&getSTI(), NumVGPRs); + VGPRBlocks = + IsaInfo::getNumVGPRBlocks(&getSTI(), NumVGPRs, EnableWavefrontSize32); SGPRBlocks = IsaInfo::getNumSGPRBlocks(&getSTI(), NumSGPRs); return false; @@ -3329,6 +3353,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { bool ReserveVCC = true; bool ReserveFlatScr = true; bool ReserveXNACK = hasXNACK(); + Optional EnableWavefrontSize32; while (true) { while (getLexer().is(AsmToken::EndOfStatement)) @@ -3547,8 +3572,9 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { unsigned VGPRBlocks; unsigned SGPRBlocks; if (calculateGPRBlocks(getFeatureBits(), ReserveVCC, ReserveFlatScr, - ReserveXNACK, NextFreeVGPR, VGPRRange, NextFreeSGPR, - SGPRRange, VGPRBlocks, SGPRBlocks)) + ReserveXNACK, EnableWavefrontSize32, NextFreeVGPR, + VGPRRange, NextFreeSGPR, SGPRRange, VGPRBlocks, + SGPRBlocks)) return true; if (!isUInt( @@ -5383,6 +5409,15 @@ AMDGPUAsmParser::parseSOppBrTarget(OperandVector &Operands) { } } +//===----------------------------------------------------------------------===// +// Boolean holding registers +//===----------------------------------------------------------------------===// + +OperandMatchResultTy +AMDGPUAsmParser::parseBoolReg(OperandVector &Operands) { + return parseReg(Operands); +} + //===----------------------------------------------------------------------===// // mubuf //===----------------------------------------------------------------------===// @@ -6294,7 +6329,7 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands, bool I } AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); // Add the register arguments - if (Op.isReg() && Op.getReg() == AMDGPU::VCC) { + if (Op.isReg() && validateVccOperand(Op.getReg())) { // VOP2b (v_add_u32, v_sub_u32 ...) dpp use "vcc" token. // Skip it. continue; @@ -6437,7 +6472,8 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands, for (unsigned E = Operands.size(); I != E; ++I) { AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); - if (skipVcc && !skippedVcc && Op.isReg() && Op.getReg() == AMDGPU::VCC) { + if (skipVcc && !skippedVcc && Op.isReg() && + (Op.getReg() == AMDGPU::VCC || Op.getReg() == AMDGPU::VCC_LO)) { // VOP2b (v_add_u32, v_sub_u32 ...) sdwa use "vcc" token as dst. // Skip it if it's 2nd (e.g. v_add_i32_sdwa v1, vcc, v2, v3) // or 4th (v_addc_u32_sdwa v1, vcc, v2, v3, vcc) operand. diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index 18325fe59f22..a12e634c8464 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -442,6 +442,7 @@ void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo, printOperand(MI, OpNo, STI, O); + // Print default vcc/vcc_lo operand. switch (MI->getOpcode()) { default: break; @@ -589,7 +590,8 @@ void AMDGPUInstPrinter::printDefaultVccOperand(unsigned OpNo, raw_ostream &O) { if (OpNo > 0) O << ", "; - printRegOperand(AMDGPU::VCC, O, MRI); + printRegOperand(STI.getFeatureBits()[AMDGPU::FeatureWavefrontSize64] ? + AMDGPU::VCC : AMDGPU::VCC_LO, O, MRI); if (OpNo == 0) O << ", "; } @@ -597,6 +599,7 @@ void AMDGPUInstPrinter::printDefaultVccOperand(unsigned OpNo, void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { + // Print default vcc/vcc_lo operand of VOPC. const MCInstrDesc &Desc = MII.get(MI->getOpcode()); if (OpNo == 0 && (Desc.TSFlags & SIInstrFlags::VOPC) && (Desc.hasImplicitDefOfPhysReg(AMDGPU::VCC) || @@ -680,6 +683,7 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, O << "/*INV_OP*/"; } + // Print default vcc/vcc_lo operand of v_cndmask_b32_e32. switch (MI->getOpcode()) { default: break; @@ -749,6 +753,7 @@ void AMDGPUInstPrinter::printOperandAndIntInputMods(const MCInst *MI, if (InputModifiers & SISrcMods::SEXT) O << ')'; + // Print default vcc/vcc_lo operand of VOP2b. switch (MI->getOpcode()) { default: break; diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp index 4776a176be6b..40da1875ee8c 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp @@ -389,7 +389,7 @@ SIMCCodeEmitter::getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo, const MCOperand &MO = MI.getOperand(OpNo); unsigned Reg = MO.getReg(); - if (Reg != AMDGPU::VCC) { + if (Reg != AMDGPU::VCC && Reg != AMDGPU::VCC_LO) { RegEnc |= MRI.getEncodingValue(Reg); RegEnc &= SDWA9EncValues::VOPC_DST_SGPR_MASK; RegEnc |= SDWA9EncValues::VOPC_DST_VCC_MASK; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 80489e9f6870..1e7d7c2ee10e 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -6,6 +6,11 @@ // //===----------------------------------------------------------------------===// +def isWave32 : Predicate<"Subtarget->getWavefrontSize() == 32">, + AssemblerPredicate <"FeatureWavefrontSize32">; +def isWave64 : Predicate<"Subtarget->getWavefrontSize() == 64">, + AssemblerPredicate <"FeatureWavefrontSize64">; + def DisableInst : Predicate <"false">, AssemblerPredicate<"FeatureDisable">; class GCNPredicateControl : PredicateControl { diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 743026932391..4a3e8b3e36bd 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -188,9 +188,18 @@ class WrapTerminatorInst : SPseudoInstSI< let CodeSize = base_inst.CodeSize; } +let WaveSizePredicate = isWave64 in { def S_MOV_B64_term : WrapTerminatorInst; def S_XOR_B64_term : WrapTerminatorInst; def S_ANDN2_B64_term : WrapTerminatorInst; +} + +let WaveSizePredicate = isWave32 in { +def S_MOV_B32_term : WrapTerminatorInst; +def S_XOR_B32_term : WrapTerminatorInst; +def S_OR_B32_term : WrapTerminatorInst; +def S_ANDN2_B32_term : WrapTerminatorInst; +} def WAVE_BARRIER : SPseudoInstSI<(outs), (ins), [(int_amdgcn_wave_barrier)]> { @@ -343,6 +352,15 @@ def SI_INIT_EXEC : SPseudoInstSI < let Defs = [EXEC]; let usesCustomInserter = 1; let isAsCheapAsAMove = 1; + let WaveSizePredicate = isWave64; +} + +def SI_INIT_EXEC_LO : SPseudoInstSI < + (outs), (ins i32imm:$src), []> { + let Defs = [EXEC_LO]; + let usesCustomInserter = 1; + let isAsCheapAsAMove = 1; + let WaveSizePredicate = isWave32; } def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI < diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index 342293851c35..f00f9888063c 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -275,6 +275,21 @@ let SubtargetPredicate = isGFX9Plus in { } // End SubtargetPredicate = isGFX9Plus let SubtargetPredicate = isGFX10Plus in { + let hasSideEffects = 1, Defs = [EXEC, SCC], Uses = [EXEC] in { + def S_AND_SAVEEXEC_B32 : SOP1_32<"s_and_saveexec_b32">; + def S_OR_SAVEEXEC_B32 : SOP1_32<"s_or_saveexec_b32">; + def S_XOR_SAVEEXEC_B32 : SOP1_32<"s_xor_saveexec_b32">; + def S_ANDN2_SAVEEXEC_B32 : SOP1_32<"s_andn2_saveexec_b32">; + def S_ORN2_SAVEEXEC_B32 : SOP1_32<"s_orn2_saveexec_b32">; + def S_NAND_SAVEEXEC_B32 : SOP1_32<"s_nand_saveexec_b32">; + def S_NOR_SAVEEXEC_B32 : SOP1_32<"s_nor_saveexec_b32">; + def S_XNOR_SAVEEXEC_B32 : SOP1_32<"s_xnor_saveexec_b32">; + def S_ANDN1_SAVEEXEC_B32 : SOP1_32<"s_andn1_saveexec_b32">; + def S_ORN1_SAVEEXEC_B32 : SOP1_32<"s_orn1_saveexec_b32">; + def S_ANDN1_WREXEC_B32 : SOP1_32<"s_andn1_wrexec_b32">; + def S_ANDN2_WREXEC_B32 : SOP1_32<"s_andn2_wrexec_b32">; + } // End hasSideEffects = 1, Defs = [EXEC, SCC], Uses = [EXEC] + let Uses = [M0] in { def S_MOVRELSD_2_B32 : SOP1_32<"s_movrelsd_2_b32">; } // End Uses = [M0] @@ -782,6 +797,9 @@ let SubtargetPredicate = isGFX10Plus in { let has_sdst = 0; } + def S_SUBVECTOR_LOOP_BEGIN : SOPK_32_BR<"s_subvector_loop_begin">; + def S_SUBVECTOR_LOOP_END : SOPK_32_BR<"s_subvector_loop_end">; + def S_WAITCNT_VSCNT : SOPK_WAITCNT<"s_waitcnt_vscnt">; def S_WAITCNT_VMCNT : SOPK_WAITCNT<"s_waitcnt_vmcnt">; def S_WAITCNT_EXPCNT : SOPK_WAITCNT<"s_waitcnt_expcnt">; @@ -1215,6 +1233,18 @@ defm S_ORN1_SAVEEXEC_B64 : SOP1_Real_gfx10<0x038>; defm S_ANDN1_WREXEC_B64 : SOP1_Real_gfx10<0x039>; defm S_ANDN2_WREXEC_B64 : SOP1_Real_gfx10<0x03a>; defm S_BITREPLICATE_B64_B32 : SOP1_Real_gfx10<0x03b>; +defm S_AND_SAVEEXEC_B32 : SOP1_Real_gfx10<0x03c>; +defm S_OR_SAVEEXEC_B32 : SOP1_Real_gfx10<0x03d>; +defm S_XOR_SAVEEXEC_B32 : SOP1_Real_gfx10<0x03e>; +defm S_ANDN2_SAVEEXEC_B32 : SOP1_Real_gfx10<0x03f>; +defm S_ORN2_SAVEEXEC_B32 : SOP1_Real_gfx10<0x040>; +defm S_NAND_SAVEEXEC_B32 : SOP1_Real_gfx10<0x041>; +defm S_NOR_SAVEEXEC_B32 : SOP1_Real_gfx10<0x042>; +defm S_XNOR_SAVEEXEC_B32 : SOP1_Real_gfx10<0x043>; +defm S_ANDN1_SAVEEXEC_B32 : SOP1_Real_gfx10<0x044>; +defm S_ORN1_SAVEEXEC_B32 : SOP1_Real_gfx10<0x045>; +defm S_ANDN1_WREXEC_B32 : SOP1_Real_gfx10<0x046>; +defm S_ANDN2_WREXEC_B32 : SOP1_Real_gfx10<0x047>; defm S_MOVRELSD_2_B32 : SOP1_Real_gfx10<0x049>; //===----------------------------------------------------------------------===// @@ -1382,6 +1412,8 @@ defm S_WAITCNT_VSCNT : SOPK_Real32_gfx10<0x017>; defm S_WAITCNT_VMCNT : SOPK_Real32_gfx10<0x018>; defm S_WAITCNT_EXPCNT : SOPK_Real32_gfx10<0x019>; defm S_WAITCNT_LGKMCNT : SOPK_Real32_gfx10<0x01a>; +defm S_SUBVECTOR_LOOP_BEGIN : SOPK_Real32_gfx10<0x01b>; +defm S_SUBVECTOR_LOOP_END : SOPK_Real32_gfx10<0x01c>; //===----------------------------------------------------------------------===// // SOPK - GFX6, GFX7. diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index fa2c79857311..2db372f201ff 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -380,12 +380,17 @@ unsigned getNumSGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs) { return NumSGPRs / getSGPREncodingGranule(STI) - 1; } -unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI) { - return 4; +unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI, + Optional EnableWavefrontSize32) { + bool IsWave32 = EnableWavefrontSize32 ? + *EnableWavefrontSize32 : + STI->getFeatureBits().test(FeatureWavefrontSize32); + return IsWave32 ? 8 : 4; } -unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI) { - return getVGPRAllocGranule(STI); +unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI, + Optional EnableWavefrontSize32) { + return getVGPRAllocGranule(STI, EnableWavefrontSize32); } unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI) { @@ -416,10 +421,12 @@ unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) { return std::min(MaxNumVGPRs, AddressableNumVGPRs); } -unsigned getNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumVGPRs) { - NumVGPRs = alignTo(std::max(1u, NumVGPRs), getVGPREncodingGranule(STI)); +unsigned getNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumVGPRs, + Optional EnableWavefrontSize32) { + NumVGPRs = alignTo(std::max(1u, NumVGPRs), + getVGPREncodingGranule(STI, EnableWavefrontSize32)); // VGPRBlocks is actual number of VGPR blocks minus 1. - return NumVGPRs / getVGPREncodingGranule(STI) - 1; + return NumVGPRs / getVGPREncodingGranule(STI, EnableWavefrontSize32) - 1; } } // end namespace IsaInfo @@ -437,7 +444,6 @@ void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header, Header.amd_machine_version_minor = Version.Minor; Header.amd_machine_version_stepping = Version.Stepping; Header.kernel_code_entry_byte_offset = sizeof(Header); - // wavefront_size is specified as a power of 2: 2^6 = 64 threads. Header.wavefront_size = 6; // If the code object does not support indirect functions, then the value must diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index def279939bb6..b56dad808f43 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -150,10 +150,18 @@ unsigned getNumExtraSGPRs(const MCSubtargetInfo *STI, bool VCCUsed, unsigned getNumSGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs); /// \returns VGPR allocation granularity for given subtarget \p STI. -unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI); +/// +/// For subtargets which support it, \p EnableWavefrontSize32 should match +/// the ENABLE_WAVEFRONT_SIZE32 kernel descriptor field. +unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI, + Optional EnableWavefrontSize32 = None); /// \returns VGPR encoding granularity for given subtarget \p STI. -unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI); +/// +/// For subtargets which support it, \p EnableWavefrontSize32 should match +/// the ENABLE_WAVEFRONT_SIZE32 kernel descriptor field. +unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI, + Optional EnableWavefrontSize32 = None); /// \returns Total number of VGPRs for given subtarget \p STI. unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI); @@ -171,7 +179,11 @@ unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU); /// \returns Number of VGPR blocks needed for given subtarget \p STI when /// \p NumVGPRs are used. -unsigned getNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs); +/// +/// For subtargets which support it, \p EnableWavefrontSize32 should match the +/// ENABLE_WAVEFRONT_SIZE32 kernel descriptor field. +unsigned getNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs, + Optional EnableWavefrontSize32 = None); } // end namespace IsaInfo diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index 7b28e58eeaf7..53fd5a1c2221 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -199,7 +199,12 @@ class VOP2bInstAlias { + let WaveSizePredicate = isWave32 in { + def : VOP2bInstAlias; + } + let WaveSizePredicate = isWave64 in { def : VOP2bInstAlias; + } } multiclass VOP2eInst : } multiclass VOP2eInstAliases { + let WaveSizePredicate = isWave32 in { + def : VOP2eInstAlias; + } + let WaveSizePredicate = isWave64 in { def : VOP2eInstAlias; + } } class VOP_MADAK : VOPProfile <[vt, vt, vt, vt]> { @@ -953,6 +963,30 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { let DecoderNamespace = "DPP8"; } + let WaveSizePredicate = isWave32 in { + def _sdwa_w32_gfx10 : + Base_VOP_SDWA10_Real(opName#"_sdwa")>, + VOP2_SDWA9Ae(opName#"_sdwa").Pfl> { + VOP2_SDWA_Pseudo Ps = !cast(opName#"_sdwa"); + let AsmString = asmName # !subst("vcc", "vcc_lo", Ps.AsmOperands); + let isAsmParserOnly = 1; + let DecoderNamespace = "SDWA10"; + } + def _dpp_w32_gfx10 : + VOP2_DPP16(opName#"_e32"), asmName> { + string AsmDPP = !cast(opName#"_e32").Pfl.AsmDPP16; + let AsmString = asmName # !subst("vcc", "vcc_lo", AsmDPP); + let isAsmParserOnly = 1; + } + def _dpp8_w32_gfx10 : + VOP2_DPP8(opName#"_e32"), asmName> { + string AsmDPP8 = !cast(opName#"_e32").Pfl.AsmDPP8; + let AsmString = asmName # !subst("vcc", "vcc_lo", AsmDPP8); + let isAsmParserOnly = 1; + } + } // End WaveSizePredicate = isWave32 + + let WaveSizePredicate = isWave64 in { def _sdwa_w64_gfx10 : Base_VOP_SDWA10_Real(opName#"_sdwa")>, VOP2_SDWA9Ae(opName#"_sdwa").Pfl> { @@ -973,6 +1007,7 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { let AsmString = asmName # AsmDPP8; let isAsmParserOnly = 1; } + } // End WaveSizePredicate = isWave64 } //===----------------------------- VOP3Only -----------------------------===// diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td index 7d60c587c92b..fb4370af0245 100644 --- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td @@ -165,9 +165,16 @@ class VOPCInstAlias { def : VOPCInstAlias (OpName#"_e64"), !cast(OpName#"_e32_"#Arch)>; + let WaveSizePredicate = isWave32 in { + def : VOPCInstAlias (OpName#"_e64"), + !cast(OpName#"_e32_"#Arch), + "vcc_lo, "#!cast(OpName#"_e64").Pfl.Asm32>; + } + let WaveSizePredicate = isWave64 in { def : VOPCInstAlias (OpName#"_e64"), !cast(OpName#"_e32_"#Arch), "vcc, "#!cast(OpName#"_e64").Pfl.Asm32>; + } } multiclass VOPCXInstAliases { @@ -740,10 +747,17 @@ defm V_CMPX_CLASS_F16 : VOPCX_CLASS_F16 <"v_cmpx_class_f16">; // We need to use COPY_TO_REGCLASS to w/a the problem when ReplaceAllUsesWith() // complaints it cannot replace i1 <-> i64/i32 if node was not morphed in place. multiclass ICMP_Pattern { + let WaveSizePredicate = isWave64 in def : GCNPat < (i64 (AMDGPUsetcc vt:$src0, vt:$src1, cond)), (i64 (COPY_TO_REGCLASS (inst $src0, $src1), SReg_64)) >; + + let WaveSizePredicate = isWave32 in + def : GCNPat < + (i32 (AMDGPUsetcc vt:$src0, vt:$src1, cond)), + (i32 (COPY_TO_REGCLASS (inst $src0, $src1), SReg_32)) + >; } defm : ICMP_Pattern ; @@ -780,12 +794,21 @@ defm : ICMP_Pattern ; defm : ICMP_Pattern ; multiclass FCMP_Pattern { + let WaveSizePredicate = isWave64 in def : GCNPat < (i64 (AMDGPUsetcc (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)), (vt (VOP3Mods vt:$src1, i32:$src1_modifiers)), cond)), (i64 (COPY_TO_REGCLASS (inst $src0_modifiers, $src0, $src1_modifiers, $src1, DSTCLAMP.NONE), SReg_64)) >; + + let WaveSizePredicate = isWave32 in + def : GCNPat < + (i32 (AMDGPUsetcc (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)), + (vt (VOP3Mods vt:$src1, i32:$src1_modifiers)), cond)), + (i32 (COPY_TO_REGCLASS (inst $src0_modifiers, $src0, $src1_modifiers, $src1, + DSTCLAMP.NONE), SReg_32)) + >; } defm : FCMP_Pattern ;