diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 4d55a663d483..bf493c9fd38f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -253,7 +253,7 @@ SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const { static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) { switch (NumVectorElts) { case 1: - return AMDGPU::SReg_32RegClassID; + return AMDGPU::SReg_32_XM0RegClassID; case 2: return AMDGPU::SReg_64RegClassID; case 4: diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 6deee51a064b..04edc91f0a7f 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -59,7 +59,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass); addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass); - addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass); + addRegisterClass(MVT::i32, &AMDGPU::SReg_32_XM0RegClass); addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass); addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass); @@ -79,8 +79,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass); if (Subtarget->has16BitInsts()) { - addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass); - addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass); + addRegisterClass(MVT::i16, &AMDGPU::SReg_32_XM0RegClass); + addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass); } computeRegisterProperties(STI.getRegisterInfo()); @@ -941,25 +941,25 @@ SDValue SITargetLowering::LowerFormalArguments( // Start adding system SGPRs. if (Info->hasWorkGroupIDX()) { unsigned Reg = Info->addWorkGroupIDX(); - MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); + MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass); CCInfo.AllocateReg(Reg); } if (Info->hasWorkGroupIDY()) { unsigned Reg = Info->addWorkGroupIDY(); - MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); + MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass); CCInfo.AllocateReg(Reg); } if (Info->hasWorkGroupIDZ()) { unsigned Reg = Info->addWorkGroupIDZ(); - MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); + MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass); CCInfo.AllocateReg(Reg); } if (Info->hasWorkGroupInfo()) { unsigned Reg = Info->addWorkGroupInfo(); - MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); + MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass); CCInfo.AllocateReg(Reg); } @@ -2414,15 +2414,15 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SI::KernelInputOffsets::LOCAL_SIZE_Z); case Intrinsic::amdgcn_workgroup_id_x: case Intrinsic::r600_read_tgid_x: - return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, + return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass, TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_X), VT); case Intrinsic::amdgcn_workgroup_id_y: case Intrinsic::r600_read_tgid_y: - return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, + return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass, TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Y), VT); case Intrinsic::amdgcn_workgroup_id_z: case Intrinsic::r600_read_tgid_z: - return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, + return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass, TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Z), VT); case Intrinsic::amdgcn_workitem_id_x: case Intrinsic::r600_read_tidig_x: @@ -4182,7 +4182,7 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, default: return std::make_pair(0U, nullptr); case 32: - return std::make_pair(0U, &AMDGPU::SReg_32RegClass); + return std::make_pair(0U, &AMDGPU::SReg_32_XM0RegClass); case 64: return std::make_pair(0U, &AMDGPU::SGPR_64RegClass); case 128: diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp index a9e693917bfa..da4db63ab333 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp @@ -532,6 +532,7 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { TRI = &TII->getRegisterInfo(); MRI = &MF.getRegInfo(); IV = getIsaVersion(ST->getFeatureBits()); + const SIMachineFunctionInfo *MFI = MF.getInfo(); HardwareLimits.Named.VM = getVmcntBitMask(IV); HardwareLimits.Named.EXP = getExpcntBitMask(IV); @@ -543,20 +544,27 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { LastOpcodeType = OTHER; LastInstWritesM0 = false; IsFlatOutstanding = false; - ReturnsVoid = MF.getInfo()->returnsVoid(); + ReturnsVoid = MFI->returnsVoid(); memset(&UsedRegs, 0, sizeof(UsedRegs)); memset(&DefinedRegs, 0, sizeof(DefinedRegs)); SmallVector RemoveMI; + SmallVector EndPgmBlocks; + + bool HaveScalarStores = false; for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { MachineBasicBlock &MBB = *BI; + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; ++I) { + if (!HaveScalarStores && TII->isScalarStore(*I)) + HaveScalarStores = true; + if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS) { // There is a hardware bug on CI/SI where SMRD instruction may corrupt // vccz bit, so when we detect that an instruction may read from a @@ -625,12 +633,45 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { pushInstruction(MBB, I, Increment); handleSendMsg(MBB, I); + + if (I->getOpcode() == AMDGPU::S_ENDPGM || + I->getOpcode() == AMDGPU::SI_RETURN) + EndPgmBlocks.push_back(&MBB); } // Wait for everything at the end of the MBB Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued); } + if (HaveScalarStores) { + // If scalar writes are used, the cache must be flushed or else the next + // wave to reuse the same scratch memory can be clobbered. + // + // Insert s_dcache_wb at wave termination points if there were any scalar + // stores, and only if the cache hasn't already been flushed. This could be + // improved by looking across blocks for flushes in postdominating blocks + // from the stores but an explicitly requested flush is probably very rare. + for (MachineBasicBlock *MBB : EndPgmBlocks) { + bool SeenDCacheWB = false; + + for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); + I != E; ++I) { + + if (I->getOpcode() == AMDGPU::S_DCACHE_WB) + SeenDCacheWB = true; + else if (TII->isScalarStore(*I)) + SeenDCacheWB = false; + + // FIXME: It would be better to insert this before a waitcnt if any. + if ((I->getOpcode() == AMDGPU::S_ENDPGM || + I->getOpcode() == AMDGPU::SI_RETURN) && !SeenDCacheWB) { + Changes = true; + BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB)); + } + } + } + } + for (MachineInstr *I : RemoveMI) I->eraseFromParent(); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 63ce25835816..85eca558f25f 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -364,7 +364,8 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, return; } - if (RC == &AMDGPU::SReg_32RegClass) { + if (RC == &AMDGPU::SReg_32_XM0RegClass || + RC == &AMDGPU::SReg_32RegClass) { if (SrcReg == AMDGPU::SCC) { BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg) .addImm(-1) @@ -544,7 +545,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass); } - BuildMI(MBB, MI, DL, OpDesc) + MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc) .addReg(SrcReg, getKillRegState(isKill)) // data .addFrameIndex(FrameIndex) // addr .addMemOperand(MMO) @@ -554,6 +555,11 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, // needing them, and need to ensure that the reserved registers are // correctly handled. + if (ST.hasScalarStores()) { + // m0 is used for offset to scalar stores if used to spill. + Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine); + } + return; } @@ -643,12 +649,17 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass); } - BuildMI(MBB, MI, DL, OpDesc, DestReg) + MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg) .addFrameIndex(FrameIndex) // addr .addMemOperand(MMO) .addReg(MFI->getScratchRSrcReg(), RegState::Implicit) .addReg(MFI->getScratchWaveOffsetReg(), RegState::Implicit); + if (ST.hasScalarStores()) { + // m0 is used for offset to scalar stores if used to spill. + Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine); + } + return; } diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 54fcbb507c82..1d933da47c21 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -24,6 +24,12 @@ using namespace llvm; +static cl::opt EnableSpillSGPRToSMEM( + "amdgpu-spill-sgpr-to-smem", + cl::desc("Use scalar stores to spill SGPRs if supported by subtarget"), + cl::init(false)); + + static bool hasPressureSet(const int *PSets, unsigned PSetID) { for (unsigned i = 0; PSets[i] != -1; ++i) { if (PSets[i] == (int)PSetID) @@ -237,7 +243,7 @@ void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, MachineRegisterInfo &MRI = MF->getRegInfo(); unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); - unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) .addImm(Offset); @@ -401,28 +407,36 @@ static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII, void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, unsigned LoadStoreOp, - const MachineOperand *SrcDst, + int Index, + unsigned ValueReg, + bool IsKill, unsigned ScratchRsrcReg, - unsigned ScratchOffset, - int64_t Offset, + unsigned ScratchOffsetReg, + int64_t InstOffset, + MachineMemOperand *MMO, RegScavenger *RS) const { - unsigned Value = SrcDst->getReg(); - bool IsKill = SrcDst->isKill(); MachineBasicBlock *MBB = MI->getParent(); MachineFunction *MF = MI->getParent()->getParent(); const SISubtarget &ST = MF->getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); + const MachineFrameInfo &MFI = MF->getFrameInfo(); - DebugLoc DL = MI->getDebugLoc(); - bool IsStore = MI->mayStore(); + const MCInstrDesc &Desc = TII->get(LoadStoreOp); + const DebugLoc &DL = MI->getDebugLoc(); + bool IsStore = Desc.mayStore(); bool RanOutOfSGPRs = false; bool Scavenged = false; - unsigned SOffset = ScratchOffset; - unsigned OriginalImmOffset = Offset; + unsigned SOffset = ScratchOffsetReg; - unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); + const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg); + unsigned NumSubRegs = AMDGPU::getRegBitWidth(RC->getID()) / 32; unsigned Size = NumSubRegs * 4; + int64_t Offset = InstOffset + MFI.getObjectOffset(Index); + const int64_t OriginalImmOffset = Offset; + + unsigned Align = MFI.getObjectAlignment(Index); + const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo(); if (!isUInt<12>(Offset + Size)) { SOffset = AMDGPU::NoRegister; @@ -441,19 +455,23 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, // subtract the offset after the spill to return ScratchOffset to it's // original value. RanOutOfSGPRs = true; - SOffset = ScratchOffset; + SOffset = ScratchOffsetReg; } else { Scavenged = true; } + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset) - .addReg(ScratchOffset) - .addImm(Offset); + .addReg(ScratchOffsetReg) + .addImm(Offset); + Offset = 0; } - for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += 4) { + const unsigned EltSize = 4; + + for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += EltSize) { unsigned SubReg = NumSubRegs == 1 ? - Value : getSubReg(Value, getSubRegFromChannel(i)); + ValueReg : getSubReg(ValueReg, getSubRegFromChannel(i)); unsigned SOffsetRegState = 0; unsigned SrcDstRegState = getDefRegState(!IsStore); @@ -463,40 +481,65 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, SrcDstRegState |= getKillRegState(IsKill); } - BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) - .addReg(SubReg, getDefRegState(!IsStore)) + MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(EltSize * i); + MachineMemOperand *NewMMO + = MF->getMachineMemOperand(PInfo, MMO->getFlags(), + EltSize, MinAlign(Align, EltSize * i)); + + auto MIB = BuildMI(*MBB, MI, DL, Desc) + .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill)) .addReg(ScratchRsrcReg) .addReg(SOffset, SOffsetRegState) .addImm(Offset) .addImm(0) // glc .addImm(0) // slc .addImm(0) // tfe - .addReg(Value, RegState::Implicit | SrcDstRegState) - .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + .addMemOperand(NewMMO); + + if (NumSubRegs > 1) + MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState); } + if (RanOutOfSGPRs) { // Subtract the offset we added to the ScratchOffset register. - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScratchOffset) - .addReg(ScratchOffset) - .addImm(OriginalImmOffset); + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScratchOffsetReg) + .addReg(ScratchOffsetReg) + .addImm(OriginalImmOffset); } } void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index, RegScavenger *RS) const { - MachineFunction *MF = MI->getParent()->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); MachineBasicBlock *MBB = MI->getParent(); - SIMachineFunctionInfo *MFI = MF->getInfo(); - MachineFrameInfo &FrameInfo = MF->getFrameInfo(); + MachineFunction *MF = MBB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); const SISubtarget &ST = MF->getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); - const DebugLoc &DL = MI->getDebugLoc(); unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); unsigned SuperReg = MI->getOperand(0).getReg(); bool IsKill = MI->getOperand(0).isKill(); + const DebugLoc &DL = MI->getDebugLoc(); + + SIMachineFunctionInfo *MFI = MF->getInfo(); + MachineFrameInfo &FrameInfo = MF->getFrameInfo(); + + bool SpillToSMEM = ST.hasScalarStores() && EnableSpillSGPRToSMEM; + + assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); + + const unsigned EltSize = 4; + unsigned OffsetReg = AMDGPU::M0; + unsigned M0CopyReg = AMDGPU::NoRegister; + + if (SpillToSMEM) { + if (RS->isRegUsed(AMDGPU::M0)) { + M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg) + .addReg(AMDGPU::M0); + } + } // SubReg carries the "Kill" flag when SubReg == SuperReg. unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill); @@ -504,21 +547,43 @@ void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, unsigned SubReg = NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, getSubRegFromChannel(i)); + if (SpillToSMEM) { + int64_t FrOffset = FrameInfo.getObjectOffset(Index); + unsigned Align = FrameInfo.getObjectAlignment(Index); + MachinePointerInfo PtrInfo + = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i); + MachineMemOperand *MMO + = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, + EltSize, MinAlign(Align, EltSize * i)); + + // Add i * 4 wave offset. + // + // SMEM instructions only support a single offset, so increment the wave + // offset. + + int64_t Offset = ST.getWavefrontSize() * (FrOffset + 4 * i); + if (Offset != 0) { + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg) + .addReg(MFI->getScratchWaveOffsetReg()) + .addImm(Offset); + } else { + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) + .addReg(MFI->getScratchWaveOffsetReg()); + } + + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BUFFER_STORE_DWORD_SGPR)) + .addReg(SubReg, getKillRegState(IsKill)) // sdata + .addReg(MFI->getScratchRSrcReg()) // sbase + .addReg(OffsetReg, RegState::Kill) // soff + .addImm(0) // glc + .addMemOperand(MMO); + + continue; + } + struct SIMachineFunctionInfo::SpilledReg Spill = MFI->getSpilledReg(MF, Index, i); if (Spill.hasReg()) { - if (SuperReg == AMDGPU::M0) { - assert(NumSubRegs == 1); - unsigned CopyM0 - = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), CopyM0) - .addReg(SuperReg, getKillRegState(IsKill)); - - // The real spill now kills the temp copy. - SubReg = SuperReg = CopyM0; - IsKill = true; - } - BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), Spill.VGPR) @@ -530,10 +595,9 @@ void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, // it are fixed. } else { // Spill SGPR to a frame index. - // FIXME we should use S_STORE_DWORD here for VI. - // TODO: Should VI try to spill to VGPR and then spill to SMEM? unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + // TODO: Should VI try to spill to VGPR and then spill to SMEM? MachineInstrBuilder Mov = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) @@ -550,13 +614,12 @@ void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, Mov.addReg(SuperReg, RegState::Implicit | SuperKillState); } - unsigned Size = FrameInfo.getObjectSize(Index); unsigned Align = FrameInfo.getObjectAlignment(Index); MachinePointerInfo PtrInfo - = MachinePointerInfo::getFixedStack(*MF, Index); + = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i); MachineMemOperand *MMO = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, - Size, Align); + EltSize, MinAlign(Align, EltSize * i)); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE)) .addReg(TmpReg, RegState::Kill) // src .addFrameIndex(Index) // vaddr @@ -567,6 +630,11 @@ void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, } } + if (M0CopyReg != AMDGPU::NoRegister) { + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), AMDGPU::M0) + .addReg(M0CopyReg, RegState::Kill); + } + MI->eraseFromParent(); MFI->addToSpilledSGPRs(NumSubRegs); } @@ -585,42 +653,86 @@ void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); unsigned SuperReg = MI->getOperand(0).getReg(); + bool SpillToSMEM = ST.hasScalarStores() && EnableSpillSGPRToSMEM; - // m0 is not allowed as with readlane/writelane, so a temporary SGPR and - // extra copy is needed. - bool IsM0 = (SuperReg == AMDGPU::M0); - if (IsM0) { - assert(NumSubRegs == 1); - SuperReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); + + unsigned OffsetReg = AMDGPU::M0; + unsigned M0CopyReg = AMDGPU::NoRegister; + + if (SpillToSMEM) { + if (RS->isRegUsed(AMDGPU::M0)) { + M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg) + .addReg(AMDGPU::M0); + } } + // SubReg carries the "Kill" flag when SubReg == SuperReg. + int64_t FrOffset = FrameInfo.getObjectOffset(Index); + + const unsigned EltSize = 4; + for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { unsigned SubReg = NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, getSubRegFromChannel(i)); + if (SpillToSMEM) { + unsigned Align = FrameInfo.getObjectAlignment(Index); + MachinePointerInfo PtrInfo + = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i); + MachineMemOperand *MMO + = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad, + EltSize, MinAlign(Align, EltSize * i)); + + // Add i * 4 offset + int64_t Offset = ST.getWavefrontSize() * (FrOffset + 4 * i); + if (Offset != 0) { + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg) + .addReg(MFI->getScratchWaveOffsetReg()) + .addImm(Offset); + } else { + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) + .addReg(MFI->getScratchWaveOffsetReg()); + } + + auto MIB = + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BUFFER_LOAD_DWORD_SGPR), SubReg) + .addReg(MFI->getScratchRSrcReg()) // sbase + .addReg(OffsetReg, RegState::Kill) // soff + .addImm(0) // glc + .addMemOperand(MMO); + + if (NumSubRegs > 1) + MIB.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine); + + continue; + } + SIMachineFunctionInfo::SpilledReg Spill = MFI->getSpilledReg(MF, Index, i); if (Spill.hasReg()) { - BuildMI(*MBB, MI, DL, - TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), - SubReg) + auto MIB = + BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), + SubReg) .addReg(Spill.VGPR) - .addImm(Spill.Lane) - .addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine); + .addImm(Spill.Lane); + + if (NumSubRegs > 1) + MIB.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine); } else { // Restore SGPR from a stack slot. // FIXME: We should use S_LOAD_DWORD here for VI. - unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); unsigned Align = FrameInfo.getObjectAlignment(Index); - unsigned Size = FrameInfo.getObjectSize(Index); MachinePointerInfo PtrInfo - = MachinePointerInfo::getFixedStack(*MF, Index); + = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i); - MachineMemOperand *MMO = MF->getMachineMemOperand( - PtrInfo, MachineMemOperand::MOLoad, Size, Align); + MachineMemOperand *MMO = MF->getMachineMemOperand(PtrInfo, + MachineMemOperand::MOLoad, EltSize, + MinAlign(Align, EltSize * i)); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpReg) .addFrameIndex(Index) // vaddr @@ -628,16 +740,19 @@ void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, .addReg(MFI->getScratchWaveOffsetReg()) // soffset .addImm(i * 4) // offset .addMemOperand(MMO); - BuildMI(*MBB, MI, DL, - TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg) - .addReg(TmpReg, RegState::Kill) - .addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine); + + auto MIB = + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg) + .addReg(TmpReg, RegState::Kill); + + if (NumSubRegs > 1) + MIB.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine); } } - if (IsM0 && SuperReg != AMDGPU::M0) { - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) - .addReg(SuperReg); + if (M0CopyReg != AMDGPU::NoRegister) { + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), AMDGPU::M0) + .addReg(M0CopyReg, RegState::Kill); } MI->eraseFromParent(); @@ -685,28 +800,38 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, case AMDGPU::SI_SPILL_V128_SAVE: case AMDGPU::SI_SPILL_V96_SAVE: case AMDGPU::SI_SPILL_V64_SAVE: - case AMDGPU::SI_SPILL_V32_SAVE: + case AMDGPU::SI_SPILL_V32_SAVE: { + const MachineOperand *VData = TII->getNamedOperand(*MI, + AMDGPU::OpName::vdata); buildSpillLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET, - TII->getNamedOperand(*MI, AMDGPU::OpName::vdata), + Index, + VData->getReg(), VData->isKill(), TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(), TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg(), - FrameInfo.getObjectOffset(Index) + - TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), RS); + TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), + *MI->memoperands_begin(), + RS); MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode())); MI->eraseFromParent(); break; + } case AMDGPU::SI_SPILL_V32_RESTORE: case AMDGPU::SI_SPILL_V64_RESTORE: case AMDGPU::SI_SPILL_V96_RESTORE: case AMDGPU::SI_SPILL_V128_RESTORE: case AMDGPU::SI_SPILL_V256_RESTORE: case AMDGPU::SI_SPILL_V512_RESTORE: { + const MachineOperand *VData = TII->getNamedOperand(*MI, + AMDGPU::OpName::vdata); + buildSpillLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET, - TII->getNamedOperand(*MI, AMDGPU::OpName::vdata), + Index, + VData->getReg(), VData->isKill(), TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(), TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg(), - FrameInfo.getObjectOffset(Index) + - TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), RS); + TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), + *MI->memoperands_begin(), + RS); MI->eraseFromParent(); break; } diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index ed960c97f7bb..bd83ef1b403d 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -253,9 +253,14 @@ public: private: void buildSpillLoadStore(MachineBasicBlock::iterator MI, - unsigned LoadStoreOp, const MachineOperand *SrcDst, - unsigned ScratchRsrcReg, unsigned ScratchOffset, - int64_t Offset, + unsigned LoadStoreOp, + int Index, + unsigned ValueReg, + bool ValueIsKill, + unsigned ScratchRsrcReg, + unsigned ScratchOffsetReg, + int64_t InstrOffset, + MachineMemOperand *MMO, RegScavenger *RS) const; }; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index d1907d16aba0..5bdd8bec54ea 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -120,6 +120,11 @@ def SCC_CLASS : RegisterClass<"AMDGPU", [i1], 1, (add SCC)> { let isAllocatable = 0; } +def M0_CLASS : RegisterClass<"AMDGPU", [i32], 32, (add M0)> { + let CopyCost = 1; + let isAllocatable = 0; +} + // TODO: Do we need to set DwarfRegAlias on register tuples? // SGPR 32-bit registers @@ -259,8 +264,9 @@ def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32, // Register class for all scalar registers (SGPRs + Special Registers) def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32, - (add SReg_32_XM0, M0, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI)> { + (add SReg_32_XM0, M0_CLASS)> { let AllocationPriority = 1; + let isAllocatable = 0; } def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add SGPR_64Regs)> { diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp index 21518389b8b9..76b0b4573fc5 100644 --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -437,7 +437,7 @@ bool SIWholeQuadMode::requiresCorrectState(const MachineInstr &MI) const { MachineBasicBlock::iterator SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before) { - unsigned SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + unsigned SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); MachineInstr *Save = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg) diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll index aba0b63a254f..617204fdf332 100644 --- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll @@ -1,16 +1,20 @@ -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s | FileCheck -check-prefix=TOSGPR -check-prefix=ALL %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -amdgpu-spill-sgpr-to-smem=1 -verify-machineinstrs < %s | FileCheck -check-prefix=TOSMEM -check-prefix=ALL %s -; CHECK-LABEL: {{^}}max_14_sgprs: +; If spilling to smem, additional registers are used for the resource +; descriptor. + +; ALL-LABEL: {{^}}max_14_sgprs: ; FIXME: Should be ablo to skip this copying of the private segment ; buffer because all the SGPR spills are to VGPRs. -; CHECK: s_mov_b64 s[6:7], s[2:3] -; CHECK: s_mov_b64 s[4:5], s[0:1] - -; CHECK: SGPRBlocks: 1 -; CHECK: NumSGPRsForWavesPerEU: 14 +; ALL: s_mov_b64 s[6:7], s[2:3] +; ALL: s_mov_b64 s[4:5], s[0:1] +; ALL: SGPRBlocks: 1 +; ALL: NumSGPRsForWavesPerEU: 14 define void @max_14_sgprs(i32 addrspace(1)* %out1, + i32 addrspace(1)* %out2, i32 addrspace(1)* %out3, i32 addrspace(1)* %out4, @@ -31,7 +35,7 @@ define void @max_14_sgprs(i32 addrspace(1)* %out1, ; --------------------- ; total: 14 -; + reserved vcc, flat_scratch = 18 +; + reserved vcc, xnack, flat_scratch = 20 ; Because we can't handle re-using the last few input registers as the ; special vcc etc. registers (as well as decide to not use the unused @@ -40,14 +44,14 @@ define void @max_14_sgprs(i32 addrspace(1)* %out1, ; ALL-LABEL: {{^}}max_12_sgprs_14_input_sgprs: ; TOSGPR: SGPRBlocks: 2 -; TOSGPR: NumSGPRsForWavesPerEU: 18 +; TOSGPR: NumSGPRsForWavesPerEU: 20 ; TOSMEM: s_mov_b64 s[6:7], s[2:3] -; TOSMEM: s_mov_b32 s9, s13 ; TOSMEM: s_mov_b64 s[4:5], s[0:1] +; TOSMEM: s_mov_b32 s3, s13 ; TOSMEM: SGPRBlocks: 2 -; TOSMEM: NumSGPRsForWavesPerEU: 18 +; TOSMEM: NumSGPRsForWavesPerEU: 20 define void @max_12_sgprs_14_input_sgprs(i32 addrspace(1)* %out1, i32 addrspace(1)* %out2, i32 addrspace(1)* %out3, @@ -79,12 +83,12 @@ define void @max_12_sgprs_14_input_sgprs(i32 addrspace(1)* %out1, ; ; swapping the order the registers are copied from what normally ; ; happens. -; TOSMEM: s_mov_b64 s[6:7], s[2:3] -; TOSMEM: s_mov_b64 s[4:5], s[0:1] -; TOSMEM: s_mov_b32 s3, s11 +; TOSMEM: s_mov_b32 s5, s11 +; TOSMEM: s_add_u32 m0, s5, +; TOSMEM: s_buffer_store_dword vcc_lo, s[0:3], m0 -; ALL: SGPRBlocks: 1 -; ALL: NumSGPRsForWavesPerEU: 16 +; ALL: SGPRBlocks: 2 +; ALL: NumSGPRsForWavesPerEU: 18 define void @max_12_sgprs_12_input_sgprs(i32 addrspace(1)* %out1, i32 addrspace(1)* %out2, i32 addrspace(1)* %out3, diff --git a/llvm/test/CodeGen/AMDGPU/basic-branch.ll b/llvm/test/CodeGen/AMDGPU/basic-branch.ll index 7bc4d735feb2..83313ed5327c 100644 --- a/llvm/test/CodeGen/AMDGPU/basic-branch.ll +++ b/llvm/test/CodeGen/AMDGPU/basic-branch.ll @@ -1,5 +1,5 @@ ; RUN: llc -O0 -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCNNOOPT -check-prefix=GCN %s -; RUN: llc -O0 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCNNOOPT -check-prefix=GCN %s +; RUN: llc -O0 -march=amdgcn -mcpu=tonga -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCNNOOPT -check-prefix=GCN %s ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCNOPT -check-prefix=GCN %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCNOPT -check-prefix=GCN %s diff --git a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll index 226def93f17f..13383cbc1741 100644 --- a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll @@ -26,9 +26,9 @@ ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]] -; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 ; 8-byte Folded Spill +; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 ; 4-byte Folded Spill ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]] -; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:4 ; 8-byte Folded Spill +; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:4 ; 4-byte Folded Spill ; Spill load ; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill @@ -55,11 +55,11 @@ -; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 ; 8-byte Folded Reload +; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 ; 4-byte Folded Reload ; VMEM: s_waitcnt vmcnt(0) ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]] -; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:4 ; 8-byte Folded Reload +; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:4 ; 4-byte Folded Reload ; VMEM: s_waitcnt vmcnt(0) ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]] @@ -108,9 +108,9 @@ endif: ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]] -; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:16 ; 8-byte Folded Spill +; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:16 ; 4-byte Folded Spill ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]] -; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:20 ; 8-byte Folded Spill +; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:20 ; 4-byte Folded Spill ; GCN: s_mov_b64 exec, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}} @@ -133,11 +133,11 @@ endif: ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]] ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]] -; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:16 ; 8-byte Folded Reload +; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:16 ; 4-byte Folded Reload ; VMEM: s_waitcnt vmcnt(0) ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]] -; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:20 ; 8-byte Folded Reload +; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:20 ; 4-byte Folded Reload ; VMEM: s_waitcnt vmcnt(0) ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]] @@ -187,9 +187,9 @@ end: ; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[SAVEEXEC_HI]], [[SAVEEXEC_HI_LANE:[0-9]+]] ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]] -; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:[[SAVEEXEC_LO_OFFSET:[0-9]+]] ; 8-byte Folded Spill +; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:[[SAVEEXEC_LO_OFFSET:[0-9]+]] ; 4-byte Folded Spill ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]] -; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:[[SAVEEXEC_HI_OFFSET:[0-9]+]] ; 8-byte Folded Spill +; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:[[SAVEEXEC_HI_OFFSET:[0-9]+]] ; 4-byte Folded Spill ; GCN: s_mov_b64 exec, [[CMP0]] ; GCN: s_waitcnt vmcnt(0) expcnt(0) @@ -208,7 +208,7 @@ end: ; VMEM: s_waitcnt vmcnt(0) ; VMEM: v_readfirstlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[FLOW_V_RELOAD_SAVEEXEC_LO]] -; VMEM: buffer_load_dword v[[FLOW_V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:[[SAVEEXEC_HI_OFFSET]] ; 8-byte Folded Reload +; VMEM: buffer_load_dword v[[FLOW_V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:[[SAVEEXEC_HI_OFFSET]] ; 4-byte Folded Reload ; VMEM: s_waitcnt vmcnt(0) ; VMEM: v_readfirstlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[FLOW_V_RELOAD_SAVEEXEC_HI]] @@ -224,9 +224,9 @@ end: ; VMEM: v_mov_b32_e32 v[[FLOW_V_SAVEEXEC_LO:[0-9]+]], s[[FLOW_S_RELOAD_SAVEEXEC_LO]] -; VMEM: buffer_store_dword v[[FLOW_V_SAVEEXEC_LO]], off, s[0:3], s7 offset:[[FLOW_SAVEEXEC_LO_OFFSET:[0-9]+]] ; 8-byte Folded Spill +; VMEM: buffer_store_dword v[[FLOW_V_SAVEEXEC_LO]], off, s[0:3], s7 offset:[[FLOW_SAVEEXEC_LO_OFFSET:[0-9]+]] ; 4-byte Folded Spill ; VMEM: v_mov_b32_e32 v[[FLOW_V_SAVEEXEC_HI:[0-9]+]], s[[FLOW_S_RELOAD_SAVEEXEC_HI]] -; VMEM: buffer_store_dword v[[FLOW_V_SAVEEXEC_HI]], off, s[0:3], s7 offset:[[FLOW_SAVEEXEC_HI_OFFSET:[0-9]+]] ; 8-byte Folded Spill +; VMEM: buffer_store_dword v[[FLOW_V_SAVEEXEC_HI]], off, s[0:3], s7 offset:[[FLOW_SAVEEXEC_HI_OFFSET:[0-9]+]] ; 4-byte Folded Spill ; GCN: buffer_store_dword [[FLOW_VAL]], off, s[0:3], s7 offset:[[RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill ; GCN: s_xor_b64 exec, exec, s{{\[}}[[FLOW_S_RELOAD_SAVEEXEC_LO]]:[[FLOW_S_RELOAD_SAVEEXEC_HI]]{{\]}} @@ -255,11 +255,11 @@ end: ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[FLOW_SAVEEXEC_HI_LANE]] -; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:[[FLOW_SAVEEXEC_LO_OFFSET]] ; 8-byte Folded Reload +; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:[[FLOW_SAVEEXEC_LO_OFFSET]] ; 4-byte Folded Reload ; VMEM: s_waitcnt vmcnt(0) ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]] -; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:[[FLOW_SAVEEXEC_HI_OFFSET]] ; 8-byte Folded Reload +; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:[[FLOW_SAVEEXEC_HI_OFFSET]] ; 4-byte Folded Reload ; VMEM: s_waitcnt vmcnt(0) ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]] diff --git a/llvm/test/CodeGen/AMDGPU/detect-dead-lanes.mir b/llvm/test/CodeGen/AMDGPU/detect-dead-lanes.mir index 057c663036c3..9d70f67ef491 100644 --- a/llvm/test/CodeGen/AMDGPU/detect-dead-lanes.mir +++ b/llvm/test/CodeGen/AMDGPU/detect-dead-lanes.mir @@ -27,9 +27,9 @@ # CHECK: S_NOP 0, implicit undef %5.sub0 name: test0 registers: - - { id: 0, class: sreg_32 } - - { id: 1, class: sreg_32 } - - { id: 2, class: sreg_32 } + - { id: 0, class: sreg_32_xm0 } + - { id: 1, class: sreg_32_xm0 } + - { id: 2, class: sreg_32_xm0 } - { id: 3, class: sreg_128 } - { id: 4, class: sreg_64 } - { id: 5, class: sreg_64 } @@ -87,13 +87,13 @@ registers: - { id: 0, class: sreg_128 } - { id: 1, class: sreg_128 } - { id: 2, class: sreg_64 } - - { id: 3, class: sreg_32 } + - { id: 3, class: sreg_32_xm0 } - { id: 4, class: sreg_128 } - { id: 5, class: sreg_64 } - - { id: 6, class: sreg_32 } - - { id: 7, class: sreg_32 } + - { id: 6, class: sreg_32_xm0 } + - { id: 7, class: sreg_32_xm0 } - { id: 8, class: sreg_64 } - - { id: 9, class: sreg_32 } + - { id: 9, class: sreg_32_xm0 } - { id: 10, class: sreg_128 } body: | bb.0: @@ -162,12 +162,12 @@ body: | name: test2 registers: - - { id: 0, class: sreg_32 } - - { id: 1, class: sreg_32 } + - { id: 0, class: sreg_32_xm0 } + - { id: 1, class: sreg_32_xm0 } - { id: 2, class: sreg_64 } - { id: 3, class: sreg_128 } - - { id: 4, class: sreg_32 } - - { id: 5, class: sreg_32 } + - { id: 4, class: sreg_32_xm0 } + - { id: 5, class: sreg_32_xm0 } - { id: 6, class: sreg_64 } - { id: 7, class: sreg_128 } - { id: 8, class: sreg_64 } @@ -260,7 +260,7 @@ body: | name: test5 tracksRegLiveness: true registers: - - { id: 0, class: sreg_32 } + - { id: 0, class: sreg_32_xm0 } - { id: 1, class: sreg_64 } body: | bb.0: @@ -286,9 +286,9 @@ body: | name: loop0 tracksRegLiveness: true registers: - - { id: 0, class: sreg_32 } - - { id: 1, class: sreg_32 } - - { id: 2, class: sreg_32 } + - { id: 0, class: sreg_32_xm0 } + - { id: 1, class: sreg_32_xm0 } + - { id: 2, class: sreg_32_xm0 } - { id: 3, class: sreg_128 } - { id: 4, class: sreg_128 } - { id: 5, class: sreg_128 } @@ -339,10 +339,10 @@ body: | name: loop1 tracksRegLiveness: true registers: - - { id: 0, class: sreg_32 } - - { id: 1, class: sreg_32 } - - { id: 2, class: sreg_32 } - - { id: 3, class: sreg_32 } + - { id: 0, class: sreg_32_xm0 } + - { id: 1, class: sreg_32_xm0 } + - { id: 2, class: sreg_32_xm0 } + - { id: 3, class: sreg_32_xm0 } - { id: 4, class: sreg_128 } - { id: 5, class: sreg_128 } - { id: 6, class: sreg_128 } @@ -390,7 +390,7 @@ body: | name: loop2 tracksRegLiveness: true registers: - - { id: 0, class: sreg_32 } + - { id: 0, class: sreg_32_xm0 } - { id: 1, class: sreg_128 } - { id: 2, class: sreg_128 } - { id: 3, class: sreg_128 } diff --git a/llvm/test/CodeGen/AMDGPU/inline-constraints.ll b/llvm/test/CodeGen/AMDGPU/inline-constraints.ll index 3c0bb75a6074..1bcbd14009ce 100644 --- a/llvm/test/CodeGen/AMDGPU/inline-constraints.ll +++ b/llvm/test/CodeGen/AMDGPU/inline-constraints.ll @@ -22,10 +22,11 @@ entry: ret void } +; FIXME: Should be able to avoid copy ; GCN-LABEL: {{^}}inline_sreg_constraint_m0: ; GCN: s_mov_b32 m0, -1 -; GCN-NOT: s_mov_b32 s{{[0-9]+}}, m0 -; GCN: ; use m0 +; GCN: s_mov_b32 [[COPY_M0:s[0-9]+]], m0 +; GCN: ; use [[COPY_M0]] define void @inline_sreg_constraint_m0() { %m0 = tail call i32 asm sideeffect "s_mov_b32 m0, -1", "={M0}"() tail call void asm sideeffect "; use $0", "s"(i32 %m0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll index 09732ff0f60e..2569108e7b18 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll @@ -22,7 +22,8 @@ define void @test_readfirstlane_imm(i32 addrspace(1)* %out) #1 { ; TODO: m0 should be folded. ; CHECK-LABEL: {{^}}test_readfirstlane_m0: ; CHECK: s_mov_b32 m0, -1 -; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], m0 +; CHECK: s_mov_b32 [[COPY_M0:s[0-9]+]], m0 +; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], [[COPY_M0]] ; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, [[VVAL]] define void @test_readfirstlane_m0(i32 addrspace(1)* %out) #1 { %m0 = call i32 asm "s_mov_b32 m0, -1", "={M0}"() diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll index 923cd725f821..a9d52b006df0 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll @@ -22,7 +22,8 @@ define void @test_readlane_imm_sreg(i32 addrspace(1)* %out, i32 %src1) #1 { ; TODO: m0 should be folded. ; CHECK-LABEL: {{^}}test_readlane_m0_sreg: ; CHECK: s_mov_b32 m0, -1 -; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], m0 +; CHECK: s_mov_b32 [[COPY_M0:s[0-9]+]], m0 +; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], [[COPY_M0]] ; CHECK: v_readlane_b32 s{{[0-9]+}}, [[VVAL]], s{{[0-9]+}} define void @test_readlane_m0_sreg(i32 addrspace(1)* %out, i32 %src1) #1 { %m0 = call i32 asm "s_mov_b32 m0, -1", "={M0}"() diff --git a/llvm/test/CodeGen/AMDGPU/read_register.ll b/llvm/test/CodeGen/AMDGPU/read_register.ll index 58a9e34b77f2..601a0adb8122 100644 --- a/llvm/test/CodeGen/AMDGPU/read_register.ll +++ b/llvm/test/CodeGen/AMDGPU/read_register.ll @@ -3,9 +3,11 @@ declare i32 @llvm.read_register.i32(metadata) #0 declare i64 @llvm.read_register.i64(metadata) #0 +; FIXME: Should be able to eliminate copy ; CHECK-LABEL: {{^}}test_read_m0: ; CHECK: s_mov_b32 m0, -1 -; CHECK: v_mov_b32_e32 [[COPY:v[0-9]+]], m0 +; CHECK: s_mov_b32 [[COPY_M0:s[0-9]+]], m0 +; CHECK: v_mov_b32_e32 [[COPY:v[0-9]+]], [[COPY_M0]] ; CHECK: buffer_store_dword [[COPY]] define void @test_read_m0(i32 addrspace(1)* %out) #0 { store volatile i32 0, i32 addrspace(3)* undef diff --git a/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll b/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll index c3d9ee7f13fd..ed397f5009c5 100644 --- a/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll +++ b/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll @@ -1,14 +1,44 @@ -; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=SGPR %s +; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-spill-sgpr-to-smem=1 -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=SMEM %s ; Make sure this doesn't crash. -; CHECK: {{^}}test: +; ALL-LABEL: {{^}}test: +; ALL: s_mov_b32 s92, SCRATCH_RSRC_DWORD0 +; ALL: s_mov_b32 s91, s3 + ; Make sure we are handling hazards correctly. -; CHECK: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_readfirstlane_b32 s[[HI:[0-9]+]], [[VHI]] -; CHECK-NEXT: s_nop 4 -; CHECK-NEXT: buffer_store_dword v0, off, s[0:[[HI]]{{\]}}, 0 -; CHECK: s_endpgm +; SGPR: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:12 +; SGPR-NEXT: s_waitcnt vmcnt(0) +; SGPR-NEXT: v_readfirstlane_b32 s[[HI:[0-9]+]], [[VHI]] +; SGPR-NEXT: s_nop 4 +; SGPR-NEXT: buffer_store_dword v0, off, s[0:[[HI]]{{\]}}, 0 + + +; Make sure scratch wave offset register is correctly incremented and +; then restored. +; SMEM: s_mov_b32 m0, s91{{$}} +; SMEM: s_buffer_store_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Spill +; SMEM: s_add_u32 m0, s91, 0x100{{$}} +; SMEM: s_buffer_store_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Spill +; SMEM: s_add_u32 m0, s91, 0x200{{$}} +; SMEM: s_buffer_store_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Spill +; SMEM: s_add_u32 m0, s91, 0x300{{$}} +; SMEM: s_buffer_store_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Spill + + +; SMEM: s_mov_b32 m0, s91{{$}} +; SMEM: s_buffer_load_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Reload +; SMEM: s_add_u32 m0, s91, 0x100{{$}} +; SMEM: s_waitcnt lgkmcnt(0) +; SMEM: s_buffer_load_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Reload +; SMEM: s_add_u32 m0, s91, 0x200{{$}} +; SMEM: s_waitcnt lgkmcnt(0) +; SMEM: s_buffer_load_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Reload +; SMEM: s_add_u32 m0, s91, 0x300{{$}} +; SMEM: s_waitcnt lgkmcnt(0) +; SMEM: s_buffer_load_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Reload + +; ALL: s_endpgm define void @test(i32 addrspace(1)* %out, i32 %in) { call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" () call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" () diff --git a/llvm/test/CodeGen/AMDGPU/spill-m0.ll b/llvm/test/CodeGen/AMDGPU/spill-m0.ll index 74e33d11bedb..ae3b2de30a66 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-m0.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-m0.ll @@ -1,33 +1,47 @@ ; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=1 -march=amdgcn -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=TOVGPR -check-prefix=GCN %s -; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=1 -march=amdgcn -mcpu=tonga -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=TOVGPR -check-prefix=GCN %s +; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=1 -amdgpu-spill-sgpr-to-smem=0 -march=amdgcn -mcpu=tonga -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=TOVGPR -check-prefix=GCN %s ; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=0 -march=amdgcn -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=TOVMEM -check-prefix=GCN %s -; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=0 -march=amdgcn -mattr=+vgpr-spilling -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=TOVMEM -check-prefix=GCN %s +; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=0 -amdgpu-spill-sgpr-to-smem=0 -march=amdgcn -mcpu=tonga -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=TOVMEM -check-prefix=GCN %s +; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=0 -amdgpu-spill-sgpr-to-smem=1 -march=amdgcn -mcpu=tonga -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=TOSMEM -check-prefix=GCN %s ; XXX - Why does it like to use vcc? ; GCN-LABEL: {{^}}spill_m0: -; TOSMEM: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; TOSMEM: s_mov_b32 s84, SCRATCH_RSRC_DWORD0 -; GCN: s_cmp_lg_u32 +; GCN-DAG: s_cmp_lg_u32 -; TOVGPR: s_mov_b32 vcc_hi, m0 -; TOVGPR: v_writelane_b32 [[SPILL_VREG:v[0-9]+]], vcc_hi, 0 +; TOVGPR-DAG: s_mov_b32 [[M0_COPY:s[0-9]+]], m0 +; TOVGPR: v_writelane_b32 [[SPILL_VREG:v[0-9]+]], [[M0_COPY]], 0 -; TOVMEM: v_mov_b32_e32 [[SPILL_VREG:v[0-9]+]], m0 +; TOVMEM-DAG: s_mov_b32 [[M0_COPY:s[0-9]+]], m0 +; TOVMEM-DAG: v_mov_b32_e32 [[SPILL_VREG:v[0-9]+]], [[M0_COPY]] ; TOVMEM: buffer_store_dword [[SPILL_VREG]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} ; 4-byte Folded Spill ; TOVMEM: s_waitcnt vmcnt(0) + +; TOSMEM-DAG: s_mov_b32 [[M0_COPY:s[0-9]+]], m0 +; TOSMEM: s_mov_b32 m0, s3{{$}} +; TOSMEM-NOT: [[M0_COPY]] +; TOSMEM: s_buffer_store_dword [[M0_COPY]], s[84:87], m0 ; 4-byte Folded Spill +; TOSMEM: s_waitcnt lgkmcnt(0) + ; GCN: s_cbranch_scc1 [[ENDIF:BB[0-9]+_[0-9]+]] ; GCN: [[ENDIF]]: -; TOVGPR: v_readlane_b32 vcc_hi, [[SPILL_VREG]], 0 -; TOVGPR: s_mov_b32 m0, vcc_hi +; TOVGPR: v_readlane_b32 [[M0_RESTORE:s[0-9]+]], [[SPILL_VREG]], 0 +; TOVGPR: s_mov_b32 m0, [[M0_RESTORE]] ; TOVMEM: buffer_load_dword [[RELOAD_VREG:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} ; 4-byte Folded Reload ; TOVMEM: s_waitcnt vmcnt(0) -; TOVMEM: v_readfirstlane_b32 vcc_hi, [[RELOAD_VREG]] -; TOVMEM: s_mov_b32 m0, vcc_hi +; TOVMEM: v_readfirstlane_b32 [[M0_RESTORE:s[0-9]+]], [[RELOAD_VREG]] +; TOVMEM: s_mov_b32 m0, [[M0_RESTORE]] -; GCN: s_add_i32 m0, m0, 1 +; TOSMEM: s_mov_b32 m0, s3{{$}} +; TOSMEM: s_buffer_load_dword [[M0_RESTORE:s[0-9]+]], s[84:87], m0 ; 4-byte Folded Reload +; TOSMEM-NOT: [[M0_RESTORE]] +; TOSMEM: s_mov_b32 m0, [[M0_RESTORE]] + +; GCN: s_add_i32 s{{[0-9]+}}, m0, 1 define void @spill_m0(i32 %cond, i32 addrspace(1)* %out) #0 { entry: %m0 = call i32 asm sideeffect "s_mov_b32 m0, 0", "={M0}"() #0 @@ -47,7 +61,33 @@ endif: @lds = internal addrspace(3) global [64 x float] undef ; GCN-LABEL: {{^}}spill_m0_lds: +; GCN: s_mov_b32 m0, s6 +; GCN: v_interp_mov_f32 + +; TOSMEM: s_mov_b32 vcc_hi, m0 +; TOSMEM: s_mov_b32 m0, s7 +; TOSMEM-NEXT: s_buffer_store_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Spill +; TOSMEM: s_mov_b32 m0, vcc_hi + +; TOSMEM: s_mov_b32 vcc_hi, m0 +; TOSMEM: s_add_u32 m0, s7, 0x100 +; TOSMEM: s_buffer_store_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Spill +; TOSMEM: s_add_u32 m0, s7, 0x200 +; TOSMEM: s_buffer_store_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Spill +; TOSMEM: s_mov_b32 m0, vcc_hi + +; TOSMEM: s_mov_b64 exec, +; TOSMEM: s_cbranch_execz +; TOSMEM: s_branch + +; TOSMEM: BB{{[0-9]+_[0-9]+}}: +; TOSMEM-NEXT: s_add_u32 m0, s7, 0x100 +; TOSMEM-NEXT: s_buffer_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Reload + + ; GCN-NOT: v_readlane_b32 m0 +; GCN-NOT: s_buffer_store_dword m0 +; GCN-NOT: s_buffer_load_dword m0 define amdgpu_ps void @spill_m0_lds(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) #0 { main_body: %4 = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %3) @@ -71,6 +111,52 @@ endif: ret void } +; GCN-LABEL: {{^}}restore_m0_lds: +; TOSMEM: s_cmp_eq_u32 +; TOSMEM: s_mov_b32 vcc_hi, m0 +; TOSMEM: s_mov_b32 m0, s3 +; TOSMEM: s_buffer_store_dword s4, s[84:87], m0 ; 4-byte Folded Spill +; TOSMEM: s_mov_b32 m0, vcc_hi +; TOSMEM: s_cbranch_scc1 + +; TOSMEM: s_mov_b32 m0, -1 + +; TOSMEM: s_mov_b32 vcc_hi, m0 +; TOSMEM: s_mov_b32 m0, s3 +; TOSMEM: s_buffer_load_dword s4, s[84:87], m0 ; 4-byte Folded Reload +; TOSMEM: s_add_u32 m0, s3, 0x100 +; TOSMEM: s_waitcnt lgkmcnt(0) +; TOSMEM: s_buffer_load_dword s5, s[84:87], m0 ; 4-byte Folded Reload +; TOSMEM: s_mov_b32 m0, vcc_hi +; TOSMEM: s_waitcnt lgkmcnt(0) + +; TOSMEM: ds_write_b64 + +; TOSMEM: s_mov_b32 vcc_hi, m0 +; TOSMEM: s_add_u32 m0, s3, 0x200 +; TOSMEM: s_buffer_load_dword s0, s[84:87], m0 ; 4-byte Folded Reload +; TOSMEM: s_mov_b32 m0, vcc_hi +; TOSMEM: s_waitcnt lgkmcnt(0) +; TOSMEM: s_mov_b32 m0, s0 +; TOSMEM: ; use m0 + +; TOSMEM: s_dcache_wb +; TOSMEM: s_endpgm +define void @restore_m0_lds(i32 %arg) { + %m0 = call i32 asm sideeffect "s_mov_b32 m0, 0", "={M0}"() #0 + %sval = load volatile i64, i64 addrspace(2)* undef + %cmp = icmp eq i32 %arg, 0 + br i1 %cmp, label %ret, label %bb + +bb: + store volatile i64 %sval, i64 addrspace(3)* undef + call void asm sideeffect "; use $0", "{M0}"(i32 %m0) #0 + br label %ret + +ret: + ret void +} + declare float @llvm.SI.fs.constant(i32, i32, i32) readnone declare i32 @llvm.SI.packf16(float, float) readnone diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll index 52fa0bec61ae..35a014bf7249 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll @@ -20,8 +20,8 @@ ; VI-DAG: s_mov_b32 s15, 0xe80000 ; s11 is offset system SGPR -; GCN: buffer_store_dword {{v[0-9]+}}, off, s[12:15], s11 offset:{{[0-9]+}} ; 16-byte Folded Spill -; GCN: buffer_load_dword v{{[0-9]+}}, off, s[12:15], s11 offset:{{[0-9]+}} ; 16-byte Folded Reload +; GCN: buffer_store_dword {{v[0-9]+}}, off, s[12:15], s11 offset:{{[0-9]+}} ; 4-byte Folded Spill +; GCN: buffer_load_dword v{{[0-9]+}}, off, s[12:15], s11 offset:{{[0-9]+}} ; 4-byte Folded Reload ; GCN: NumVgprs: 256 ; GCN: ScratchSize: 1024 diff --git a/llvm/test/CodeGen/MIR/AMDGPU/scalar-store-cache-flush.mir b/llvm/test/CodeGen/MIR/AMDGPU/scalar-store-cache-flush.mir new file mode 100644 index 000000000000..af71086e542f --- /dev/null +++ b/llvm/test/CodeGen/MIR/AMDGPU/scalar-store-cache-flush.mir @@ -0,0 +1,173 @@ +# RUN: llc -march=amdgcn -run-pass si-insert-waits %s -o - | FileCheck %s + +--- | + define void @basic_insert_dcache_wb() { + ret void + } + + define void @explicit_flush_after() { + ret void + } + + define void @explicit_flush_before() { + ret void + } + + define void @no_scalar_store() { + ret void + } + + define void @multi_block_store() { + bb0: + br i1 undef, label %bb1, label %bb2 + + bb1: + ret void + + bb2: + ret void + } + + define void @one_block_store() { + bb0: + br i1 undef, label %bb1, label %bb2 + + bb1: + ret void + + bb2: + ret void + } + + define amdgpu_ps float @si_return() { + ret float undef + } + +... +--- +# CHECK-LABEL: name: basic_insert_dcache_wb +# CHECK: bb.0: +# CHECK-NEXT: S_STORE_DWORD +# CHECK-NEXT: S_DCACHE_WB +# CHECK-NEXT: S_ENDPGM + +name: basic_insert_dcache_wb +tracksRegLiveness: false + +body: | + bb.0: + S_STORE_DWORD_SGPR undef %sgpr2, undef %sgpr0_sgpr1, undef %m0, 0 + S_ENDPGM +... +--- +# Already has an explicitly requested flush after the last store. +# CHECK-LABEL: name: explicit_flush_after +# CHECK: bb.0: +# CHECK-NEXT: S_STORE_DWORD +# CHECK-NEXT: S_DCACHE_WB +# CHECK-NEXT: S_ENDPGM + +name: explicit_flush_after +tracksRegLiveness: false + +body: | + bb.0: + S_STORE_DWORD_SGPR undef %sgpr2, undef %sgpr0_sgpr1, undef %m0, 0 + S_DCACHE_WB + S_ENDPGM +... +--- +# Already has an explicitly requested flush before the last store. +# CHECK-LABEL: name: explicit_flush_before +# CHECK: bb.0: +# CHECK-NEXT: S_DCACHE_WB +# CHECK-NEXT: S_STORE_DWORD +# CHECK-NEXT: S_DCACHE_WB +# CHECK-NEXT: S_ENDPGM + +name: explicit_flush_before +tracksRegLiveness: false + +body: | + bb.0: + S_DCACHE_WB + S_STORE_DWORD_SGPR undef %sgpr2, undef %sgpr0_sgpr1, undef %m0, 0 + S_ENDPGM +... +--- +# CHECK-LABEL: no_scalar_store +# CHECK: bb.0 +# CHECK-NEXT: S_ENDPGM +name: no_scalar_store +tracksRegLiveness: false + +body: | + bb.0: + S_ENDPGM +... + +# CHECK-LABEL: name: multi_block_store +# CHECK: bb.0: +# CHECK-NEXT: S_STORE_DWORD +# CHECK-NEXT: S_DCACHE_WB +# CHECK-NEXT: S_ENDPGM + +# CHECK: bb.1: +# CHECK-NEXT: S_STORE_DWORD +# CHECK-NEXT: S_DCACHE_WB +# CHECK-NEXT: S_ENDPGM + +name: multi_block_store +tracksRegLiveness: false + +body: | + bb.0: + S_STORE_DWORD_SGPR undef %sgpr2, undef %sgpr0_sgpr1, undef %m0, 0 + S_ENDPGM + + bb.1: + S_STORE_DWORD_SGPR undef %sgpr4, undef %sgpr6_sgpr7, undef %m0, 0 + S_ENDPGM +... +... + +# This one should be able to omit the flush in the storeless block but +# this isn't handled now. + +# CHECK-LABEL: name: one_block_store +# CHECK: bb.0: +# CHECK-NEXT: S_DCACHE_WB +# CHECK-NEXT: S_ENDPGM + +# CHECK: bb.1: +# CHECK-NEXT: S_STORE_DWORD +# CHECK-NEXT: S_DCACHE_WB +# CHECK-NEXT: S_ENDPGM + +name: one_block_store +tracksRegLiveness: false + +body: | + bb.0: + S_ENDPGM + + bb.1: + S_STORE_DWORD_SGPR undef %sgpr4, undef %sgpr6_sgpr7, undef %m0, 0 + S_ENDPGM +... +--- +# CHECK-LABEL: name: si_return +# CHECK: bb.0: +# CHECK-NEXT: S_STORE_DWORD +# CHECK-NEXT: S_WAITCNT +# CHECK-NEXT: S_DCACHE_WB +# CHECK-NEXT: SI_RETURN + +name: si_return +tracksRegLiveness: false + +body: | + bb.0: + S_STORE_DWORD_SGPR undef %sgpr2, undef %sgpr0_sgpr1, undef %m0, 0 + SI_RETURN undef %vgpr0 +... diff --git a/llvm/test/CodeGen/MIR/AMDGPU/si-fix-sgpr-copies.mir b/llvm/test/CodeGen/MIR/AMDGPU/si-fix-sgpr-copies.mir index 016a6e6fd063..0c08deb13a8e 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/si-fix-sgpr-copies.mir +++ b/llvm/test/CodeGen/MIR/AMDGPU/si-fix-sgpr-copies.mir @@ -6,14 +6,14 @@ name: phi_visit_order tracksRegLiveness: true registers: - - { id: 0, class: sreg_32 } + - { id: 0, class: sreg_32_xm0 } - { id: 1, class: sreg_64 } - - { id: 2, class: sreg_32 } + - { id: 2, class: sreg_32_xm0 } - { id: 7, class: vgpr_32 } - - { id: 8, class: sreg_32 } + - { id: 8, class: sreg_32_xm0 } - { id: 9, class: vgpr_32 } - { id: 10, class: sreg_64 } - - { id: 11, class: sreg_32 } + - { id: 11, class: sreg_32_xm0 } body: | ; GCN-LABEL: name: phi_visit_order