diff --git a/llvm/include/llvm/CodeGen/TargetFrameLowering.h b/llvm/include/llvm/CodeGen/TargetFrameLowering.h index a0beee36c748..c7d4c4d7e5d4 100644 --- a/llvm/include/llvm/CodeGen/TargetFrameLowering.h +++ b/llvm/include/llvm/CodeGen/TargetFrameLowering.h @@ -309,13 +309,6 @@ public: RegScavenger *RS = nullptr) const { } - /// processFunctionBeforeFrameIndicesReplaced - This method is called - /// immediately before MO_FrameIndex operands are eliminated, but after the - /// frame is finalized. This method is optional. - virtual void - processFunctionBeforeFrameIndicesReplaced(MachineFunction &MF, - RegScavenger *RS = nullptr) const {} - virtual unsigned getWinEHParentFrameOffset(const MachineFunction &MF) const { report_fatal_error("WinEH not implemented for this target"); } diff --git a/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/llvm/lib/CodeGen/PrologEpilogInserter.cpp index d583643ac68f..3909b5717281 100644 --- a/llvm/lib/CodeGen/PrologEpilogInserter.cpp +++ b/llvm/lib/CodeGen/PrologEpilogInserter.cpp @@ -259,10 +259,6 @@ bool PEI::runOnMachineFunction(MachineFunction &MF) { for (auto &I : EntryDbgValues) I.first->insert(I.first->begin(), I.second.begin(), I.second.end()); - // Allow the target machine to make final modifications to the function - // before the frame layout is finalized. - TFI->processFunctionBeforeFrameIndicesReplaced(MF, RS); - // Replace all MO_FrameIndex operands with physical register references // and actual offsets. // diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index 97162ae22187..3b8f8a19fe49 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -349,38 +349,22 @@ bool AArch64ExpandPseudo::expandSetTagLoop( MachineBasicBlock::iterator &NextMBBI) { MachineInstr &MI = *MBBI; DebugLoc DL = MI.getDebugLoc(); - Register SizeReg = MI.getOperand(0).getReg(); - Register AddressReg = MI.getOperand(1).getReg(); + Register SizeReg = MI.getOperand(2).getReg(); + Register AddressReg = MI.getOperand(3).getReg(); MachineFunction *MF = MBB.getParent(); bool ZeroData = MI.getOpcode() == AArch64::STZGloop; - const unsigned OpCode1 = - ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex; - const unsigned OpCode2 = + const unsigned OpCode = ZeroData ? AArch64::STZ2GPostIndex : AArch64::ST2GPostIndex; - unsigned Size = MI.getOperand(2).getImm(); - assert(Size > 0 && Size % 16 == 0); - if (Size % (16 * 2) != 0) { - BuildMI(MBB, MBBI, DL, TII->get(OpCode1), AddressReg) - .addReg(AddressReg) - .addReg(AddressReg) - .addImm(1); - Size -= 16; - } - MachineBasicBlock::iterator I = - BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVi64imm), SizeReg) - .addImm(Size); - expandMOVImm(MBB, I, 64); - auto LoopBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); auto DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); MF->insert(++MBB.getIterator(), LoopBB); MF->insert(++LoopBB->getIterator(), DoneBB); - BuildMI(LoopBB, DL, TII->get(OpCode2)) + BuildMI(LoopBB, DL, TII->get(OpCode)) .addDef(AddressReg) .addReg(AddressReg) .addReg(AddressReg) diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index 39d32863f15b..c732106014e6 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -170,11 +170,6 @@ static cl::opt cl::desc("reverse the CSR restore sequence"), cl::init(false), cl::Hidden); -static cl::opt StackTaggingMergeSetTag( - "stack-tagging-merge-settag", - cl::desc("merge settag instruction in function epilog"), cl::init(true), - cl::Hidden); - STATISTIC(NumRedZoneFunctions, "Number of functions using red zone"); /// This is the biggest offset to the stack pointer we can encode in aarch64 @@ -485,39 +480,6 @@ bool AArch64FrameLowering::shouldCombineCSRLocalStackBump( return true; } -bool AArch64FrameLowering::shouldCombineCSRLocalStackBumpInEpilogue( - MachineBasicBlock &MBB, unsigned StackBumpBytes) const { - if (!shouldCombineCSRLocalStackBump(*MBB.getParent(), StackBumpBytes)) - return false; - - if (MBB.empty()) - return true; - - // Disable combined SP bump if the last instruction is an MTE tag store. It - // is almost always better to merge SP adjustment into those instructions. - MachineBasicBlock::iterator LastI = MBB.getFirstTerminator(); - MachineBasicBlock::iterator Begin = MBB.begin(); - while (LastI != Begin) { - --LastI; - if (LastI->isTransient()) - continue; - if (!LastI->getFlag(MachineInstr::FrameDestroy)) - break; - } - switch (LastI->getOpcode()) { - case AArch64::STGloop: - case AArch64::STZGloop: - case AArch64::STGOffset: - case AArch64::STZGOffset: - case AArch64::ST2GOffset: - case AArch64::STZ2GOffset: - return false; - default: - return true; - } - llvm_unreachable("unreachable"); -} - // Given a load or a store instruction, generate an appropriate unwinding SEH // code on Windows. static MachineBasicBlock::iterator InsertSEH(MachineBasicBlock::iterator MBBI, @@ -1497,7 +1459,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, // function. if (MF.hasEHFunclets()) AFI->setLocalStackSize(NumBytes - PrologueSaveSize); - bool CombineSPBump = shouldCombineCSRLocalStackBumpInEpilogue(MBB, NumBytes); + bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes); // Assume we can't combine the last pop with the sp restore. if (!CombineSPBump && PrologueSaveSize != 0) { @@ -2675,399 +2637,9 @@ void AArch64FrameLowering::processFunctionBeforeFrameFinalized( .addImm(0); } -namespace { -struct TagStoreInstr { - MachineInstr *MI; - int64_t Offset, Size; - explicit TagStoreInstr(MachineInstr *MI, int64_t Offset, int64_t Size) - : MI(MI), Offset(Offset), Size(Size) {} -}; - -class TagStoreEdit { - MachineFunction *MF; - MachineBasicBlock *MBB; - MachineRegisterInfo *MRI; - // Tag store instructions that are being replaced. - SmallVector TagStores; - // Combined memref arguments of the above instructions. - SmallVector CombinedMemRefs; - - // Replace allocation tags in [FrameReg + FrameRegOffset, FrameReg + - // FrameRegOffset + Size) with the address tag of SP. - Register FrameReg; - StackOffset FrameRegOffset; - int64_t Size; - // If not None, move FrameReg to (FrameReg + FrameRegUpdate) at the end. - Optional FrameRegUpdate; - // MIFlags for any FrameReg updating instructions. - unsigned FrameRegUpdateFlags; - - // Use zeroing instruction variants. - bool ZeroData; - DebugLoc DL; - - void emitUnrolled(MachineBasicBlock::iterator InsertI); - void emitLoop(MachineBasicBlock::iterator InsertI); - -public: - TagStoreEdit(MachineBasicBlock *MBB, bool ZeroData) - : MBB(MBB), ZeroData(ZeroData) { - MF = MBB->getParent(); - MRI = &MF->getRegInfo(); - } - // Add an instruction to be replaced. Instructions must be added in the - // ascending order of Offset, and have to be adjacent. - void addInstruction(TagStoreInstr I) { - assert((TagStores.empty() || - TagStores.back().Offset + TagStores.back().Size == I.Offset) && - "Non-adjacent tag store instructions."); - TagStores.push_back(I); - } - void clear() { TagStores.clear(); } - // Emit equivalent code at the given location, and erase the current set of - // instructions. May skip if the replacement is not profitable. May invalidate - // the input iterator and replace it with a valid one. - void emitCode(MachineBasicBlock::iterator &InsertI, - const AArch64FrameLowering *TFI, bool IsLast); -}; - -void TagStoreEdit::emitUnrolled(MachineBasicBlock::iterator InsertI) { - const AArch64InstrInfo *TII = - MF->getSubtarget().getInstrInfo(); - - const int64_t kMinOffset = -256 * 16; - const int64_t kMaxOffset = 255 * 16; - - Register BaseReg = FrameReg; - int64_t BaseRegOffsetBytes = FrameRegOffset.getBytes(); - if (BaseRegOffsetBytes < kMinOffset || - BaseRegOffsetBytes + (Size - Size % 32) > kMaxOffset) { - Register ScratchReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass); - emitFrameOffset(*MBB, InsertI, DL, ScratchReg, BaseReg, - {BaseRegOffsetBytes, MVT::i8}, TII); - BaseReg = ScratchReg; - BaseRegOffsetBytes = 0; - } - - MachineInstr *LastI = nullptr; - while (Size) { - int64_t InstrSize = (Size > 16) ? 32 : 16; - unsigned Opcode = - InstrSize == 16 - ? (ZeroData ? AArch64::STZGOffset : AArch64::STGOffset) - : (ZeroData ? AArch64::STZ2GOffset : AArch64::ST2GOffset); - MachineInstr *I = BuildMI(*MBB, InsertI, DL, TII->get(Opcode)) - .addReg(AArch64::SP) - .addReg(BaseReg) - .addImm(BaseRegOffsetBytes / 16) - .setMemRefs(CombinedMemRefs); - // A store to [BaseReg, #0] should go last for an opportunity to fold the - // final SP adjustment in the epilogue. - if (BaseRegOffsetBytes == 0) - LastI = I; - BaseRegOffsetBytes += InstrSize; - Size -= InstrSize; - } - - if (LastI) - MBB->splice(InsertI, MBB, LastI); -} - -void TagStoreEdit::emitLoop(MachineBasicBlock::iterator InsertI) { - const AArch64InstrInfo *TII = - MF->getSubtarget().getInstrInfo(); - - Register BaseReg = FrameRegUpdate - ? FrameReg - : MRI->createVirtualRegister(&AArch64::GPR64RegClass); - Register SizeReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass); - - emitFrameOffset(*MBB, InsertI, DL, BaseReg, FrameReg, FrameRegOffset, TII); - - int64_t LoopSize = Size; - // If the loop size is not a multiple of 32, split off one 16-byte store at - // the end to fold BaseReg update into. - if (FrameRegUpdate && *FrameRegUpdate) - LoopSize -= LoopSize % 32; - MachineInstr *LoopI = - BuildMI(*MBB, InsertI, DL, - TII->get(ZeroData ? AArch64::STZGloop : AArch64::STGloop)) - .addDef(SizeReg) - .addDef(BaseReg) - .addImm(LoopSize) - .addReg(BaseReg) - .setMemRefs(CombinedMemRefs); - if (FrameRegUpdate) - LoopI->setFlags(FrameRegUpdateFlags); - - int64_t ExtraBaseRegUpdate = - FrameRegUpdate ? (*FrameRegUpdate - FrameRegOffset.getBytes() - Size) : 0; - if (LoopSize < Size) { - assert(FrameRegUpdate); - assert(Size - LoopSize == 16); - // Tag 16 more bytes at BaseReg and update BaseReg. - BuildMI(*MBB, InsertI, DL, - TII->get(ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex)) - .addDef(BaseReg) - .addReg(BaseReg) - .addReg(BaseReg) - .addImm(1 + ExtraBaseRegUpdate / 16) - .setMemRefs(CombinedMemRefs) - .setMIFlags(FrameRegUpdateFlags); - } else if (ExtraBaseRegUpdate) { - // Update BaseReg. - BuildMI( - *MBB, InsertI, DL, - TII->get(ExtraBaseRegUpdate > 0 ? AArch64::ADDXri : AArch64::SUBXri)) - .addDef(BaseReg) - .addReg(BaseReg) - .addImm(std::abs(ExtraBaseRegUpdate)) - .addImm(0) - .setMIFlags(FrameRegUpdateFlags); - } -} - -// Check if *II is a register update that can be merged into STGloop that ends -// at (Reg + Size). RemainingOffset is the required adjustment to Reg after the -// end of the loop. -bool canMergeRegUpdate(MachineBasicBlock::iterator II, unsigned Reg, - int64_t Size, int64_t *TotalOffset) { - MachineInstr &MI = *II; - if ((MI.getOpcode() == AArch64::ADDXri || - MI.getOpcode() == AArch64::SUBXri) && - MI.getOperand(0).getReg() == Reg && MI.getOperand(1).getReg() == Reg) { - unsigned Shift = AArch64_AM::getShiftValue(MI.getOperand(3).getImm()); - int64_t Offset = MI.getOperand(2).getImm() << Shift; - if (MI.getOpcode() == AArch64::SUBXri) - Offset = -Offset; - int64_t AbsPostOffset = std::abs(Offset - Size); - const int64_t kMaxOffset = - 0xFFF; // Max encoding for unshifted ADDXri / SUBXri - if (AbsPostOffset <= kMaxOffset && AbsPostOffset % 16 == 0) { - *TotalOffset = Offset; - return true; - } - } - return false; -} - -void mergeMemRefs(const SmallVectorImpl &TSE, - SmallVectorImpl &MemRefs) { - MemRefs.clear(); - for (auto &TS : TSE) { - MachineInstr *MI = TS.MI; - // An instruction without memory operands may access anything. Be - // conservative and return an empty list. - if (MI->memoperands_empty()) { - MemRefs.clear(); - return; - } - MemRefs.append(MI->memoperands_begin(), MI->memoperands_end()); - } -} - -void TagStoreEdit::emitCode(MachineBasicBlock::iterator &InsertI, - const AArch64FrameLowering *TFI, bool IsLast) { - if (TagStores.empty()) - return; - TagStoreInstr &FirstTagStore = TagStores[0]; - TagStoreInstr &LastTagStore = TagStores[TagStores.size() - 1]; - Size = LastTagStore.Offset - FirstTagStore.Offset + LastTagStore.Size; - DL = TagStores[0].MI->getDebugLoc(); - - unsigned Reg; - FrameRegOffset = TFI->resolveFrameOffsetReference( - *MF, FirstTagStore.Offset, false /*isFixed*/, false /*isSVE*/, Reg, - /*PreferFP=*/false, /*ForSimm=*/true); - FrameReg = Reg; - FrameRegUpdate = None; - - mergeMemRefs(TagStores, CombinedMemRefs); - - LLVM_DEBUG(dbgs() << "Replacing adjacent STG instructions:\n"; - for (const auto &Instr - : TagStores) { dbgs() << " " << *Instr.MI; }); - - // Size threshold where a loop becomes shorter than a linear sequence of - // tagging instructions. - const int kSetTagLoopThreshold = 176; - if (Size < kSetTagLoopThreshold) { - if (TagStores.size() < 2) - return; - emitUnrolled(InsertI); - } else { - MachineInstr *UpdateInstr = nullptr; - int64_t TotalOffset; - if (IsLast) { - // See if we can merge base register update into the STGloop. - // This is done in AArch64LoadStoreOptimizer for "normal" stores, - // but STGloop is way too unusual for that, and also it only - // realistically happens in function epilogue. Also, STGloop is expanded - // before that pass. - if (InsertI != MBB->end() && - canMergeRegUpdate(InsertI, FrameReg, FrameRegOffset.getBytes() + Size, - &TotalOffset)) { - UpdateInstr = &*InsertI++; - LLVM_DEBUG(dbgs() << "Folding SP update into loop:\n " - << *UpdateInstr); - } - } - - if (!UpdateInstr && TagStores.size() < 2) - return; - - if (UpdateInstr) { - FrameRegUpdate = TotalOffset; - FrameRegUpdateFlags = UpdateInstr->getFlags(); - } - emitLoop(InsertI); - if (UpdateInstr) - UpdateInstr->eraseFromParent(); - } - - for (auto &TS : TagStores) - TS.MI->eraseFromParent(); -} - -bool isMergeableStackTaggingInstruction(MachineInstr &MI, int64_t &Offset, - int64_t &Size, bool &ZeroData) { - MachineFunction &MF = *MI.getParent()->getParent(); - const MachineFrameInfo &MFI = MF.getFrameInfo(); - - unsigned Opcode = MI.getOpcode(); - ZeroData = (Opcode == AArch64::STZGloop || Opcode == AArch64::STZGOffset || - Opcode == AArch64::STZ2GOffset); - - if (Opcode == AArch64::STGloop || Opcode == AArch64::STZGloop) { - if (!MI.getOperand(0).isDead() || !MI.getOperand(1).isDead()) - return false; - if (!MI.getOperand(2).isImm() || !MI.getOperand(3).isFI()) - return false; - Offset = MFI.getObjectOffset(MI.getOperand(3).getIndex()); - Size = MI.getOperand(2).getImm(); - return true; - } - - if (Opcode == AArch64::STGOffset || Opcode == AArch64::STZGOffset) - Size = 16; - else if (Opcode == AArch64::ST2GOffset || Opcode == AArch64::STZ2GOffset) - Size = 32; - else - return false; - - if (MI.getOperand(0).getReg() != AArch64::SP || !MI.getOperand(1).isFI()) - return false; - - Offset = MFI.getObjectOffset(MI.getOperand(1).getIndex()) + - 16 * MI.getOperand(2).getImm(); - return true; -} - -// Detect a run of memory tagging instructions for adjacent stack frame slots, -// and replace them with a shorter instruction sequence: -// * replace STG + STG with ST2G -// * replace STGloop + STGloop with STGloop -// This code needs to run when stack slot offsets are already known, but before -// FrameIndex operands in STG instructions are eliminated. -MachineBasicBlock::iterator tryMergeAdjacentSTG(MachineBasicBlock::iterator II, - const AArch64FrameLowering *TFI, - RegScavenger *RS) { - bool FirstZeroData; - int64_t Size, Offset; - MachineInstr &MI = *II; - MachineBasicBlock *MBB = MI.getParent(); - MachineBasicBlock::iterator NextI = ++II; - if (&MI == &MBB->instr_back()) - return II; - if (!isMergeableStackTaggingInstruction(MI, Offset, Size, FirstZeroData)) - return II; - - SmallVector Instrs; - Instrs.emplace_back(&MI, Offset, Size); - - constexpr int kScanLimit = 10; - int Count = 0; - for (MachineBasicBlock::iterator E = MBB->end(); - NextI != E && Count < kScanLimit; ++NextI) { - MachineInstr &MI = *NextI; - bool ZeroData; - int64_t Size, Offset; - // Collect instructions that update memory tags with a FrameIndex operand - // and (when applicable) constant size, and whose output registers are dead - // (the latter is almost always the case in practice). Since these - // instructions effectively have no inputs or outputs, we are free to skip - // any non-aliasing instructions in between without tracking used registers. - if (isMergeableStackTaggingInstruction(MI, Offset, Size, ZeroData)) { - if (ZeroData != FirstZeroData) - break; - Instrs.emplace_back(&MI, Offset, Size); - continue; - } - - // Only count non-transient, non-tagging instructions toward the scan - // limit. - if (!MI.isTransient()) - ++Count; - - // Just in case, stop before the epilogue code starts. - if (MI.getFlag(MachineInstr::FrameSetup) || - MI.getFlag(MachineInstr::FrameDestroy)) - break; - - // Reject anything that may alias the collected instructions. - if (MI.mayLoadOrStore() || MI.hasUnmodeledSideEffects()) - break; - } - - // New code will be inserted after the last tagging instruction we've found. - MachineBasicBlock::iterator InsertI = Instrs.back().MI; - InsertI++; - - llvm::stable_sort(Instrs, - [](const TagStoreInstr &Left, const TagStoreInstr &Right) { - return Left.Offset < Right.Offset; - }); - - // Make sure that we don't have any overlapping stores. - int64_t CurOffset = Instrs[0].Offset; - for (auto &Instr : Instrs) { - if (CurOffset > Instr.Offset) - return NextI; - CurOffset = Instr.Offset + Instr.Size; - } - - // Find contiguous runs of tagged memory and emit shorter instruction - // sequencies for them when possible. - TagStoreEdit TSE(MBB, FirstZeroData); - Optional EndOffset; - for (auto &Instr : Instrs) { - if (EndOffset && *EndOffset != Instr.Offset) { - // Found a gap. - TSE.emitCode(InsertI, TFI, /*IsLast = */ false); - TSE.clear(); - } - - TSE.addInstruction(Instr); - EndOffset = Instr.Offset + Instr.Size; - } - - TSE.emitCode(InsertI, TFI, /*IsLast = */ true); - - return InsertI; -} -} // namespace - -void AArch64FrameLowering::processFunctionBeforeFrameIndicesReplaced( - MachineFunction &MF, RegScavenger *RS = nullptr) const { - if (StackTaggingMergeSetTag) - for (auto &BB : MF) - for (MachineBasicBlock::iterator II = BB.begin(); II != BB.end();) - II = tryMergeAdjacentSTG(II, this, RS); -} - -/// For Win64 AArch64 EH, the offset to the Unwind object is from the SP -/// before the update. This is easily retrieved as it is exactly the offset -/// that is set in processFunctionBeforeFrameFinalized. +/// For Win64 AArch64 EH, the offset to the Unwind object is from the SP before +/// the update. This is easily retrieved as it is exactly the offset that is set +/// in processFunctionBeforeFrameFinalized. int AArch64FrameLowering::getFrameIndexReferencePreferSP( const MachineFunction &MF, int FI, unsigned &FrameReg, bool IgnoreSPUpdates) const { diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h index 57a7924fb8f8..b5719feb6b15 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h @@ -77,10 +77,6 @@ public: void processFunctionBeforeFrameFinalized(MachineFunction &MF, RegScavenger *RS) const override; - void - processFunctionBeforeFrameIndicesReplaced(MachineFunction &MF, - RegScavenger *RS) const override; - unsigned getWinEHParentFrameOffset(const MachineFunction &MF) const override; unsigned getWinEHFuncletFrameSize(const MachineFunction &MF) const; @@ -111,8 +107,6 @@ private: int64_t assignSVEStackObjectOffsets(MachineFrameInfo &MF, int &MinCSFrameIndex, int &MaxCSFrameIndex) const; - bool shouldCombineCSRLocalStackBumpInEpilogue(MachineBasicBlock &MBB, - unsigned StackBumpBytes) const; }; } // End llvm namespace diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 0ed2a678c4f0..54f3f7c10132 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -3458,8 +3458,6 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, case AArch64::ST1Fourv1d: case AArch64::IRG: case AArch64::IRGstack: - case AArch64::STGloop: - case AArch64::STZGloop: return AArch64FrameOffsetCannotUpdate; } diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 04a23f31ffd6..f4d340c9f06a 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -1514,17 +1514,17 @@ def TAGPstack // register / expression for the tagged base pointer of the current function. def : Pat<(int_aarch64_irg_sp i64:$Rm), (IRGstack SP, i64:$Rm)>; -// Large STG to be expanded into a loop. $sz is the size, $Rn is start address. -// $Rn_wback is one past the end of the range. $Rm is the loop counter. +// Large STG to be expanded into a loop. $Rm is the size, $Rn is start address. +// $Rn_wback is one past the end of the range. let isCodeGenOnly=1, mayStore=1 in { def STGloop - : Pseudo<(outs GPR64common:$Rm, GPR64sp:$Rn_wback), (ins i64imm:$sz, GPR64sp:$Rn), - [], "$Rn = $Rn_wback,@earlyclobber $Rn_wback,@earlyclobber $Rm" >, + : Pseudo<(outs GPR64common:$Rm_wback, GPR64sp:$Rn_wback), (ins GPR64common:$Rm, GPR64sp:$Rn), + [], "$Rn = $Rn_wback,@earlyclobber $Rn_wback,$Rm = $Rm_wback,@earlyclobber $Rm_wback" >, Sched<[WriteAdr, WriteST]>; def STZGloop - : Pseudo<(outs GPR64common:$Rm, GPR64sp:$Rn_wback), (ins i64imm:$sz, GPR64sp:$Rn), - [], "$Rn = $Rn_wback,@earlyclobber $Rn_wback,@earlyclobber $Rm" >, + : Pseudo<(outs GPR64common:$Rm_wback, GPR64sp:$Rn_wback), (ins GPR64common:$Rm, GPR64sp:$Rn), + [], "$Rn = $Rn_wback,@earlyclobber $Rn_wback,$Rm = $Rm_wback,@earlyclobber $Rm_wback" >, Sched<[WriteAdr, WriteST]>; } diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp index 4a3778a2fd07..14f839cd4f81 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -390,10 +390,6 @@ bool AArch64RegisterInfo::needsFrameBaseReg(MachineInstr *MI, if (isFrameOffsetLegal(MI, AArch64::SP, Offset)) return false; - // If even offset 0 is illegal, we don't want a virtual base register. - if (!isFrameOffsetLegal(MI, AArch64::SP, 0)) - return false; - // The offset likely isn't legal; we want to allocate a virtual base register. return true; } @@ -449,17 +445,6 @@ void AArch64RegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, (void)Done; } -static Register getScratchRegisterForInstruction(MachineInstr &MI) { - // ST*Gloop can only have #fi in op3, and they have a constraint that - // op1==op3. Since op1 is early-clobber, it may (and also must) be used as the - // scratch register. - if (MI.getOpcode() == AArch64::STGloop || MI.getOpcode() == AArch64::STZGloop) - return MI.getOperand(1).getReg(); - else - return MI.getMF()->getRegInfo().createVirtualRegister( - &AArch64::GPR64RegClass); -} - void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const { @@ -516,7 +501,8 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, // in a scratch register. Offset = TFI->resolveFrameIndexReference( MF, FrameIndex, FrameReg, /*PreferFP=*/false, /*ForSimm=*/true); - Register ScratchReg = getScratchRegisterForInstruction(MI); + Register ScratchReg = + MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass); emitFrameOffset(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset, TII); BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AArch64::LDG), ScratchReg) @@ -545,7 +531,8 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, // If we get here, the immediate doesn't fit into the instruction. We folded // as much as possible above. Handle the rest, providing a register that is // SP+LargeImm. - Register ScratchReg = getScratchRegisterForInstruction(MI); + Register ScratchReg = + MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass); emitFrameOffset(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset, TII); MI.getOperand(FIOperandNum).ChangeToRegister(ScratchReg, false, false, true); } diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp index e050a0028eca..ba61ed726e84 100644 --- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp @@ -125,13 +125,19 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForSetTag( return EmitUnrolledSetTag(DAG, dl, Chain, Addr, ObjSize, BaseMemOperand, ZeroData); - const EVT ResTys[] = {MVT::i64, MVT::i64, MVT::Other}; - - if (Addr.getOpcode() == ISD::FrameIndex) { - int FI = cast(Addr)->getIndex(); - Addr = DAG.getTargetFrameIndex(FI, MVT::i64); + if (ObjSize % 32 != 0) { + SDNode *St1 = DAG.getMachineNode( + ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex, dl, + {MVT::i64, MVT::Other}, + {Addr, Addr, DAG.getTargetConstant(1, dl, MVT::i64), Chain}); + DAG.setNodeMemRefs(cast(St1), {BaseMemOperand}); + ObjSize -= 16; + Addr = SDValue(St1, 0); + Chain = SDValue(St1, 1); } - SDValue Ops[] = {DAG.getTargetConstant(ObjSize, dl, MVT::i64), Addr, Chain}; + + const EVT ResTys[] = {MVT::i64, MVT::i64, MVT::Other}; + SDValue Ops[] = {DAG.getConstant(ObjSize, dl, MVT::i64), Addr, Chain}; SDNode *St = DAG.getMachineNode( ZeroData ? AArch64::STZGloop : AArch64::STGloop, dl, ResTys, Ops); diff --git a/llvm/test/CodeGen/AArch64/settag-merge.ll b/llvm/test/CodeGen/AArch64/settag-merge.ll deleted file mode 100644 index 1bc93a82070f..000000000000 --- a/llvm/test/CodeGen/AArch64/settag-merge.ll +++ /dev/null @@ -1,214 +0,0 @@ -; RUN: llc < %s -mtriple=aarch64 -mattr=+mte | FileCheck %s - -declare void @use(i8* %p) -declare void @llvm.aarch64.settag(i8* %p, i64 %a) -declare void @llvm.aarch64.settag.zero(i8* %p, i64 %a) - -define void @stg16_16() { -entry: -; CHECK-LABEL: stg16_16: -; CHECK: st2g sp, [sp], #32 -; CHECK: ret - %a = alloca i8, i32 16, align 16 - %b = alloca i8, i32 16, align 16 - call void @llvm.aarch64.settag(i8* %a, i64 16) - call void @llvm.aarch64.settag(i8* %b, i64 16) - ret void -} - -define i32 @stg16_16_16_16_ret() { -entry: -; CHECK-LABEL: stg16_16_16_16_ret: -; CHECK: st2g sp, [sp, #32] -; CHECK: st2g sp, [sp], #64 -; CHECK: mov w0, wzr -; CHECK: ret - %a = alloca i8, i32 16, align 16 - %b = alloca i8, i32 16, align 16 - %c = alloca i8, i32 16, align 16 - %d = alloca i8, i32 16, align 16 - call void @llvm.aarch64.settag(i8* %a, i64 16) - call void @llvm.aarch64.settag(i8* %b, i64 16) - call void @llvm.aarch64.settag(i8* %c, i64 16) - call void @llvm.aarch64.settag(i8* %d, i64 16) - ret i32 0 -} - -define void @stg16_16_16_16() { -entry: -; CHECK-LABEL: stg16_16_16_16: -; CHECK: st2g sp, [sp, #32] -; CHECK: st2g sp, [sp], #64 -; CHECK: ret - %a = alloca i8, i32 16, align 16 - %b = alloca i8, i32 16, align 16 - %c = alloca i8, i32 16, align 16 - %d = alloca i8, i32 16, align 16 - call void @llvm.aarch64.settag(i8* %a, i64 16) - call void @llvm.aarch64.settag(i8* %b, i64 16) - call void @llvm.aarch64.settag(i8* %c, i64 16) - call void @llvm.aarch64.settag(i8* %d, i64 16) - ret void -} - -define void @stg128_128_128_128() { -entry: -; CHECK-LABEL: stg128_128_128_128: -; CHECK: mov x8, #512 -; CHECK: st2g sp, [sp], #32 -; CHECK: sub x8, x8, #32 -; CHECK: cbnz x8, -; CHECK: ret - %a = alloca i8, i32 128, align 16 - %b = alloca i8, i32 128, align 16 - %c = alloca i8, i32 128, align 16 - %d = alloca i8, i32 128, align 16 - call void @llvm.aarch64.settag(i8* %a, i64 128) - call void @llvm.aarch64.settag(i8* %b, i64 128) - call void @llvm.aarch64.settag(i8* %c, i64 128) - call void @llvm.aarch64.settag(i8* %d, i64 128) - ret void -} - -define void @stg16_512_16() { -entry: -; CHECK-LABEL: stg16_512_16: -; CHECK: mov x8, #544 -; CHECK: st2g sp, [sp], #32 -; CHECK: sub x8, x8, #32 -; CHECK: cbnz x8, -; CHECK: ret - %a = alloca i8, i32 16, align 16 - %b = alloca i8, i32 512, align 16 - %c = alloca i8, i32 16, align 16 - call void @llvm.aarch64.settag(i8* %a, i64 16) - call void @llvm.aarch64.settag(i8* %b, i64 512) - call void @llvm.aarch64.settag(i8* %c, i64 16) - ret void -} - -define void @stg512_512_512() { -entry: -; CHECK-LABEL: stg512_512_512: -; CHECK: mov x8, #1536 -; CHECK: st2g sp, [sp], #32 -; CHECK: sub x8, x8, #32 -; CHECK: cbnz x8, -; CHECK: ret - %a = alloca i8, i32 512, align 16 - %b = alloca i8, i32 512, align 16 - %c = alloca i8, i32 512, align 16 - call void @llvm.aarch64.settag(i8* %a, i64 512) - call void @llvm.aarch64.settag(i8* %b, i64 512) - call void @llvm.aarch64.settag(i8* %c, i64 512) - ret void -} - -define void @early(i1 %flag) { -entry: -; CHECK-LABEL: early: -; CHECK: tbz w0, #0, [[LABEL:.LBB.*]] -; CHECK: st2g sp, [sp, # -; CHECK: st2g sp, [sp, # -; CHECK: st2g sp, [sp, # -; CHECK: [[LABEL]]: -; CHECK: stg sp, [sp, # -; CHECK: st2g sp, [sp], # -; CHECK: ret - %a = alloca i8, i32 48, align 16 - %b = alloca i8, i32 48, align 16 - %c = alloca i8, i32 48, align 16 - br i1 %flag, label %if.then, label %if.end - -if.then: - call void @llvm.aarch64.settag(i8* %a, i64 48) - call void @llvm.aarch64.settag(i8* %b, i64 48) - br label %if.end - -if.end: - call void @llvm.aarch64.settag(i8* %c, i64 48) - ret void -} - -define void @early_128_128(i1 %flag) { -entry: -; CHECK-LABEL: early_128_128: -; CHECK: tbz w0, #0, [[LABEL:.LBB.*]] -; CHECK: add x9, sp, # -; CHECK: mov x8, #256 -; CHECK: st2g x9, [x9], #32 -; CHECK: sub x8, x8, #32 -; CHECK: cbnz x8, -; CHECK: [[LABEL]]: -; CHECK: stg sp, [sp, # -; CHECK: st2g sp, [sp], # -; CHECK: ret - %a = alloca i8, i32 128, align 16 - %b = alloca i8, i32 128, align 16 - %c = alloca i8, i32 48, align 16 - br i1 %flag, label %if.then, label %if.end - -if.then: - call void @llvm.aarch64.settag(i8* %a, i64 128) - call void @llvm.aarch64.settag(i8* %b, i64 128) - br label %if.end - -if.end: - call void @llvm.aarch64.settag(i8* %c, i64 48) - ret void -} - -define void @early_512_512(i1 %flag) { -entry: -; CHECK-LABEL: early_512_512: -; CHECK: tbz w0, #0, [[LABEL:.LBB.*]] -; CHECK: add x9, sp, # -; CHECK: mov x8, #1024 -; CHECK: st2g x9, [x9], #32 -; CHECK: sub x8, x8, #32 -; CHECK: cbnz x8, -; CHECK: [[LABEL]]: -; CHECK: stg sp, [sp, # -; CHECK: st2g sp, [sp], # -; CHECK: ret - %a = alloca i8, i32 512, align 16 - %b = alloca i8, i32 512, align 16 - %c = alloca i8, i32 48, align 16 - br i1 %flag, label %if.then, label %if.end - -if.then: - call void @llvm.aarch64.settag(i8* %a, i64 512) - call void @llvm.aarch64.settag(i8* %b, i64 512) - br label %if.end - -if.end: - call void @llvm.aarch64.settag(i8* %c, i64 48) - ret void -} - -; Two loops of size 256; the second loop updates SP. -define void @stg128_128_gap_128_128() { -entry: -; CHECK-LABEL: stg128_128_gap_128_128: -; CHECK: mov x9, sp -; CHECK: mov x8, #256 -; CHECK: st2g x9, [x9], #32 -; CHECK: sub x8, x8, #32 -; CHECK: cbnz x8, -; CHECK: mov x8, #256 -; CHECK: st2g sp, [sp], #32 -; CHECK: sub x8, x8, #32 -; CHECK: cbnz x8, -; CHECK: ret - %a = alloca i8, i32 128, align 16 - %a2 = alloca i8, i32 128, align 16 - %b = alloca i8, i32 32, align 16 - %c = alloca i8, i32 128, align 16 - %c2 = alloca i8, i32 128, align 16 - call void @use(i8* %b) - call void @llvm.aarch64.settag(i8* %a, i64 128) - call void @llvm.aarch64.settag(i8* %a2, i64 128) - call void @llvm.aarch64.settag(i8* %c, i64 128) - call void @llvm.aarch64.settag(i8* %c2, i64 128) - ret void -} diff --git a/llvm/test/CodeGen/AArch64/settag-merge.mir b/llvm/test/CodeGen/AArch64/settag-merge.mir deleted file mode 100644 index dc2a00c7d3d3..000000000000 --- a/llvm/test/CodeGen/AArch64/settag-merge.mir +++ /dev/null @@ -1,83 +0,0 @@ -# RUN: llc -mtriple=aarch64 -mattr=+mte -run-pass=prologepilog %s -o - | FileCheck %s - ---- | - declare void @llvm.aarch64.settag(i8* nocapture writeonly, i64) argmemonly nounwind writeonly "target-features"="+mte" - define i32 @stg16_16_16_16_ret() "target-features"="+mte" { - entry: - %a = alloca i8, i32 16, align 16 - %b = alloca i8, i32 16, align 16 - %c = alloca i8, i32 16, align 16 - %d = alloca i8, i32 16, align 16 - call void @llvm.aarch64.settag(i8* %a, i64 16) - call void @llvm.aarch64.settag(i8* %b, i64 16) - call void @llvm.aarch64.settag(i8* %c, i64 16) - call void @llvm.aarch64.settag(i8* %d, i64 16) - ret i32 0 - } - - define void @stg16_store_128() "target-features"="+mte" { - entry: - %a = alloca i8, i32 16, align 16 - %b = alloca i8, i32 128, align 16 - call void @llvm.aarch64.settag(i8* %a, i64 16) - store i8 42, i8* %a - call void @llvm.aarch64.settag(i8* %b, i64 128) - ret void - } - -... ---- -# A sequence of STG with a register copy in the middle. -# Can be merged into ST2G + ST2G. -# CHECK-LABEL: name:{{.*}}stg16_16_16_16_ret -# CHECK-DAG: ST2GOffset $sp, $sp, 2 -# CHECK-DAG: ST2GOffset $sp, $sp, 0 -# CHECK-DAG: $w0 = COPY $wzr -# CHECK-DAG: RET_ReallyLR implicit killed $w0 - -name: stg16_16_16_16_ret -tracksRegLiveness: true -stack: - - { id: 0, name: a, size: 16, alignment: 16 } - - { id: 1, name: b, size: 16, alignment: 16 } - - { id: 2, name: c, size: 16, alignment: 16 } - - { id: 3, name: d, size: 16, alignment: 16 } -body: | - bb.0.entry: - STGOffset $sp, %stack.0.a, 0 :: (store 16 into %ir.a) - STGOffset $sp, %stack.1.b, 0 :: (store 16 into %ir.b) - STGOffset $sp, %stack.2.c, 0 :: (store 16 into %ir.c) - $w0 = COPY $wzr - STGOffset $sp, %stack.3.d, 0 :: (store 16 into %ir.d) - RET_ReallyLR implicit killed $w0 - -... - ---- -# A store in the middle prevents merging. -# CHECK-LABEL: name:{{.*}}stg16_store_128 -# CHECK: ST2GOffset $sp, $sp, 2 -# CHECK: ST2GOffset $sp, $sp, 4 -# CHECK: ST2GOffset $sp, $sp, 6 -# CHECK: STGOffset $sp, $sp, 8 -# CHECK: STRBBui -# CHECK: ST2GOffset $sp, $sp, 0 -# CHECK: RET_ReallyLR - -name: stg16_store_128 -tracksRegLiveness: true -stack: - - { id: 0, name: a, size: 16, alignment: 16 } - - { id: 1, name: b, size: 128, alignment: 16 } -body: | - bb.0.entry: - STGOffset $sp, %stack.0.a, 0 :: (store 16 into %ir.a) - renamable $w8 = MOVi32imm 42 - ST2GOffset $sp, %stack.1.b, 6 :: (store 32 into %ir.b + 96, align 16) - ST2GOffset $sp, %stack.1.b, 4 :: (store 32 into %ir.b + 64, align 16) - ST2GOffset $sp, %stack.1.b, 2 :: (store 32 into %ir.b + 32, align 16) - STRBBui killed renamable $w8, %stack.0.a, 0 :: (store 1 into %ir.a, align 16) - ST2GOffset $sp, %stack.1.b, 0 :: (store 32 into %ir.b, align 16) - RET_ReallyLR - -... diff --git a/llvm/test/CodeGen/AArch64/settag.ll b/llvm/test/CodeGen/AArch64/settag.ll index 3deeb0155fe8..9ca188fbce32 100644 --- a/llvm/test/CodeGen/AArch64/settag.ll +++ b/llvm/test/CodeGen/AArch64/settag.ll @@ -64,8 +64,8 @@ entry: define void @stg17(i8* %p) { entry: ; CHECK-LABEL: stg17: -; CHECK: stg x0, [x0], #16 ; CHECK: mov {{(w|x)}}[[R:[0-9]+]], #256 +; CHECK: stg x0, [x0], #16 ; CHECK: st2g x0, [x0], #32 ; CHECK: sub x[[R]], x[[R]], #32 ; CHECK: cbnz x[[R]], @@ -87,8 +87,8 @@ entry: define void @stzg17(i8* %p) { entry: ; CHECK-LABEL: stzg17: -; CHECK: stzg x0, [x0], #16 ; CHECK: mov {{w|x}}[[R:[0-9]+]], #256 +; CHECK: stzg x0, [x0], #16 ; CHECK: stz2g x0, [x0], #32 ; CHECK: sub x[[R]], x[[R]], #32 ; CHECK: cbnz x[[R]], @@ -110,10 +110,10 @@ entry: define void @stg_alloca5() { entry: ; CHECK-LABEL: stg_alloca5: -; CHECK: st2g sp, [sp, #32] -; CHECK-NEXT: stg sp, [sp, #64] -; CHECK-NEXT: st2g sp, [sp], #80 -; CHECK-NEXT: ret +; CHECK: stg sp, [sp, #64] +; CHECK: st2g sp, [sp, #32] +; CHECK: st2g sp, [sp] +; CHECK: ret %a = alloca i8, i32 80, align 16 call void @llvm.aarch64.settag(i8* %a, i64 80) ret void @@ -122,11 +122,12 @@ entry: define void @stg_alloca17() { entry: ; CHECK-LABEL: stg_alloca17: +; CHECK: mov [[P:x[0-9]+]], sp +; CHECK: stg [[P]], {{\[}}[[P]]{{\]}}, #16 ; CHECK: mov {{w|x}}[[R:[0-9]+]], #256 -; CHECK: st2g sp, [sp], #32 +; CHECK: st2g [[P]], {{\[}}[[P]]{{\]}}, #32 ; CHECK: sub x[[R]], x[[R]], #32 ; CHECK: cbnz x[[R]], -; CHECK: stg sp, [sp], #16 ; CHECK: ret %a = alloca i8, i32 272, align 16 call void @llvm.aarch64.settag(i8* %a, i64 272) diff --git a/llvm/test/CodeGen/AArch64/stack-tagging-unchecked-ld-st.ll b/llvm/test/CodeGen/AArch64/stack-tagging-unchecked-ld-st.ll index ed6ccc8b4941..200837dabfe0 100644 --- a/llvm/test/CodeGen/AArch64/stack-tagging-unchecked-ld-st.ll +++ b/llvm/test/CodeGen/AArch64/stack-tagging-unchecked-ld-st.ll @@ -210,10 +210,11 @@ entry: ; DEFAULT: ldrb [[A:w.*]], [x{{.*}}] ; DEFAULT: ldrb [[B:w.*]], [x{{.*}}] -; ALWAYS-DAG: ldg [[PA:x.*]], [x{{.*}}] -; ALWAYS-DAG: ldrb [[B:w.*]], [sp] -; ALWAYS-DAG: ldrb [[A:w.*]], {{\[}}[[PA]]{{\]}} +; ALWAYS: ldg [[PA:x.*]], [x{{.*}}] +; ALWAYS: ldrb [[B:w.*]], [sp] +; ALWAYS: ldrb [[A:w.*]], {{\[}}[[PA]]{{\]}} +; COMMON: add w0, [[B]], [[A]] ; COMMON: ret ; One of these allocas is closer to FP than to SP, and within 256 bytes