Revert "Merge memtag instructions with adjacent stack slots."

*** Bad machine code: Tied use must be a register *** - function: stg_alloca17 - basic block: %bb.0 entry (0x20076710580) - instruction: early-clobber %0:gpr64common, early-clobber %1:gpr64sp = STGloop 272, %stack.0.a :: (store 272 into %ir.a, align 16) - operand 3: %stack.0.a http://lab.llvm.org:8011/builders/llvm-clang-x86_64-expensive-checks-win/builds/21481/steps/test-check-all/logs/stdio This reverts commit b675a7628c.
2020-01-08 14:33:28 -08:00 · 2020-01-08 14:33:28 -08:00 · 58deb20dd2
parent 28b9cdd260
commit 58deb20dd2
13 changed files with 43 additions and 808 deletions
--- a/llvm/include/llvm/CodeGen/TargetFrameLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetFrameLowering.h
@ -309,13 +309,6 @@ public:
                                             RegScavenger *RS = nullptr) const {
  }

-  /// processFunctionBeforeFrameIndicesReplaced - This method is called
-  /// immediately before MO_FrameIndex operands are eliminated, but after the
-  /// frame is finalized. This method is optional.
-  virtual void
-  processFunctionBeforeFrameIndicesReplaced(MachineFunction &MF,
-                                            RegScavenger *RS = nullptr) const {}
-
  virtual unsigned getWinEHParentFrameOffset(const MachineFunction &MF) const {
    report_fatal_error("WinEH not implemented for this target");
  }
--- a/llvm/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
@ -259,10 +259,6 @@ bool PEI::runOnMachineFunction(MachineFunction &MF) {
  for (auto &I : EntryDbgValues)
    I.first->insert(I.first->begin(), I.second.begin(), I.second.end());

-  // Allow the target machine to make final modifications to the function
-  // before the frame layout is finalized.
-  TFI->processFunctionBeforeFrameIndicesReplaced(MF, RS);
-
  // Replace all MO_FrameIndex operands with physical register references
  // and actual offsets.
  //
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@ -349,38 +349,22 @@ bool AArch64ExpandPseudo::expandSetTagLoop(
    MachineBasicBlock::iterator &NextMBBI) {
  MachineInstr &MI = *MBBI;
  DebugLoc DL = MI.getDebugLoc();
-  Register SizeReg = MI.getOperand(0).getReg();
-  Register AddressReg = MI.getOperand(1).getReg();
+  Register SizeReg = MI.getOperand(2).getReg();
+  Register AddressReg = MI.getOperand(3).getReg();

  MachineFunction *MF = MBB.getParent();

  bool ZeroData = MI.getOpcode() == AArch64::STZGloop;
-  const unsigned OpCode1 =
-      ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex;
-  const unsigned OpCode2 =
+  const unsigned OpCode =
      ZeroData ? AArch64::STZ2GPostIndex : AArch64::ST2GPostIndex;

-  unsigned Size = MI.getOperand(2).getImm();
-  assert(Size > 0 && Size % 16 == 0);
-  if (Size % (16 * 2) != 0) {
-    BuildMI(MBB, MBBI, DL, TII->get(OpCode1), AddressReg)
-        .addReg(AddressReg)
-        .addReg(AddressReg)
-        .addImm(1);
-    Size -= 16;
-  }
-  MachineBasicBlock::iterator I =
-      BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVi64imm), SizeReg)
-          .addImm(Size);
-  expandMOVImm(MBB, I, 64);
-
  auto LoopBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
  auto DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());

  MF->insert(++MBB.getIterator(), LoopBB);
  MF->insert(++LoopBB->getIterator(), DoneBB);

-  BuildMI(LoopBB, DL, TII->get(OpCode2))
+  BuildMI(LoopBB, DL, TII->get(OpCode))
      .addDef(AddressReg)
      .addReg(AddressReg)
      .addReg(AddressReg)
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@ -170,11 +170,6 @@ static cl::opt<bool>
                         cl::desc("reverse the CSR restore sequence"),
                         cl::init(false), cl::Hidden);

-static cl::opt<bool> StackTaggingMergeSetTag(
-    "stack-tagging-merge-settag",
-    cl::desc("merge settag instruction in function epilog"), cl::init(true),
-    cl::Hidden);
-
 STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");

 /// This is the biggest offset to the stack pointer we can encode in aarch64
@ -485,39 +480,6 @@ bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
  return true;
 }

-bool AArch64FrameLowering::shouldCombineCSRLocalStackBumpInEpilogue(
-    MachineBasicBlock &MBB, unsigned StackBumpBytes) const {
-  if (!shouldCombineCSRLocalStackBump(*MBB.getParent(), StackBumpBytes))
-    return false;
-
-  if (MBB.empty())
-    return true;
-
-  // Disable combined SP bump if the last instruction is an MTE tag store. It
-  // is almost always better to merge SP adjustment into those instructions.
-  MachineBasicBlock::iterator LastI = MBB.getFirstTerminator();
-  MachineBasicBlock::iterator Begin = MBB.begin();
-  while (LastI != Begin) {
-    --LastI;
-    if (LastI->isTransient())
-      continue;
-    if (!LastI->getFlag(MachineInstr::FrameDestroy))
-      break;
-  }
-  switch (LastI->getOpcode()) {
-  case AArch64::STGloop:
-  case AArch64::STZGloop:
-  case AArch64::STGOffset:
-  case AArch64::STZGOffset:
-  case AArch64::ST2GOffset:
-  case AArch64::STZ2GOffset:
-    return false;
-  default:
-    return true;
-  }
-  llvm_unreachable("unreachable");
-}
-
 // Given a load or a store instruction, generate an appropriate unwinding SEH
 // code on Windows.
 static MachineBasicBlock::iterator InsertSEH(MachineBasicBlock::iterator MBBI,
@ -1497,7 +1459,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
  // function.
  if (MF.hasEHFunclets())
    AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
-  bool CombineSPBump = shouldCombineCSRLocalStackBumpInEpilogue(MBB, NumBytes);
+  bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
  // Assume we can't combine the last pop with the sp restore.

  if (!CombineSPBump && PrologueSaveSize != 0) {
@ -2675,399 +2637,9 @@ void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
      .addImm(0);
 }

-namespace {
-struct TagStoreInstr {
-  MachineInstr *MI;
-  int64_t Offset, Size;
-  explicit TagStoreInstr(MachineInstr *MI, int64_t Offset, int64_t Size)
-      : MI(MI), Offset(Offset), Size(Size) {}
-};
-
-class TagStoreEdit {
-  MachineFunction *MF;
-  MachineBasicBlock *MBB;
-  MachineRegisterInfo *MRI;
-  // Tag store instructions that are being replaced.
-  SmallVector<TagStoreInstr, 8> TagStores;
-  // Combined memref arguments of the above instructions.
-  SmallVector<MachineMemOperand *, 8> CombinedMemRefs;
-
-  // Replace allocation tags in [FrameReg + FrameRegOffset, FrameReg +
-  // FrameRegOffset + Size) with the address tag of SP.
-  Register FrameReg;
-  StackOffset FrameRegOffset;
-  int64_t Size;
-  // If not None, move FrameReg to (FrameReg + FrameRegUpdate) at the end.
-  Optional<int64_t> FrameRegUpdate;
-  // MIFlags for any FrameReg updating instructions.
-  unsigned FrameRegUpdateFlags;
-
-  // Use zeroing instruction variants.
-  bool ZeroData;
-  DebugLoc DL;
-
-  void emitUnrolled(MachineBasicBlock::iterator InsertI);
-  void emitLoop(MachineBasicBlock::iterator InsertI);
-
-public:
-  TagStoreEdit(MachineBasicBlock *MBB, bool ZeroData)
-      : MBB(MBB), ZeroData(ZeroData) {
-    MF = MBB->getParent();
-    MRI = &MF->getRegInfo();
-  }
-  // Add an instruction to be replaced. Instructions must be added in the
-  // ascending order of Offset, and have to be adjacent.
-  void addInstruction(TagStoreInstr I) {
-    assert((TagStores.empty() ||
-            TagStores.back().Offset + TagStores.back().Size == I.Offset) &&
-           "Non-adjacent tag store instructions.");
-    TagStores.push_back(I);
-  }
-  void clear() { TagStores.clear(); }
-  // Emit equivalent code at the given location, and erase the current set of
-  // instructions. May skip if the replacement is not profitable. May invalidate
-  // the input iterator and replace it with a valid one.
-  void emitCode(MachineBasicBlock::iterator &InsertI,
-                const AArch64FrameLowering *TFI, bool IsLast);
-};
-
-void TagStoreEdit::emitUnrolled(MachineBasicBlock::iterator InsertI) {
-  const AArch64InstrInfo *TII =
-      MF->getSubtarget<AArch64Subtarget>().getInstrInfo();
-
-  const int64_t kMinOffset = -256 * 16;
-  const int64_t kMaxOffset = 255 * 16;
-
-  Register BaseReg = FrameReg;
-  int64_t BaseRegOffsetBytes = FrameRegOffset.getBytes();
-  if (BaseRegOffsetBytes < kMinOffset ||
-      BaseRegOffsetBytes + (Size - Size % 32) > kMaxOffset) {
-    Register ScratchReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
-    emitFrameOffset(*MBB, InsertI, DL, ScratchReg, BaseReg,
-                    {BaseRegOffsetBytes, MVT::i8}, TII);
-    BaseReg = ScratchReg;
-    BaseRegOffsetBytes = 0;
-  }
-
-  MachineInstr *LastI = nullptr;
-  while (Size) {
-    int64_t InstrSize = (Size > 16) ? 32 : 16;
-    unsigned Opcode =
-        InstrSize == 16
-            ? (ZeroData ? AArch64::STZGOffset : AArch64::STGOffset)
-            : (ZeroData ? AArch64::STZ2GOffset : AArch64::ST2GOffset);
-    MachineInstr *I = BuildMI(*MBB, InsertI, DL, TII->get(Opcode))
-                          .addReg(AArch64::SP)
-                          .addReg(BaseReg)
-                          .addImm(BaseRegOffsetBytes / 16)
-                          .setMemRefs(CombinedMemRefs);
-    // A store to [BaseReg, #0] should go last for an opportunity to fold the
-    // final SP adjustment in the epilogue.
-    if (BaseRegOffsetBytes == 0)
-      LastI = I;
-    BaseRegOffsetBytes += InstrSize;
-    Size -= InstrSize;
-  }
-
-  if (LastI)
-    MBB->splice(InsertI, MBB, LastI);
-}
-
-void TagStoreEdit::emitLoop(MachineBasicBlock::iterator InsertI) {
-  const AArch64InstrInfo *TII =
-      MF->getSubtarget<AArch64Subtarget>().getInstrInfo();
-
-  Register BaseReg = FrameRegUpdate
-                         ? FrameReg
-                         : MRI->createVirtualRegister(&AArch64::GPR64RegClass);
-  Register SizeReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
-
-  emitFrameOffset(*MBB, InsertI, DL, BaseReg, FrameReg, FrameRegOffset, TII);
-
-  int64_t LoopSize = Size;
-  // If the loop size is not a multiple of 32, split off one 16-byte store at
-  // the end to fold BaseReg update into.
-  if (FrameRegUpdate && *FrameRegUpdate)
-    LoopSize -= LoopSize % 32;
-  MachineInstr *LoopI =
-      BuildMI(*MBB, InsertI, DL,
-              TII->get(ZeroData ? AArch64::STZGloop : AArch64::STGloop))
-          .addDef(SizeReg)
-          .addDef(BaseReg)
-          .addImm(LoopSize)
-          .addReg(BaseReg)
-          .setMemRefs(CombinedMemRefs);
-  if (FrameRegUpdate)
-    LoopI->setFlags(FrameRegUpdateFlags);
-
-  int64_t ExtraBaseRegUpdate =
-      FrameRegUpdate ? (*FrameRegUpdate - FrameRegOffset.getBytes() - Size) : 0;
-  if (LoopSize < Size) {
-    assert(FrameRegUpdate);
-    assert(Size - LoopSize == 16);
-    // Tag 16 more bytes at BaseReg and update BaseReg.
-    BuildMI(*MBB, InsertI, DL,
-            TII->get(ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex))
-        .addDef(BaseReg)
-        .addReg(BaseReg)
-        .addReg(BaseReg)
-        .addImm(1 + ExtraBaseRegUpdate / 16)
-        .setMemRefs(CombinedMemRefs)
-        .setMIFlags(FrameRegUpdateFlags);
-  } else if (ExtraBaseRegUpdate) {
-    // Update BaseReg.
-    BuildMI(
-        *MBB, InsertI, DL,
-        TII->get(ExtraBaseRegUpdate > 0 ? AArch64::ADDXri : AArch64::SUBXri))
-        .addDef(BaseReg)
-        .addReg(BaseReg)
-        .addImm(std::abs(ExtraBaseRegUpdate))
-        .addImm(0)
-        .setMIFlags(FrameRegUpdateFlags);
-  }
-}
-
-// Check if *II is a register update that can be merged into STGloop that ends
-// at (Reg + Size). RemainingOffset is the required adjustment to Reg after the
-// end of the loop.
-bool canMergeRegUpdate(MachineBasicBlock::iterator II, unsigned Reg,
-                       int64_t Size, int64_t *TotalOffset) {
-  MachineInstr &MI = *II;
-  if ((MI.getOpcode() == AArch64::ADDXri ||
-       MI.getOpcode() == AArch64::SUBXri) &&
-      MI.getOperand(0).getReg() == Reg && MI.getOperand(1).getReg() == Reg) {
-    unsigned Shift = AArch64_AM::getShiftValue(MI.getOperand(3).getImm());
-    int64_t Offset = MI.getOperand(2).getImm() << Shift;
-    if (MI.getOpcode() == AArch64::SUBXri)
-      Offset = -Offset;
-    int64_t AbsPostOffset = std::abs(Offset - Size);
-    const int64_t kMaxOffset =
-        0xFFF; // Max encoding for unshifted ADDXri / SUBXri
-    if (AbsPostOffset <= kMaxOffset && AbsPostOffset % 16 == 0) {
-      *TotalOffset = Offset;
-      return true;
-    }
-  }
-  return false;
-}
-
-void mergeMemRefs(const SmallVectorImpl<TagStoreInstr> &TSE,
-                  SmallVectorImpl<MachineMemOperand *> &MemRefs) {
-  MemRefs.clear();
-  for (auto &TS : TSE) {
-    MachineInstr *MI = TS.MI;
-    // An instruction without memory operands may access anything. Be
-    // conservative and return an empty list.
-    if (MI->memoperands_empty()) {
-      MemRefs.clear();
-      return;
-    }
-    MemRefs.append(MI->memoperands_begin(), MI->memoperands_end());
-  }
-}
-
-void TagStoreEdit::emitCode(MachineBasicBlock::iterator &InsertI,
-                            const AArch64FrameLowering *TFI, bool IsLast) {
-  if (TagStores.empty())
-    return;
-  TagStoreInstr &FirstTagStore = TagStores[0];
-  TagStoreInstr &LastTagStore = TagStores[TagStores.size() - 1];
-  Size = LastTagStore.Offset - FirstTagStore.Offset + LastTagStore.Size;
-  DL = TagStores[0].MI->getDebugLoc();
-
-  unsigned Reg;
-  FrameRegOffset = TFI->resolveFrameOffsetReference(
-      *MF, FirstTagStore.Offset, false /*isFixed*/, false /*isSVE*/, Reg,
-      /*PreferFP=*/false, /*ForSimm=*/true);
-  FrameReg = Reg;
-  FrameRegUpdate = None;
-
-  mergeMemRefs(TagStores, CombinedMemRefs);
-
-  LLVM_DEBUG(dbgs() << "Replacing adjacent STG instructions:\n";
-             for (const auto &Instr
-                  : TagStores) { dbgs() << "  " << *Instr.MI; });
-
-  // Size threshold where a loop becomes shorter than a linear sequence of
-  // tagging instructions.
-  const int kSetTagLoopThreshold = 176;
-  if (Size < kSetTagLoopThreshold) {
-    if (TagStores.size() < 2)
-      return;
-    emitUnrolled(InsertI);
-  } else {
-    MachineInstr *UpdateInstr = nullptr;
-    int64_t TotalOffset;
-    if (IsLast) {
-      // See if we can merge base register update into the STGloop.
-      // This is done in AArch64LoadStoreOptimizer for "normal" stores,
-      // but STGloop is way too unusual for that, and also it only
-      // realistically happens in function epilogue. Also, STGloop is expanded
-      // before that pass.
-      if (InsertI != MBB->end() &&
-          canMergeRegUpdate(InsertI, FrameReg, FrameRegOffset.getBytes() + Size,
-                            &TotalOffset)) {
-        UpdateInstr = &*InsertI++;
-        LLVM_DEBUG(dbgs() << "Folding SP update into loop:\n  "
-                          << *UpdateInstr);
-      }
-    }
-
-    if (!UpdateInstr && TagStores.size() < 2)
-      return;
-
-    if (UpdateInstr) {
-      FrameRegUpdate = TotalOffset;
-      FrameRegUpdateFlags = UpdateInstr->getFlags();
-    }
-    emitLoop(InsertI);
-    if (UpdateInstr)
-      UpdateInstr->eraseFromParent();
-  }
-
-  for (auto &TS : TagStores)
-    TS.MI->eraseFromParent();
-}
-
-bool isMergeableStackTaggingInstruction(MachineInstr &MI, int64_t &Offset,
-                                        int64_t &Size, bool &ZeroData) {
-  MachineFunction &MF = *MI.getParent()->getParent();
-  const MachineFrameInfo &MFI = MF.getFrameInfo();
-
-  unsigned Opcode = MI.getOpcode();
-  ZeroData = (Opcode == AArch64::STZGloop || Opcode == AArch64::STZGOffset ||
-              Opcode == AArch64::STZ2GOffset);
-
-  if (Opcode == AArch64::STGloop || Opcode == AArch64::STZGloop) {
-    if (!MI.getOperand(0).isDead() || !MI.getOperand(1).isDead())
-      return false;
-    if (!MI.getOperand(2).isImm() || !MI.getOperand(3).isFI())
-      return false;
-    Offset = MFI.getObjectOffset(MI.getOperand(3).getIndex());
-    Size = MI.getOperand(2).getImm();
-    return true;
-  }
-
-  if (Opcode == AArch64::STGOffset || Opcode == AArch64::STZGOffset)
-    Size = 16;
-  else if (Opcode == AArch64::ST2GOffset || Opcode == AArch64::STZ2GOffset)
-    Size = 32;
-  else
-    return false;
-
-  if (MI.getOperand(0).getReg() != AArch64::SP || !MI.getOperand(1).isFI())
-    return false;
-
-  Offset = MFI.getObjectOffset(MI.getOperand(1).getIndex()) +
-           16 * MI.getOperand(2).getImm();
-  return true;
-}
-
-// Detect a run of memory tagging instructions for adjacent stack frame slots,
-// and replace them with a shorter instruction sequence:
-// * replace STG + STG with ST2G
-// * replace STGloop + STGloop with STGloop
-// This code needs to run when stack slot offsets are already known, but before
-// FrameIndex operands in STG instructions are eliminated.
-MachineBasicBlock::iterator tryMergeAdjacentSTG(MachineBasicBlock::iterator II,
-                                                const AArch64FrameLowering *TFI,
-                                                RegScavenger *RS) {
-  bool FirstZeroData;
-  int64_t Size, Offset;
-  MachineInstr &MI = *II;
-  MachineBasicBlock *MBB = MI.getParent();
-  MachineBasicBlock::iterator NextI = ++II;
-  if (&MI == &MBB->instr_back())
-    return II;
-  if (!isMergeableStackTaggingInstruction(MI, Offset, Size, FirstZeroData))
-    return II;
-
-  SmallVector<TagStoreInstr, 4> Instrs;
-  Instrs.emplace_back(&MI, Offset, Size);
-
-  constexpr int kScanLimit = 10;
-  int Count = 0;
-  for (MachineBasicBlock::iterator E = MBB->end();
-       NextI != E && Count < kScanLimit; ++NextI) {
-    MachineInstr &MI = *NextI;
-    bool ZeroData;
-    int64_t Size, Offset;
-    // Collect instructions that update memory tags with a FrameIndex operand
-    // and (when applicable) constant size, and whose output registers are dead
-    // (the latter is almost always the case in practice). Since these
-    // instructions effectively have no inputs or outputs, we are free to skip
-    // any non-aliasing instructions in between without tracking used registers.
-    if (isMergeableStackTaggingInstruction(MI, Offset, Size, ZeroData)) {
-      if (ZeroData != FirstZeroData)
-        break;
-      Instrs.emplace_back(&MI, Offset, Size);
-      continue;
-    }
-
-    // Only count non-transient, non-tagging instructions toward the scan
-    // limit.
-    if (!MI.isTransient())
-      ++Count;
-
-    // Just in case, stop before the epilogue code starts.
-    if (MI.getFlag(MachineInstr::FrameSetup) ||
-        MI.getFlag(MachineInstr::FrameDestroy))
-      break;
-
-    // Reject anything that may alias the collected instructions.
-    if (MI.mayLoadOrStore() || MI.hasUnmodeledSideEffects())
-      break;
-  }
-
-  // New code will be inserted after the last tagging instruction we've found.
-  MachineBasicBlock::iterator InsertI = Instrs.back().MI;
-  InsertI++;
-
-  llvm::stable_sort(Instrs,
-                    [](const TagStoreInstr &Left, const TagStoreInstr &Right) {
-                      return Left.Offset < Right.Offset;
-                    });
-
-  // Make sure that we don't have any overlapping stores.
-  int64_t CurOffset = Instrs[0].Offset;
-  for (auto &Instr : Instrs) {
-    if (CurOffset > Instr.Offset)
-      return NextI;
-    CurOffset = Instr.Offset + Instr.Size;
-  }
-
-  // Find contiguous runs of tagged memory and emit shorter instruction
-  // sequencies for them when possible.
-  TagStoreEdit TSE(MBB, FirstZeroData);
-  Optional<int64_t> EndOffset;
-  for (auto &Instr : Instrs) {
-    if (EndOffset && *EndOffset != Instr.Offset) {
-      // Found a gap.
-      TSE.emitCode(InsertI, TFI, /*IsLast = */ false);
-      TSE.clear();
-    }
-
-    TSE.addInstruction(Instr);
-    EndOffset = Instr.Offset + Instr.Size;
-  }
-
-  TSE.emitCode(InsertI, TFI, /*IsLast = */ true);
-
-  return InsertI;
-}
-} // namespace
-
-void AArch64FrameLowering::processFunctionBeforeFrameIndicesReplaced(
-    MachineFunction &MF, RegScavenger *RS = nullptr) const {
-  if (StackTaggingMergeSetTag)
-    for (auto &BB : MF)
-      for (MachineBasicBlock::iterator II = BB.begin(); II != BB.end();)
-        II = tryMergeAdjacentSTG(II, this, RS);
-}
-
-/// For Win64 AArch64 EH, the offset to the Unwind object is from the SP
-/// before the update.  This is easily retrieved as it is exactly the offset
-/// that is set in processFunctionBeforeFrameFinalized.
+/// For Win64 AArch64 EH, the offset to the Unwind object is from the SP before
+/// the update.  This is easily retrieved as it is exactly the offset that is set
+/// in processFunctionBeforeFrameFinalized.
 int AArch64FrameLowering::getFrameIndexReferencePreferSP(
    const MachineFunction &MF, int FI, unsigned &FrameReg,
    bool IgnoreSPUpdates) const {
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
@ -77,10 +77,6 @@ public:
  void processFunctionBeforeFrameFinalized(MachineFunction &MF,
                                             RegScavenger *RS) const override;

-  void
-  processFunctionBeforeFrameIndicesReplaced(MachineFunction &MF,
-                                            RegScavenger *RS) const override;
-
  unsigned getWinEHParentFrameOffset(const MachineFunction &MF) const override;

  unsigned getWinEHFuncletFrameSize(const MachineFunction &MF) const;
@ -111,8 +107,6 @@ private:
  int64_t assignSVEStackObjectOffsets(MachineFrameInfo &MF,
                                      int &MinCSFrameIndex,
                                      int &MaxCSFrameIndex) const;
-  bool shouldCombineCSRLocalStackBumpInEpilogue(MachineBasicBlock &MBB,
-                                                unsigned StackBumpBytes) const;
 };

 } // End llvm namespace
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@ -3458,8 +3458,6 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI,
  case AArch64::ST1Fourv1d:
  case AArch64::IRG:
  case AArch64::IRGstack:
-  case AArch64::STGloop:
-  case AArch64::STZGloop:
    return AArch64FrameOffsetCannotUpdate;
  }

--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@ -1514,17 +1514,17 @@ def TAGPstack
 // register / expression for the tagged base pointer of the current function.
 def : Pat<(int_aarch64_irg_sp i64:$Rm), (IRGstack SP, i64:$Rm)>;

-// Large STG to be expanded into a loop. $sz is the size, $Rn is start address.
-// $Rn_wback is one past the end of the range. $Rm is the loop counter.
+// Large STG to be expanded into a loop. $Rm is the size, $Rn is start address.
+// $Rn_wback is one past the end of the range.
 let isCodeGenOnly=1, mayStore=1 in {
 def STGloop
-    : Pseudo<(outs GPR64common:$Rm, GPR64sp:$Rn_wback), (ins i64imm:$sz, GPR64sp:$Rn),
-             [], "$Rn = $Rn_wback,@earlyclobber $Rn_wback,@earlyclobber $Rm" >,
+    : Pseudo<(outs GPR64common:$Rm_wback, GPR64sp:$Rn_wback), (ins GPR64common:$Rm, GPR64sp:$Rn),
+             [], "$Rn = $Rn_wback,@earlyclobber $Rn_wback,$Rm = $Rm_wback,@earlyclobber $Rm_wback" >,
      Sched<[WriteAdr, WriteST]>;

 def STZGloop
-    : Pseudo<(outs GPR64common:$Rm, GPR64sp:$Rn_wback), (ins i64imm:$sz, GPR64sp:$Rn),
-             [], "$Rn = $Rn_wback,@earlyclobber $Rn_wback,@earlyclobber $Rm" >,
+    : Pseudo<(outs GPR64common:$Rm_wback, GPR64sp:$Rn_wback), (ins GPR64common:$Rm, GPR64sp:$Rn),
+             [], "$Rn = $Rn_wback,@earlyclobber $Rn_wback,$Rm = $Rm_wback,@earlyclobber $Rm_wback" >,
      Sched<[WriteAdr, WriteST]>;
 }

--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@ -390,10 +390,6 @@ bool AArch64RegisterInfo::needsFrameBaseReg(MachineInstr *MI,
  if (isFrameOffsetLegal(MI, AArch64::SP, Offset))
    return false;

-  // If even offset 0 is illegal, we don't want a virtual base register.
-  if (!isFrameOffsetLegal(MI, AArch64::SP, 0))
-    return false;
-
  // The offset likely isn't legal; we want to allocate a virtual base register.
  return true;
 }
@ -449,17 +445,6 @@ void AArch64RegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
  (void)Done;
 }

-static Register getScratchRegisterForInstruction(MachineInstr &MI) {
-  // ST*Gloop can only have #fi in op3, and they have a constraint that
-  // op1==op3. Since op1 is early-clobber, it may (and also must) be used as the
-  // scratch register.
-  if (MI.getOpcode() == AArch64::STGloop || MI.getOpcode() == AArch64::STZGloop)
-    return MI.getOperand(1).getReg();
-  else
-    return MI.getMF()->getRegInfo().createVirtualRegister(
-        &AArch64::GPR64RegClass);
-}
-
 void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
                                              int SPAdj, unsigned FIOperandNum,
                                              RegScavenger *RS) const {
@ -516,7 +501,8 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
      // in a scratch register.
      Offset = TFI->resolveFrameIndexReference(
          MF, FrameIndex, FrameReg, /*PreferFP=*/false, /*ForSimm=*/true);
-      Register ScratchReg = getScratchRegisterForInstruction(MI);
+      Register ScratchReg =
+          MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
      emitFrameOffset(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset,
                      TII);
      BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AArch64::LDG), ScratchReg)
@ -545,7 +531,8 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
  // If we get here, the immediate doesn't fit into the instruction.  We folded
  // as much as possible above.  Handle the rest, providing a register that is
  // SP+LargeImm.
-  Register ScratchReg = getScratchRegisterForInstruction(MI);
+  Register ScratchReg =
+      MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
  emitFrameOffset(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset, TII);
  MI.getOperand(FIOperandNum).ChangeToRegister(ScratchReg, false, false, true);
 }
--- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@ -125,13 +125,19 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForSetTag(
    return EmitUnrolledSetTag(DAG, dl, Chain, Addr, ObjSize, BaseMemOperand,
                              ZeroData);

-  const EVT ResTys[] = {MVT::i64, MVT::i64, MVT::Other};
-
-  if (Addr.getOpcode() == ISD::FrameIndex) {
-    int FI = cast<FrameIndexSDNode>(Addr)->getIndex();
-    Addr = DAG.getTargetFrameIndex(FI, MVT::i64);
+  if (ObjSize % 32 != 0) {
+    SDNode *St1 = DAG.getMachineNode(
+        ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex, dl,
+        {MVT::i64, MVT::Other},
+        {Addr, Addr, DAG.getTargetConstant(1, dl, MVT::i64), Chain});
+    DAG.setNodeMemRefs(cast<MachineSDNode>(St1), {BaseMemOperand});
+    ObjSize -= 16;
+    Addr = SDValue(St1, 0);
+    Chain = SDValue(St1, 1);
  }
-  SDValue Ops[] = {DAG.getTargetConstant(ObjSize, dl, MVT::i64), Addr, Chain};
+
+  const EVT ResTys[] = {MVT::i64, MVT::i64, MVT::Other};
+  SDValue Ops[] = {DAG.getConstant(ObjSize, dl, MVT::i64), Addr, Chain};
  SDNode *St = DAG.getMachineNode(
      ZeroData ? AArch64::STZGloop : AArch64::STGloop, dl, ResTys, Ops);

--- a/llvm/test/CodeGen/AArch64/settag-merge.ll
+++ b/llvm/test/CodeGen/AArch64/settag-merge.ll
@ -1,214 +0,0 @@
-; RUN: llc < %s -mtriple=aarch64 -mattr=+mte | FileCheck %s
-
-declare void @use(i8* %p)
-declare void @llvm.aarch64.settag(i8* %p, i64 %a)
-declare void @llvm.aarch64.settag.zero(i8* %p, i64 %a)
-
-define void @stg16_16() {
-entry:
-; CHECK-LABEL: stg16_16:
-; CHECK: st2g sp, [sp], #32
-; CHECK: ret
-  %a = alloca i8, i32 16, align 16
-  %b = alloca i8, i32 16, align 16
-  call void @llvm.aarch64.settag(i8* %a, i64 16)
-  call void @llvm.aarch64.settag(i8* %b, i64 16)
-  ret void
-}
-
-define i32 @stg16_16_16_16_ret() {
-entry:
-; CHECK-LABEL: stg16_16_16_16_ret:
-; CHECK: st2g sp, [sp, #32]
-; CHECK: st2g sp, [sp], #64
-; CHECK: mov  w0, wzr
-; CHECK: ret
-  %a = alloca i8, i32 16, align 16
-  %b = alloca i8, i32 16, align 16
-  %c = alloca i8, i32 16, align 16
-  %d = alloca i8, i32 16, align 16
-  call void @llvm.aarch64.settag(i8* %a, i64 16)
-  call void @llvm.aarch64.settag(i8* %b, i64 16)
-  call void @llvm.aarch64.settag(i8* %c, i64 16)
-  call void @llvm.aarch64.settag(i8* %d, i64 16)
-  ret i32 0
-}
-
-define void @stg16_16_16_16() {
-entry:
-; CHECK-LABEL: stg16_16_16_16:
-; CHECK: st2g sp, [sp, #32]
-; CHECK: st2g sp, [sp], #64
-; CHECK: ret
-  %a = alloca i8, i32 16, align 16
-  %b = alloca i8, i32 16, align 16
-  %c = alloca i8, i32 16, align 16
-  %d = alloca i8, i32 16, align 16
-  call void @llvm.aarch64.settag(i8* %a, i64 16)
-  call void @llvm.aarch64.settag(i8* %b, i64 16)
-  call void @llvm.aarch64.settag(i8* %c, i64 16)
-  call void @llvm.aarch64.settag(i8* %d, i64 16)
-  ret void
-}
-
-define void @stg128_128_128_128() {
-entry:
-; CHECK-LABEL: stg128_128_128_128:
-; CHECK: mov     x8, #512
-; CHECK: st2g    sp, [sp], #32
-; CHECK: sub     x8, x8, #32
-; CHECK: cbnz    x8,
-; CHECK: ret
-  %a = alloca i8, i32 128, align 16
-  %b = alloca i8, i32 128, align 16
-  %c = alloca i8, i32 128, align 16
-  %d = alloca i8, i32 128, align 16
-  call void @llvm.aarch64.settag(i8* %a, i64 128)
-  call void @llvm.aarch64.settag(i8* %b, i64 128)
-  call void @llvm.aarch64.settag(i8* %c, i64 128)
-  call void @llvm.aarch64.settag(i8* %d, i64 128)
-  ret void
-}
-
-define void @stg16_512_16() {
-entry:
-; CHECK-LABEL: stg16_512_16:
-; CHECK: mov     x8, #544
-; CHECK: st2g    sp, [sp], #32
-; CHECK: sub     x8, x8, #32
-; CHECK: cbnz    x8,
-; CHECK: ret
-  %a = alloca i8, i32 16, align 16
-  %b = alloca i8, i32 512, align 16
-  %c = alloca i8, i32 16, align 16
-  call void @llvm.aarch64.settag(i8* %a, i64 16)
-  call void @llvm.aarch64.settag(i8* %b, i64 512)
-  call void @llvm.aarch64.settag(i8* %c, i64 16)
-  ret void
-}
-
-define void @stg512_512_512() {
-entry:
-; CHECK-LABEL: stg512_512_512:
-; CHECK: mov     x8, #1536
-; CHECK: st2g    sp, [sp], #32
-; CHECK: sub     x8, x8, #32
-; CHECK: cbnz    x8,
-; CHECK: ret
-  %a = alloca i8, i32 512, align 16
-  %b = alloca i8, i32 512, align 16
-  %c = alloca i8, i32 512, align 16
-  call void @llvm.aarch64.settag(i8* %a, i64 512)
-  call void @llvm.aarch64.settag(i8* %b, i64 512)
-  call void @llvm.aarch64.settag(i8* %c, i64 512)
-  ret void
-}
-
-define void @early(i1 %flag) {
-entry:
-; CHECK-LABEL: early:
-; CHECK: tbz     w0, #0, [[LABEL:.LBB.*]]
-; CHECK: st2g    sp, [sp, #
-; CHECK: st2g    sp, [sp, #
-; CHECK: st2g    sp, [sp, #
-; CHECK: [[LABEL]]:
-; CHECK: stg     sp, [sp, #
-; CHECK: st2g    sp, [sp], #
-; CHECK: ret
-  %a = alloca i8, i32 48, align 16
-  %b = alloca i8, i32 48, align 16
-  %c = alloca i8, i32 48, align 16
-  br i1 %flag, label %if.then, label %if.end
-
-if.then:
-  call void @llvm.aarch64.settag(i8* %a, i64 48)
-  call void @llvm.aarch64.settag(i8* %b, i64 48)
-  br label %if.end
-
-if.end:
-  call void @llvm.aarch64.settag(i8* %c, i64 48)
-  ret void
-}
-
-define void @early_128_128(i1 %flag) {
-entry:
-; CHECK-LABEL: early_128_128:
-; CHECK: tbz   w0, #0, [[LABEL:.LBB.*]]
-; CHECK: add   x9, sp, #
-; CHECK: mov   x8, #256
-; CHECK: st2g  x9, [x9], #32
-; CHECK: sub   x8, x8, #32
-; CHECK: cbnz  x8,
-; CHECK: [[LABEL]]:
-; CHECK: stg     sp, [sp, #
-; CHECK: st2g    sp, [sp], #
-; CHECK: ret
-  %a = alloca i8, i32 128, align 16
-  %b = alloca i8, i32 128, align 16
-  %c = alloca i8, i32 48, align 16
-  br i1 %flag, label %if.then, label %if.end
-
-if.then:
-  call void @llvm.aarch64.settag(i8* %a, i64 128)
-  call void @llvm.aarch64.settag(i8* %b, i64 128)
-  br label %if.end
-
-if.end:
-  call void @llvm.aarch64.settag(i8* %c, i64 48)
-  ret void
-}
-
-define void @early_512_512(i1 %flag) {
-entry:
-; CHECK-LABEL: early_512_512:
-; CHECK: tbz   w0, #0, [[LABEL:.LBB.*]]
-; CHECK: add   x9, sp, #
-; CHECK: mov   x8, #1024
-; CHECK: st2g  x9, [x9], #32
-; CHECK: sub   x8, x8, #32
-; CHECK: cbnz  x8,
-; CHECK: [[LABEL]]:
-; CHECK: stg     sp, [sp, #
-; CHECK: st2g    sp, [sp], #
-; CHECK: ret
-  %a = alloca i8, i32 512, align 16
-  %b = alloca i8, i32 512, align 16
-  %c = alloca i8, i32 48, align 16
-  br i1 %flag, label %if.then, label %if.end
-
-if.then:
-  call void @llvm.aarch64.settag(i8* %a, i64 512)
-  call void @llvm.aarch64.settag(i8* %b, i64 512)
-  br label %if.end
-
-if.end:
-  call void @llvm.aarch64.settag(i8* %c, i64 48)
-  ret void
-}
-
-; Two loops of size 256; the second loop updates SP.
-define void @stg128_128_gap_128_128() {
-entry:
-; CHECK-LABEL: stg128_128_gap_128_128:
-; CHECK: mov     x9, sp
-; CHECK: mov     x8, #256
-; CHECK: st2g    x9, [x9], #32
-; CHECK: sub     x8, x8, #32
-; CHECK: cbnz    x8,
-; CHECK: mov     x8, #256
-; CHECK: st2g    sp, [sp], #32
-; CHECK: sub     x8, x8, #32
-; CHECK: cbnz    x8,
-; CHECK: ret
-  %a = alloca i8, i32 128, align 16
-  %a2 = alloca i8, i32 128, align 16
-  %b = alloca i8, i32 32, align 16
-  %c = alloca i8, i32 128, align 16
-  %c2 = alloca i8, i32 128, align 16
-  call void @use(i8* %b)
-  call void @llvm.aarch64.settag(i8* %a, i64 128)
-  call void @llvm.aarch64.settag(i8* %a2, i64 128)
-  call void @llvm.aarch64.settag(i8* %c, i64 128)
-  call void @llvm.aarch64.settag(i8* %c2, i64 128)
-  ret void
-}
--- a/llvm/test/CodeGen/AArch64/settag-merge.mir
+++ b/llvm/test/CodeGen/AArch64/settag-merge.mir
@ -1,83 +0,0 @@
-# RUN: llc -mtriple=aarch64 -mattr=+mte -run-pass=prologepilog %s -o - | FileCheck %s
-
--- |
-  declare void @llvm.aarch64.settag(i8* nocapture writeonly, i64) argmemonly nounwind writeonly "target-features"="+mte"
-  define i32 @stg16_16_16_16_ret() "target-features"="+mte" {
-  entry:
-    %a = alloca i8, i32 16, align 16
-    %b = alloca i8, i32 16, align 16
-    %c = alloca i8, i32 16, align 16
-    %d = alloca i8, i32 16, align 16
-    call void @llvm.aarch64.settag(i8* %a, i64 16)
-    call void @llvm.aarch64.settag(i8* %b, i64 16)
-    call void @llvm.aarch64.settag(i8* %c, i64 16)
-    call void @llvm.aarch64.settag(i8* %d, i64 16)
-    ret i32 0
-  }
-
-  define void @stg16_store_128() "target-features"="+mte" {
-  entry:
-    %a = alloca i8, i32 16, align 16
-    %b = alloca i8, i32 128, align 16
-    call void @llvm.aarch64.settag(i8* %a, i64 16)
-    store i8 42, i8* %a
-    call void @llvm.aarch64.settag(i8* %b, i64 128)
-    ret void
-  }
-
-...
---
-# A sequence of STG with a register copy in the middle.
-# Can be merged into ST2G + ST2G.
-# CHECK-LABEL: name:{{.*}}stg16_16_16_16_ret
-# CHECK-DAG: ST2GOffset $sp, $sp, 2
-# CHECK-DAG: ST2GOffset $sp, $sp, 0
-# CHECK-DAG: $w0 = COPY $wzr
-# CHECK-DAG: RET_ReallyLR implicit killed $w0
-
-name:            stg16_16_16_16_ret
-tracksRegLiveness: true
-stack:
-  - { id: 0, name: a, size: 16, alignment: 16 }
-  - { id: 1, name: b, size: 16, alignment: 16 }
-  - { id: 2, name: c, size: 16, alignment: 16 }
-  - { id: 3, name: d, size: 16, alignment: 16 }
-body:             |
-  bb.0.entry:
-    STGOffset $sp, %stack.0.a, 0 :: (store 16 into %ir.a)
-    STGOffset $sp, %stack.1.b, 0 :: (store 16 into %ir.b)
-    STGOffset $sp, %stack.2.c, 0 :: (store 16 into %ir.c)
-    $w0 = COPY $wzr
-    STGOffset $sp, %stack.3.d, 0 :: (store 16 into %ir.d)
-    RET_ReallyLR implicit killed $w0
-
-...
-
---
-# A store in the middle prevents merging.
-# CHECK-LABEL: name:{{.*}}stg16_store_128
-# CHECK: ST2GOffset $sp, $sp, 2
-# CHECK: ST2GOffset $sp, $sp, 4
-# CHECK: ST2GOffset $sp, $sp, 6
-# CHECK: STGOffset  $sp, $sp, 8
-# CHECK: STRBBui
-# CHECK: ST2GOffset $sp, $sp, 0 
-# CHECK: RET_ReallyLR
-
-name:            stg16_store_128
-tracksRegLiveness: true
-stack:
-  - { id: 0, name: a, size: 16, alignment: 16 }
-  - { id: 1, name: b, size: 128, alignment: 16 }
-body:             |
-  bb.0.entry:
-    STGOffset $sp, %stack.0.a, 0 :: (store 16 into %ir.a)
-    renamable $w8 = MOVi32imm 42
-    ST2GOffset $sp, %stack.1.b, 6 :: (store 32 into %ir.b + 96, align 16)
-    ST2GOffset $sp, %stack.1.b, 4 :: (store 32 into %ir.b + 64, align 16)
-    ST2GOffset $sp, %stack.1.b, 2 :: (store 32 into %ir.b + 32, align 16)
-    STRBBui killed renamable $w8, %stack.0.a, 0 :: (store 1 into %ir.a, align 16)
-    ST2GOffset $sp, %stack.1.b, 0 :: (store 32 into %ir.b, align 16)
-    RET_ReallyLR
-
-...
--- a/llvm/test/CodeGen/AArch64/settag.ll
+++ b/llvm/test/CodeGen/AArch64/settag.ll
@ -64,8 +64,8 @@ entry:
 define void @stg17(i8* %p) {
 entry:
 ; CHECK-LABEL: stg17:
-; CHECK: stg x0, [x0], #16
 ; CHECK: mov  {{(w|x)}}[[R:[0-9]+]], #256
+; CHECK: stg x0, [x0], #16
 ; CHECK: st2g x0, [x0], #32
 ; CHECK: sub  x[[R]], x[[R]], #32
 ; CHECK: cbnz x[[R]],
@ -87,8 +87,8 @@ entry:
 define void @stzg17(i8* %p) {
 entry:
 ; CHECK-LABEL: stzg17:
-; CHECK: stzg x0, [x0], #16
 ; CHECK: mov  {{w|x}}[[R:[0-9]+]], #256
+; CHECK: stzg x0, [x0], #16
 ; CHECK: stz2g x0, [x0], #32
 ; CHECK: sub  x[[R]], x[[R]], #32
 ; CHECK: cbnz x[[R]],
@ -110,10 +110,10 @@ entry:
 define void @stg_alloca5() {
 entry:
 ; CHECK-LABEL: stg_alloca5:
+; CHECK: stg  sp, [sp, #64]
 ; CHECK: st2g sp, [sp, #32]
-; CHECK-NEXT:    stg     sp, [sp, #64]
-; CHECK-NEXT:    st2g    sp, [sp], #80
-; CHECK-NEXT:    ret
+; CHECK: st2g sp, [sp]
+; CHECK: ret
  %a = alloca i8, i32 80, align 16
  call void @llvm.aarch64.settag(i8* %a, i64 80)
  ret void
@ -122,11 +122,12 @@ entry:
 define void @stg_alloca17() {
 entry:
 ; CHECK-LABEL: stg_alloca17:
+; CHECK: mov [[P:x[0-9]+]], sp
+; CHECK: stg [[P]], {{\[}}[[P]]{{\]}}, #16
 ; CHECK: mov  {{w|x}}[[R:[0-9]+]], #256
-; CHECK: st2g sp, [sp], #32
+; CHECK: st2g [[P]], {{\[}}[[P]]{{\]}}, #32
 ; CHECK: sub  x[[R]], x[[R]], #32
 ; CHECK: cbnz x[[R]],
-; CHECK: stg sp, [sp], #16
 ; CHECK: ret
  %a = alloca i8, i32 272, align 16
  call void @llvm.aarch64.settag(i8* %a, i64 272)
--- a/llvm/test/CodeGen/AArch64/stack-tagging-unchecked-ld-st.ll
+++ b/llvm/test/CodeGen/AArch64/stack-tagging-unchecked-ld-st.ll
@ -210,10 +210,11 @@ entry:
 ; DEFAULT:  ldrb [[A:w.*]], [x{{.*}}]
 ; DEFAULT:  ldrb [[B:w.*]], [x{{.*}}]

-; ALWAYS-DAG: ldg [[PA:x.*]], [x{{.*}}]
-; ALWAYS-DAG: ldrb [[B:w.*]], [sp]
-; ALWAYS-DAG: ldrb [[A:w.*]], {{\[}}[[PA]]{{\]}}
+; ALWAYS: ldg [[PA:x.*]], [x{{.*}}]
+; ALWAYS: ldrb [[B:w.*]], [sp]
+; ALWAYS: ldrb [[A:w.*]], {{\[}}[[PA]]{{\]}}

+; COMMON: add w0, [[B]], [[A]]
 ; COMMON: ret

 ; One of these allocas is closer to FP than to SP, and within 256 bytes