[ARM] Improve VPT predicate tracking

The VPTBlock has been modified to track the 'global' state of the VPR, as well as the state for each block. Each object now just holds a list of instructions that makeup the block, while static structures hold the predicate information. This enables global access for querying how both a VPT block and individual instructions are predicated. These changes now allow us, again, to handle more complicated cases where multiple instructions build a predicate and/or where the same predicate in used in multiple blocks. It doesn't, however, get us back to before the tracking was 'fixed' as some extra logic will be required to properly handle VPT instructions. Currently a VPT could be effectively predicated because of it's inputs, but the existing logic will not detect that and so will refuse to perform the transformation. This can be seen in remat-vctp.ll test where we still don't perform the transform. Differential Revision: https://reviews.llvm.org/D87681
2020-09-22 09:22:11 +01:00 · 2020-09-22 09:22:11 +01:00 · b4fa884a73
parent 73a6a164b8
commit b4fa884a73
5 changed files with 329 additions and 263 deletions
--- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@ -166,62 +166,136 @@ namespace {
    }
  };

-  // Represent a VPT block, a list of instructions that begins with a VPT/VPST
-  // and has a maximum of four proceeding instructions. All instructions within
-  // the block are predicated upon the vpr and we allow instructions to define
-  // the vpr within in the block too.
-  class VPTBlock {
-    // The predicate then instruction, which is either a VPT, or a VPST
-    // instruction.
-    std::unique_ptr<PredicatedMI> PredicateThen;
-    PredicatedMI *Divergent = nullptr;
-    SmallVector<PredicatedMI, 4> Insts;
+  // Represent the current state of the VPR and hold all instances which
+  // represent a VPT block, which is a list of instructions that begins with a
+  // VPT/VPST and has a maximum of four proceeding instructions. All
+  // instructions within the block are predicated upon the vpr and we allow
+  // instructions to define the vpr within in the block too.
+  class VPTState {
+    friend struct LowOverheadLoop;
+
+    SmallVector<MachineInstr *, 4> Insts;
+
+    static SmallVector<VPTState, 4> Blocks;
+    static SetVector<MachineInstr *> CurrentPredicates;
+    static std::map<MachineInstr *,
+      std::unique_ptr<PredicatedMI>> PredicatedInsts;
+
+    static void CreateVPTBlock(MachineInstr *MI) {
+      assert(CurrentPredicates.size() && "Can't begin VPT without predicate");
+      Blocks.emplace_back(MI);
+      // The execution of MI is predicated upon the current set of instructions
+      // that are AND'ed together to form the VPR predicate value. In the case
+      // that MI is a VPT, CurrentPredicates will also just be MI.
+      PredicatedInsts.emplace(
+        MI, std::make_unique<PredicatedMI>(MI, CurrentPredicates));
+    }
+
+    static void reset() {
+      Blocks.clear();
+      PredicatedInsts.clear();
+      CurrentPredicates.clear();
+    }
+
+    static void addInst(MachineInstr *MI) {
+      Blocks.back().insert(MI);
+      PredicatedInsts.emplace(
+        MI, std::make_unique<PredicatedMI>(MI, CurrentPredicates));
+    }
+
+    static void addPredicate(MachineInstr *MI) {
+      LLVM_DEBUG(dbgs() << "ARM Loops: Adding VPT Predicate: " << *MI);
+      CurrentPredicates.insert(MI);
+    }
+
+    static void resetPredicate(MachineInstr *MI) {
+      LLVM_DEBUG(dbgs() << "ARM Loops: Resetting VPT Predicate: " << *MI);
+      CurrentPredicates.clear();
+      CurrentPredicates.insert(MI);
+    }

  public:
-    VPTBlock(MachineInstr *MI, SetVector<MachineInstr*> &Preds) {
-      PredicateThen = std::make_unique<PredicatedMI>(MI, Preds);
-    }
-
-    void addInst(MachineInstr *MI, SetVector<MachineInstr*> &Preds) {
-      LLVM_DEBUG(dbgs() << "ARM Loops: Adding predicated MI: " << *MI);
-      if (!Divergent && !set_difference(Preds, PredicateThen->Predicates).empty()) {
-        Divergent = &Insts.back();
-        LLVM_DEBUG(dbgs() << " - has divergent predicate: " << *Divergent->MI);
-      }
-      Insts.emplace_back(MI, Preds);
-      assert(Insts.size() <= 4 && "Too many instructions in VPT block!");
-    }
-
    // Have we found an instruction within the block which defines the vpr? If
    // so, not all the instructions in the block will have the same predicate.
-    bool HasNonUniformPredicate() const {
-      return Divergent != nullptr;
+    static bool hasUniformPredicate(VPTState &Block) {
+      return getDivergent(Block) == nullptr;
    }

-    // Is the given instruction part of the predicate set controlling the entry
-    // to the block.
-    bool IsPredicatedOn(MachineInstr *MI) const {
-      return PredicateThen->Predicates.count(MI);
+    // If it exists, return the first internal instruction which modifies the
+    // VPR.
+    static MachineInstr *getDivergent(VPTState &Block) {
+      SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
+      for (unsigned i = 1; i < Insts.size(); ++i) {
+        MachineInstr *Next = Insts[i];
+        if (isVectorPredicate(Next))
+          return Next; // Found an instruction altering the vpr.
+      }
+      return nullptr;
    }

-    // Returns true if this is a VPT instruction.
-    bool isVPT() const { return !isVPST(); }
-
-    // Returns true if this is a VPST instruction.
-    bool isVPST() const {
-      return PredicateThen->MI->getOpcode() == ARM::MVE_VPST;
+    // Return whether the given instruction is predicated upon a VCTP.
+    static bool isPredicatedOnVCTP(MachineInstr *MI, bool Exclusive = false) {
+      SetVector<MachineInstr *> &Predicates = PredicatedInsts[MI]->Predicates;
+      if (Exclusive && Predicates.size() != 1)
+        return false;
+      for (auto *PredMI : Predicates)
+        if (isVCTP(PredMI))
+          return true;
+      return false;
    }

-    // Is the given instruction the only predicate which controls the entry to
-    // the block.
-    bool IsOnlyPredicatedOn(MachineInstr *MI) const {
-      return IsPredicatedOn(MI) && PredicateThen->Predicates.size() == 1;
+    // Is the VPST, controlling the block entry, predicated upon a VCTP.
+    static bool isEntryPredicatedOnVCTP(VPTState &Block,
+                                        bool Exclusive = false) {
+      SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
+      return isPredicatedOnVCTP(Insts.front(), Exclusive);
+    }
+
+    static bool isValid() {
+      // All predication within the loop should be based on vctp. If the block
+      // isn't predicated on entry, check whether the vctp is within the block
+      // and that all other instructions are then predicated on it.
+      for (auto &Block : Blocks) {
+        if (isEntryPredicatedOnVCTP(Block))
+          continue;
+
+        SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
+        for (auto *MI : Insts) {
+          // Check that any internal VCTPs are 'Then' predicated.
+          if (isVCTP(MI) && getVPTInstrPredicate(*MI) != ARMVCC::Then)
+            return false;
+          // Skip other instructions that build up the predicate.
+          if (MI->getOpcode() == ARM::MVE_VPST || isVectorPredicate(MI))
+            continue;
+          // Check that any other instructions are predicated upon a vctp.
+          // TODO: We could infer when VPTs are implicitly predicated on the
+          // vctp (when the operands are predicated).
+          if (!isPredicatedOnVCTP(MI)) {
+            LLVM_DEBUG(dbgs() << "ARM Loops: Can't convert: " << *MI);
+            return false;
+          }
+        }
+      }
+      return true;
+    }
+
+    VPTState(MachineInstr *MI) { Insts.push_back(MI); }
+
+    void insert(MachineInstr *MI) {
+      Insts.push_back(MI);
+      // VPT/VPST + 4 predicated instructions.
+      assert(Insts.size() <= 5 && "Too many instructions in VPT block!");
+    }
+
+    bool containsVCTP() const {
+      for (auto *MI : Insts)
+        if (isVCTP(MI))
+          return true;
+      return false;
    }

    unsigned size() const { return Insts.size(); }
-    SmallVectorImpl<PredicatedMI> &getInsts() { return Insts; }
-    MachineInstr *getPredicateThen() const { return PredicateThen->MI; }
-    PredicatedMI *getDivergent() const { return Divergent; }
+    SmallVectorImpl<MachineInstr *> &getInsts() { return Insts; }
  };

  struct LowOverheadLoop {
@ -237,12 +311,8 @@ namespace {
    MachineInstr *Start = nullptr;
    MachineInstr *Dec = nullptr;
    MachineInstr *End = nullptr;
-    MachineInstr *VCTP = nullptr;
    MachineOperand TPNumElements;
-    SmallPtrSet<MachineInstr*, 4> SecondaryVCTPs;
-    VPTBlock *CurrentBlock = nullptr;
-    SetVector<MachineInstr*> CurrentPredicate;
-    SmallVector<VPTBlock, 4> VPTBlocks;
+    SmallVector<MachineInstr*, 4> VCTPs;
    SmallPtrSet<MachineInstr*, 4> ToRemove;
    SmallPtrSet<MachineInstr*, 4> BlockMasksToRecompute;
    bool Revert = false;
@ -258,6 +328,7 @@ namespace {
        Preheader = MBB;
      else if (auto *MBB = MLI.findLoopPreheader(&ML, true))
        Preheader = MBB;
+      VPTState::reset();
    }

    // If this is an MVE instruction, check that we know how to use tail
@ -272,10 +343,14 @@ namespace {
    bool IsTailPredicationLegal() const {
      // For now, let's keep things really simple and only support a single
      // block for tail predication.
-      return !Revert && FoundAllComponents() && VCTP &&
+      return !Revert && FoundAllComponents() && !VCTPs.empty() &&
             !CannotTailPredicate && ML.getNumBlocks() == 1;
    }

+    // Given that MI is a VCTP, check that is equivalent to any other VCTPs
+    // found.
+    bool AddVCTP(MachineInstr *MI);
+
    // Check that the predication in the loop will be equivalent once we
    // perform the conversion. Also ensure that we can provide the number
    // of elements to the loop start instruction.
@ -298,7 +373,9 @@ namespace {
      return Start && Dec && End;
    }

-    SmallVectorImpl<VPTBlock> &getVPTBlocks() { return VPTBlocks; }
+    SmallVectorImpl<VPTState> &getVPTBlocks() {
+      return VPTState::Blocks;
+    }

    // Return the operand for the loop start instruction. This will be the loop
    // iteration count, or the number of elements if we're tail predicating.
@ -311,14 +388,18 @@ namespace {
      if (!IsTailPredicationLegal())
        return IsDo ? ARM::t2DLS : ARM::t2WLS;

-      return VCTPOpcodeToLSTP(VCTP->getOpcode(), IsDo);
+      return VCTPOpcodeToLSTP(VCTPs.back()->getOpcode(), IsDo);
    }

    void dump() const {
      if (Start) dbgs() << "ARM Loops: Found Loop Start: " << *Start;
      if (Dec) dbgs() << "ARM Loops: Found Loop Dec: " << *Dec;
      if (End) dbgs() << "ARM Loops: Found Loop End: " << *End;
-      if (VCTP) dbgs() << "ARM Loops: Found VCTP: " << *VCTP;
+      if (!VCTPs.empty()) {
+        dbgs() << "ARM Loops: Found VCTP(s):\n";
+        for (auto *MI : VCTPs)
+          dbgs() << " - " << *MI;
+      }
      if (!FoundAllComponents())
        dbgs() << "ARM Loops: Not a low-overhead loop.\n";
      else if (!(Start && Dec && End))
@ -382,6 +463,11 @@ namespace {

 char ARMLowOverheadLoops::ID = 0;

+SmallVector<VPTState, 4> VPTState::Blocks;
+SetVector<MachineInstr *> VPTState::CurrentPredicates;
+std::map<MachineInstr *,
+         std::unique_ptr<PredicatedMI>> VPTState::PredicatedInsts;
+
 INITIALIZE_PASS(ARMLowOverheadLoops, DEBUG_TYPE, ARM_LOW_OVERHEAD_LOOPS_NAME,
                false, false)

@ -419,38 +505,10 @@ MachineInstr *LowOverheadLoop::isSafeToDefineLR() {
 }

 bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt) {
-  assert(VCTP && "VCTP instruction expected but is not set");
-  // All predication within the loop should be based on vctp. If the block
-  // isn't predicated on entry, check whether the vctp is within the block
-  // and that all other instructions are then predicated on it.
-  for (auto &Block : VPTBlocks) {
-    if (Block.IsPredicatedOn(VCTP))
-      continue;
-    if (Block.HasNonUniformPredicate() && !isVCTP(Block.getDivergent()->MI)) {
-      LLVM_DEBUG(dbgs() << "ARM Loops: Found unsupported diverging predicate: "
-                        << *Block.getDivergent()->MI);
-      return false;
-    }
-    SmallVectorImpl<PredicatedMI> &Insts = Block.getInsts();
-    for (auto &PredMI : Insts) {
-      // Check the instructions in the block and only allow:
-      //   - VCTPs
-      //   - Instructions predicated on the main VCTP
-      //   - Any VCMP
-      //      - VCMPs just "and" their result with VPR.P0. Whether they are
-      //      located before/after the VCTP is irrelevant - the end result will
-      //      be the same in both cases, so there's no point in requiring them
-      //      to be located after the VCTP!
-      if (PredMI.Predicates.count(VCTP) || isVCTP(PredMI.MI) ||
-          VCMPOpcodeToVPT(PredMI.MI->getOpcode()) != 0)
-        continue;
-      LLVM_DEBUG(dbgs() << "ARM Loops: Can't convert: " << *PredMI.MI
-                 << " - which is predicated on:\n";
-                 for (auto *MI : PredMI.Predicates)
-                   dbgs() << "   - " << *MI);
-      return false;
-    }
-  }
+  assert(!VCTPs.empty() && "VCTP instruction expected but is not set");
+
+  if (!VPTState::isValid())
+    return false;

  if (!ValidateLiveOuts()) {
    LLVM_DEBUG(dbgs() << "ARM Loops: Invalid live outs.\n");
@ -461,6 +519,7 @@ bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt) {
  // of the iteration count, to the loop start instruction. The number of
  // elements is provided to the vctp instruction, so we need to check that
  // we can use this register at InsertPt.
+  MachineInstr *VCTP = VCTPs.back();
  TPNumElements = VCTP->getOperand(1);
  Register NumElements = TPNumElements.getReg();

@ -557,10 +616,10 @@ bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt) {
  if (auto *Def = RDA.getUniqueReachingMIDef(&MBB->back(),
                                             VCTP->getOperand(1).getReg())) {
    SmallPtrSet<MachineInstr*, 2> ElementChain;
-    SmallPtrSet<MachineInstr*, 2> Ignore = { VCTP };
+    SmallPtrSet<MachineInstr*, 2> Ignore;
    unsigned ExpectedVectorWidth = getTailPredVectorWidth(VCTP->getOpcode());

-    Ignore.insert(SecondaryVCTPs.begin(), SecondaryVCTPs.end());
+    Ignore.insert(VCTPs.begin(), VCTPs.end());

    if (RDA.isSafeToRemove(Def, ElementChain, Ignore)) {
      bool FoundSub = false;
@ -853,7 +912,7 @@ void LowOverheadLoop::CheckLegality(ARMBasicBlockUtils *BBUtils) {
    LLVM_DEBUG(dbgs() << "ARM Loops: Start insertion point: " << *InsertPt);

  if (!IsTailPredicationLegal()) {
-    LLVM_DEBUG(if (!VCTP)
+    LLVM_DEBUG(if (VCTPs.empty())
                 dbgs() << "ARM Loops: Didn't find a VCTP instruction.\n";
               dbgs() << "ARM Loops: Tail-predication is not valid.\n");
    return;
@ -866,6 +925,26 @@ void LowOverheadLoop::CheckLegality(ARMBasicBlockUtils *BBUtils) {
             dbgs() << "ARM Loops: Couldn't validate tail predicate.\n");
 }

+bool LowOverheadLoop::AddVCTP(MachineInstr *MI) {
+  LLVM_DEBUG(dbgs() << "ARM Loops: Adding VCTP: " << *MI);
+  if (VCTPs.empty()) {
+    VCTPs.push_back(MI);
+    return true;
+  }
+
+  // If we find another VCTP, check whether it uses the same value as the main VCTP.
+  // If it does, store it in the VCTPs set, else refuse it.
+  MachineInstr *Prev = VCTPs.back();
+  if (!Prev->getOperand(1).isIdenticalTo(MI->getOperand(1)) ||
+      !RDA.hasSameReachingDef(Prev, MI, MI->getOperand(1).getReg())) {
+    LLVM_DEBUG(dbgs() << "ARM Loops: Found VCTP with a different reaching "
+                         "definition from the main VCTP");
+    return false;
+  }
+  VCTPs.push_back(MI);
+  return true;
+}
+
 bool LowOverheadLoop::ValidateMVEInst(MachineInstr* MI) {
  if (CannotTailPredicate)
    return false;
@ -886,75 +965,28 @@ bool LowOverheadLoop::ValidateMVEInst(MachineInstr* MI) {
    return false;
  }

-  if (isVCTP(MI)) {
-    // If we find another VCTP, check whether it uses the same value as the main VCTP.
-    // If it does, store it in the SecondaryVCTPs set, else refuse it.
-    if (VCTP) {
-      if (!VCTP->getOperand(1).isIdenticalTo(MI->getOperand(1)) ||
-          !RDA.hasSameReachingDef(VCTP, MI, MI->getOperand(1).getReg())) {
-        LLVM_DEBUG(dbgs() << "ARM Loops: Found VCTP with a different reaching "
-                             "definition from the main VCTP");
-        return false;
-      }
-      LLVM_DEBUG(dbgs() << "ARM Loops: Found secondary VCTP: " << *MI);
-      SecondaryVCTPs.insert(MI);
-    } else {
-      LLVM_DEBUG(dbgs() << "ARM Loops: Found 'main' VCTP: " << *MI);
-      VCTP = MI;
-    }
-  } else if (isVPTOpcode(MI->getOpcode())) {
-    if (MI->getOpcode() != ARM::MVE_VPST) {
-      assert(MI->findRegisterDefOperandIdx(ARM::VPR) != -1 &&
-             "VPT does not implicitly define VPR?!");
-      CurrentPredicate.clear();
-      CurrentPredicate.insert(MI);
-    }
-
-    VPTBlocks.emplace_back(MI, CurrentPredicate);
-    CurrentBlock = &VPTBlocks.back();
-    return true;
-  }
+  // Record all VCTPs and check that they're equivalent to one another.
+  if (isVCTP(MI) && !AddVCTP(MI))
+    return false;

  // Inspect uses first so that any instructions that alter the VPR don't
  // alter the predicate upon themselves.
  const MCInstrDesc &MCID = MI->getDesc();
  bool IsUse = false;
-  bool IsDef = false;
  for (int i = MI->getNumOperands() - 1; i >= 0; --i) {
    const MachineOperand &MO = MI->getOperand(i);
-    if (!MO.isReg() || MO.getReg() != ARM::VPR)
+    if (!MO.isReg() || !MO.isUse() || MO.getReg() != ARM::VPR)
      continue;

-    if (MO.isDef()) {
-      CurrentPredicate.insert(MI);
-      IsDef = true;
-    } else if (ARM::isVpred(MCID.OpInfo[i].OperandType)) {
-      CurrentBlock->addInst(MI, CurrentPredicate);
+    if (ARM::isVpred(MCID.OpInfo[i].OperandType)) {
+      VPTState::addInst(MI);
      IsUse = true;
-    } else {
+    } else if (MI->getOpcode() != ARM::MVE_VPST) {
      LLVM_DEBUG(dbgs() << "ARM Loops: Found instruction using vpr: " << *MI);
      return false;
    }
  }

-  // If this instruction defines the VPR, update the predicate for the
-  // proceeding instructions.
-  if (IsDef) {
-    // Clear the existing predicate when we're not in VPT Active state.
-    if (!isVectorPredicated(MI))
-      CurrentPredicate.clear();
-    CurrentPredicate.insert(MI);
-    LLVM_DEBUG(dbgs() << "ARM Loops: Adding Predicate: " << *MI);
-  }
-
-  // If we find a vpr def that is not already predicated on the vctp, we've
-  // got disjoint predicates that may not be equivalent when we do the
-  // conversion.
-  if (IsDef && !IsUse && VCTP && !isVCTP(MI)) {
-    LLVM_DEBUG(dbgs() << "ARM Loops: Found disjoint vpr def: " << *MI);
-    return false;
-  }
-
  // If we find an instruction that has been marked as not valid for tail
  // predication, only allow the instruction if it's contained within a valid
  // VPT block.
@ -968,7 +1000,26 @@ bool LowOverheadLoop::ValidateMVEInst(MachineInstr* MI) {

  // If the instruction is already explicitly predicated, then the conversion
  // will be fine, but ensure that all store operations are predicated.
-  return !IsUse && MI->mayStore() ? false : true;
+  if (MI->mayStore())
+    return IsUse;
+
+  // If this instruction defines the VPR, update the predicate for the
+  // proceeding instructions.
+  if (isVectorPredicate(MI)) {
+    // Clear the existing predicate when we're not in VPT Active state,
+    // otherwise we add to it.
+    if (!isVectorPredicated(MI))
+      VPTState::resetPredicate(MI);
+    else
+      VPTState::addPredicate(MI);
+  }
+
+  // Finally once the predicate has been modified, we can start a new VPT
+  // block if necessary.
+  if (isVPTOpcode(MI->getOpcode()))
+    VPTState::CreateVPTBlock(MI);
+
+  return true;
 }

 bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) {
@ -1301,23 +1352,20 @@ void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) {
      llvm_unreachable("trying to unpredicate a non-predicated instruction");
  };

-  // There are a few scenarios which we have to fix up:
-  // 1. VPT Blocks with non-uniform predicates:
-  //    - a. When the divergent instruction is a vctp
-  //    - b. When the block uses a vpst, and is only predicated on the vctp
-  //    - c. When the block uses a vpt and (optionally) contains one or more
-  //         vctp.
-  // 2. VPT Blocks with uniform predicates:
-  //    - a. The block uses a vpst, and is only predicated on the vctp
  for (auto &Block : LoLoop.getVPTBlocks()) {
-    SmallVectorImpl<PredicatedMI> &Insts = Block.getInsts();
-    if (Block.HasNonUniformPredicate()) {
-      PredicatedMI *Divergent = Block.getDivergent();
-      if (isVCTP(Divergent->MI)) {
-        // The vctp will be removed, so the block mask of the vp(s)t will need
-        // to be recomputed.
-        LoLoop.BlockMasksToRecompute.insert(Block.getPredicateThen());
-      } else if (Block.isVPST() && Block.IsOnlyPredicatedOn(LoLoop.VCTP)) {
+    SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
+
+    if (VPTState::isEntryPredicatedOnVCTP(Block, /*exclusive*/true)) {
+      if (VPTState::hasUniformPredicate(Block)) {
+        // A vpt block starting with VPST, is only predicated upon vctp and has no
+        // internal vpr defs:
+        // - Remove vpst.
+        // - Unpredicate the remaining instructions.
+        LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Insts.front());
+        LoLoop.ToRemove.insert(Insts.front());
+        for (unsigned i = 1; i < Insts.size(); ++i)
+          RemovePredicate(Insts[i]);
+      } else {
        // The VPT block has a non-uniform predicate but it uses a vpst and its
        // entry is guarded only by a vctp, which means we:
        // - Need to remove the original vpst.
@ -1327,28 +1375,28 @@ void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) {
        //   the divergent vpr def.
        // TODO: We could be producing more VPT blocks than necessary and could
        // fold the newly created one into a proceeding one.
-        for (auto I = ++MachineBasicBlock::iterator(Block.getPredicateThen()),
-             E = ++MachineBasicBlock::iterator(Divergent->MI); I != E; ++I)
+        MachineInstr *Divergent = VPTState::getDivergent(Block);
+        for (auto I = ++MachineBasicBlock::iterator(Insts.front()),
+             E = ++MachineBasicBlock::iterator(Divergent); I != E; ++I)
          RemovePredicate(&*I);

        // Check if the instruction defining vpr is a vcmp so it can be combined
        // with the VPST This should be the divergent instruction
-        MachineInstr *VCMP = VCMPOpcodeToVPT(Divergent->MI->getOpcode()) != 0
-                                 ? Divergent->MI
-                                 : nullptr;
+        MachineInstr *VCMP = VCMPOpcodeToVPT(Divergent->getOpcode()) != 0
+          ? Divergent
+          : nullptr;

        unsigned Size = 0;
-        auto E = MachineBasicBlock::reverse_iterator(Divergent->MI);
-        auto I = MachineBasicBlock::reverse_iterator(Insts.back().MI);
+        auto E = MachineBasicBlock::reverse_iterator(Divergent);
+        auto I = MachineBasicBlock::reverse_iterator(Insts.back());
        MachineInstr *InsertAt = nullptr;
        while (I != E) {
          InsertAt = &*I;
          ++Size;
          ++I;
        }
+
        MachineInstrBuilder MIB;
-        LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: "
-                          << *Block.getPredicateThen());
        if (VCMP) {
          // Combine the VPST and VCMP into a VPT
          MIB =
@ -1372,51 +1420,18 @@ void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) {
          MIB.addImm(0);
          LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB);
        }
-        LoLoop.ToRemove.insert(Block.getPredicateThen());
+        LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Insts.front());
+        LoLoop.ToRemove.insert(Insts.front());
        LoLoop.BlockMasksToRecompute.insert(MIB.getInstr());
      }
-      // Else, if the block uses a vpt, iterate over the block, removing the
-      // extra VCTPs it may contain.
-      else if (Block.isVPT()) {
-        bool RemovedVCTP = false;
-        for (PredicatedMI &Elt : Block.getInsts()) {
-          MachineInstr *MI = Elt.MI;
-          if (isVCTP(MI)) {
-            LLVM_DEBUG(dbgs() << "ARM Loops: Removing VCTP: " << *MI);
-            LoLoop.ToRemove.insert(MI);
-            RemovedVCTP = true;
-            continue;
-          }
-        }
-        if (RemovedVCTP)
-          LoLoop.BlockMasksToRecompute.insert(Block.getPredicateThen());
-      }
-    } else if (Block.IsOnlyPredicatedOn(LoLoop.VCTP) && Block.isVPST()) {
-      // A vpt block starting with VPST, is only predicated upon vctp and has no
-      // internal vpr defs:
-      // - Remove vpst.
-      // - Unpredicate the remaining instructions.
-      LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Block.getPredicateThen());
-      LoLoop.ToRemove.insert(Block.getPredicateThen());
-      for (auto &PredMI : Insts)
-        RemovePredicate(PredMI.MI);
-    }
-  }
-  LLVM_DEBUG(dbgs() << "ARM Loops: Removing remaining VCTPs...\n");
-  // Remove the "main" VCTP
-  LoLoop.ToRemove.insert(LoLoop.VCTP);
-  LLVM_DEBUG(dbgs() << "    " << *LoLoop.VCTP);
-  // Remove remaining secondary VCTPs
-  for (MachineInstr *VCTP : LoLoop.SecondaryVCTPs) {
-    // All VCTPs that aren't marked for removal yet should be unpredicated ones.
-    // The predicated ones should have already been marked for removal when
-    // visiting the VPT blocks.
-    if (LoLoop.ToRemove.insert(VCTP).second) {
-      assert(getVPTInstrPredicate(*VCTP) == ARMVCC::None &&
-             "Removing Predicated VCTP without updating the block mask!");
-      LLVM_DEBUG(dbgs() << "    " << *VCTP);
+    } else if (Block.containsVCTP()) {
+      // The vctp will be removed, so the block mask of the vp(s)t will need
+      // to be recomputed.
+      LoLoop.BlockMasksToRecompute.insert(Insts.front());
    }
  }
+
+  LoLoop.ToRemove.insert(LoLoop.VCTPs.begin(), LoLoop.VCTPs.end());
 }

 void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) {
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll
@ -464,28 +464,19 @@ define dso_local arm_aapcs_vfpcc void @range_test(i32* noalias nocapture %arg, i
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r7, pc}
 ; CHECK-NEXT:  .LBB5_1: @ %bb4
-; CHECK-NEXT:    add.w r12, r3, #3
-; CHECK-NEXT:    mov.w lr, #1
-; CHECK-NEXT:    bic r12, r12, #3
-; CHECK-NEXT:    sub.w r12, r12, #4
-; CHECK-NEXT:    add.w lr, lr, r12, lsr #2
 ; CHECK-NEXT:    mov.w r12, #0
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dlstp.32 lr, r3
 ; CHECK-NEXT:  .LBB5_2: @ %bb12
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vctp.32 r3
-; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vldrwt.u32 q0, [r0]
-; CHECK-NEXT:    vpttt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vptt.i32 ne, q0, zr
 ; CHECK-NEXT:    vcmpt.s32 le, q0, r2
-; CHECK-NEXT:    vctpt.32 r3
 ; CHECK-NEXT:    vldrwt.u32 q1, [r1], #16
 ; CHECK-NEXT:    add.w r12, r12, #4
-; CHECK-NEXT:    subs r3, #4
 ; CHECK-NEXT:    vmul.i32 q0, q1, q0
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vstrwt.32 q0, [r0], #16
-; CHECK-NEXT:    le lr, .LBB5_2
+; CHECK-NEXT:    letp lr, .LBB5_2
 ; CHECK-NEXT:  @ %bb.3: @ %bb32
 ; CHECK-NEXT:    pop {r7, pc}
 bb:
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tp-multiple-vpst.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tp-multiple-vpst.ll
@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -tail-predication=enabled %s -o - | FileCheck %s
+
+define dso_local arm_aapcs_vfpcc i32 @minmaxval4(i32* nocapture readonly %x, i32* nocapture %minp) {
+; CHECK-LABEL: minmaxval4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    sub sp, #8
+; CHECK-NEXT:    adr r3, .LCPI0_0
+; CHECK-NEXT:    mov.w lr, #3
+; CHECK-NEXT:    vldrw.u32 q2, [r3]
+; CHECK-NEXT:    vmov.i32 q0, #0x80000000
+; CHECK-NEXT:    vmvn.i32 q1, #0x80000000
+; CHECK-NEXT:    movs r2, #0
+; CHECK-NEXT:    vmov.i32 q3, #0xa
+; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:  .LBB0_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vadd.i32 q4, q2, r2
+; CHECK-NEXT:    vdup.32 q5, r2
+; CHECK-NEXT:    vcmp.u32 hi, q5, q4
+; CHECK-NEXT:    adds r2, #4
+; CHECK-NEXT:    vpnot
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vcmpt.u32 hi, q3, q4
+; CHECK-NEXT:    vstr p0, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrwt.u32 q4, [r0], #16
+; CHECK-NEXT:    vldr p0, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vcmpt.s32 gt, q4, q0
+; CHECK-NEXT:    vpsel q0, q4, q0
+; CHECK-NEXT:    vldr p0, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vcmpt.s32 gt, q1, q4
+; CHECK-NEXT:    vpsel q1, q4, q1
+; CHECK-NEXT:    le lr, .LBB0_1
+; CHECK-NEXT:  @ %bb.2: @ %middle.block
+; CHECK-NEXT:    mvn r0, #-2147483648
+; CHECK-NEXT:    vminv.s32 r0, q1
+; CHECK-NEXT:    str r0, [r1]
+; CHECK-NEXT:    mov.w r0, #-2147483648
+; CHECK-NEXT:    vmaxv.s32 r0, q0
+; CHECK-NEXT:    add sp, #8
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    pop {r7, pc}
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  @ %bb.3:
+; CHECK-NEXT:  .LCPI0_0:
+; CHECK-NEXT:    .long 0 @ 0x0
+; CHECK-NEXT:    .long 1 @ 0x1
+; CHECK-NEXT:    .long 2 @ 0x2
+; CHECK-NEXT:    .long 3 @ 0x3
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
+  %vec.phi = phi <4 x i32> [ <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>, %entry ], [ %5, %vector.body ]
+  %vec.phi29 = phi <4 x i32> [ <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>, %entry ], [ %7, %vector.body ]
+  %0 = getelementptr inbounds i32, i32* %x, i32 %index
+  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 10)
+  %1 = bitcast i32* %0 to <4 x i32>*
+  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
+  %2 = icmp sgt <4 x i32> %wide.masked.load, %vec.phi29
+  %3 = icmp slt <4 x i32> %wide.masked.load, %vec.phi
+  %4 = and <4 x i1> %active.lane.mask, %3
+  %5 = select <4 x i1> %4, <4 x i32> %wide.masked.load, <4 x i32> %vec.phi
+  %6 = and <4 x i1> %active.lane.mask, %2
+  %7 = select <4 x i1> %6, <4 x i32> %wide.masked.load, <4 x i32> %vec.phi29
+  %index.next = add i32 %index, 4
+  %8 = icmp eq i32 %index.next, 12
+  br i1 %8, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %9 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> %7)
+  %10 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> %5)
+  store i32 %10, i32* %minp, align 4
+  ret i32 %9
+}
+
+declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) #1
+declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #2
+declare i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32>) #3
+declare i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32>) #3
+
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt-2.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt-2.mir
@ -118,32 +118,23 @@ body:             |
  ; CHECK: bb.1.bb3:
  ; CHECK:   successors: %bb.2(0x80000000)
  ; CHECK:   liveins: $r0, $r1, $r2, $r3
-  ; CHECK:   renamable $r12 = t2ADDri renamable $r2, 3, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $lr = t2MOVi 1, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $r12 = t2BICri killed renamable $r12, 3, 14 /* CC::al */, $noreg, $noreg
  ; CHECK:   $vpr = VMSR_P0 killed $r3, 14 /* CC::al */, $noreg
-  ; CHECK:   renamable $r12 = t2SUBri killed renamable $r12, 4, 14 /* CC::al */, $noreg, $noreg
  ; CHECK:   VSTR_P0_off killed renamable $vpr, $sp, 0, 14 /* CC::al */, $noreg :: (store 4 into %stack.0)
  ; CHECK:   $r3 = tMOVr $r0, 14 /* CC::al */, $noreg
-  ; CHECK:   renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   $lr = t2DLS killed renamable $lr
+  ; CHECK:   $lr = MVE_DLSTP_32 killed renamable $r2
  ; CHECK: bb.2.bb9:
  ; CHECK:   successors: %bb.2(0x7c000000), %bb.3(0x04000000)
-  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r3
+  ; CHECK:   liveins: $lr, $r0, $r1, $r3
  ; CHECK:   renamable $vpr = VLDR_P0_off $sp, 0, 14 /* CC::al */, $noreg :: (load 4 from %stack.0)
-  ; CHECK:   MVE_VPST 4, implicit $vpr
-  ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r2, 1, killed renamable $vpr
+  ; CHECK:   MVE_VPST 8, implicit $vpr
  ; CHECK:   renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, killed renamable $vpr :: (load 16 from %ir.lsr.iv24, align 4)
-  ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg
-  ; CHECK:   renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg
-  ; CHECK:   MVE_VPST 4, implicit $vpr
-  ; CHECK:   renamable $vpr = MVE_VCMPi32r renamable $q0, $zr, 1, 1, killed renamable $vpr
+  ; CHECK:   MVE_VPTv4i32r 8, renamable $q0, $zr, 1, implicit-def $vpr
  ; CHECK:   renamable $r3, renamable $q1 = MVE_VLDRWU32_post killed renamable $r3, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1, align 4)
  ; CHECK:   renamable $q0 = nsw MVE_VMULi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
  ; CHECK:   MVE_VPST 8, implicit $vpr
  ; CHECK:   MVE_VSTRWU32 killed renamable $q0, killed renamable $r0, 0, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1, align 4)
  ; CHECK:   $r0 = tMOVr $r3, 14 /* CC::al */, $noreg
-  ; CHECK:   $lr = t2LEUpdate killed renamable $lr, %bb.2
+  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.2
  ; CHECK: bb.3.bb27:
  ; CHECK:   $sp = tADDspi $sp, 1, 14 /* CC::al */, $noreg
  ; CHECK:   tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vpt-blocks.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vpt-blocks.mir
@ -215,26 +215,17 @@ body:             |
  ; CHECK: bb.1.vector.ph:
  ; CHECK:   successors: %bb.2(0x80000000)
  ; CHECK:   liveins: $r0, $r1, $r2
-  ; CHECK:   renamable $r3, dead $cpsr = tADDi3 renamable $r1, 3, 14 /* CC::al */, $noreg
  ; CHECK:   renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
-  ; CHECK:   renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
-  ; CHECK:   renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
  ; CHECK:   renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg
-  ; CHECK:   $lr = t2DLS killed renamable $lr
+  ; CHECK:   $lr = MVE_DLSTP_32 killed renamable $r1
  ; CHECK: bb.2.vector.body:
  ; CHECK:   successors: %bb.2(0x7c000000), %bb.3(0x04000000)
-  ; CHECK:   liveins: $lr, $q0, $r0, $r1, $r2, $r3
-  ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r1, 0, $noreg
-  ; CHECK:   MVE_VPST 8, implicit $vpr
-  ; CHECK:   renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 1, killed renamable $vpr
-  ; CHECK:   MVE_VPTv4s32r 2, renamable $q1, renamable $r2, 11, implicit-def $vpr
+  ; CHECK:   liveins: $lr, $q0, $r0, $r2, $r3
+  ; CHECK:   renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 0, killed $noreg
+  ; CHECK:   MVE_VPTv4s32r 4, renamable $q1, renamable $r2, 11, implicit-def $vpr
  ; CHECK:   renamable $vpr = MVE_VCMPs32r killed renamable $q1, renamable $r3, 12, 1, killed renamable $vpr
-  ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r1, 1, killed renamable $vpr
  ; CHECK:   renamable $r0 = MVE_VSTRWU32_post renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr
-  ; CHECK:   renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14 /* CC::al */, $noreg
-  ; CHECK:   $lr = t2LEUpdate killed renamable $lr, %bb.2
+  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.2
  ; CHECK: bb.3.for.cond.cleanup:
  ; CHECK:   frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc
  bb.0.entry:
@ -731,26 +722,17 @@ body:             |
  ; CHECK: bb.1.vector.ph:
  ; CHECK:   successors: %bb.2(0x80000000)
  ; CHECK:   liveins: $r0, $r1, $r2
-  ; CHECK:   renamable $r3, dead $cpsr = tADDi3 renamable $r1, 3, 14 /* CC::al */, $noreg
  ; CHECK:   renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
-  ; CHECK:   renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
-  ; CHECK:   renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
  ; CHECK:   renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg
-  ; CHECK:   $lr = t2DLS killed renamable $lr
+  ; CHECK:   $lr = MVE_DLSTP_32 killed renamable $r1
  ; CHECK: bb.2.vector.body:
  ; CHECK:   successors: %bb.2(0x7c000000), %bb.3(0x04000000)
-  ; CHECK:   liveins: $lr, $q0, $r0, $r1, $r2, $r3
-  ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r1, 0, $noreg
-  ; CHECK:   MVE_VPST 8, implicit $vpr
-  ; CHECK:   renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 1, killed renamable $vpr
-  ; CHECK:   MVE_VPTv4s32r 2, renamable $q0, renamable $r2, 11, implicit-def $vpr
+  ; CHECK:   liveins: $lr, $q0, $r0, $r2, $r3
+  ; CHECK:   renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 0, killed $noreg
+  ; CHECK:   MVE_VPTv4s32r 4, renamable $q0, renamable $r2, 11, implicit-def $vpr
  ; CHECK:   renamable $vpr = MVE_VCMPs32r killed renamable $q1, renamable $r3, 12, 1, killed renamable $vpr
-  ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r1, 1, killed renamable $vpr
  ; CHECK:   renamable $r0 = MVE_VSTRWU32_post renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr
-  ; CHECK:   renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14 /* CC::al */, $noreg
-  ; CHECK:   $lr = t2LEUpdate killed renamable $lr, %bb.2
+  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.2
  ; CHECK: bb.3.for.cond.cleanup:
  ; CHECK:   frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc
  bb.0.entry: