llvm-project/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPas...

//===-- MVETPAndVPTOptimisationsPass.cpp ----------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file This pass does a few optimisations related to Tail predicated loops
/// and MVE VPT blocks before register allocation is performed. For VPT blocks
/// the goal is to maximize the sizes of the blocks that will be created by the
/// MVE VPT Block Insertion pass (which runs after register allocation). For
/// tail predicated loops we transform the loop into something that will
/// hopefully make the backend ARMLowOverheadLoops pass's job easier.
///
//===----------------------------------------------------------------------===//

#include "ARM.h"
#include "ARMSubtarget.h"
#include "MCTargetDesc/ARMBaseInfo.h"
#include "MVETailPredUtils.h"
#include "Thumb2InstrInfo.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/InitializePasses.h"
#include "llvm/Support/Debug.h"
#include <cassert>

using namespace llvm;

#define DEBUG_TYPE "arm-mve-vpt-opts"

static cl::opt<bool>
MergeEndDec("arm-enable-merge-loopenddec", cl::Hidden,
    cl::desc("Enable merging Loop End and Dec instructions."),
    cl::init(true));

static cl::opt<bool>
SetLRPredicate("arm-set-lr-predicate", cl::Hidden,
    cl::desc("Enable setting lr as a predicate in tail predication regions."),
    cl::init(true));

namespace {
class MVETPAndVPTOptimisations : public MachineFunctionPass {
public:
  static char ID;
  const Thumb2InstrInfo *TII;
  MachineRegisterInfo *MRI;

  MVETPAndVPTOptimisations() : MachineFunctionPass(ID) {}

  bool runOnMachineFunction(MachineFunction &Fn) override;

  void getAnalysisUsage(AnalysisUsage &AU) const override {
    AU.addRequired<MachineLoopInfo>();
    AU.addPreserved<MachineLoopInfo>();
    AU.addRequired<MachineDominatorTree>();
    AU.addPreserved<MachineDominatorTree>();
    MachineFunctionPass::getAnalysisUsage(AU);
  }

  StringRef getPassName() const override {
    return "ARM MVE TailPred and VPT Optimisation Pass";
  }

private:
  bool LowerWhileLoopStart(MachineLoop *ML);
  bool MergeLoopEnd(MachineLoop *ML);
  bool ConvertTailPredLoop(MachineLoop *ML, MachineDominatorTree *DT);
  MachineInstr &ReplaceRegisterUseWithVPNOT(MachineBasicBlock &MBB,
                                            MachineInstr &Instr,
                                            MachineOperand &User,
                                            Register Target);
  bool ReduceOldVCCRValueUses(MachineBasicBlock &MBB);
  bool ReplaceVCMPsByVPNOTs(MachineBasicBlock &MBB);
  bool ReplaceConstByVPNOTs(MachineBasicBlock &MBB, MachineDominatorTree *DT);
  bool ConvertVPSEL(MachineBasicBlock &MBB);
  bool HintDoLoopStartReg(MachineBasicBlock &MBB);
  MachineInstr *CheckForLRUseInPredecessors(MachineBasicBlock *PreHeader,
                                            MachineInstr *LoopStart);
};

char MVETPAndVPTOptimisations::ID = 0;

} // end anonymous namespace

INITIALIZE_PASS_BEGIN(MVETPAndVPTOptimisations, DEBUG_TYPE,
                      "ARM MVE TailPred and VPT Optimisations pass", false,
                      false)
INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
INITIALIZE_PASS_END(MVETPAndVPTOptimisations, DEBUG_TYPE,
                    "ARM MVE TailPred and VPT Optimisations pass", false, false)

static MachineInstr *LookThroughCOPY(MachineInstr *MI,
                                     MachineRegisterInfo *MRI) {
  while (MI && MI->getOpcode() == TargetOpcode::COPY &&
         MI->getOperand(1).getReg().isVirtual())
    MI = MRI->getVRegDef(MI->getOperand(1).getReg());
  return MI;
}

// Given a loop ML, this attempts to find the t2LoopEnd, t2LoopDec and
// corresponding PHI that make up a low overhead loop. Only handles 'do' loops
// at the moment, returning a t2DoLoopStart in LoopStart.
static bool findLoopComponents(MachineLoop *ML, MachineRegisterInfo *MRI,
                               MachineInstr *&LoopStart, MachineInstr *&LoopPhi,
                               MachineInstr *&LoopDec, MachineInstr *&LoopEnd) {
  MachineBasicBlock *Header = ML->getHeader();
  MachineBasicBlock *Latch = ML->getLoopLatch();
  if (!Header || !Latch) {
    LLVM_DEBUG(dbgs() << "  no Loop Latch or Header\n");
    return false;
  }

  // Find the loop end from the terminators.
  LoopEnd = nullptr;
  for (auto &T : Latch->terminators()) {
    if (T.getOpcode() == ARM::t2LoopEnd && T.getOperand(1).getMBB() == Header) {
      LoopEnd = &T;
      break;
    }
    if (T.getOpcode() == ARM::t2LoopEndDec &&
        T.getOperand(2).getMBB() == Header) {
      LoopEnd = &T;
      break;
    }
  }
  if (!LoopEnd) {
    LLVM_DEBUG(dbgs() << "  no LoopEnd\n");
    return false;
  }
  LLVM_DEBUG(dbgs() << "  found loop end: " << *LoopEnd);

  // Find the dec from the use of the end. There may be copies between
  // instructions. We expect the loop to loop like:
  //   $vs = t2DoLoopStart ...
  // loop:
  //   $vp = phi [ $vs ], [ $vd ]
  //   ...
  //   $vd = t2LoopDec $vp
  //   ...
  //   t2LoopEnd $vd, loop
  if (LoopEnd->getOpcode() == ARM::t2LoopEndDec)
    LoopDec = LoopEnd;
  else {
    LoopDec =
        LookThroughCOPY(MRI->getVRegDef(LoopEnd->getOperand(0).getReg()), MRI);
    if (!LoopDec || LoopDec->getOpcode() != ARM::t2LoopDec) {
      LLVM_DEBUG(dbgs() << "  didn't find LoopDec where we expected!\n");
      return false;
    }
  }
  LLVM_DEBUG(dbgs() << "  found loop dec: " << *LoopDec);

  LoopPhi =
      LookThroughCOPY(MRI->getVRegDef(LoopDec->getOperand(1).getReg()), MRI);
  if (!LoopPhi || LoopPhi->getOpcode() != TargetOpcode::PHI ||
      LoopPhi->getNumOperands() != 5 ||
      (LoopPhi->getOperand(2).getMBB() != Latch &&
       LoopPhi->getOperand(4).getMBB() != Latch)) {
    LLVM_DEBUG(dbgs() << "  didn't find PHI where we expected!\n");
    return false;
  }
  LLVM_DEBUG(dbgs() << "  found loop phi: " << *LoopPhi);

  Register StartReg = LoopPhi->getOperand(2).getMBB() == Latch
                          ? LoopPhi->getOperand(3).getReg()
                          : LoopPhi->getOperand(1).getReg();
  LoopStart = LookThroughCOPY(MRI->getVRegDef(StartReg), MRI);
  if (!LoopStart || (LoopStart->getOpcode() != ARM::t2DoLoopStart &&
                     LoopStart->getOpcode() != ARM::t2WhileLoopSetup &&
                     LoopStart->getOpcode() != ARM::t2WhileLoopStartLR)) {
    LLVM_DEBUG(dbgs() << "  didn't find Start where we expected!\n");
    return false;
  }
  LLVM_DEBUG(dbgs() << "  found loop start: " << *LoopStart);

  return true;
}

static void RevertWhileLoopSetup(MachineInstr *MI, const TargetInstrInfo *TII) {
  MachineBasicBlock *MBB = MI->getParent();
  assert(MI->getOpcode() == ARM::t2WhileLoopSetup &&
         "Only expected a t2WhileLoopSetup in RevertWhileLoopStart!");

  // Subs
  MachineInstrBuilder MIB =
      BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2SUBri));
  MIB.add(MI->getOperand(0));
  MIB.add(MI->getOperand(1));
  MIB.addImm(0);
  MIB.addImm(ARMCC::AL);
  MIB.addReg(ARM::NoRegister);
  MIB.addReg(ARM::CPSR, RegState::Define);

  // Attempt to find a t2WhileLoopStart and revert to a t2Bcc.
  for (MachineInstr &I : MBB->terminators()) {
    if (I.getOpcode() == ARM::t2WhileLoopStart) {
      MachineInstrBuilder MIB =
          BuildMI(*MBB, &I, I.getDebugLoc(), TII->get(ARM::t2Bcc));
      MIB.add(MI->getOperand(1)); // branch target
      MIB.addImm(ARMCC::EQ);
      MIB.addReg(ARM::CPSR);
      I.eraseFromParent();
      break;
    }
  }

  MI->eraseFromParent();
}

// The Hardware Loop insertion and ISel Lowering produce the pseudos for the
// start of a while loop:
//   %a:gprlr = t2WhileLoopSetup %Cnt
//   t2WhileLoopStart %a, %BB
// We want to convert those to a single instruction which, like t2LoopEndDec and
// t2DoLoopStartTP is both a terminator and produces a value:
//   %a:grplr: t2WhileLoopStartLR %Cnt, %BB
//
// Otherwise if we can't, we revert the loop. t2WhileLoopSetup and
// t2WhileLoopStart are not valid past regalloc.
bool MVETPAndVPTOptimisations::LowerWhileLoopStart(MachineLoop *ML) {
  LLVM_DEBUG(dbgs() << "LowerWhileLoopStart on loop "
                    << ML->getHeader()->getName() << "\n");

  MachineInstr *LoopEnd, *LoopPhi, *LoopStart, *LoopDec;
  if (!findLoopComponents(ML, MRI, LoopStart, LoopPhi, LoopDec, LoopEnd))
    return false;

  if (LoopStart->getOpcode() != ARM::t2WhileLoopSetup)
    return false;

  Register LR = LoopStart->getOperand(0).getReg();
  auto WLSIt = find_if(MRI->use_nodbg_instructions(LR), [](auto &MI) {
    return MI.getOpcode() == ARM::t2WhileLoopStart;
  });
  if (!MergeEndDec || WLSIt == MRI->use_instr_nodbg_end()) {
    RevertWhileLoopSetup(LoopStart, TII);
    RevertLoopDec(LoopStart, TII);
    RevertLoopEnd(LoopStart, TII);
    return true;
  }

  MachineInstrBuilder MI =
      BuildMI(*WLSIt->getParent(), *WLSIt, WLSIt->getDebugLoc(),
              TII->get(ARM::t2WhileLoopStartLR), LR)
          .add(LoopStart->getOperand(1))
          .add(WLSIt->getOperand(1));
  (void)MI;
  LLVM_DEBUG(dbgs() << "Lowered WhileLoopStart into: " << *MI.getInstr());

  WLSIt->eraseFromParent();
  LoopStart->eraseFromParent();
  return true;
}

// Return true if this instruction is invalid in a low overhead loop, usually
// because it clobbers LR.
static bool IsInvalidTPInstruction(MachineInstr &MI) {
  return MI.isCall() || isLoopStart(MI);
}

// Starting from PreHeader, search for invalid instructions back until the
// LoopStart block is reached. If invalid instructions are found, the loop start
// is reverted from a WhileLoopStart to a DoLoopStart on the same loop. Will
// return the new DLS LoopStart if updated.
MachineInstr *MVETPAndVPTOptimisations::CheckForLRUseInPredecessors(
    MachineBasicBlock *PreHeader, MachineInstr *LoopStart) {
  SmallVector<MachineBasicBlock *> Worklist;
  SmallPtrSet<MachineBasicBlock *, 4> Visited;
  Worklist.push_back(PreHeader);
  Visited.insert(LoopStart->getParent());

  while (!Worklist.empty()) {
    MachineBasicBlock *MBB = Worklist.pop_back_val();
    if (Visited.count(MBB))
      continue;

    for (MachineInstr &MI : *MBB) {
      if (!IsInvalidTPInstruction(MI))
        continue;

      LLVM_DEBUG(dbgs() << "Found LR use in predecessors, reverting: " << MI);

      // Create a t2DoLoopStart at the end of the preheader.
      MachineInstrBuilder MIB =
          BuildMI(*PreHeader, PreHeader->getFirstTerminator(),
                  LoopStart->getDebugLoc(), TII->get(ARM::t2DoLoopStart));
      MIB.add(LoopStart->getOperand(0));
      MIB.add(LoopStart->getOperand(1));

      // Make sure to remove the kill flags, to prevent them from being invalid.
      LoopStart->getOperand(1).setIsKill(false);

      // Revert the t2WhileLoopStartLR to a CMP and Br.
      RevertWhileLoopStartLR(LoopStart, TII, ARM::t2Bcc, true);
      return MIB;
    }

    Visited.insert(MBB);
    for (auto *Pred : MBB->predecessors())
      Worklist.push_back(Pred);
  }
  return LoopStart;
}

// This function converts loops with t2LoopEnd and t2LoopEnd instructions into
// a single t2LoopEndDec instruction. To do that it needs to make sure that LR
// will be valid to be used for the low overhead loop, which means nothing else
// is using LR (especially calls) and there are no superfluous copies in the
// loop. The t2LoopEndDec is a branching terminator that produces a value (the
// decrement) around the loop edge, which means we need to be careful that they
// will be valid to allocate without any spilling.
bool MVETPAndVPTOptimisations::MergeLoopEnd(MachineLoop *ML) {
  if (!MergeEndDec)
    return false;

  LLVM_DEBUG(dbgs() << "MergeLoopEnd on loop " << ML->getHeader()->getName()
                    << "\n");

  MachineInstr *LoopEnd, *LoopPhi, *LoopStart, *LoopDec;
  if (!findLoopComponents(ML, MRI, LoopStart, LoopPhi, LoopDec, LoopEnd))
    return false;

  // Check if there is an illegal instruction (a call) in the low overhead loop
  // and if so revert it now before we get any further. While loops also need to
  // check the preheaders, but can be reverted to a DLS loop if needed.
  auto *PreHeader = ML->getLoopPreheader();
  if (LoopStart->getOpcode() == ARM::t2WhileLoopStartLR && PreHeader)
    LoopStart = CheckForLRUseInPredecessors(PreHeader, LoopStart);

  for (MachineBasicBlock *MBB : ML->blocks()) {
    for (MachineInstr &MI : *MBB) {
      if (IsInvalidTPInstruction(MI)) {
        LLVM_DEBUG(dbgs() << "Found LR use in loop, reverting: " << MI);
        if (LoopStart->getOpcode() == ARM::t2DoLoopStart)
          RevertDoLoopStart(LoopStart, TII);
        else
          RevertWhileLoopStartLR(LoopStart, TII);
        RevertLoopDec(LoopDec, TII);
        RevertLoopEnd(LoopEnd, TII);
        return true;
      }
    }
  }

  // Remove any copies from the loop, to ensure the phi that remains is both
  // simpler and contains no extra uses. Because t2LoopEndDec is a terminator
  // that cannot spill, we need to be careful what remains in the loop.
  Register PhiReg = LoopPhi->getOperand(0).getReg();
  Register DecReg = LoopDec->getOperand(0).getReg();
  Register StartReg = LoopStart->getOperand(0).getReg();
  // Ensure the uses are expected, and collect any copies we want to remove.
  SmallVector<MachineInstr *, 4> Copies;
  auto CheckUsers = [&Copies](Register BaseReg,
                              ArrayRef<MachineInstr *> ExpectedUsers,
                              MachineRegisterInfo *MRI) {
    SmallVector<Register, 4> Worklist;
    Worklist.push_back(BaseReg);
    while (!Worklist.empty()) {
      Register Reg = Worklist.pop_back_val();
      for (MachineInstr &MI : MRI->use_nodbg_instructions(Reg)) {
        if (count(ExpectedUsers, &MI))
          continue;
        if (MI.getOpcode() != TargetOpcode::COPY ||
            !MI.getOperand(0).getReg().isVirtual()) {
          LLVM_DEBUG(dbgs() << "Extra users of register found: " << MI);
          return false;
        }
        Worklist.push_back(MI.getOperand(0).getReg());
        Copies.push_back(&MI);
      }
    }
    return true;
  };
  if (!CheckUsers(PhiReg, {LoopDec}, MRI) ||
      !CheckUsers(DecReg, {LoopPhi, LoopEnd}, MRI) ||
      !CheckUsers(StartReg, {LoopPhi}, MRI)) {
    // Don't leave a t2WhileLoopStartLR without the LoopDecEnd.
    if (LoopStart->getOpcode() == ARM::t2WhileLoopStartLR) {
      RevertWhileLoopStartLR(LoopStart, TII);
      RevertLoopDec(LoopDec, TII);
      RevertLoopEnd(LoopEnd, TII);
      return true;
    }
    return false;
  }

  MRI->constrainRegClass(StartReg, &ARM::GPRlrRegClass);
  MRI->constrainRegClass(PhiReg, &ARM::GPRlrRegClass);
  MRI->constrainRegClass(DecReg, &ARM::GPRlrRegClass);

  if (LoopPhi->getOperand(2).getMBB() == ML->getLoopLatch()) {
    LoopPhi->getOperand(3).setReg(StartReg);
    LoopPhi->getOperand(1).setReg(DecReg);
  } else {
    LoopPhi->getOperand(1).setReg(StartReg);
    LoopPhi->getOperand(3).setReg(DecReg);
  }

  // Replace the loop dec and loop end as a single instruction.
  MachineInstrBuilder MI =
      BuildMI(*LoopEnd->getParent(), *LoopEnd, LoopEnd->getDebugLoc(),
              TII->get(ARM::t2LoopEndDec), DecReg)
          .addReg(PhiReg)
          .add(LoopEnd->getOperand(1));
  (void)MI;
  LLVM_DEBUG(dbgs() << "Merged LoopDec and End into: " << *MI.getInstr());

  LoopDec->eraseFromParent();
  LoopEnd->eraseFromParent();
  for (auto *MI : Copies)
    MI->eraseFromParent();
  return true;
}

// Convert t2DoLoopStart to t2DoLoopStartTP if the loop contains VCTP
// instructions. This keeps the VCTP count reg operand on the t2DoLoopStartTP
// instruction, making the backend ARMLowOverheadLoops passes job of finding the
// VCTP operand much simpler.
bool MVETPAndVPTOptimisations::ConvertTailPredLoop(MachineLoop *ML,
                                              MachineDominatorTree *DT) {
  LLVM_DEBUG(dbgs() << "ConvertTailPredLoop on loop "
                    << ML->getHeader()->getName() << "\n");

  // Find some loop components including the LoopEnd/Dec/Start, and any VCTP's
  // in the loop.
  MachineInstr *LoopEnd, *LoopPhi, *LoopStart, *LoopDec;
  if (!findLoopComponents(ML, MRI, LoopStart, LoopPhi, LoopDec, LoopEnd))
    return false;
  if (LoopDec != LoopEnd || (LoopStart->getOpcode() != ARM::t2DoLoopStart &&
                             LoopStart->getOpcode() != ARM::t2WhileLoopStartLR))
    return false;

  SmallVector<MachineInstr *, 4> VCTPs;
  SmallVector<MachineInstr *, 4> MVEInstrs;
  for (MachineBasicBlock *BB : ML->blocks()) {
    for (MachineInstr &MI : *BB)
      if (isVCTP(&MI))
        VCTPs.push_back(&MI);
      else if (findFirstVPTPredOperandIdx(MI) != -1)
        MVEInstrs.push_back(&MI);
  }

  if (VCTPs.empty()) {
    LLVM_DEBUG(dbgs() << "  no VCTPs\n");
    return false;
  }

  // Check all VCTPs are the same.
  MachineInstr *FirstVCTP = *VCTPs.begin();
  for (MachineInstr *VCTP : VCTPs) {
    LLVM_DEBUG(dbgs() << "  with VCTP " << *VCTP);
    if (VCTP->getOpcode() != FirstVCTP->getOpcode() ||
        VCTP->getOperand(0).getReg() != FirstVCTP->getOperand(0).getReg()) {
      LLVM_DEBUG(dbgs() << "  VCTP's are not identical\n");
      return false;
    }
  }

  // Check for the register being used can be setup before the loop. We expect
  // this to be:
  //   $vx = ...
  // loop:
  //   $vp = PHI [ $vx ], [ $vd ]
  //   ..
  //   $vpr = VCTP $vp
  //   ..
  //   $vd = t2SUBri $vp, #n
  //   ..
  Register CountReg = FirstVCTP->getOperand(1).getReg();
  if (!CountReg.isVirtual()) {
    LLVM_DEBUG(dbgs() << "  cannot determine VCTP PHI\n");
    return false;
  }
  MachineInstr *Phi = LookThroughCOPY(MRI->getVRegDef(CountReg), MRI);
  if (!Phi || Phi->getOpcode() != TargetOpcode::PHI ||
      Phi->getNumOperands() != 5 ||
      (Phi->getOperand(2).getMBB() != ML->getLoopLatch() &&
       Phi->getOperand(4).getMBB() != ML->getLoopLatch())) {
    LLVM_DEBUG(dbgs() << "  cannot determine VCTP Count\n");
    return false;
  }
  CountReg = Phi->getOperand(2).getMBB() == ML->getLoopLatch()
                 ? Phi->getOperand(3).getReg()
                 : Phi->getOperand(1).getReg();

  // Replace the t2DoLoopStart with the t2DoLoopStartTP, move it to the end of
  // the preheader and add the new CountReg to it. We attempt to place it late
  // in the preheader, but may need to move that earlier based on uses.
  MachineBasicBlock *MBB = LoopStart->getParent();
  MachineBasicBlock::iterator InsertPt = MBB->getFirstTerminator();
  for (MachineInstr &Use :
       MRI->use_instructions(LoopStart->getOperand(0).getReg()))
    if ((InsertPt != MBB->end() && !DT->dominates(&*InsertPt, &Use)) ||
        !DT->dominates(ML->getHeader(), Use.getParent())) {
      LLVM_DEBUG(dbgs() << "  InsertPt could not be a terminator!\n");
      return false;
    }

  unsigned NewOpc = LoopStart->getOpcode() == ARM::t2DoLoopStart
                        ? ARM::t2DoLoopStartTP
                        : ARM::t2WhileLoopStartTP;
  MachineInstrBuilder MI =
      BuildMI(*MBB, InsertPt, LoopStart->getDebugLoc(), TII->get(NewOpc))
          .add(LoopStart->getOperand(0))
          .add(LoopStart->getOperand(1))
          .addReg(CountReg);
  if (NewOpc == ARM::t2WhileLoopStartTP)
    MI.add(LoopStart->getOperand(2));
  LLVM_DEBUG(dbgs() << "Replacing " << *LoopStart << "  with "
                    << *MI.getInstr());
  MRI->constrainRegClass(CountReg, &ARM::rGPRRegClass);
  LoopStart->eraseFromParent();

  if (SetLRPredicate) {
    // Each instruction in the loop needs to be using LR as the predicate from
    // the Phi as the predicate.
    Register LR = LoopPhi->getOperand(0).getReg();
    for (MachineInstr *MI : MVEInstrs) {
      int Idx = findFirstVPTPredOperandIdx(*MI);
      MI->getOperand(Idx + 2).setReg(LR);
    }
  }

  return true;
}

// Returns true if Opcode is any VCMP Opcode.
static bool IsVCMP(unsigned Opcode) { return VCMPOpcodeToVPT(Opcode) != 0; }

// Returns true if a VCMP with this Opcode can have its operands swapped.
// There is 2 kind of VCMP that can't have their operands swapped: Float VCMPs,
// and VCMPr instructions (since the r is always on the right).
static bool CanHaveSwappedOperands(unsigned Opcode) {
  switch (Opcode) {
  default:
    return true;
  case ARM::MVE_VCMPf32:
  case ARM::MVE_VCMPf16:
  case ARM::MVE_VCMPf32r:
  case ARM::MVE_VCMPf16r:
  case ARM::MVE_VCMPi8r:
  case ARM::MVE_VCMPi16r:
  case ARM::MVE_VCMPi32r:
  case ARM::MVE_VCMPu8r:
  case ARM::MVE_VCMPu16r:
  case ARM::MVE_VCMPu32r:
  case ARM::MVE_VCMPs8r:
  case ARM::MVE_VCMPs16r:
  case ARM::MVE_VCMPs32r:
    return false;
  }
}

// Returns the CondCode of a VCMP Instruction.
static ARMCC::CondCodes GetCondCode(MachineInstr &Instr) {
  assert(IsVCMP(Instr.getOpcode()) && "Inst must be a VCMP");
  return ARMCC::CondCodes(Instr.getOperand(3).getImm());
}

// Returns true if Cond is equivalent to a VPNOT instruction on the result of
// Prev. Cond and Prev must be VCMPs.
static bool IsVPNOTEquivalent(MachineInstr &Cond, MachineInstr &Prev) {
  assert(IsVCMP(Cond.getOpcode()) && IsVCMP(Prev.getOpcode()));

  // Opcodes must match.
  if (Cond.getOpcode() != Prev.getOpcode())
    return false;

  MachineOperand &CondOP1 = Cond.getOperand(1), &CondOP2 = Cond.getOperand(2);
  MachineOperand &PrevOP1 = Prev.getOperand(1), &PrevOP2 = Prev.getOperand(2);

  // If the VCMP has the opposite condition with the same operands, we can
  // replace it with a VPNOT
  ARMCC::CondCodes ExpectedCode = GetCondCode(Cond);
  ExpectedCode = ARMCC::getOppositeCondition(ExpectedCode);
  if (ExpectedCode == GetCondCode(Prev))
    if (CondOP1.isIdenticalTo(PrevOP1) && CondOP2.isIdenticalTo(PrevOP2))
      return true;
  // Check again with operands swapped if possible
  if (!CanHaveSwappedOperands(Cond.getOpcode()))
    return false;
  ExpectedCode = ARMCC::getSwappedCondition(ExpectedCode);
  return ExpectedCode == GetCondCode(Prev) && CondOP1.isIdenticalTo(PrevOP2) &&
         CondOP2.isIdenticalTo(PrevOP1);
}

// Returns true if Instr writes to VCCR.
static bool IsWritingToVCCR(MachineInstr &Instr) {
  if (Instr.getNumOperands() == 0)
    return false;
  MachineOperand &Dst = Instr.getOperand(0);
  if (!Dst.isReg())
    return false;
  Register DstReg = Dst.getReg();
  if (!DstReg.isVirtual())
    return false;
  MachineRegisterInfo &RegInfo = Instr.getMF()->getRegInfo();
  const TargetRegisterClass *RegClass = RegInfo.getRegClassOrNull(DstReg);
  return RegClass && (RegClass->getID() == ARM::VCCRRegClassID);
}

// Transforms
//    <Instr that uses %A ('User' Operand)>
// Into
//    %K = VPNOT %Target
//    <Instr that uses %K ('User' Operand)>
// And returns the newly inserted VPNOT.
// This optimization is done in the hopes of preventing spills/reloads of VPR by
// reducing the number of VCCR values with overlapping lifetimes.
MachineInstr &MVETPAndVPTOptimisations::ReplaceRegisterUseWithVPNOT(
    MachineBasicBlock &MBB, MachineInstr &Instr, MachineOperand &User,
    Register Target) {
  Register NewResult = MRI->createVirtualRegister(MRI->getRegClass(Target));

  MachineInstrBuilder MIBuilder =
      BuildMI(MBB, &Instr, Instr.getDebugLoc(), TII->get(ARM::MVE_VPNOT))
          .addDef(NewResult)
          .addReg(Target);
  addUnpredicatedMveVpredNOp(MIBuilder);

  // Make the user use NewResult instead, and clear its kill flag.
  User.setReg(NewResult);
  User.setIsKill(false);

  LLVM_DEBUG(dbgs() << "  Inserting VPNOT (for spill prevention): ";
             MIBuilder.getInstr()->dump());

  return *MIBuilder.getInstr();
}

// Moves a VPNOT before its first user if an instruction that uses Reg is found
// in-between the VPNOT and its user.
// Returns true if there is at least one user of the VPNOT in the block.
static bool MoveVPNOTBeforeFirstUser(MachineBasicBlock &MBB,
                                     MachineBasicBlock::iterator Iter,
                                     Register Reg) {
  assert(Iter->getOpcode() == ARM::MVE_VPNOT && "Not a VPNOT!");
  assert(getVPTInstrPredicate(*Iter) == ARMVCC::None &&
         "The VPNOT cannot be predicated");

  MachineInstr &VPNOT = *Iter;
  Register VPNOTResult = VPNOT.getOperand(0).getReg();
  Register VPNOTOperand = VPNOT.getOperand(1).getReg();

  // Whether the VPNOT will need to be moved, and whether we found a user of the
  // VPNOT.
  bool MustMove = false, HasUser = false;
  MachineOperand *VPNOTOperandKiller = nullptr;
  for (; Iter != MBB.end(); ++Iter) {
    if (MachineOperand *MO =
            Iter->findRegisterUseOperand(VPNOTOperand, /*isKill*/ true)) {
      // If we find the operand that kills the VPNOTOperand's result, save it.
      VPNOTOperandKiller = MO;
    }

    if (Iter->findRegisterUseOperandIdx(Reg) != -1) {
      MustMove = true;
      continue;
    }

    if (Iter->findRegisterUseOperandIdx(VPNOTResult) == -1)
      continue;

    HasUser = true;
    if (!MustMove)
      break;

    // Move the VPNOT right before Iter
    LLVM_DEBUG(dbgs() << "Moving: "; VPNOT.dump(); dbgs() << "  Before: ";
               Iter->dump());
    MBB.splice(Iter, &MBB, VPNOT.getIterator());
    // If we move the instr, and its operand was killed earlier, remove the kill
    // flag.
    if (VPNOTOperandKiller)
      VPNOTOperandKiller->setIsKill(false);

    break;
  }
  return HasUser;
}

// This optimisation attempts to reduce the number of overlapping lifetimes of
// VCCR values by replacing uses of old VCCR values with VPNOTs. For example,
// this replaces
//    %A:vccr = (something)
//    %B:vccr = VPNOT %A
//    %Foo = (some op that uses %B)
//    %Bar = (some op that uses %A)
// With
//    %A:vccr = (something)
//    %B:vccr = VPNOT %A
//    %Foo = (some op that uses %B)
//    %TMP2:vccr = VPNOT %B
//    %Bar = (some op that uses %A)
bool MVETPAndVPTOptimisations::ReduceOldVCCRValueUses(MachineBasicBlock &MBB) {
  MachineBasicBlock::iterator Iter = MBB.begin(), End = MBB.end();
  SmallVector<MachineInstr *, 4> DeadInstructions;
  bool Modified = false;

  while (Iter != End) {
    Register VCCRValue, OppositeVCCRValue;
    // The first loop looks for 2 unpredicated instructions:
    //    %A:vccr = (instr)     ; A is stored in VCCRValue
    //    %B:vccr = VPNOT %A    ; B is stored in OppositeVCCRValue
    for (; Iter != End; ++Iter) {
      // We're only interested in unpredicated instructions that write to VCCR.
      if (!IsWritingToVCCR(*Iter) ||
          getVPTInstrPredicate(*Iter) != ARMVCC::None)
        continue;
      Register Dst = Iter->getOperand(0).getReg();

      // If we already have a VCCRValue, and this is a VPNOT on VCCRValue, we've
      // found what we were looking for.
      if (VCCRValue && Iter->getOpcode() == ARM::MVE_VPNOT &&
          Iter->findRegisterUseOperandIdx(VCCRValue) != -1) {
        // Move the VPNOT closer to its first user if needed, and ignore if it
        // has no users.
        if (!MoveVPNOTBeforeFirstUser(MBB, Iter, VCCRValue))
          continue;

        OppositeVCCRValue = Dst;
        ++Iter;
        break;
      }

      // Else, just set VCCRValue.
      VCCRValue = Dst;
    }

    // If the first inner loop didn't find anything, stop here.
    if (Iter == End)
      break;

    assert(VCCRValue && OppositeVCCRValue &&
           "VCCRValue and OppositeVCCRValue shouldn't be empty if the loop "
           "stopped before the end of the block!");
    assert(VCCRValue != OppositeVCCRValue &&
           "VCCRValue should not be equal to OppositeVCCRValue!");

    // LastVPNOTResult always contains the same value as OppositeVCCRValue.
    Register LastVPNOTResult = OppositeVCCRValue;

    // This second loop tries to optimize the remaining instructions.
    for (; Iter != End; ++Iter) {
      bool IsInteresting = false;

      if (MachineOperand *MO = Iter->findRegisterUseOperand(VCCRValue)) {
        IsInteresting = true;

        // - If the instruction is a VPNOT, it can be removed, and we can just
        //   replace its uses with LastVPNOTResult.
        // - Else, insert a new VPNOT on LastVPNOTResult to recompute VCCRValue.
        if (Iter->getOpcode() == ARM::MVE_VPNOT) {
          Register Result = Iter->getOperand(0).getReg();

          MRI->replaceRegWith(Result, LastVPNOTResult);
          DeadInstructions.push_back(&*Iter);
          Modified = true;

          LLVM_DEBUG(dbgs()
                     << "Replacing all uses of '" << printReg(Result)
                     << "' with '" << printReg(LastVPNOTResult) << "'\n");
        } else {
          MachineInstr &VPNOT =
              ReplaceRegisterUseWithVPNOT(MBB, *Iter, *MO, LastVPNOTResult);
          Modified = true;

          LastVPNOTResult = VPNOT.getOperand(0).getReg();
          std::swap(VCCRValue, OppositeVCCRValue);

          LLVM_DEBUG(dbgs() << "Replacing use of '" << printReg(VCCRValue)
                            << "' with '" << printReg(LastVPNOTResult)
                            << "' in instr: " << *Iter);
        }
      } else {
        // If the instr uses OppositeVCCRValue, make it use LastVPNOTResult
        // instead as they contain the same value.
        if (MachineOperand *MO =
                Iter->findRegisterUseOperand(OppositeVCCRValue)) {
          IsInteresting = true;

          // This is pointless if LastVPNOTResult == OppositeVCCRValue.
          if (LastVPNOTResult != OppositeVCCRValue) {
            LLVM_DEBUG(dbgs() << "Replacing usage of '"
                              << printReg(OppositeVCCRValue) << "' with '"
                              << printReg(LastVPNOTResult) << " for instr: ";
                       Iter->dump());
            MO->setReg(LastVPNOTResult);
            Modified = true;
          }

          MO->setIsKill(false);
        }

        // If this is an unpredicated VPNOT on
        // LastVPNOTResult/OppositeVCCRValue, we can act like we inserted it.
        if (Iter->getOpcode() == ARM::MVE_VPNOT &&
            getVPTInstrPredicate(*Iter) == ARMVCC::None) {
          Register VPNOTOperand = Iter->getOperand(1).getReg();
          if (VPNOTOperand == LastVPNOTResult ||
              VPNOTOperand == OppositeVCCRValue) {
            IsInteresting = true;

            std::swap(VCCRValue, OppositeVCCRValue);
            LastVPNOTResult = Iter->getOperand(0).getReg();
          }
        }
      }

      // If this instruction was not interesting, and it writes to VCCR, stop.
      if (!IsInteresting && IsWritingToVCCR(*Iter))
        break;
    }
  }

  for (MachineInstr *DeadInstruction : DeadInstructions)
    DeadInstruction->eraseFromParent();

  return Modified;
}

// This optimisation replaces VCMPs with VPNOTs when they are equivalent.
bool MVETPAndVPTOptimisations::ReplaceVCMPsByVPNOTs(MachineBasicBlock &MBB) {
  SmallVector<MachineInstr *, 4> DeadInstructions;

  // The last VCMP that we have seen and that couldn't be replaced.
  // This is reset when an instruction that writes to VCCR/VPR is found, or when
  // a VCMP is replaced with a VPNOT.
  // We'll only replace VCMPs with VPNOTs when this is not null, and when the
  // current VCMP is the opposite of PrevVCMP.
  MachineInstr *PrevVCMP = nullptr;
  // If we find an instruction that kills the result of PrevVCMP, we save the
  // operand here to remove the kill flag in case we need to use PrevVCMP's
  // result.
  MachineOperand *PrevVCMPResultKiller = nullptr;

  for (MachineInstr &Instr : MBB.instrs()) {
    if (PrevVCMP) {
      if (MachineOperand *MO = Instr.findRegisterUseOperand(
              PrevVCMP->getOperand(0).getReg(), /*isKill*/ true)) {
        // If we come accross the instr that kills PrevVCMP's result, record it
        // so we can remove the kill flag later if we need to.
        PrevVCMPResultKiller = MO;
      }
    }

    // Ignore predicated instructions.
    if (getVPTInstrPredicate(Instr) != ARMVCC::None)
      continue;

    // Only look at VCMPs
    if (!IsVCMP(Instr.getOpcode())) {
      // If the instruction writes to VCCR, forget the previous VCMP.
      if (IsWritingToVCCR(Instr))
        PrevVCMP = nullptr;
      continue;
    }

    if (!PrevVCMP || !IsVPNOTEquivalent(Instr, *PrevVCMP)) {
      PrevVCMP = &Instr;
      continue;
    }

    // The register containing the result of the VCMP that we're going to
    // replace.
    Register PrevVCMPResultReg = PrevVCMP->getOperand(0).getReg();

    // Build a VPNOT to replace the VCMP, reusing its operands.
    MachineInstrBuilder MIBuilder =
        BuildMI(MBB, &Instr, Instr.getDebugLoc(), TII->get(ARM::MVE_VPNOT))
            .add(Instr.getOperand(0))
            .addReg(PrevVCMPResultReg);
    addUnpredicatedMveVpredNOp(MIBuilder);
    LLVM_DEBUG(dbgs() << "Inserting VPNOT (to replace VCMP): ";
               MIBuilder.getInstr()->dump(); dbgs() << "  Removed VCMP: ";
               Instr.dump());

    // If we found an instruction that uses, and kills PrevVCMP's result,
    // remove the kill flag.
    if (PrevVCMPResultKiller)
      PrevVCMPResultKiller->setIsKill(false);

    // Finally, mark the old VCMP for removal and reset
    // PrevVCMP/PrevVCMPResultKiller.
    DeadInstructions.push_back(&Instr);
    PrevVCMP = nullptr;
    PrevVCMPResultKiller = nullptr;
  }

  for (MachineInstr *DeadInstruction : DeadInstructions)
    DeadInstruction->eraseFromParent();

  return !DeadInstructions.empty();
}

bool MVETPAndVPTOptimisations::ReplaceConstByVPNOTs(MachineBasicBlock &MBB,
                                               MachineDominatorTree *DT) {
  // Scan through the block, looking for instructions that use constants moves
  // into VPR that are the negative of one another. These are expected to be
  // COPY's to VCCRRegClass, from a t2MOVi or t2MOVi16. The last seen constant
  // mask is kept it or and VPNOT's of it are added or reused as we scan through
  // the function.
  unsigned LastVPTImm = 0;
  Register LastVPTReg = 0;
  SmallSet<MachineInstr *, 4> DeadInstructions;

  for (MachineInstr &Instr : MBB.instrs()) {
    // Look for predicated MVE instructions.
    int PIdx = llvm::findFirstVPTPredOperandIdx(Instr);
    if (PIdx == -1)
      continue;
    Register VPR = Instr.getOperand(PIdx + 1).getReg();
    if (!VPR.isVirtual())
      continue;

    // From that we are looking for an instruction like %11:vccr = COPY %9:rgpr.
    MachineInstr *Copy = MRI->getVRegDef(VPR);
    if (!Copy || Copy->getOpcode() != TargetOpcode::COPY ||
        !Copy->getOperand(1).getReg().isVirtual() ||
        MRI->getRegClass(Copy->getOperand(1).getReg()) == &ARM::VCCRRegClass) {
      LastVPTReg = 0;
      continue;
    }
    Register GPR = Copy->getOperand(1).getReg();

    // Find the Immediate used by the copy.
    auto getImm = [&](Register GPR) -> unsigned {
      MachineInstr *Def = MRI->getVRegDef(GPR);
      if (Def && (Def->getOpcode() == ARM::t2MOVi ||
                  Def->getOpcode() == ARM::t2MOVi16))
        return Def->getOperand(1).getImm();
      return -1U;
    };
    unsigned Imm = getImm(GPR);
    if (Imm == -1U) {
      LastVPTReg = 0;
      continue;
    }

    unsigned NotImm = ~Imm & 0xffff;
    if (LastVPTReg != 0 && LastVPTReg != VPR && LastVPTImm == Imm) {
      Instr.getOperand(PIdx + 1).setReg(LastVPTReg);
      if (MRI->use_empty(VPR)) {
        DeadInstructions.insert(Copy);
        if (MRI->hasOneUse(GPR))
          DeadInstructions.insert(MRI->getVRegDef(GPR));
      }
      LLVM_DEBUG(dbgs() << "Reusing predicate: in  " << Instr);
    } else if (LastVPTReg != 0 && LastVPTImm == NotImm) {
      // We have found the not of a previous constant. Create a VPNot of the
      // earlier predicate reg and use it instead of the copy.
      Register NewVPR = MRI->createVirtualRegister(&ARM::VCCRRegClass);
      auto VPNot = BuildMI(MBB, &Instr, Instr.getDebugLoc(),
                           TII->get(ARM::MVE_VPNOT), NewVPR)
                       .addReg(LastVPTReg);
      addUnpredicatedMveVpredNOp(VPNot);

      // Use the new register and check if the def is now dead.
      Instr.getOperand(PIdx + 1).setReg(NewVPR);
      if (MRI->use_empty(VPR)) {
        DeadInstructions.insert(Copy);
        if (MRI->hasOneUse(GPR))
          DeadInstructions.insert(MRI->getVRegDef(GPR));
      }
      LLVM_DEBUG(dbgs() << "Adding VPNot: " << *VPNot << "  to replace use at "
                        << Instr);
      VPR = NewVPR;
    }

    LastVPTImm = Imm;
    LastVPTReg = VPR;
  }

  for (MachineInstr *DI : DeadInstructions)
    DI->eraseFromParent();

  return !DeadInstructions.empty();
}

// Replace VPSEL with a predicated VMOV in blocks with a VCTP. This is a
// somewhat blunt approximation to allow tail predicated with vpsel
// instructions. We turn a vselect into a VPSEL in ISEL, but they have slightly
// different semantics under tail predication. Until that is modelled we just
// convert to a VMOVT (via a predicated VORR) instead.
bool MVETPAndVPTOptimisations::ConvertVPSEL(MachineBasicBlock &MBB) {
  bool HasVCTP = false;
  SmallVector<MachineInstr *, 4> DeadInstructions;

  for (MachineInstr &MI : MBB.instrs()) {
    if (isVCTP(&MI)) {
      HasVCTP = true;
      continue;
    }

    if (!HasVCTP || MI.getOpcode() != ARM::MVE_VPSEL)
      continue;

    MachineInstrBuilder MIBuilder =
        BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(ARM::MVE_VORR))
            .add(MI.getOperand(0))
            .add(MI.getOperand(1))
            .add(MI.getOperand(1))
            .addImm(ARMVCC::Then)
            .add(MI.getOperand(4))
            .add(MI.getOperand(5))
            .add(MI.getOperand(2));
    // Silence unused variable warning in release builds.
    (void)MIBuilder;
    LLVM_DEBUG(dbgs() << "Replacing VPSEL: "; MI.dump();
               dbgs() << "     with VMOVT: "; MIBuilder.getInstr()->dump());
    DeadInstructions.push_back(&MI);
  }

  for (MachineInstr *DeadInstruction : DeadInstructions)
    DeadInstruction->eraseFromParent();

  return !DeadInstructions.empty();
}

// Add a registry allocation hint for t2DoLoopStart to hint it towards LR, as
// the instruction may be removable as a noop.
bool MVETPAndVPTOptimisations::HintDoLoopStartReg(MachineBasicBlock &MBB) {
  bool Changed = false;
  for (MachineInstr &MI : MBB.instrs()) {
    if (MI.getOpcode() != ARM::t2DoLoopStart)
      continue;
    Register R = MI.getOperand(1).getReg();
    MachineFunction *MF = MI.getParent()->getParent();
    MF->getRegInfo().setRegAllocationHint(R, ARMRI::RegLR, 0);
    Changed = true;
  }
  return Changed;
}

bool MVETPAndVPTOptimisations::runOnMachineFunction(MachineFunction &Fn) {
  const ARMSubtarget &STI =
      static_cast<const ARMSubtarget &>(Fn.getSubtarget());

  if (!STI.isThumb2() || !STI.hasLOB())
    return false;

  TII = static_cast<const Thumb2InstrInfo *>(STI.getInstrInfo());
  MRI = &Fn.getRegInfo();
  MachineLoopInfo *MLI = &getAnalysis<MachineLoopInfo>();
  MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>();

  LLVM_DEBUG(dbgs() << "********** ARM MVE VPT Optimisations **********\n"
                    << "********** Function: " << Fn.getName() << '\n');

  bool Modified = false;
  for (MachineLoop *ML : MLI->getBase().getLoopsInPreorder()) {
    Modified |= LowerWhileLoopStart(ML);
    Modified |= MergeLoopEnd(ML);
    Modified |= ConvertTailPredLoop(ML, DT);
  }

  for (MachineBasicBlock &MBB : Fn) {
    Modified |= HintDoLoopStartReg(MBB);
    Modified |= ReplaceConstByVPNOTs(MBB, DT);
    Modified |= ReplaceVCMPsByVPNOTs(MBB);
    Modified |= ReduceOldVCCRValueUses(MBB);
    Modified |= ConvertVPSEL(MBB);
  }

  LLVM_DEBUG(dbgs() << "**************************************\n");
  return Modified;
}

/// createMVETPAndVPTOptimisationsPass
FunctionPass *llvm::createMVETPAndVPTOptimisationsPass() {
  return new MVETPAndVPTOptimisations();
}