[X86][AMX] Hoist ldtilecfg

The previous code calculated the first ldtilecfg by dominating all AMX registers' def. This may result in the ldtilecfg being inserted into a loop. This patch try to calculate the nearest point where all shapes of AMX registers are reachable. Reviewed By: LuoYuanke Differential Revision: https://reviews.llvm.org/D99010
2021-04-12 22:08:27 +08:00 · 2021-04-12 22:08:27 +08:00 · 4cbaaf4a24
parent 6c0a1ed3a9
commit 4cbaaf4a24
5 changed files with 554 additions and 321 deletions
--- a/llvm/lib/Target/X86/X86PreTileConfig.cpp
+++ b/llvm/lib/Target/X86/X86PreTileConfig.cpp
@ -6,31 +6,20 @@
 //
 //===----------------------------------------------------------------------===//
 //
-/// \file Pass to pre-config the shape of AMX register
-/// AMX register need to be configured before use. The shape of AMX register
-/// is encoded in the 1st and 2nd machine operand of AMX pseudo instructions.
-/// The pldtilecfg is to config tile registers. It should dominator all AMX
-/// instructions. The pldtilecfg produce a virtual cfg register and the cfg
-/// register is used by all AMX instructions.
-/// This pass is to find the common dominator of all AMX instructions and
-/// insert the pldtilecfg instruction. Besides the cfg register that pldtilecfg
-/// produces is inserted as the last operand of each AMX instruction. We use
-/// this scheme to model the def-use relationship between AMX config instruction
-/// and other AMX instructions. Below is an example.
+/// \file Pass to pre-config the shapes of AMX registers
+/// AMX register needs to be configured before use. The shapes of AMX register
+/// are encoded in the 1st and 2nd machine operand of AMX pseudo instructions.
 ///
-///                        ----B1----
-///                       /           \
-///                      /             \
-///                    B2               B3
-///    %1:tile = PTILELOADDV        %2:tile = PTILELOADDV
+/// The instruction ldtilecfg is used to config the shapes. It must be reachable
+/// for all variable shapes. ldtilecfg will be inserted more than once if we
+/// cannot find a dominating point for all AMX instructions.
 ///
-///  is transformed to
+/// The configure register is caller saved according to ABI. We need to insert
+/// ldtilecfg again after the call instruction if callee clobbers any AMX
+/// registers.
 ///
-///                            B1
-///                 %25:tilecfg = PLDTILECFG
-///                       /           \
-///                      /             \
-///  %1:tile = PTILELOADDV %25    %2:tile = PTILELOADDV %25
+/// This pass calculates all points that ldtilecfg need to be inserted to and
+/// insert them. It reports error if the reachability conditions aren't met.
 //
 //===----------------------------------------------------------------------===//

@ -38,32 +27,107 @@
 #include "X86InstrBuilder.h"
 #include "X86RegisterInfo.h"
 #include "X86Subtarget.h"
-#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
-#include "llvm/CodeGen/TileShapeInfo.h"
 #include "llvm/InitializePasses.h"

 using namespace llvm;

 #define DEBUG_TYPE "tile-pre-config"
+#define ASSERT_VALID_COMPARE                                                   \
+  assert((!MBB || !RHS.MBB || MBB == RHS.MBB) &&                               \
+         "Cannot compare between different BBs");
+#define REPORT_CONFIG_FAIL                                                     \
+  report_fatal_error(                                                          \
+      MF.getName() +                                                           \
+      ": Failed to config tile register, please define the shape earlier");

 namespace {

-class X86PreTileConfig : public MachineFunctionPass {
-  // context
-  MachineFunction *MF = nullptr;
-  const X86Subtarget *ST = nullptr;
-  const TargetRegisterInfo *TRI;
-  const TargetInstrInfo *TII;
-  MachineDominatorTree *DomTree = nullptr;
-  MachineRegisterInfo *MRI = nullptr;
+struct MIRef {
+  MachineInstr *MI = nullptr;
+  MachineBasicBlock *MBB = nullptr;
+  // A virtual position for instruction that will be inserted after MI.
+  size_t Pos = 0;
+  MIRef() = default;
+  MIRef(MachineBasicBlock *MBB) : MBB(MBB) {
+    for (auto I = MBB->begin(), E = MBB->end(); I != E && I->isPHI();
+         ++I, ++Pos)
+      MI = &*I;
+  }
+  MIRef(MachineInstr *MI, MachineBasicBlock *MBB)
+      : MI(MI), MBB(MBB),
+        Pos(std::distance(MBB->instr_begin(), ++MI->getIterator())) {}
+  MIRef(MachineInstr *MI, MachineBasicBlock *MBB, size_t Pos)
+      : MI(MI), MBB(MBB), Pos(Pos) {}
+  operator bool() const { return MBB != nullptr; }
+  bool operator==(const MIRef &RHS) const {
+    return MI == RHS.MI && MBB == RHS.MBB;
+  }
+  bool operator<(const MIRef &RHS) const {
+    ASSERT_VALID_COMPARE;
+    return Pos < RHS.Pos;
+  }
+  bool operator>(const MIRef &RHS) const {
+    ASSERT_VALID_COMPARE;
+    return Pos > RHS.Pos;
+  }
+};

-  MachineInstr *getTileConfigPoint();
+struct BBInfo {
+  MIRef FirstAMX;
+  MIRef LastCall;
+  MIRef LastShape;
+  bool NeedTileCfgLiveIn = false;
+  unsigned ShapeReachedCount = 0;
+};
+
+class X86PreTileConfig : public MachineFunctionPass {
+  MachineRegisterInfo *MRI;
+  const MachineLoopInfo *MLI;
+  SmallSet<MachineInstr *, 8> DefVisited;
+  SmallSet<MachineBasicBlock *, 8> ShapeBBs;
+  DenseMap<MachineBasicBlock *, BBInfo> BBVisitedInfo;
+
+  /// Check if the callee will clobber AMX registers.
+  bool isDestructiveCall(MachineInstr &MI, BitVector UsableRegs) {
+    auto Iter = llvm::find_if(
+        MI.operands(), [](MachineOperand &MO) { return MO.isRegMask(); });
+    if (Iter == MI.operands_end())
+      return false;
+    UsableRegs.clearBitsInMask(Iter->getRegMask());
+    return !UsableRegs.none();
+  }
+
+  /// Check if MI is AMX pseudo instruction.
+  bool isAMXInstruction(MachineInstr &MI) {
+    if (MI.isPHI() || MI.isDebugInstr() || MI.getNumOperands() < 3)
+      return false;
+    MachineOperand &MO = MI.getOperand(0);
+    // We can simply check if it is AMX instruction by its def.
+    // But we should exclude old API which uses physical registers.
+    if (MO.isReg() && MO.getReg().isVirtual() &&
+        MRI->getRegClass(MO.getReg())->getID() == X86::TILERegClassID) {
+      collectShapeInfo(MI);
+      return true;
+    }
+    // PTILESTOREDV is the only exception that doesn't def a AMX register.
+    return MI.getOpcode() == X86::PTILESTOREDV;
+  }
+
+  /// Check if it is an edge from loop bottom to loop head.
+  bool isLoopBackEdge(MachineBasicBlock *Header, MachineBasicBlock *Bottom) {
+    return MLI->isLoopHeader(Header) &&
+           MLI->getLoopFor(Header)->getBottomBlock() == Bottom;
+  }
+
+  /// Collect the shape def information for later use.
+  void collectShapeInfo(MachineInstr &MI);

 public:
  X86PreTileConfig() : MachineFunctionPass(ID) {}
@ -74,10 +138,21 @@ public:
  }

  /// X86PreTileConfig analysis usage.
-  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+    AU.addRequired<MachineLoopInfo>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }

-  /// Perform register allocation.
-  bool runOnMachineFunction(MachineFunction &mf) override;
+  /// Clear MF related structures.
+  void releaseMemory() override {
+    ShapeBBs.clear();
+    DefVisited.clear();
+    BBVisitedInfo.clear();
+  }
+
+  /// Perform ldtilecfg instructions inserting.
+  bool runOnMachineFunction(MachineFunction &MF) override;

  static char ID;
 };
@ -88,284 +163,199 @@ char X86PreTileConfig::ID = 0;

 INITIALIZE_PASS_BEGIN(X86PreTileConfig, "tilepreconfig",
                      "Tile Register Pre-configure", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_PASS_END(X86PreTileConfig, "tilepreconfig",
                    "Tile Register Pre-configure", false, false)

-void X86PreTileConfig::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.setPreservesAll();
-  AU.addRequired<MachineDominatorTree>();
-  MachineFunctionPass::getAnalysisUsage(AU);
-}
+void X86PreTileConfig::collectShapeInfo(MachineInstr &MI) {
+  auto RecordShape = [&](MachineInstr *MI, MachineBasicBlock *MBB) {
+    MIRef MIR(MI, MBB);
+    if (BBVisitedInfo[MBB].LastShape < MIR)
+      BBVisitedInfo[MBB].LastShape = MIR;
+    ShapeBBs.insert(MBB);
+  };

-static void buildConfigMI(MachineBasicBlock::iterator MI, int FrameIdx,
-                          const TargetInstrInfo *TII, MachineRegisterInfo *MRI,
-                          const X86Subtarget *ST) {
-  auto *MBB = MI->getParent();
-
-  // Zero stack slot.
-  if (ST->hasAVX512()) {
-    Register Zmm = MRI->createVirtualRegister(&X86::VR512RegClass);
-    BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::VPXORDZrr), Zmm)
-        .addReg(Zmm, RegState::Undef)
-        .addReg(Zmm, RegState::Undef);
-    addFrameReference(BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::VMOVUPSZmr)),
-                      FrameIdx)
-        .addReg(Zmm);
-  } else if (ST->hasAVX2()) {
-    Register Ymm = MRI->createVirtualRegister(&X86::VR256RegClass);
-    BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::VPXORYrr), Ymm)
-        .addReg(Ymm, RegState::Undef)
-        .addReg(Ymm, RegState::Undef);
-    addFrameReference(BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::VMOVUPSYmr)),
-                      FrameIdx)
-        .addReg(Ymm);
-    addFrameReference(BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::VMOVUPSYmr)),
-                      FrameIdx, 32)
-        .addReg(Ymm);
-  } else {
-    assert(ST->hasSSE2() && "AMX should assume SSE2 enabled");
-    Register Xmm = MRI->createVirtualRegister(&X86::VR128RegClass);
-    BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::PXORrr), Xmm)
-        .addReg(Xmm, RegState::Undef)
-        .addReg(Xmm, RegState::Undef);
-    addFrameReference(BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::MOVUPSmr)),
-                      FrameIdx)
-        .addReg(Xmm);
-    addFrameReference(BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::MOVUPSmr)),
-                      FrameIdx, 16)
-        .addReg(Xmm);
-    addFrameReference(BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::MOVUPSmr)),
-                      FrameIdx, 32)
-        .addReg(Xmm);
-    addFrameReference(BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::MOVUPSmr)),
-                      FrameIdx, 48)
-        .addReg(Xmm);
-  }
-
-  // build psuedo ldtilecfg
-  addFrameReference(BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::LDTILECFG)),
-                    FrameIdx);
-}
-
-static ShapeT getShape(const MachineInstr &MI, MachineRegisterInfo *MRI) {
-  unsigned Opcode = MI.getOpcode();
-  switch (Opcode) {
-  default:
-    llvm_unreachable("Unexpected machine instruction on tile");
-  case X86::PTILELOADDV:
-  case X86::PTDPBSSDV:
-  case X86::PTDPBSUDV:
-  case X86::PTDPBUSDV:
-  case X86::PTDPBUUDV:
-  case X86::PTILEZEROV:
-  case X86::PTDPBF16PSV:
-    MachineOperand &MO1 = const_cast<MachineOperand &>(MI.getOperand(1));
-    MachineOperand &MO2 = const_cast<MachineOperand &>(MI.getOperand(2));
-    ShapeT Shape(&MO1, &MO2, MRI);
-    return Shape;
+  SmallVector<Register, 8> WorkList(
+      {MI.getOperand(1).getReg(), MI.getOperand(2).getReg()});
+  while (!WorkList.empty()) {
+    Register R = WorkList.pop_back_val();
+    MachineInstr *DefMI = MRI->getVRegDef(R);
+    MachineBasicBlock *DefMBB = DefMI->getParent();
+    if (!DefMI || DefMI->isMoveImmediate() || !DefVisited.insert(DefMI).second)
+      continue;
+    if (DefMI->isPHI()) {
+      for (unsigned I = 1; I < DefMI->getNumOperands(); I += 2)
+        if (isLoopBackEdge(DefMBB, DefMI->getOperand(I + 1).getMBB()))
+          RecordShape(DefMI, DefMBB); // In this case, PHI is also a shape def.
+        else
+          WorkList.push_back(DefMI->getOperand(I).getReg());
+    } else {
+      RecordShape(DefMI, DefMBB);
+    }
  }
 }

-MachineInstr *X86PreTileConfig::getTileConfigPoint() {
-  DenseMap<Register, ShapeT> PhysShapeInfo;
-  MachineBasicBlock *MBB = nullptr;
-  DenseSet<const MachineInstr *> MIs;
-  for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
-    Register VirtReg = Register::index2VirtReg(i);
-    if (MRI->reg_nodbg_empty(VirtReg))
-      continue;
-    const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
-    if (RC.getID() != X86::TILERegClassID)
-      continue;
+bool X86PreTileConfig::runOnMachineFunction(MachineFunction &MF) {
+  const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
+  const TargetInstrInfo *TII = ST.getInstrInfo();
+  const TargetRegisterInfo *TRI = ST.getRegisterInfo();
+  const TargetRegisterClass *RC = TRI->getRegClass(X86::TILERegClassID);

-    // Find the common dominator for all MI that define tile register.
-    for (const MachineOperand &MO : MRI->def_operands(VirtReg)) {
-      if (MO.isUndef())
-        continue;
-      const auto *MI = MO.getParent();
-      // PHI or IMPLICIT_DEF instructiion.
-      // There must be a input tile before PHI instruction.
-      if (MI->isTransient())
-        continue;
-      if (!MBB)
-        MBB = const_cast<MachineBasicBlock *>(MI->getParent());
-      MBB = DomTree->findNearestCommonDominator(
-          MBB, const_cast<MachineBasicBlock *>(MI->getParent()));
+  BitVector AMXRegs(TRI->getNumRegs());
+  for (unsigned I = 0; I < RC->getNumRegs(); I++)
+    AMXRegs.set(X86::TMM0 + I);

-      // Collect the instructions that define shape.
-      ShapeT Shape = getShape(*MI, MRI);
-      std::array<MachineOperand *, 2> ShapeMOs = {Shape.getRow(),
-                                                  Shape.getCol()};
-      for (auto *ShapeMO : ShapeMOs) {
-        Register ShapeReg = ShapeMO->getReg();
-        for (const MachineOperand &MO : MRI->def_operands(ShapeReg)) {
-          const auto *ShapeMI = MO.getParent();
-          MIs.insert(ShapeMI);
+  // Iterate MF to collect information.
+  MRI = &MF.getRegInfo();
+  MLI = &getAnalysis<MachineLoopInfo>();
+  SmallSet<MIRef, 8> CfgNeedInsert;
+  SmallVector<MachineBasicBlock *, 8> CfgLiveInBBs;
+  for (auto &MBB : MF) {
+    size_t Pos = 0;
+    for (auto &MI : MBB) {
+      ++Pos;
+      if (isAMXInstruction(MI)) {
+        // If there's call before the AMX, we need to reload tile config.
+        if (BBVisitedInfo[&MBB].LastCall)
+          CfgNeedInsert.insert(BBVisitedInfo[&MBB].LastCall);
+        else // Otherwise, we need tile config to live in this BB.
+          BBVisitedInfo[&MBB].NeedTileCfgLiveIn = true;
+        // Always record the first AMX in case there's shape def after it.
+        if (!BBVisitedInfo[&MBB].FirstAMX)
+          BBVisitedInfo[&MBB].FirstAMX = MIRef(&MI, &MBB, Pos);
+      } else if (MI.isCall() && isDestructiveCall(MI, AMXRegs)) {
+        // Record the call only if the callee clobbers all AMX registers.
+        BBVisitedInfo[&MBB].LastCall = MIRef(&MI, &MBB, Pos);
+      }
+    }
+    if (BBVisitedInfo[&MBB].NeedTileCfgLiveIn) {
+      if (&MBB == &MF.front())
+        CfgNeedInsert.insert(MIRef(&MBB));
+      else
+        CfgLiveInBBs.push_back(&MBB);
+    }
+  }
+
+  // Update NeedTileCfgLiveIn for predecessors.
+  while (!CfgLiveInBBs.empty()) {
+    MachineBasicBlock *MBB = CfgLiveInBBs.pop_back_val();
+    for (auto *Pred : MBB->predecessors()) {
+      if (BBVisitedInfo[Pred].LastCall) {
+        CfgNeedInsert.insert(BBVisitedInfo[Pred].LastCall);
+      } else if (!BBVisitedInfo[Pred].NeedTileCfgLiveIn) {
+        BBVisitedInfo[Pred].NeedTileCfgLiveIn = true;
+        if (Pred == &MF.front())
+          CfgNeedInsert.insert(MIRef(Pred));
+        else
+          CfgLiveInBBs.push_back(Pred);
+      }
+    }
+  }
+
+  // There's no AMX instruction if we didn't find a tile config live in point.
+  if (CfgNeedInsert.empty())
+    return false;
+
+  // Calculate how many times the ShapeBB can reach to this BB.
+  unsigned ShapeBBNum = 0;
+  for (auto *MBB : ShapeBBs) {
+    SmallSet<MachineBasicBlock *, 8> VistedBB;
+    SmallVector<MachineBasicBlock *, 8> WorkList({MBB});
+    while (!WorkList.empty()) {
+      MachineBasicBlock *MBB = WorkList.pop_back_val();
+      ++BBVisitedInfo[MBB].ShapeReachedCount;
+      for (auto *Succ : MBB->successors())
+        if (VistedBB.insert(Succ).second && !isLoopBackEdge(Succ, MBB))
+          WorkList.push_back(Succ);
+    }
+    ++ShapeBBNum;
+  }
+
+  DebugLoc DL;
+  SmallSet<MIRef, 8> VisitedOrInserted;
+  int SS = MF.getFrameInfo().CreateStackObject(
+      ST.getTileConfigSize(), ST.getTileConfigAlignment(), false);
+
+  // Try to insert for the tile config live in points.
+  for (auto I : CfgNeedInsert) {
+    SmallSet<MIRef, 8> InsertPoints;
+    SmallVector<MIRef, 8> WorkList({I});
+    while (!WorkList.empty()) {
+      MIRef I = WorkList.pop_back_val();
+      if (!VisitedOrInserted.count(I)) {
+        if (BBVisitedInfo[I.MBB].ShapeReachedCount == ShapeBBNum) {
+          // If the BB is all shapes reachable, stop sink and try to insert.
+          InsertPoints.insert(I);
+        } else {
+          // Avoid the BB to be multi visited.
+          VisitedOrInserted.insert(I);
+          // We cannot sink it across any AMX instruction.
+          if (BBVisitedInfo[I.MBB].FirstAMX)
+            REPORT_CONFIG_FAIL;
+          // Sink the inserting point along the chain with NeedTileCfgLiveIn =
+          // true when MBB isn't all shapes reachable.
+          for (auto *Succ : I.MBB->successors())
+            if (BBVisitedInfo[Succ].NeedTileCfgLiveIn)
+              WorkList.push_back(MIRef(Succ));
        }
      }
    }
-  }
-  if (!MBB)
-    return nullptr;
-  // This pass is before the pass of eliminating PHI node, so it
-  // is in SSA form.
-  assert(MRI->isSSA() && "Not SSA form in pre-tile config");
-  // Shape def should dominate tile config MBB.
-  //    def s           s1    s2
-  //     / \             \   /
-  //    /   \             \ /
-  //  conf               s3=phi(s1,s2)
-  //                       |
-  //                       c
-  //
-  for (const auto *MI : MIs) {
-    const MachineBasicBlock *ShapeMBB = MI->getParent();
-    if (DomTree->dominates(ShapeMBB, MBB))
-      continue;
-    if (MI->isMoveImmediate())
-      continue;
-    report_fatal_error(MF->getName() + ": Failed to config tile register, "
-                                       "please define the shape earlier");
-  }

-  // ldtilecfg should be inserted after the MI that define the shape.
-  MachineBasicBlock::reverse_instr_iterator I, E;
-  for (I = MBB->instr_rbegin(), E = MBB->instr_rend(); I != E; ++I) {
-    auto *MI = &*I;
-    if (MIs.count(MI) && (!MI->isMoveImmediate()))
-      break;
-  }
-  MachineBasicBlock::iterator MII;
-  if (I == E)
-    MII = MBB->getFirstNonPHI();
-  else {
-    MII = MachineBasicBlock::iterator(&*I);
-    MII++;
-  }
-  return &*MII;
-}
-
-static bool isAMXInstruction(MachineBasicBlock::iterator MII) {
-  switch (MII->getOpcode()) {
-  default:
-    return false;
-  case X86::PTILELOADDV:
-  case X86::PTILESTOREDV:
-  case X86::PTDPBSSDV:
-  case X86::PTDPBSUDV:
-  case X86::PTDPBUSDV:
-  case X86::PTDPBUUDV:
-  case X86::PTILEZEROV:
-  case X86::PTDPBF16PSV:
-    return true;
-  }
-}
-
-struct BBInfo {
-  bool HasAMX = false;
-  bool HasCallBeforeAMX = false;
-  bool HasAMXBeforeCallInSuccs = false;
-  MachineInstr *LastCall = nullptr;
-
-  BBInfo() = default;
-  BBInfo(SmallSet<MachineInstr *, 8> &CfgNeedInsert, MachineBasicBlock *MBB,
-         MachineInstr *MI = nullptr) {
-    MachineBasicBlock::iterator MII = MI ? MI->getIterator() : MBB->begin();
-    for (auto E = MBB->end(); MII != E; ++MII) {
-      if (isAMXInstruction(MII)) {
-        HasAMX = true;
-        if (LastCall)
-          CfgNeedInsert.insert(LastCall);
-      } else if (MII->isCall()) {
-        LastCall = &*MII;
-        if (!HasAMX)
-          HasCallBeforeAMX = true;
-      }
-    }
-  }
-};
-
-static void reloadTileConfig(MachineInstr *MI, int FI,
-                             const TargetInstrInfo *TII,
-                             const TargetRegisterInfo *TRI) {
-  SmallSet<MachineInstr *, 8> CfgNeedInsert;
-  SmallVector<MachineBasicBlock *, 8> WorkList;
-  DenseMap<MachineBasicBlock *, BBInfo> BBVisitedInfo;
-
-  MachineBasicBlock *MBB = MI->getParent();
-  BBVisitedInfo[MBB] = BBInfo(CfgNeedInsert, MBB, MI);
-
-  // The entry BB is special, since it always has a ldtilecfg before AMX
-  // instruction. We don't need to check if its predecessor BBs have call.
-  // FIXME: This case happens only when the entry BB is in a loop. We need to
-  // hoist the first tile config point out of the loop in future.
-  BBVisitedInfo[MBB].HasCallBeforeAMX = true;
-
-  WorkList.push_back(MBB);
-  while (!WorkList.empty()) {
-    MBB = WorkList.pop_back_val();
-    for (auto I = MBB->succ_begin(), E = MBB->succ_end(); I != E; ++I) {
-      if (!BBVisitedInfo.count(*I)) {
-        BBVisitedInfo[*I] = BBInfo(CfgNeedInsert, *I);
-        WorkList.push_back(*I);
+    // A given point might be forked due to shape conditions are not met.
+    for (MIRef I : InsertPoints) {
+      // Even MBB is all shapes reachable, we still need to check if there's
+      // AMX that intersects with shapes in the same MBB.
+      if (BBVisitedInfo[I.MBB].FirstAMX &&
+          BBVisitedInfo[I.MBB].FirstAMX < BBVisitedInfo[I.MBB].LastShape)
+        REPORT_CONFIG_FAIL;
+      // Make sure we insert ldtilecfg after the last shape def in MBB.
+      if (I < BBVisitedInfo[I.MBB].LastShape)
+        I = BBVisitedInfo[I.MBB].LastShape;
+      // There're chances the MBB is sunk more than once. Record it to avoid
+      // multi insert.
+      if (VisitedOrInserted.insert(I).second) {
+        auto II = I.MI ? I.MI->getIterator() : I.MBB->instr_begin();
+        addFrameReference(BuildMI(*I.MBB, ++II, DL, TII->get(X86::LDTILECFG)),
+                          SS);
      }
    }
  }

-  WorkList.clear();
-  for (auto I : BBVisitedInfo) {
-    WorkList.push_back(I.first);
-    while (!WorkList.empty()) {
-      MBB = WorkList.pop_back_val();
-      if (BBVisitedInfo[MBB].HasCallBeforeAMX ||
-          (!BBVisitedInfo[MBB].HasAMX &&
-           !BBVisitedInfo[MBB].HasAMXBeforeCallInSuccs))
-        continue;
-      for (auto I = MBB->pred_begin(), E = MBB->pred_end(); I != E; ++I) {
-        if (!BBVisitedInfo.count(*I) ||
-            BBVisitedInfo[*I].HasAMXBeforeCallInSuccs)
-          continue;
-        if (BBVisitedInfo[*I].LastCall)
-          CfgNeedInsert.insert(BBVisitedInfo[*I].LastCall);
-        BBVisitedInfo[*I].HasAMXBeforeCallInSuccs = true;
-        WorkList.push_back(*I);
-      }
-    }
+  // Zero stack slot.
+  MachineBasicBlock &MBB = MF.front();
+  MachineInstr *MI = &*MBB.begin();
+  if (ST.hasAVX512()) {
+    Register Zmm = MRI->createVirtualRegister(&X86::VR512RegClass);
+    BuildMI(MBB, MI, DL, TII->get(X86::VPXORDZrr), Zmm)
+        .addReg(Zmm, RegState::Undef)
+        .addReg(Zmm, RegState::Undef);
+    addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::VMOVUPSZmr)), SS)
+        .addReg(Zmm);
+  } else if (ST.hasAVX2()) {
+    Register Ymm = MRI->createVirtualRegister(&X86::VR256RegClass);
+    BuildMI(MBB, MI, DL, TII->get(X86::VPXORYrr), Ymm)
+        .addReg(Ymm, RegState::Undef)
+        .addReg(Ymm, RegState::Undef);
+    addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::VMOVUPSYmr)), SS)
+        .addReg(Ymm);
+    addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::VMOVUPSYmr)), SS, 32)
+        .addReg(Ymm);
+  } else {
+    assert(ST.hasSSE2() && "AMX should assume SSE2 enabled");
+    Register Xmm = MRI->createVirtualRegister(&X86::VR128RegClass);
+    BuildMI(MBB, MI, DL, TII->get(X86::PXORrr), Xmm)
+        .addReg(Xmm, RegState::Undef)
+        .addReg(Xmm, RegState::Undef);
+    addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOVUPSmr)), SS)
+        .addReg(Xmm);
+    addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOVUPSmr)), SS, 16)
+        .addReg(Xmm);
+    addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOVUPSmr)), SS, 32)
+        .addReg(Xmm);
+    addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOVUPSmr)), SS, 48)
+        .addReg(Xmm);
  }

-  for (auto *I : CfgNeedInsert) {
-    BitVector UsableRegs(TRI->getNumRegs());
-    const TargetRegisterClass *RC = TRI->getRegClass(X86::TILERegClassID);
-    for (unsigned J = 0; J < RC->getNumRegs(); J++)
-      UsableRegs.set(X86::TMM0 + J);
-    for (MachineOperand &CallMO : I->operands()) {
-      if (CallMO.isRegMask())
-        UsableRegs.clearBitsInMask(CallMO.getRegMask());
-    }
-    if (!UsableRegs.none())
-      addFrameReference(BuildMI(*I->getParent(), ++I->getIterator(), DebugLoc(),
-                                TII->get(X86::LDTILECFG)),
-                        FI);
-  }
-}
-
-bool X86PreTileConfig::runOnMachineFunction(MachineFunction &mf) {
-  MF = &mf;
-  MRI = &mf.getRegInfo();
-  ST = &mf.getSubtarget<X86Subtarget>();
-  TRI = ST->getRegisterInfo();
-  TII = mf.getSubtarget().getInstrInfo();
-  DomTree = &getAnalysis<MachineDominatorTree>();
-
-  MachineInstr *MI = getTileConfigPoint();
-  if (!MI)
-    return false;
-  unsigned Size = ST->getTileConfigSize();
-  Align Alignment = ST->getTileConfigAlignment();
-  int SS = mf.getFrameInfo().CreateStackObject(Size, Alignment, false);
-  buildConfigMI(MI, SS, TII, MRI, ST);
-  reloadTileConfig(MI, SS, TII, TRI);
  return true;
 }

--- a/llvm/test/CodeGen/X86/AMX/amx-across-func.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-across-func.ll
@ -113,9 +113,10 @@ define dso_local i32 @test_loop(i32 %0) nounwind {
 ; CHECK-NEXT:    pushq %rbx
 ; CHECK-NEXT:    subq $3016, %rsp # imm = 0xBC8
 ; CHECK-NEXT:    movl %edi, %r14d
-; CHECK-NEXT:    callq foo
 ; CHECK-NEXT:    vpxord %zmm0, %zmm0, %zmm0
 ; CHECK-NEXT:    vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq foo
 ; CHECK-NEXT:    movb $1, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movb $8, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movw $8, {{[0-9]+}}(%rsp)
@ -133,7 +134,6 @@ define dso_local i32 @test_loop(i32 %0) nounwind {
 ; CHECK-NEXT:    tileloadd (%r15,%r12), %tmm0
 ; CHECK-NEXT:    movabsq $64, %rax
 ; CHECK-NEXT:    tilestored %tmm0, 1024(%rsp,%rax) # 1024-byte Folded Spill
-; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    callq foo
 ; CHECK-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movabsq $64, %rax
@ -154,7 +154,6 @@ define dso_local i32 @test_loop(i32 %0) nounwind {
 ; CHECK-NEXT:    incl %r14d
 ; CHECK-NEXT:    jmp .LBB2_8
 ; CHECK-NEXT:  .LBB2_4:
-; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    callq foo
 ; CHECK-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movl $32, %eax
@ -180,13 +179,13 @@ define dso_local i32 @test_loop(i32 %0) nounwind {
 ; IPRA:       # %bb.0:
 ; IPRA-NEXT:    subq $72, %rsp
 ; IPRA-NEXT:    movl %edi, %eax
-; IPRA-NEXT:    callq foo
 ; IPRA-NEXT:    vpxord %zmm0, %zmm0, %zmm0
 ; IPRA-NEXT:    vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
 ; IPRA-NEXT:    movb $1, {{[0-9]+}}(%rsp)
 ; IPRA-NEXT:    movb $8, {{[0-9]+}}(%rsp)
 ; IPRA-NEXT:    movw $8, {{[0-9]+}}(%rsp)
 ; IPRA-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
+; IPRA-NEXT:    callq foo
 ; IPRA-NEXT:    testl %edi, %edi
 ; IPRA-NEXT:    jg .LBB2_4
 ; IPRA-NEXT:  # %bb.1: # %.preheader
@ -273,26 +272,26 @@ define dso_local void @test_loop2(i32 %0) nounwind {
 ; CHECK-NEXT:    pushq %rbx
 ; CHECK-NEXT:    subq $3024, %rsp # imm = 0xBD0
 ; CHECK-NEXT:    movl %edi, %ebx
+; CHECK-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; CHECK-NEXT:    vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movl $buf, %r14d
 ; CHECK-NEXT:    movl $32, %r15d
 ; CHECK-NEXT:    movw $8, %bp
 ; CHECK-NEXT:    movl $buf+2048, %r12d
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB3_1: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    callq foo
-; CHECK-NEXT:    testl %ebx, %ebx
-; CHECK-NEXT:    jle .LBB3_3
-; CHECK-NEXT:  # %bb.2: # in Loop: Header=BB3_1 Depth=1
-; CHECK-NEXT:    vpxord %zmm0, %zmm0, %zmm0
-; CHECK-NEXT:    vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movb $1, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movb $8, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movw $8, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    testl %ebx, %ebx
+; CHECK-NEXT:    jle .LBB3_3
+; CHECK-NEXT:  # %bb.2: # in Loop: Header=BB3_1 Depth=1
 ; CHECK-NEXT:    tileloadd (%r14,%r15), %tmm0
 ; CHECK-NEXT:    movabsq $64, %rax
 ; CHECK-NEXT:    tilestored %tmm0, 1024(%rsp,%rax) # 1024-byte Folded Spill
-; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    callq foo
 ; CHECK-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movabsq $64, %rax
@ -313,6 +312,12 @@ define dso_local void @test_loop2(i32 %0) nounwind {
 ; IPRA-LABEL: test_loop2:
 ; IPRA:       # %bb.0:
 ; IPRA-NEXT:    subq $72, %rsp
+; IPRA-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; IPRA-NEXT:    vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
+; IPRA-NEXT:    movb $1, {{[0-9]+}}(%rsp)
+; IPRA-NEXT:    movb $8, {{[0-9]+}}(%rsp)
+; IPRA-NEXT:    movw $8, {{[0-9]+}}(%rsp)
+; IPRA-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
 ; IPRA-NEXT:    movl $buf, %eax
 ; IPRA-NEXT:    movl $32, %ecx
 ; IPRA-NEXT:    movw $8, %dx
@ -323,12 +328,6 @@ define dso_local void @test_loop2(i32 %0) nounwind {
 ; IPRA-NEXT:    testl %edi, %edi
 ; IPRA-NEXT:    jle .LBB3_3
 ; IPRA-NEXT:  # %bb.2: # in Loop: Header=BB3_1 Depth=1
-; IPRA-NEXT:    vpxord %zmm0, %zmm0, %zmm0
-; IPRA-NEXT:    vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
-; IPRA-NEXT:    movb $1, {{[0-9]+}}(%rsp)
-; IPRA-NEXT:    movb $8, {{[0-9]+}}(%rsp)
-; IPRA-NEXT:    movw $8, {{[0-9]+}}(%rsp)
-; IPRA-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
 ; IPRA-NEXT:    tileloadd (%rax,%rcx), %tmm0
 ; IPRA-NEXT:    callq foo
 ; IPRA-NEXT:    tilestored %tmm0, (%rsi,%rcx)
--- a/llvm/test/CodeGen/X86/AMX/amx-config.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-config.ll
@ -10,10 +10,10 @@
 define dso_local void @test_api(i32 %0, i16 signext %1, i16 signext %2) {
 ; AVX512-LABEL: test_api:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    testl %edi, %edi
-; AVX512-NEXT:    movsbl %sil, %eax
 ; AVX512-NEXT:    vpxord %zmm0, %zmm0, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    testl %edi, %edi
+; AVX512-NEXT:    movsbl %sil, %eax
 ; AVX512-NEXT:    movb $1, -{{[0-9]+}}(%rsp)
 ; AVX512-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
 ; AVX512-NEXT:    movw %si, -{{[0-9]+}}(%rsp)
@ -43,11 +43,11 @@ define dso_local void @test_api(i32 %0, i16 signext %1, i16 signext %2) {
 ;
 ; AVX2-LABEL: test_api:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    testl %edi, %edi
-; AVX2-NEXT:    movsbl %sil, %eax
 ; AVX2-NEXT:    vxorps %ymm0, %ymm0, %ymm0
 ; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
 ; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    testl %edi, %edi
+; AVX2-NEXT:    movsbl %sil, %eax
 ; AVX2-NEXT:    movb $1, -{{[0-9]+}}(%rsp)
 ; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
 ; AVX2-NEXT:    movw %si, -{{[0-9]+}}(%rsp)
@ -77,13 +77,13 @@ define dso_local void @test_api(i32 %0, i16 signext %1, i16 signext %2) {
 ;
 ; SSE2-LABEL: test_api:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    testl %edi, %edi
-; SSE2-NEXT:    movsbl %sil, %eax
 ; SSE2-NEXT:    xorps %xmm0, %xmm0
 ; SSE2-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE2-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE2-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE2-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT:    testl %edi, %edi
+; SSE2-NEXT:    movsbl %sil, %eax
 ; SSE2-NEXT:    movb $1, -{{[0-9]+}}(%rsp)
 ; SSE2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
 ; SSE2-NEXT:    movw %si, -{{[0-9]+}}(%rsp)
--- a/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll
@ -32,6 +32,7 @@ define dso_local void @test1(i16 signext %0, i16 signext %1) nounwind {
  %4 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32)
  %5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32)
  %6 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %5, x86_amx %3, x86_amx %4)
+  call void @llvm.dbg.value(metadata x86_amx %6, metadata !DILocalVariable(name: "1", scope: !2), metadata !DIExpression()), !dbg !3
  tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %6)
  tail call void @foo()
  ret void
@ -47,6 +48,8 @@ define dso_local void @test2(i16 signext %0, i16 signext %1) nounwind {
 ; CHECK-NEXT:    movl %edi, %ebp
 ; CHECK-NEXT:    vpxord %zmm0, %zmm0, %zmm0
 ; CHECK-NEXT:    vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq foo
 ; CHECK-NEXT:    movb $1, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movb $8, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movw %bx, {{[0-9]+}}(%rsp)
@ -59,9 +62,6 @@ define dso_local void @test2(i16 signext %0, i16 signext %1) nounwind {
 ; CHECK-NEXT:    movb %bpl, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movw $8, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    callq foo
-; CHECK-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    testb %al, %al
 ; CHECK-NEXT:    jne .LBB1_3
@ -116,8 +116,252 @@ exit:
  ret void
 }

+define dso_local void @test3(i16 signext %0, i16 signext %1) nounwind {
+; CHECK-LABEL: test3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; CHECK-NEXT:    vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testb %al, %al
+; CHECK-NEXT:    jne .LBB2_2
+; CHECK-NEXT:  # %bb.1: # %if.true
+; CHECK-NEXT:    incl %edi
+; CHECK-NEXT:    jmp .LBB2_3
+; CHECK-NEXT:  .LBB2_2: # %if.false
+; CHECK-NEXT:    decl %edi
+; CHECK-NEXT:  .LBB2_3: # %exit
+; CHECK-NEXT:    movb $1, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movw %si, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    tilezero %tmm0
+; CHECK-NEXT:    movl $buf, %eax
+; CHECK-NEXT:    movl $32, %ecx
+; CHECK-NEXT:    tilestored %tmm0, (%rax,%rcx)
+; CHECK-NEXT:    tilerelease
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  br i1 undef, label %if.true, label %if.false
+
+if.true:
+  %3 = add i16 %0, 1
+  br label %exit
+
+if.false:
+  %4 = sub i16 %0, 1
+  br label %exit
+
+exit:
+  %5 = phi i16 [ %3, %if.true ], [ %4, %if.false ]
+  %6 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %5, i16 %1)
+  tail call void @llvm.x86.tilestored64.internal(i16 %5, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32, x86_amx %6)
+  ret void
+}
+
+; TODO: There's PRA Tile Register Configure bug needs to fix later.
+define dso_local void @test4(i16 signext %0, i16 signext %1) nounwind {
+; CHECK-LABEL: test4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; CHECK-NEXT:    vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testb %al, %al
+; CHECK-NEXT:    jne .LBB3_3
+; CHECK-NEXT:  # %bb.1: # %if.true
+; CHECK-NEXT:    incl %edi
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testb %al, %al
+; CHECK-NEXT:    jne .LBB3_4
+; CHECK-NEXT:  .LBB3_2: # %amx2
+; CHECK-NEXT:    movb $1, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movw %si, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $32, %eax
+; CHECK-NEXT:    movl $buf+1024, %ecx
+; CHECK-NEXT:    tileloadd (%rcx,%rax), %tmm0
+; CHECK-NEXT:    movl $buf, %ecx
+; CHECK-NEXT:    tilestored %tmm0, (%rcx,%rax)
+; CHECK-NEXT:    tilerelease
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB3_3: # %if.false
+; CHECK-NEXT:    decl %edi
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testb %al, %al
+; CHECK-NEXT:    jne .LBB3_2
+; CHECK-NEXT:  .LBB3_4: # %amx1
+; CHECK-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    tilezero %tmm0
+; CHECK-NEXT:    movl $buf, %eax
+; CHECK-NEXT:    movl $32, %ecx
+; CHECK-NEXT:    tilestored %tmm0, (%rax,%rcx)
+; CHECK-NEXT:    tilerelease
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  br i1 undef, label %if.true, label %if.false
+
+if.true:
+  %3 = add i16 %0, 1
+  br i1 undef, label %amx1, label %amx2
+
+if.false:
+  %4 = sub i16 %0, 1
+  br i1 undef, label %amx2, label %amx1
+
+amx1:
+  %5 = phi i16 [ %3, %if.true ], [ %4, %if.false ]
+  %6 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %5, i16 %1)
+  tail call void @llvm.x86.tilestored64.internal(i16 %5, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32, x86_amx %6)
+  br label %exit
+
+amx2:
+  %7 = phi i16 [ %3, %if.true ], [ %4, %if.false ]
+  %8 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %7, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32)
+  tail call void @llvm.x86.tilestored64.internal(i16 %7, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32, x86_amx %8)
+  br label %exit
+
+exit:
+  ret void
+}
+
+define dso_local void @test5(i16 signext %0, i16 signext %1) nounwind {
+; CHECK-LABEL: test5:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $esi killed $esi def $rsi
+; CHECK-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; CHECK-NEXT:    vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    movl $buf, %r8d
+; CHECK-NEXT:    movl $32, %edx
+; CHECK-NEXT:    leal -1(%rsi), %ecx
+; CHECK-NEXT:    jmp .LBB4_1
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB4_3: # %if.false
+; CHECK-NEXT:    # in Loop: Header=BB4_1 Depth=1
+; CHECK-NEXT:    movl %ecx, %esi
+; CHECK-NEXT:    cmpw $7, %si
+; CHECK-NEXT:    jne .LBB4_5
+; CHECK-NEXT:  .LBB4_1: # %loop.bb1
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movb $1, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movw %si, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    testb %al, %al
+; CHECK-NEXT:    jne .LBB4_3
+; CHECK-NEXT:  # %bb.2: # %if.true
+; CHECK-NEXT:    # in Loop: Header=BB4_1 Depth=1
+; CHECK-NEXT:    tilezero %tmm0
+; CHECK-NEXT:    tilestored %tmm0, (%r8,%rdx)
+; CHECK-NEXT:    cmpw $7, %si
+; CHECK-NEXT:    je .LBB4_1
+; CHECK-NEXT:  .LBB4_5: # %exit
+; CHECK-NEXT:    tilerelease
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+entry:
+  br label %loop.bb1
+
+loop.bb1:
+  %2 = phi i16 [ %1, %entry ], [ %5, %loop.bb2 ]
+  br i1 undef, label %if.true, label %if.false
+
+if.true:
+  %3 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %0, i16 %2)
+  tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %2, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32, x86_amx %3)
+  br label %loop.bb2
+
+if.false:
+  %4 = sub i16 %1, 1
+  br label %loop.bb2
+
+loop.bb2:
+  %5 = phi i16 [ %2, %if.true ], [ %4, %if.false ]
+  %6 = icmp eq i16 %5, 7
+  br i1 %6, label %loop.bb1, label %exit
+
+exit:
+  ret void
+}
+
+define dso_local void @test6(i16 signext %0) nounwind {
+; CHECK-LABEL: test6:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
+; CHECK-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; CHECK-NEXT:    vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    xorl %r8d, %r8d
+; CHECK-NEXT:    movl $buf, %ecx
+; CHECK-NEXT:    movl $32, %edx
+; CHECK-NEXT:    xorl %esi, %esi
+; CHECK-NEXT:    jmp .LBB5_1
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB5_3: # %if.false
+; CHECK-NEXT:    # in Loop: Header=BB5_1 Depth=1
+; CHECK-NEXT:    decl %esi
+; CHECK-NEXT:  .LBB5_4: # %loop.bb2
+; CHECK-NEXT:    # in Loop: Header=BB5_1 Depth=1
+; CHECK-NEXT:    leal (%rdi,%rsi), %eax
+; CHECK-NEXT:    cmpw $7, %si
+; CHECK-NEXT:    movb $1, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    tilezero %tmm0
+; CHECK-NEXT:    tilestored %tmm0, (%rcx,%rdx)
+; CHECK-NEXT:    jne .LBB5_5
+; CHECK-NEXT:  .LBB5_1: # %loop.bb1
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    testb %r8b, %r8b
+; CHECK-NEXT:    jne .LBB5_3
+; CHECK-NEXT:  # %bb.2: # %if.true
+; CHECK-NEXT:    # in Loop: Header=BB5_1 Depth=1
+; CHECK-NEXT:    incl %esi
+; CHECK-NEXT:    jmp .LBB5_4
+; CHECK-NEXT:  .LBB5_5: # %exit
+; CHECK-NEXT:    tilerelease
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+entry:
+  br label %loop.bb1
+
+loop.bb1:
+  %1 = phi i16 [ 0, %entry ], [ %4, %loop.bb2 ]
+  br i1 undef, label %if.true, label %if.false
+
+if.true:
+  %2 = add i16 %1, 1
+  br label %loop.bb2
+
+if.false:
+  %3 = sub i16 %1, 1
+  br label %loop.bb2
+
+loop.bb2:
+  %4 = phi i16 [ %2, %if.true ], [ %3, %if.false ]
+  %5 = icmp eq i16 %4, 7
+  %6 = add i16 %0, %4
+  %7 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %0, i16 %6)
+  tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %6, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32, x86_amx %7)
+  br i1 %5, label %loop.bb1, label %exit
+
+exit:
+  ret void
+}
+
+
 declare dso_local void @foo() nounwind
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 declare x86_amx @llvm.x86.tilezero.internal(i16, i16)
 declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)
 declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
 declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!1}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !DIFile(filename: "1", directory: "1"))
+!1 = !{i32 2, !"Debug Info Version", i32 3}
+!2 = distinct !DISubprogram(unit: !0)
+!3 = !DILocation(line: 1, column: 1, scope: !2)
--- a/llvm/test/CodeGen/X86/opt-pipeline.ll
+++ b/llvm/test/CodeGen/X86/opt-pipeline.ll
@ -117,12 +117,12 @@
 ; CHECK-NEXT:       X86 EFLAGS copy lowering
 ; CHECK-NEXT:       X86 WinAlloca Expander
 ; CHECK-NEXT:       MachineDominator Tree Construction
+; CHECK-NEXT:       Machine Natural Loop Construction
 ; CHECK-NEXT:       Tile Register Pre-configure
 ; CHECK-NEXT:       Detect Dead Lanes
 ; CHECK-NEXT:       Process Implicit Definitions
 ; CHECK-NEXT:       Remove unreachable machine basic blocks
 ; CHECK-NEXT:       Live Variable Analysis
-; CHECK-NEXT:       Machine Natural Loop Construction
 ; CHECK-NEXT:       Eliminate PHI nodes for register allocation
 ; CHECK-NEXT:       Two-Address instruction pass
 ; CHECK-NEXT:       Slot index numbering