[LoopInterchange] New cost model for loop interchange

This patch proposed to use a new cost model for loop interchange, which is obtained from loop cache analysis. Given a loopnest, what loop cache analysis returns is a vector of loops [loop0, loop1, loop2, ...] where loop0 should be replaced as the outermost loop, loop1 should be placed one more level inside, and loop2 one more level inside, etc. What loop cache analysis does is not only more comprehensive than the current cost model, it is also a "one-shot" query which means that we only need to query it once during the entire loop interchange pass, which is better than the current cost model where we query it every time we check whether it is profitable to interchange two loops. Thus complexity is reduced, especially after D120386 where we do more interchanges to get the globally optimal loop access pattern. Updates made to test cases are mostly minor changes and some corrections. Test coverage for loop interchange is not reduced. Currently we did not completely remove the legacy cost model, but keep it as fall-back in case the new cost model did not run successfully. This is because currently we have some limitations in delinearization, which sometimes makes loop cache analysis bail out. The longer term goal is to enhance delinearization and eventually remove the legacy cost model compeletely. Reviewed By: bmahjour, #loopoptwg Differential Revision: https://reviews.llvm.org/D124926
2022-06-02 17:53:13 -04:00 · 2022-06-02 17:53:13 -04:00 · 006334470d
parent 4ad17d2e96
commit 006334470d
35 changed files with 301 additions and 169 deletions
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@ -18,6 +18,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/DependenceAnalysis.h"
+#include "llvm/Analysis/LoopCacheAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopNestAnalysis.h"
 #include "llvm/Analysis/LoopPass.h"
@ -358,8 +359,10 @@ public:
      : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}

  /// Check if the loop interchange is profitable.
-  bool isProfitable(unsigned InnerLoopId, unsigned OuterLoopId,
-                    CharMatrix &DepMatrix);
+  bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop,
+                    unsigned InnerLoopId, unsigned OuterLoopId,
+                    CharMatrix &DepMatrix,
+                    const DenseMap<const Loop *, unsigned> &CostMap);

 private:
  int getInstrOrderCost();
@ -410,13 +413,15 @@ struct LoopInterchange {
  LoopInfo *LI = nullptr;
  DependenceInfo *DI = nullptr;
  DominatorTree *DT = nullptr;
+  std::unique_ptr<CacheCost> CC = nullptr;

  /// Interface to emit optimization remarks.
  OptimizationRemarkEmitter *ORE;

  LoopInterchange(ScalarEvolution *SE, LoopInfo *LI, DependenceInfo *DI,
-                  DominatorTree *DT, OptimizationRemarkEmitter *ORE)
-      : SE(SE), LI(LI), DI(DI), DT(DT), ORE(ORE) {}
+                  DominatorTree *DT, std::unique_ptr<CacheCost> &CC,
+                  OptimizationRemarkEmitter *ORE)
+      : SE(SE), LI(LI), DI(DI), DT(DT), CC(std::move(CC)), ORE(ORE) {}

  bool run(Loop *L) {
    if (L->getParentLoop())
@ -499,6 +504,21 @@ struct LoopInterchange {
    }

    unsigned SelecLoopId = selectLoopForInterchange(LoopList);
+    // Obtain the loop vector returned from loop cache analysis beforehand,
+    // and put each <Loop, index> pair into a map for constant time query
+    // later. Indices in loop vector reprsent the optimal order of the
+    // corresponding loop, e.g., given a loopnest with depth N, index 0
+    // indicates the loop should be placed as the outermost loop and index N
+    // indicates the loop should be placed as the innermost loop.
+    //
+    // For the old pass manager CacheCost would be null.
+    DenseMap<const Loop *, unsigned> CostMap;
+    if (CC != nullptr) {
+      const auto &LoopCosts = CC->getLoopCosts();
+      for (unsigned i = 0; i < LoopCosts.size(); i++) {
+        CostMap[LoopCosts[i].first] = i;
+      }
+    }
    // We try to achieve the globally optimal memory access for the loopnest,
    // and do interchange based on a bubble-sort fasion. We start from
    // the innermost loop, move it outwards to the best possible position
@ -507,7 +527,7 @@ struct LoopInterchange {
      bool ChangedPerIter = false;
      for (unsigned i = SelecLoopId; i > SelecLoopId - j; i--) {
        bool Interchanged = processLoop(LoopList[i], LoopList[i - 1], i, i - 1,
-                                        DependencyMatrix);
+                                        DependencyMatrix, CostMap);
        if (!Interchanged)
          continue;
        // Loops interchanged, update LoopList accordingly.
@ -531,7 +551,8 @@ struct LoopInterchange {

  bool processLoop(Loop *InnerLoop, Loop *OuterLoop, unsigned InnerLoopId,
                   unsigned OuterLoopId,
-                   std::vector<std::vector<char>> &DependencyMatrix) {
+                   std::vector<std::vector<char>> &DependencyMatrix,
+                   const DenseMap<const Loop *, unsigned> &CostMap) {
    LLVM_DEBUG(dbgs() << "Processing InnerLoopId = " << InnerLoopId
                      << " and OuterLoopId = " << OuterLoopId << "\n");
    LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE);
@ -541,7 +562,8 @@ struct LoopInterchange {
    }
    LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n");
    LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
-    if (!LIP.isProfitable(InnerLoopId, OuterLoopId, DependencyMatrix)) {
+    if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId,
+                          DependencyMatrix, CostMap)) {
      LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n");
      return false;
    }
@ -1135,21 +1157,33 @@ static bool isProfitableForVectorization(unsigned InnerLoopId,
  return !DepMatrix.empty();
 }

-bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId,
-                                                unsigned OuterLoopId,
-                                                CharMatrix &DepMatrix) {
-  // TODO: Add better profitability checks.
-  // e.g
-  // 1) Construct dependency matrix and move the one with no loop carried dep
-  //    inside to enable vectorization.
+bool LoopInterchangeProfitability::isProfitable(
+    const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId,
+    unsigned OuterLoopId, CharMatrix &DepMatrix,
+    const DenseMap<const Loop *, unsigned> &CostMap) {
+  // TODO: Remove the legacy cost model.

-  // This is rough cost estimation algorithm. It counts the good and bad order
-  // of induction variables in the instruction and allows reordering if number
-  // of bad orders is more than good.
-  int Cost = getInstrOrderCost();
-  LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n");
-  if (Cost < -LoopInterchangeCostThreshold)
-    return true;
+  // This is the new cost model returned from loop cache analysis.
+  // A smaller index means the loop should be placed an outer loop, and vice
+  // versa.
+  if (CostMap.find(InnerLoop) != CostMap.end() &&
+      CostMap.find(OuterLoop) != CostMap.end()) {
+    unsigned InnerIndex = 0, OuterIndex = 0;
+    InnerIndex = CostMap.find(InnerLoop)->second;
+    OuterIndex = CostMap.find(OuterLoop)->second;
+    LLVM_DEBUG(dbgs() << "InnerIndex = " << InnerIndex
+                      << ", OuterIndex = " << OuterIndex << "\n");
+    if (InnerIndex < OuterIndex)
+      return true;
+  } else {
+    // Legacy cost model: this is rough cost estimation algorithm. It counts the
+    // good and bad order of induction variables in the instruction and allows
+    // reordering if number of bad orders is more than good.
+    int Cost = getInstrOrderCost();
+    LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n");
+    if (Cost < -LoopInterchangeCostThreshold)
+      return true;
+  }

  // It is not profitable as per current cache profitability model. But check if
  // we can move this loop outside to improve parallelism.
@ -1160,10 +1194,8 @@ bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId,
    return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
                                    InnerLoop->getStartLoc(),
                                    InnerLoop->getHeader())
-           << "Interchanging loops is too costly (cost="
-           << ore::NV("Cost", Cost) << ", threshold="
-           << ore::NV("Threshold", LoopInterchangeCostThreshold)
-           << ") and it does not improve parallelism.";
+           << "Interchanging loops is too costly and it does not improve "
+              "parallelism.";
  });
  return false;
 }
@ -1709,8 +1741,8 @@ struct LoopInterchangeLegacyPass : public LoopPass {
    auto *DI = &getAnalysis<DependenceAnalysisWrapperPass>().getDI();
    auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
    auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
-
-    return LoopInterchange(SE, LI, DI, DT, ORE).run(L);
+    std::unique_ptr<CacheCost> CC = nullptr;
+    return LoopInterchange(SE, LI, DI, DT, CC, ORE).run(L);
  }
 };
 } // namespace
@ -1737,8 +1769,10 @@ PreservedAnalyses LoopInterchangePass::run(LoopNest &LN,
  Function &F = *LN.getParent();

  DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI);
+  std::unique_ptr<CacheCost> CC =
+      CacheCost::getCacheCost(LN.getOutermostLoop(), AR, DI);
  OptimizationRemarkEmitter ORE(&F);
-  if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, &ORE).run(LN))
+  if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, CC, &ORE).run(LN))
    return PreservedAnalyses::all();
  return getLoopPassPreservedAnalyses();
 }
--- a/llvm/test/Transforms/LICM/lnicm.ll
+++ b/llvm/test/Transforms/LICM/lnicm.ll
@ -1,12 +1,13 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -aa-pipeline=basic-aa -passes='loop(loop-interchange)'       -S %s | FileCheck %s --check-prefixes INTC
-; RUN: opt -aa-pipeline=basic-aa -passes='loop-mssa(lnicm),loop(loop-interchange)' -S %s | FileCheck %s --check-prefixes LNICM,CHECK
-; RUN: opt -aa-pipeline=basic-aa -passes='loop-mssa(licm),loop(loop-interchange)'  -S %s | FileCheck %s --check-prefixes LICM,CHECK
+; RUN: opt -aa-pipeline=basic-aa -passes='loop-mssa(lnicm),loop(loop-interchange)' -S %s | FileCheck %s --check-prefixes LNICM
+; RUN: opt -aa-pipeline=basic-aa -passes='loop-mssa(licm),loop(loop-interchange)'  -S %s | FileCheck %s --check-prefixes LICM

 ; This test represents the following function:
-; void test(int x[10][10], int y[10], int *z) {
-;   for (int k = 0; k < 10; k++) {
+; void test(int n, int m, int x[m][n], int y[n], int *z) {
+;   for (int k = 0; k < n; k++) {
 ;     int tmp = *z;
-;     for (int i = 0; i < 10; i++)
+;     for (int i = 0; i < m; i++)
 ;       x[i][k] += y[k] + tmp;
 ;   }
 ; }
@ -15,83 +16,189 @@
 ; to keep perfect loop nest. This enables optimizations that require
 ; perfect loop nest (e.g. loop-interchange) to perform.

+target triple = "powerpc64le-unknown-linux-gnu"

-define dso_local void @test([10 x i32]* noalias %x, i32* noalias readonly %y, i32* readonly %z) {
-; CHECK-LABEL: @test(
-; CHECK-NEXT: entry:
-; CHECK-NEXT:   [[Z:%.*]] = load i32, i32* %z, align 4
-; CHECK-NEXT:   br label [[FOR_BODY3_PREHEADER:%.*]]
-; LNICM:      for.body.preheader:
-; LICM-NOT:   for.body.preheader:
-; INTC-NOT:   for.body.preheader:
-; LNICM-NEXT:   br label [[FOR_BODY:%.*]]
-; CHECK:      for.body:
-; LNICM-NEXT:   [[K:%.*]] = phi i32 [ [[INC10:%.*]], [[FOR_END:%.*]] ], [ 0, [[FOR_BODY_PREHEADER:%.*]] ]
-; LNICM-NEXT:   br label [[FOR_BODY3_SPLIT1:%.*]]
-; LICM:         [[TMP:%.*]] = load i32, i32* [[ARRAYIDX:%.*]], align 4
-; LNICM:      for.body3.preheader:
-; LICM-NOT:   for.body3.preheader:
-; INTC-NOT:   for.body3.preheader:
-; LNICM-NEXT:   br label [[FOR_BODY3:%.*]]
-; CHECK:      for.body3:
-; LNICM-NEXT:   [[I:%.*]] = phi i32 [ [[TMP3:%.*]], [[FOR_BODY3_SPLIT:%.*]] ], [ 0, [[FOR_BODY3_PREHEADER:%.*]] ]
-; LNICM-NEXT:   br label [[FOR_BODY_PREHEADER:%.*]]
-; LNICM:      for.body3.split1:
-; LNICM-NEXT:   [[IDXPROM:%.*]] = sext i32 [[K:%.*]] to i64
-; LNICM-NEXT:   [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* %y, i64 [[IDXPROM:%.*]]
-; LNICM-NEXT:   [[TMP:%.*]] = load i32, i32* [[ARRAYIDX:%.*]], align 4
-; LNICM-NEXT:   [[ADD:%.*]] = add nsw i32 [[TMP:%.*]], [[Z:%.*]]
-; LNICM-NEXT:   [[IDXPROM4:%.*]] = sext i32 [[I:%.*]] to i64
-; LNICM-NEXT:   [[ARRAYIDX5:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* %x, i64 [[IDXPROM4:%.*]]
-; LNICM-NEXT:   [[IDXPROM6:%.*]] = sext i32 [[K:%.*]] to i64
-; LNICM-NEXT:   [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX5:%.*]], i64 0, i64 [[IDXPROM6:%.*]]
-; LNICM-NEXT:   [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX7:%.*]], align 4
-; LNICM-NEXT:   [[ADD8:%.*]] = add nsw i32 [[TMP2:%.*]], [[ADD:%.*]]
-; LNICM-NEXT:   store i32 [[ADD8:%.*]], i32* [[ARRAYIDX7:%.*]], align 4
-; LNICM-NEXT:   [[INC:%.*]] = add nsw i32 [[I:%.*]], 1
-; LNICM-NEXT:   [[CMP2:%.*]] = icmp slt i32 [[INC:%.*]], 10
-; LNICM-NEXT:   br label [[FOR_END:%.*]]
-; LNICM:      for.body3.split:
-; LICM-NOT:   for.body3.split:
-; INTC-NOT:   for.body3.split:
-; LNICM-NEXT:   [[TMP3:%.*]] = add nsw i32 [[I:%.*]], 1
-; LNICM-NEXT:   [[TMP4:%.*]] = icmp slt i32 [[TMP3:%.*]], 10
-; LNICM-NEXT:   br i1 [[TMP4:%.*]], label [[FOR_BODY3:%.*]], label [[FOR_END11:%.*]], !llvm.loop !0
-; LNICM:      for.end:
-; LNICM-NEXT:   [[INC10:%.*]] = add nsw i32 [[K:%.*]], 1
-; LNICM-NEXT:   [[CMP:%.*]] = icmp slt i32 [[INC10:%.*]], 10
-; LNICM-NEXT:   br i1 [[CMP:%.*]], label [[FOR_BODY:%.*]], label [[FOR_BODY3_SPLIT:%.*]], !llvm.loop !2
-; LNICM:      for.end11:
-; LNICM-NEXT:   ret void
+define dso_local void @test(i64 %n, i64 %m, ptr noalias %x, ptr noalias readonly %y, ptr readonly %z) {
+; The loopnest is not interchanged when we only run loop interchange.
+; INTC-LABEL: @test(
+; INTC-NEXT:  gurad:
+; INTC-NEXT:    [[CMP23:%.*]] = icmp sgt i64 [[M:%.*]], 0
+; INTC-NEXT:    [[CMP32:%.*]] = icmp sgt i64 [[N:%.*]], 0
+; INTC-NEXT:    br i1 [[CMP23]], label [[FOR_COND1_PREHEADER_LR_PH:%.*]], label [[FOR_END11:%.*]]
+; INTC:       for.cond1.preheader.lr.ph:
+; INTC-NEXT:    br i1 [[CMP32]], label [[FOR_I_PREHEADER:%.*]], label [[FOR_END11]]
+; INTC:       for.i.preheader:
+; INTC-NEXT:    br label [[ENTRY:%.*]]
+; INTC:       entry:
+; INTC-NEXT:    br label [[FOR_BODY:%.*]]
+; INTC:       for.body:
+; INTC-NEXT:    [[K_02:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC10:%.*]], [[FOR_END:%.*]] ]
+; INTC-NEXT:    [[TMP0:%.*]] = load i32, ptr [[Z:%.*]], align 4
+; INTC-NEXT:    br label [[FOR_BODY3:%.*]]
+; INTC:       for.body3:
+; INTC-NEXT:    [[I_01:%.*]] = phi i32 [ 0, [[FOR_BODY]] ], [ [[INC:%.*]], [[FOR_BODY3]] ]
+; INTC-NEXT:    [[IDXPROM:%.*]] = sext i32 [[K_02]] to i64
+; INTC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[Y:%.*]], i64 [[IDXPROM]]
+; INTC-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; INTC-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
+; INTC-NEXT:    [[IDXPROM4:%.*]] = sext i32 [[I_01]] to i64
+; INTC-NEXT:    [[INDEX0:%.*]] = mul i64 [[IDXPROM4]], [[N]]
+; INTC-NEXT:    [[INDEX1:%.*]] = add i64 [[INDEX0]], [[IDXPROM]]
+; INTC-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i64 [[INDEX1]]
+; INTC-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX7]], align 4
+; INTC-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP2]], [[ADD]]
+; INTC-NEXT:    store i32 [[ADD8]], ptr [[ARRAYIDX7]], align 4
+; INTC-NEXT:    [[INC]] = add nsw i32 [[I_01]], 1
+; INTC-NEXT:    [[INC_EXT:%.*]] = sext i32 [[INC]] to i64
+; INTC-NEXT:    [[CMP2:%.*]] = icmp slt i64 [[INC_EXT]], [[M]]
+; INTC-NEXT:    br i1 [[CMP2]], label [[FOR_BODY3]], label [[FOR_END]], !llvm.loop [[LOOP0:![0-9]+]]
+; INTC:       for.end:
+; INTC-NEXT:    [[INC10]] = add nsw i32 [[K_02]], 1
+; INTC-NEXT:    [[INC10_EXT:%.*]] = sext i32 [[INC10]] to i64
+; INTC-NEXT:    [[CMP:%.*]] = icmp slt i64 [[INC10_EXT]], [[N]]
+; INTC-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END11_LOOPEXIT:%.*]], !llvm.loop [[LOOP2:![0-9]+]]
+; INTC:       for.end11.loopexit:
+; INTC-NEXT:    br label [[FOR_END11]]
+; INTC:       for.end11:
+; INTC-NEXT:    ret void
+;
+; The loopnest is interchanged when we run lnicm and loop interchange.
+; LNICM-LABEL: @test(
+; LNICM-NEXT:  gurad:
+; LNICM-NEXT:    [[CMP23:%.*]] = icmp sgt i64 [[M:%.*]], 0
+; LNICM-NEXT:    [[CMP32:%.*]] = icmp sgt i64 [[N:%.*]], 0
+; LNICM-NEXT:    br i1 [[CMP23]], label [[FOR_COND1_PREHEADER_LR_PH:%.*]], label [[FOR_END11:%.*]]
+; LNICM:       for.cond1.preheader.lr.ph:
+; LNICM-NEXT:    br i1 [[CMP32]], label [[FOR_I_PREHEADER:%.*]], label [[FOR_END11]]
+; LNICM:       for.i.preheader:
+; LNICM-NEXT:    br label [[FOR_BODY3_PREHEADER:%.*]]
+; LNICM:       entry:
+; LNICM-NEXT:    br label [[FOR_BODY:%.*]]
+; LNICM:       for.body:
+; LNICM-NEXT:    [[K_02:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC10:%.*]], [[FOR_END:%.*]] ]
+; LNICM-NEXT:    br label [[FOR_BODY3_SPLIT1:%.*]]
+; LNICM:       for.body3.preheader:
+; LNICM-NEXT:    [[TMP0:%.*]] = load i32, ptr [[Z:%.*]], align 4
+; LNICM-NEXT:    br label [[FOR_BODY3:%.*]]
+; LNICM:       for.body3:
+; LNICM-NEXT:    [[I_01:%.*]] = phi i32 [ [[TMP3:%.*]], [[FOR_BODY3_SPLIT:%.*]] ], [ 0, [[FOR_BODY3_PREHEADER]] ]
+; LNICM-NEXT:    br label [[ENTRY]]
+; LNICM:       for.body3.split1:
+; LNICM-NEXT:    [[IDXPROM:%.*]] = sext i32 [[K_02]] to i64
+; LNICM-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[Y:%.*]], i64 [[IDXPROM]]
+; LNICM-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; LNICM-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
+; LNICM-NEXT:    [[IDXPROM4:%.*]] = sext i32 [[I_01]] to i64
+; LNICM-NEXT:    [[INDEX0:%.*]] = mul i64 [[IDXPROM4]], [[N]]
+; LNICM-NEXT:    [[INDEX1:%.*]] = add i64 [[INDEX0]], [[IDXPROM]]
+; LNICM-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i64 [[INDEX1]]
+; LNICM-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX7]], align 4
+; LNICM-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP2]], [[ADD]]
+; LNICM-NEXT:    store i32 [[ADD8]], ptr [[ARRAYIDX7]], align 4
+; LNICM-NEXT:    [[INC:%.*]] = add nsw i32 [[I_01]], 1
+; LNICM-NEXT:    [[INC_EXT:%.*]] = sext i32 [[INC]] to i64
+; LNICM-NEXT:    [[CMP2:%.*]] = icmp slt i64 [[INC_EXT]], [[M]]
+; LNICM-NEXT:    br label [[FOR_END]]
+; LNICM:       for.body3.split:
+; LNICM-NEXT:    [[TMP3]] = add nsw i32 [[I_01]], 1
+; LNICM-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
+; LNICM-NEXT:    [[TMP5:%.*]] = icmp slt i64 [[TMP4]], [[M]]
+; LNICM-NEXT:    br i1 [[TMP5]], label [[FOR_BODY3]], label [[FOR_END11_LOOPEXIT:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; LNICM:       for.end:
+; LNICM-NEXT:    [[INC10]] = add nsw i32 [[K_02]], 1
+; LNICM-NEXT:    [[INC10_EXT:%.*]] = sext i32 [[INC10]] to i64
+; LNICM-NEXT:    [[CMP:%.*]] = icmp slt i64 [[INC10_EXT]], [[N]]
+; LNICM-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_BODY3_SPLIT]], !llvm.loop [[LOOP2:![0-9]+]]
+; LNICM:       for.end11.loopexit:
+; LNICM-NEXT:    br label [[FOR_END11]]
+; LNICM:       for.end11:
+; LNICM-NEXT:    ret void
+;
+; The loopnest is not interchanged when we run licm and loop interchange.
+; LICM-LABEL: @test(
+; LICM-NEXT:  gurad:
+; LICM-NEXT:    [[CMP23:%.*]] = icmp sgt i64 [[M:%.*]], 0
+; LICM-NEXT:    [[CMP32:%.*]] = icmp sgt i64 [[N:%.*]], 0
+; LICM-NEXT:    br i1 [[CMP23]], label [[FOR_COND1_PREHEADER_LR_PH:%.*]], label [[FOR_END11:%.*]]
+; LICM:       for.cond1.preheader.lr.ph:
+; LICM-NEXT:    br i1 [[CMP32]], label [[FOR_I_PREHEADER:%.*]], label [[FOR_END11]]
+; LICM:       for.i.preheader:
+; LICM-NEXT:    br label [[ENTRY:%.*]]
+; LICM:       entry:
+; LICM-NEXT:    [[TMP0:%.*]] = load i32, ptr [[Z:%.*]], align 4
+; LICM-NEXT:    br label [[FOR_BODY:%.*]]
+; LICM:       for.body:
+; LICM-NEXT:    [[K_02:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC10:%.*]], [[FOR_END:%.*]] ]
+; LICM-NEXT:    [[IDXPROM:%.*]] = sext i32 [[K_02]] to i64
+; LICM-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[Y:%.*]], i64 [[IDXPROM]]
+; LICM-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; LICM-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
+; LICM-NEXT:    br label [[FOR_BODY3:%.*]]
+; LICM:       for.body3:
+; LICM-NEXT:    [[I_01:%.*]] = phi i32 [ 0, [[FOR_BODY]] ], [ [[INC:%.*]], [[FOR_BODY3]] ]
+; LICM-NEXT:    [[IDXPROM4:%.*]] = sext i32 [[I_01]] to i64
+; LICM-NEXT:    [[INDEX0:%.*]] = mul i64 [[IDXPROM4]], [[N]]
+; LICM-NEXT:    [[INDEX1:%.*]] = add i64 [[INDEX0]], [[IDXPROM]]
+; LICM-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i64 [[INDEX1]]
+; LICM-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX7]], align 4
+; LICM-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP2]], [[ADD]]
+; LICM-NEXT:    store i32 [[ADD8]], ptr [[ARRAYIDX7]], align 4
+; LICM-NEXT:    [[INC]] = add nsw i32 [[I_01]], 1
+; LICM-NEXT:    [[INC_EXT:%.*]] = sext i32 [[INC]] to i64
+; LICM-NEXT:    [[CMP2:%.*]] = icmp slt i64 [[INC_EXT]], [[M]]
+; LICM-NEXT:    br i1 [[CMP2]], label [[FOR_BODY3]], label [[FOR_END]], !llvm.loop [[LOOP0:![0-9]+]]
+; LICM:       for.end:
+; LICM-NEXT:    [[INC10]] = add nsw i32 [[K_02]], 1
+; LICM-NEXT:    [[INC10_EXT:%.*]] = sext i32 [[INC10]] to i64
+; LICM-NEXT:    [[CMP:%.*]] = icmp slt i64 [[INC10_EXT]], [[N]]
+; LICM-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END11_LOOPEXIT:%.*]], !llvm.loop [[LOOP2:![0-9]+]]
+; LICM:       for.end11.loopexit:
+; LICM-NEXT:    br label [[FOR_END11]]
+; LICM:       for.end11:
+; LICM-NEXT:    ret void
+;

-entry:
+gurad:
+  %cmp23 = icmp sgt i64 %m, 0
+  %cmp32 = icmp sgt i64 %n, 0
+  br i1 %cmp23, label %for.cond1.preheader.lr.ph, label %for.end11
+
+for.cond1.preheader.lr.ph:                        ; preds = %gurad
+  br i1 %cmp32, label %for.i.preheader, label %for.end11
+
+for.i.preheader:                                  ; preds = %for.cond1.preheader.lr.ph
+  br label %entry
+
+entry:                                  ; preds = %for.i.preheader
  br label %for.body

 for.body:
  %k.02 = phi i32 [ 0, %entry ], [ %inc10, %for.end ]
-  %0 = load i32, i32* %z, align 4
+  %0 = load i32, ptr %z, align 4
  br label %for.body3

 for.body3:
  %i.01 = phi i32 [ 0, %for.body ], [ %inc, %for.body3 ]
  %idxprom = sext i32 %k.02 to i64
-  %arrayidx = getelementptr inbounds i32, i32* %y, i64 %idxprom
-  %1 = load i32, i32* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds i32, ptr %y, i64 %idxprom
+  %1 = load i32, ptr %arrayidx, align 4
  %add = add nsw i32 %1, %0
  %idxprom4 = sext i32 %i.01 to i64
-  %arrayidx5 = getelementptr inbounds [10 x i32], [10 x i32]* %x, i64 %idxprom4
-  %idxprom6 = sext i32 %k.02 to i64
-  %arrayidx7 = getelementptr inbounds [10 x i32], [10 x i32]* %arrayidx5, i64 0, i64 %idxprom6
-  %2 = load i32, i32* %arrayidx7, align 4
+  %index0 = mul i64 %idxprom4, %n
+  %index1 = add i64 %index0, %idxprom
+  %arrayidx7 = getelementptr inbounds i32, ptr %x, i64 %index1
+  %2 = load i32, ptr %arrayidx7, align 4
  %add8 = add nsw i32 %2, %add
-  store i32 %add8, i32* %arrayidx7, align 4
+  store i32 %add8, ptr %arrayidx7, align 4
  %inc = add nsw i32 %i.01, 1
-  %cmp2 = icmp slt i32 %inc, 10
+  %inc.ext = sext i32 %inc to i64
+  %cmp2 = icmp slt i64 %inc.ext, %m
  br i1 %cmp2, label %for.body3, label %for.end, !llvm.loop !0

 for.end:
  %inc10 = add nsw i32 %k.02, 1
-  %cmp = icmp slt i32 %inc10, 10
+  %inc10.ext = sext i32 %inc10 to i64
+  %cmp = icmp slt i64 %inc10.ext, %n
  br i1 %cmp, label %for.body, label %for.end11, !llvm.loop !2

 for.end11:
--- a/llvm/test/Transforms/LoopInterchange/call-instructions.ll
+++ b/llvm/test/Transforms/LoopInterchange/call-instructions.ll
@ -4,7 +4,7 @@
 ; RUN: FileCheck --input-file=%t %s

 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
+target triple = "powerpc64le-unknown-linux-gnu"

@A = common global [100 x [100 x i32]] zeroinitializer

--- a/llvm/test/Transforms/LoopInterchange/currentLimitation.ll
+++ b/llvm/test/Transforms/LoopInterchange/currentLimitation.ll
@ -8,7 +8,7 @@
 ; RUN: FileCheck --check-prefix=DELIN --input-file=%t %s

 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
+target triple = "powerpc64le-unknown-linux-gnu"
 
@A = common global [100 x [100 x i32]] zeroinitializer
@B = common global [100 x [100 x [100 x i32]]] zeroinitializer
--- a/llvm/test/Transforms/LoopInterchange/debuginfo.ll
+++ b/llvm/test/Transforms/LoopInterchange/debuginfo.ll
@ -4,7 +4,7 @@


 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
+target triple = "powerpc64le-unknown-linux-gnu"

@A = common global [100 x [100 x i64]] zeroinitializer

--- a/llvm/test/Transforms/LoopInterchange/inner-indvar-depend-on-outer-indvar.ll
+++ b/llvm/test/Transforms/LoopInterchange/inner-indvar-depend-on-outer-indvar.ll
@ -2,6 +2,7 @@
 ; RUN: opt < %s -basic-aa -loop-interchange -verify-dom-info -verify-loop-info \
 ; RUN:     -S -debug 2>&1 | FileCheck %s

+target triple = "powerpc64le-unknown-linux-gnu"
@A = common global [100 x [100 x i64]] zeroinitializer
@N = dso_local local_unnamed_addr global i64 100, align 8

--- a/llvm/test/Transforms/LoopInterchange/inner-only-reductions.ll
+++ b/llvm/test/Transforms/LoopInterchange/inner-only-reductions.ll
@ -5,6 +5,7 @@
 ; Inner loop only reductions are not supported currently. See discussion at
 ; D53027 for more information on the required checks.

+target triple = "powerpc64le-unknown-linux-gnu"
@A = common global [500 x [500 x i32]] zeroinitializer
@X = common global i32 0
@B = common global [500 x [500 x i32]] zeroinitializer
--- a/llvm/test/Transforms/LoopInterchange/innermost-latch-uses-values-in-middle-header.ll
+++ b/llvm/test/Transforms/LoopInterchange/innermost-latch-uses-values-in-middle-header.ll
@ -2,6 +2,7 @@
 ; RUN: opt < %s -basic-aa -loop-interchange -verify-dom-info -verify-loop-info \
 ; RUN:     -S -debug 2>&1 | FileCheck %s

+target triple = "powerpc64le-unknown-linux-gnu"
@a = common global i32 0, align 4
@d = common dso_local local_unnamed_addr global [1 x [6 x i32]] zeroinitializer, align 4

--- a/llvm/test/Transforms/LoopInterchange/interchange-flow-dep-outer.ll
+++ b/llvm/test/Transforms/LoopInterchange/interchange-flow-dep-outer.ll
@ -3,7 +3,7 @@
 ; RUN:     -S -debug 2>&1 | FileCheck %s

 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
+target triple = "powerpc64le-unknown-linux-gnu"

@A = common global [100 x [100 x i32]] zeroinitializer
@B = common global [100 x i32] zeroinitializer
--- a/llvm/test/Transforms/LoopInterchange/interchange-insts-between-indvar.ll
+++ b/llvm/test/Transforms/LoopInterchange/interchange-insts-between-indvar.ll
@ -1,6 +1,7 @@
 ; RUN: opt < %s -basic-aa -loop-interchange -verify-dom-info -verify-loop-info \
 ; RUN:     -S -pass-remarks=loop-interchange 2>&1 | FileCheck %s

+target triple = "powerpc64le-unknown-linux-gnu"
@A10 = local_unnamed_addr global [3 x [3 x i32]] zeroinitializer, align 16

 ;; Test to make sure we can handle zext instructions introduced by
--- a/llvm/test/Transforms/LoopInterchange/interchange-no-deps.ll
+++ b/llvm/test/Transforms/LoopInterchange/interchange-no-deps.ll
@ -4,6 +4,7 @@
 ; RUN:     | FileCheck -check-prefix=STATS %s
 ; RUN: FileCheck -input-file %t %s

+target triple = "powerpc64le-unknown-linux-gnu"

 ; no_deps_interchange just accesses a single nested array and can be interchange.
 ; CHECK:      Name:       Interchanged
@ -34,35 +35,6 @@ exit:                                 ; preds = %for1.inc

 }

-; Only the inner loop induction variable is used for memory accesses.
-; Interchanging is not beneficial.
-; CHECK:      Name:       InterchangeNotProfitable
-; CHECK-NEXT: Function:   no_bad_order
-define i32 @no_bad_order(i32* %Arr) {
-entry:
-  br label %for1.header
-
-for1.header:                                         ; preds = %entry, %for1.inc
-  %indvars.iv19 = phi i64 [ 0, %entry ], [ %indvars.iv.next20, %for1.inc ]
-  br label %for2
-
-for2:                                        ; preds = %for1.header, %for2
-  %indvars.iv = phi i64 [ 0, %for1.header ], [ %indvars.iv.next, %for2 ]
-  %arrayidx6 = getelementptr inbounds i32, i32* %Arr, i64 %indvars.iv
-  store i32 0, i32* %arrayidx6, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp ne i64 %indvars.iv.next, 1024
-  br i1 %exitcond, label %for2, label %for1.inc
-
-for1.inc:
-  %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1
-  %exitcond21 = icmp ne i64 %indvars.iv.next20, 1024
-  br i1 %exitcond21, label %for1.header, label %exit
-
-exit:                                 ; preds = %for1.inc
-  ret i32 0
-}
-
 ; No memory access using any induction variables, interchanging not beneficial.
 ; CHECK:      Name:        InterchangeNotProfitable
 ; CHECK-NEXT: Function:    no_mem_instrs
--- a/llvm/test/Transforms/LoopInterchange/interchangeable-innerloop-multiple-indvars.ll
+++ b/llvm/test/Transforms/LoopInterchange/interchangeable-innerloop-multiple-indvars.ll
@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -basic-aa -loop-interchange -verify-dom-info -verify-loop-info -verify-scev -verify-loop-lcssa -S | FileCheck %s

+target triple = "powerpc64le-unknown-linux-gnu"
@b = common dso_local local_unnamed_addr global [200 x [200 x i32]] zeroinitializer, align 4
@a = common dso_local local_unnamed_addr global i32 0, align 4

--- a/llvm/test/Transforms/LoopInterchange/interchangeable-outerloop-multiple-indvars.ll
+++ b/llvm/test/Transforms/LoopInterchange/interchangeable-outerloop-multiple-indvars.ll
@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s --basic-aa -loop-interchange -verify-dom-info -verify-loop-info -verify-scev -verify-loop-lcssa -S | FileCheck %s
+; RUN: opt < %s -basic-aa -loop-interchange -verify-dom-info -verify-loop-info -verify-scev -verify-loop-lcssa -S | FileCheck %s

+target triple = "powerpc64le-unknown-linux-gnu"
@b = constant [200 x [100 x i32]] zeroinitializer, align 4
@a = constant i32 0, align 4

--- a/llvm/test/Transforms/LoopInterchange/interchangeable.ll
+++ b/llvm/test/Transforms/LoopInterchange/interchangeable.ll
@ -3,7 +3,7 @@
 ; RUN: opt < %s -aa-pipeline=basic-aa -passes=loop-interchange -verify-dom-info -verify-loop-info -verify-scev -verify-loop-lcssa -S | FileCheck %s

 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
+target triple = "powerpc64le-unknown-linux-gnu"

@A = common global [100 x [100 x i64]] zeroinitializer
@B = common global [100 x i64] zeroinitializer
--- a/llvm/test/Transforms/LoopInterchange/interchanged-loop-nest-3.ll
+++ b/llvm/test/Transforms/LoopInterchange/interchanged-loop-nest-3.ll
@ -3,7 +3,7 @@
 ; RUN:     -S -debug 2>&1 | FileCheck %s

 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
+target triple = "powerpc64le-unknown-linux-gnu"

@D = common global [100 x [100 x [100 x i32]]] zeroinitializer

@ -24,31 +24,31 @@ entry:
  br label %for.cond1.preheader

 for.cond1.preheader:                              ; preds = %for.inc15, %entry
-  %i.028 = phi i32 [ 0, %entry ], [ %inc16, %for.inc15 ]
+  %i.028 = phi i64 [ 0, %entry ], [ %inc16, %for.inc15 ]
  br label %for.cond4.preheader

 for.cond4.preheader:                              ; preds = %for.inc12, %for.cond1.preheader
-  %j.027 = phi i32 [ 0, %for.cond1.preheader ], [ %inc13, %for.inc12 ]
+  %j.027 = phi i64 [ 0, %for.cond1.preheader ], [ %inc13, %for.inc12 ]
  br label %for.body6

 for.body6:                                        ; preds = %for.body6, %for.cond4.preheader
-  %k.026 = phi i32 [ 0, %for.cond4.preheader ], [ %inc, %for.body6 ]
-  %arrayidx8 = getelementptr inbounds [100 x [100 x [100 x i32]]], [100 x [100 x [100 x i32]]]* @D, i32 0, i32 %k.026, i32 %j.027, i32 %i.028
+  %k.026 = phi i64 [ 0, %for.cond4.preheader ], [ %inc, %for.body6 ]
+  %arrayidx8 = getelementptr inbounds [100 x [100 x [100 x i32]]], [100 x [100 x [100 x i32]]]* @D, i64 0, i64 %k.026, i64 %j.027, i64 %i.028
  %0 = load i32, i32* %arrayidx8
  %add = add nsw i32 %0, %t
  store i32 %add, i32* %arrayidx8
-  %inc = add nuw nsw i32 %k.026, 1
-  %exitcond = icmp eq i32 %inc, 100
+  %inc = add nuw nsw i64 %k.026, 1
+  %exitcond = icmp eq i64 %inc, 100
  br i1 %exitcond, label %for.inc12, label %for.body6

 for.inc12:                                        ; preds = %for.body6
-  %inc13 = add nuw nsw i32 %j.027, 1
-  %exitcond29 = icmp eq i32 %inc13, 100
+  %inc13 = add nuw nsw i64 %j.027, 1
+  %exitcond29 = icmp eq i64 %inc13, 100
  br i1 %exitcond29, label %for.inc15, label %for.cond4.preheader

 for.inc15:                                        ; preds = %for.inc12
-  %inc16 = add nuw nsw i32 %i.028, 1
-  %exitcond30 = icmp eq i32 %inc16, 100
+  %inc16 = add nuw nsw i64 %i.028, 1
+  %exitcond30 = icmp eq i64 %inc16, 100
  br i1 %exitcond30, label %for.end17, label %for.cond1.preheader

 for.end17:                                        ; preds = %for.inc15
--- a/llvm/test/Transforms/LoopInterchange/lcssa-preheader.ll
+++ b/llvm/test/Transforms/LoopInterchange/lcssa-preheader.ll
@ -3,6 +3,7 @@
 ; RUN: opt < %s -basic-aa -loop-interchange -da-disable-delinearization-checks -pass-remarks-missed='loop-interchange' -verify-loop-lcssa -S | FileCheck -check-prefix=CHECK-DELIN %s

 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "powerpc64le-unknown-linux-gnu"

 ; void foo(int n, int m) {
 ;   int temp[16][16];
--- a/llvm/test/Transforms/LoopInterchange/lcssa.ll
+++ b/llvm/test/Transforms/LoopInterchange/lcssa.ll
@ -2,7 +2,7 @@
 ; RUN: FileCheck --input-file %t --check-prefix REMARK %s

 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
+target triple = "powerpc64le-unknown-linux-gnu"

@A = common global [100 x [100 x i32]] zeroinitializer
@C = common global [100 x [100 x i32]] zeroinitializer
--- a/llvm/test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll
+++ b/llvm/test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll
@ -10,6 +10,7 @@
 ; RUN:     -pass-remarks='loop-interchange' -S -da-disable-delinearization-checks
 ; RUN: cat %t |  FileCheck --check-prefix=DELIN %s

+target triple = "powerpc64le-unknown-linux-gnu"
@A = common global [100 x [100 x i32]] zeroinitializer
@B = common global [100 x [100 x i32]] zeroinitializer
@C = common global [100 x i32] zeroinitializer
@ -71,11 +72,7 @@ for.end19:
 ; DELIN-NEXT: Name:            InterchangeNotProfitable
 ; DELIN-NEXT: Function:        test01
 ; DELIN-NEXT: Args:
-; DELIN-NEXT:   - String:          'Interchanging loops is too costly (cost='
-; DELIN-NEXT:   - Cost:            '2'
-; DELIN-NEXT:   - String:          ', threshold='
-; DELIN-NEXT:   - Threshold:       '0'
-; DELIN-NEXT:   - String:          ') and it does not improve parallelism.'
+; DELIN-NEXT:   - String:          Interchanging loops is too costly and it does not improve parallelism.
 ; DELIN-NEXT: ...

 ;;--------------------------------------Test case 02------------------------------------
--- a/llvm/test/Transforms/LoopInterchange/not-interchanged-dependencies-1.ll
+++ b/llvm/test/Transforms/LoopInterchange/not-interchanged-dependencies-1.ll
@ -3,7 +3,7 @@
 ; RUN:     -S -debug 2>&1 | FileCheck %s

 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
+target triple = "powerpc64le-unknown-linux-gnu"

@A = common global [100 x [100 x i32]] zeroinitializer
@B = common global [100 x i32] zeroinitializer
--- a/llvm/test/Transforms/LoopInterchange/not-interchanged-loop-nest-3.ll
+++ b/llvm/test/Transforms/LoopInterchange/not-interchanged-loop-nest-3.ll
@ -3,7 +3,7 @@
 ; RUN:     -S -debug 2>&1 | FileCheck %s

 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
+target triple = "powerpc64le-unknown-linux-gnu"

@D = common global [100 x [100 x [100 x i32]]] zeroinitializer

@ -24,31 +24,31 @@ entry:
  br label %for.cond1.preheader

 for.cond1.preheader:                              ; preds = %for.inc15, %entry
-  %i.028 = phi i32 [ 0, %entry ], [ %inc16, %for.inc15 ]
+  %i.028 = phi i64 [ 0, %entry ], [ %inc16, %for.inc15 ]
  br label %for.cond4.preheader

 for.cond4.preheader:                              ; preds = %for.inc12, %for.cond1.preheader
-  %j.027 = phi i32 [ 0, %for.cond1.preheader ], [ %inc13, %for.inc12 ]
+  %j.027 = phi i64 [ 0, %for.cond1.preheader ], [ %inc13, %for.inc12 ]
  br label %for.body6

 for.body6:                                        ; preds = %for.body6, %for.cond4.preheader
-  %k.026 = phi i32 [ 0, %for.cond4.preheader ], [ %inc, %for.body6 ]
-  %arrayidx8 = getelementptr inbounds [100 x [100 x [100 x i32]]], [100 x [100 x [100 x i32]]]* @D, i32 0, i32 %i.028, i32 %k.026, i32 %j.027
+  %k.026 = phi i64 [ 0, %for.cond4.preheader ], [ %inc, %for.body6 ]
+  %arrayidx8 = getelementptr inbounds [100 x [100 x [100 x i32]]], [100 x [100 x [100 x i32]]]* @D, i32 0, i64 %i.028, i64 %k.026, i64 %j.027
  %0 = load i32, i32* %arrayidx8
  %add = add nsw i32 %0, %t
  store i32 %add, i32* %arrayidx8
-  %inc = add nuw nsw i32 %k.026, 1
-  %exitcond = icmp eq i32 %inc, 100
+  %inc = add nuw nsw i64 %k.026, 1
+  %exitcond = icmp eq i64 %inc, 100
  br i1 %exitcond, label %for.inc12, label %for.body6

 for.inc12:                                        ; preds = %for.body6
-  %inc13 = add nuw nsw i32 %j.027, 1
-  %exitcond29 = icmp eq i32 %inc13, 100
+  %inc13 = add nuw nsw i64 %j.027, 1
+  %exitcond29 = icmp eq i64 %inc13, 100
  br i1 %exitcond29, label %for.inc15, label %for.cond4.preheader

 for.inc15:                                        ; preds = %for.inc12
-  %inc16 = add nuw nsw i32 %i.028, 1
-  %exitcond30 = icmp eq i32 %inc16, 100
+  %inc16 = add nuw nsw i64 %i.028, 1
+  %exitcond30 = icmp eq i64 %inc16, 100
  br i1 %exitcond30, label %for.end17, label %for.cond1.preheader

 for.end17:                                        ; preds = %for.inc15
--- a/llvm/test/Transforms/LoopInterchange/not-interchanged-tightly-nested.ll
+++ b/llvm/test/Transforms/LoopInterchange/not-interchanged-tightly-nested.ll
@ -3,7 +3,7 @@
 ; RUN:     -S -debug 2>&1 | FileCheck %s

 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
+target triple = "powerpc64le-unknown-linux-gnu"

@A = common global [100 x [100 x i32]] zeroinitializer
@B = common global [100 x i32] zeroinitializer
@ -108,13 +108,13 @@ for.end12:
 ;; The outer loop header does not branch to the inner loop preheader, or the
 ;; inner loop header, or the outer loop latch.
 ; CHECK: Not interchanging loops. Cannot prove legality.
-define void @interchange_07(i32 %k, i32 %N, i32 %ny) {
+define void @interchange_07(i32 %k, i32 %N, i64 %ny) {
 entry:
  br label %for1.header

 for1.header:
-  %j23 = phi i32 [ 0, %entry ], [ %j.next24, %for1.inc10 ]
-  %cmp21 = icmp slt i32 0, %ny
+  %j23 = phi i64 [ 0, %entry ], [ %j.next24, %for1.inc10 ]
+  %cmp21 = icmp slt i64 0, %ny
  br label %singleSucc

 singleSucc:
@ -124,18 +124,18 @@ preheader.j:
  br label %for2

 for2:
-  %j = phi i32 [ %j.next, %for2 ], [ 0, %preheader.j ]
-  %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i32 0, i32 %j, i32 %j23
+  %j = phi i64 [ %j.next, %for2 ], [ 0, %preheader.j ]
+  %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %j, i64 %j23
  %lv = load i32, i32* %arrayidx5
  %add = add nsw i32 %lv, %k
  store i32 %add, i32* %arrayidx5
-  %j.next = add nuw nsw i32 %j, 1
-  %exitcond = icmp eq i32 %j, 99
+  %j.next = add nuw nsw i64 %j, 1
+  %exitcond = icmp eq i64 %j, 99
  br i1 %exitcond, label %for1.inc10, label %for2

 for1.inc10:
-  %j.next24 = add nuw nsw i32 %j23, 1
-  %exitcond26 = icmp eq i32 %j23, 99
+  %j.next24 = add nuw nsw i64 %j23, 1
+  %exitcond26 = icmp eq i64 %j23, 99
  br i1 %exitcond26, label %for.end12, label %for1.header

 for.end12:
--- a/llvm/test/Transforms/LoopInterchange/outer-header-jump-to-inner-latch.ll
+++ b/llvm/test/Transforms/LoopInterchange/outer-header-jump-to-inner-latch.ll
@ -1,5 +1,6 @@
 ; RUN: opt -basic-aa -loop-interchange -verify-dom-info -verify-loop-info -verify-loop-lcssa -S %s | FileCheck %s

+target triple = "powerpc64le-unknown-linux-gnu"
@b = global [3 x [5 x [8 x i16]]] [[5 x [8 x i16]] zeroinitializer, [5 x [8 x i16]] [[8 x i16] zeroinitializer, [8 x i16] [i16 0, i16 0, i16 0, i16 6, i16 1, i16 6, i16 0, i16 0], [8 x i16] zeroinitializer, [8 x i16] zeroinitializer, [8 x i16] zeroinitializer], [5 x [8 x i16]] zeroinitializer], align 2
@a = common global i32 0, align 4
@d = common dso_local local_unnamed_addr global [1 x [6 x i32]] zeroinitializer, align 4
--- a/llvm/test/Transforms/LoopInterchange/outer-only-reductions.ll
+++ b/llvm/test/Transforms/LoopInterchange/outer-only-reductions.ll
@ -4,6 +4,7 @@

 ; Outer loop only reductions are not supported currently.

+target triple = "powerpc64le-unknown-linux-gnu"
@A = common global [500 x [500 x i32]] zeroinitializer

 ;; global X
--- a/llvm/test/Transforms/LoopInterchange/perserve-lcssa.ll
+++ b/llvm/test/Transforms/LoopInterchange/perserve-lcssa.ll
@ -3,6 +3,7 @@
 ; Test case for PR41725. The induction variables in the latches escape the
 ; loops and we must move some PHIs around.

+target triple = "powerpc64le-unknown-linux-gnu"
@a = common dso_local global i64 0, align 4
@b = common dso_local global i64 0, align 4
@c = common dso_local global [10 x [10 x i32 ]] zeroinitializer, align 16
@ -157,7 +158,7 @@ exit:                                             ; preds = %outer.latch
 ; Make sure we do not crash for loops without reachable exits.
 define void @no_reachable_exits() {
 ; Check we interchanged.
-; CHECK-LABEL: @no_reachable_exits() {
+; CHECK-LABEL: @no_reachable_exits()
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    br label %inner.ph
 ; CHECK-LABEL: outer.ph:
--- a/llvm/test/Transforms/LoopInterchange/pr43176-move-to-new-latch.ll
+++ b/llvm/test/Transforms/LoopInterchange/pr43176-move-to-new-latch.ll
@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -loop-interchange -verify-loop-lcssa -verify-dom-info -S %s | FileCheck %s

+target triple = "powerpc64le-unknown-linux-gnu"
@b = external dso_local global [5 x i32], align 16

 define void @test1() {
--- a/llvm/test/Transforms/LoopInterchange/pr43326-ideal-access-pattern.ll
+++ b/llvm/test/Transforms/LoopInterchange/pr43326-ideal-access-pattern.ll
@ -14,6 +14,8 @@
 ;   }
 ; }

+target triple = "powerpc64le-unknown-linux-gnu"
+
 ; REMARKS: --- !Passed
 ; REMARKS-NEXT: Pass:            loop-interchange
 ; REMARKS-NEXT: Name:            Interchanged
--- a/llvm/test/Transforms/LoopInterchange/pr43326.ll
+++ b/llvm/test/Transforms/LoopInterchange/pr43326.ll
@ -2,6 +2,7 @@
 ; RUN:     -verify-dom-info -verify-loop-info -verify-loop-lcssa -stats 2>&1
 ; RUN: FileCheck --input-file=%t --check-prefix=REMARKS %s

+target triple = "powerpc64le-unknown-linux-gnu"
@a = global i32 0
@b = global i8 0
@c = global i32 0
--- a/llvm/test/Transforms/LoopInterchange/pr43473-invalid-lcssa-phis-in-inner-exit.ll
+++ b/llvm/test/Transforms/LoopInterchange/pr43473-invalid-lcssa-phis-in-inner-exit.ll
@ -6,6 +6,8 @@
 ; In the 2 test cases below, we have a LCSSA PHI in the inner loop exit, which
 ; is used in the outer loop latch. This is not supported.

+target triple = "powerpc64le-unknown-linux-gnu"
+
 define void @test1() {
 ; CHECK-LABEL: @test1(
 ; CHECK-NEXT:  entry:
--- a/llvm/test/Transforms/LoopInterchange/pr43797-lcssa-for-multiple-outer-loop-blocks.ll
+++ b/llvm/test/Transforms/LoopInterchange/pr43797-lcssa-for-multiple-outer-loop-blocks.ll
@ -3,6 +3,7 @@

 ; Tests for PR43797.

+target triple = "powerpc64le-unknown-linux-gnu"
@wdtdr = external dso_local global [5 x [5 x double]], align 16

 define void @test1() {
--- a/llvm/test/Transforms/LoopInterchange/pr45743-move-from-inner-preheader.ll
+++ b/llvm/test/Transforms/LoopInterchange/pr45743-move-from-inner-preheader.ll
@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -loop-interchange -S %s | FileCheck %s

+target triple = "powerpc64le-unknown-linux-gnu"
@global = external local_unnamed_addr global [400 x [400 x i32]], align 16

 ; We need to move %tmp4 from the inner loop pre header to the outer loop header
--- a/llvm/test/Transforms/LoopInterchange/pr48212.ll
+++ b/llvm/test/Transforms/LoopInterchange/pr48212.ll
@ -2,6 +2,8 @@
 ; RUN:     -verify-dom-info -verify-loop-info -verify-loop-lcssa 2>&1
 ; RUN: FileCheck --input-file=%t --check-prefix=REMARKS %s

+target triple = "powerpc64le-unknown-linux-gnu"
+
 ; REMARKS: --- !Passed
 ; REMARKS-NEXT: Pass:            loop-interchange
 ; REMARKS-NEXT: Name:            Interchanged
--- a/llvm/test/Transforms/LoopInterchange/profitability.ll
+++ b/llvm/test/Transforms/LoopInterchange/profitability.ll
@ -5,7 +5,7 @@
 ;; We test profitability model in these test cases.

 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
+target triple = "powerpc64le-unknown-linux-gnu"

@A = common global [100 x [100 x i32]] zeroinitializer
@B = common global [100 x [100 x i32]] zeroinitializer
--- a/llvm/test/Transforms/LoopInterchange/reductions-across-inner-and-outer-loop.ll
+++ b/llvm/test/Transforms/LoopInterchange/reductions-across-inner-and-outer-loop.ll
@ -4,7 +4,7 @@


 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
+target triple = "powerpc64le-unknown-linux-gnu"

 ; REMARKS: --- !Passed
 ; REMARKS-NEXT: Pass:            loop-interchange
--- a/llvm/test/Transforms/LoopInterchange/update-condbranch-duplicate-successors.ll
+++ b/llvm/test/Transforms/LoopInterchange/update-condbranch-duplicate-successors.ll
@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -loop-interchange -S %s | FileCheck %s

-
+target triple = "powerpc64le-unknown-linux-gnu"
@global = external dso_local global [1000 x [1000 x i32]], align 16

 ; Test that we support updating conditional branches where both targets are the same
--- a/llvm/test/Transforms/LoopInterchange/vector-gep-operand.ll
+++ b/llvm/test/Transforms/LoopInterchange/vector-gep-operand.ll
@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -loop-interchange -loop-interchange-threshold=-10 -S %s | FileCheck %s

+target triple = "powerpc64le-unknown-linux-gnu"
+
 ; The test contains a GEP with an operand that is not SCEV-able. Make sure
 ; loop-interchange does not crash.
 define void @test([256 x float]* noalias %src, float* %dst) {