From 006334470d8d1b5d8f630890336fcb45795749d1 Mon Sep 17 00:00:00 2001 From: Congzhe Cao Date: Thu, 2 Jun 2022 17:53:13 -0400 Subject: [PATCH] [LoopInterchange] New cost model for loop interchange This patch proposed to use a new cost model for loop interchange, which is obtained from loop cache analysis. Given a loopnest, what loop cache analysis returns is a vector of loops [loop0, loop1, loop2, ...] where loop0 should be replaced as the outermost loop, loop1 should be placed one more level inside, and loop2 one more level inside, etc. What loop cache analysis does is not only more comprehensive than the current cost model, it is also a "one-shot" query which means that we only need to query it once during the entire loop interchange pass, which is better than the current cost model where we query it every time we check whether it is profitable to interchange two loops. Thus complexity is reduced, especially after D120386 where we do more interchanges to get the globally optimal loop access pattern. Updates made to test cases are mostly minor changes and some corrections. Test coverage for loop interchange is not reduced. Currently we did not completely remove the legacy cost model, but keep it as fall-back in case the new cost model did not run successfully. This is because currently we have some limitations in delinearization, which sometimes makes loop cache analysis bail out. The longer term goal is to enhance delinearization and eventually remove the legacy cost model compeletely. Reviewed By: bmahjour, #loopoptwg Differential Revision: https://reviews.llvm.org/D124926 --- .../lib/Transforms/Scalar/LoopInterchange.cpp | 90 ++++--- llvm/test/Transforms/LICM/lnicm.ll | 233 +++++++++++++----- .../LoopInterchange/call-instructions.ll | 2 +- .../LoopInterchange/currentLimitation.ll | 2 +- .../Transforms/LoopInterchange/debuginfo.ll | 2 +- .../inner-indvar-depend-on-outer-indvar.ll | 1 + .../LoopInterchange/inner-only-reductions.ll | 1 + ...most-latch-uses-values-in-middle-header.ll | 1 + .../interchange-flow-dep-outer.ll | 2 +- .../interchange-insts-between-indvar.ll | 1 + .../LoopInterchange/interchange-no-deps.ll | 30 +-- ...erchangeable-innerloop-multiple-indvars.ll | 1 + ...erchangeable-outerloop-multiple-indvars.ll | 3 +- .../LoopInterchange/interchangeable.ll | 2 +- .../interchanged-loop-nest-3.ll | 22 +- .../LoopInterchange/lcssa-preheader.ll | 1 + llvm/test/Transforms/LoopInterchange/lcssa.ll | 2 +- .../loop-interchange-optimization-remarks.ll | 7 +- .../not-interchanged-dependencies-1.ll | 2 +- .../not-interchanged-loop-nest-3.ll | 22 +- .../not-interchanged-tightly-nested.ll | 20 +- .../outer-header-jump-to-inner-latch.ll | 1 + .../LoopInterchange/outer-only-reductions.ll | 1 + .../LoopInterchange/perserve-lcssa.ll | 3 +- .../pr43176-move-to-new-latch.ll | 1 + .../pr43326-ideal-access-pattern.ll | 2 + .../Transforms/LoopInterchange/pr43326.ll | 1 + ...r43473-invalid-lcssa-phis-in-inner-exit.ll | 2 + ...97-lcssa-for-multiple-outer-loop-blocks.ll | 1 + .../pr45743-move-from-inner-preheader.ll | 1 + .../Transforms/LoopInterchange/pr48212.ll | 2 + .../LoopInterchange/profitability.ll | 2 +- .../reductions-across-inner-and-outer-loop.ll | 2 +- .../update-condbranch-duplicate-successors.ll | 2 +- .../LoopInterchange/vector-gep-operand.ll | 2 + 35 files changed, 301 insertions(+), 169 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp index ced4396d5e79..1d3023d04463 100644 --- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp +++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp @@ -18,6 +18,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/DependenceAnalysis.h" +#include "llvm/Analysis/LoopCacheAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopNestAnalysis.h" #include "llvm/Analysis/LoopPass.h" @@ -358,8 +359,10 @@ public: : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {} /// Check if the loop interchange is profitable. - bool isProfitable(unsigned InnerLoopId, unsigned OuterLoopId, - CharMatrix &DepMatrix); + bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop, + unsigned InnerLoopId, unsigned OuterLoopId, + CharMatrix &DepMatrix, + const DenseMap &CostMap); private: int getInstrOrderCost(); @@ -410,13 +413,15 @@ struct LoopInterchange { LoopInfo *LI = nullptr; DependenceInfo *DI = nullptr; DominatorTree *DT = nullptr; + std::unique_ptr CC = nullptr; /// Interface to emit optimization remarks. OptimizationRemarkEmitter *ORE; LoopInterchange(ScalarEvolution *SE, LoopInfo *LI, DependenceInfo *DI, - DominatorTree *DT, OptimizationRemarkEmitter *ORE) - : SE(SE), LI(LI), DI(DI), DT(DT), ORE(ORE) {} + DominatorTree *DT, std::unique_ptr &CC, + OptimizationRemarkEmitter *ORE) + : SE(SE), LI(LI), DI(DI), DT(DT), CC(std::move(CC)), ORE(ORE) {} bool run(Loop *L) { if (L->getParentLoop()) @@ -499,6 +504,21 @@ struct LoopInterchange { } unsigned SelecLoopId = selectLoopForInterchange(LoopList); + // Obtain the loop vector returned from loop cache analysis beforehand, + // and put each pair into a map for constant time query + // later. Indices in loop vector reprsent the optimal order of the + // corresponding loop, e.g., given a loopnest with depth N, index 0 + // indicates the loop should be placed as the outermost loop and index N + // indicates the loop should be placed as the innermost loop. + // + // For the old pass manager CacheCost would be null. + DenseMap CostMap; + if (CC != nullptr) { + const auto &LoopCosts = CC->getLoopCosts(); + for (unsigned i = 0; i < LoopCosts.size(); i++) { + CostMap[LoopCosts[i].first] = i; + } + } // We try to achieve the globally optimal memory access for the loopnest, // and do interchange based on a bubble-sort fasion. We start from // the innermost loop, move it outwards to the best possible position @@ -507,7 +527,7 @@ struct LoopInterchange { bool ChangedPerIter = false; for (unsigned i = SelecLoopId; i > SelecLoopId - j; i--) { bool Interchanged = processLoop(LoopList[i], LoopList[i - 1], i, i - 1, - DependencyMatrix); + DependencyMatrix, CostMap); if (!Interchanged) continue; // Loops interchanged, update LoopList accordingly. @@ -531,7 +551,8 @@ struct LoopInterchange { bool processLoop(Loop *InnerLoop, Loop *OuterLoop, unsigned InnerLoopId, unsigned OuterLoopId, - std::vector> &DependencyMatrix) { + std::vector> &DependencyMatrix, + const DenseMap &CostMap) { LLVM_DEBUG(dbgs() << "Processing InnerLoopId = " << InnerLoopId << " and OuterLoopId = " << OuterLoopId << "\n"); LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE); @@ -541,7 +562,8 @@ struct LoopInterchange { } LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n"); LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE); - if (!LIP.isProfitable(InnerLoopId, OuterLoopId, DependencyMatrix)) { + if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId, + DependencyMatrix, CostMap)) { LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n"); return false; } @@ -1135,21 +1157,33 @@ static bool isProfitableForVectorization(unsigned InnerLoopId, return !DepMatrix.empty(); } -bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId, - unsigned OuterLoopId, - CharMatrix &DepMatrix) { - // TODO: Add better profitability checks. - // e.g - // 1) Construct dependency matrix and move the one with no loop carried dep - // inside to enable vectorization. +bool LoopInterchangeProfitability::isProfitable( + const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId, + unsigned OuterLoopId, CharMatrix &DepMatrix, + const DenseMap &CostMap) { + // TODO: Remove the legacy cost model. - // This is rough cost estimation algorithm. It counts the good and bad order - // of induction variables in the instruction and allows reordering if number - // of bad orders is more than good. - int Cost = getInstrOrderCost(); - LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n"); - if (Cost < -LoopInterchangeCostThreshold) - return true; + // This is the new cost model returned from loop cache analysis. + // A smaller index means the loop should be placed an outer loop, and vice + // versa. + if (CostMap.find(InnerLoop) != CostMap.end() && + CostMap.find(OuterLoop) != CostMap.end()) { + unsigned InnerIndex = 0, OuterIndex = 0; + InnerIndex = CostMap.find(InnerLoop)->second; + OuterIndex = CostMap.find(OuterLoop)->second; + LLVM_DEBUG(dbgs() << "InnerIndex = " << InnerIndex + << ", OuterIndex = " << OuterIndex << "\n"); + if (InnerIndex < OuterIndex) + return true; + } else { + // Legacy cost model: this is rough cost estimation algorithm. It counts the + // good and bad order of induction variables in the instruction and allows + // reordering if number of bad orders is more than good. + int Cost = getInstrOrderCost(); + LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n"); + if (Cost < -LoopInterchangeCostThreshold) + return true; + } // It is not profitable as per current cache profitability model. But check if // we can move this loop outside to improve parallelism. @@ -1160,10 +1194,8 @@ bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId, return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable", InnerLoop->getStartLoc(), InnerLoop->getHeader()) - << "Interchanging loops is too costly (cost=" - << ore::NV("Cost", Cost) << ", threshold=" - << ore::NV("Threshold", LoopInterchangeCostThreshold) - << ") and it does not improve parallelism."; + << "Interchanging loops is too costly and it does not improve " + "parallelism."; }); return false; } @@ -1709,8 +1741,8 @@ struct LoopInterchangeLegacyPass : public LoopPass { auto *DI = &getAnalysis().getDI(); auto *DT = &getAnalysis().getDomTree(); auto *ORE = &getAnalysis().getORE(); - - return LoopInterchange(SE, LI, DI, DT, ORE).run(L); + std::unique_ptr CC = nullptr; + return LoopInterchange(SE, LI, DI, DT, CC, ORE).run(L); } }; } // namespace @@ -1737,8 +1769,10 @@ PreservedAnalyses LoopInterchangePass::run(LoopNest &LN, Function &F = *LN.getParent(); DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI); + std::unique_ptr CC = + CacheCost::getCacheCost(LN.getOutermostLoop(), AR, DI); OptimizationRemarkEmitter ORE(&F); - if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, &ORE).run(LN)) + if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, CC, &ORE).run(LN)) return PreservedAnalyses::all(); return getLoopPassPreservedAnalyses(); } diff --git a/llvm/test/Transforms/LICM/lnicm.ll b/llvm/test/Transforms/LICM/lnicm.ll index 1ef0c1fd1421..fd68787ffc35 100644 --- a/llvm/test/Transforms/LICM/lnicm.ll +++ b/llvm/test/Transforms/LICM/lnicm.ll @@ -1,12 +1,13 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -aa-pipeline=basic-aa -passes='loop(loop-interchange)' -S %s | FileCheck %s --check-prefixes INTC -; RUN: opt -aa-pipeline=basic-aa -passes='loop-mssa(lnicm),loop(loop-interchange)' -S %s | FileCheck %s --check-prefixes LNICM,CHECK -; RUN: opt -aa-pipeline=basic-aa -passes='loop-mssa(licm),loop(loop-interchange)' -S %s | FileCheck %s --check-prefixes LICM,CHECK +; RUN: opt -aa-pipeline=basic-aa -passes='loop-mssa(lnicm),loop(loop-interchange)' -S %s | FileCheck %s --check-prefixes LNICM +; RUN: opt -aa-pipeline=basic-aa -passes='loop-mssa(licm),loop(loop-interchange)' -S %s | FileCheck %s --check-prefixes LICM ; This test represents the following function: -; void test(int x[10][10], int y[10], int *z) { -; for (int k = 0; k < 10; k++) { +; void test(int n, int m, int x[m][n], int y[n], int *z) { +; for (int k = 0; k < n; k++) { ; int tmp = *z; -; for (int i = 0; i < 10; i++) +; for (int i = 0; i < m; i++) ; x[i][k] += y[k] + tmp; ; } ; } @@ -15,83 +16,189 @@ ; to keep perfect loop nest. This enables optimizations that require ; perfect loop nest (e.g. loop-interchange) to perform. +target triple = "powerpc64le-unknown-linux-gnu" -define dso_local void @test([10 x i32]* noalias %x, i32* noalias readonly %y, i32* readonly %z) { -; CHECK-LABEL: @test( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[Z:%.*]] = load i32, i32* %z, align 4 -; CHECK-NEXT: br label [[FOR_BODY3_PREHEADER:%.*]] -; LNICM: for.body.preheader: -; LICM-NOT: for.body.preheader: -; INTC-NOT: for.body.preheader: -; LNICM-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; LNICM-NEXT: [[K:%.*]] = phi i32 [ [[INC10:%.*]], [[FOR_END:%.*]] ], [ 0, [[FOR_BODY_PREHEADER:%.*]] ] -; LNICM-NEXT: br label [[FOR_BODY3_SPLIT1:%.*]] -; LICM: [[TMP:%.*]] = load i32, i32* [[ARRAYIDX:%.*]], align 4 -; LNICM: for.body3.preheader: -; LICM-NOT: for.body3.preheader: -; INTC-NOT: for.body3.preheader: -; LNICM-NEXT: br label [[FOR_BODY3:%.*]] -; CHECK: for.body3: -; LNICM-NEXT: [[I:%.*]] = phi i32 [ [[TMP3:%.*]], [[FOR_BODY3_SPLIT:%.*]] ], [ 0, [[FOR_BODY3_PREHEADER:%.*]] ] -; LNICM-NEXT: br label [[FOR_BODY_PREHEADER:%.*]] -; LNICM: for.body3.split1: -; LNICM-NEXT: [[IDXPROM:%.*]] = sext i32 [[K:%.*]] to i64 -; LNICM-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* %y, i64 [[IDXPROM:%.*]] -; LNICM-NEXT: [[TMP:%.*]] = load i32, i32* [[ARRAYIDX:%.*]], align 4 -; LNICM-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP:%.*]], [[Z:%.*]] -; LNICM-NEXT: [[IDXPROM4:%.*]] = sext i32 [[I:%.*]] to i64 -; LNICM-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* %x, i64 [[IDXPROM4:%.*]] -; LNICM-NEXT: [[IDXPROM6:%.*]] = sext i32 [[K:%.*]] to i64 -; LNICM-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX5:%.*]], i64 0, i64 [[IDXPROM6:%.*]] -; LNICM-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX7:%.*]], align 4 -; LNICM-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP2:%.*]], [[ADD:%.*]] -; LNICM-NEXT: store i32 [[ADD8:%.*]], i32* [[ARRAYIDX7:%.*]], align 4 -; LNICM-NEXT: [[INC:%.*]] = add nsw i32 [[I:%.*]], 1 -; LNICM-NEXT: [[CMP2:%.*]] = icmp slt i32 [[INC:%.*]], 10 -; LNICM-NEXT: br label [[FOR_END:%.*]] -; LNICM: for.body3.split: -; LICM-NOT: for.body3.split: -; INTC-NOT: for.body3.split: -; LNICM-NEXT: [[TMP3:%.*]] = add nsw i32 [[I:%.*]], 1 -; LNICM-NEXT: [[TMP4:%.*]] = icmp slt i32 [[TMP3:%.*]], 10 -; LNICM-NEXT: br i1 [[TMP4:%.*]], label [[FOR_BODY3:%.*]], label [[FOR_END11:%.*]], !llvm.loop !0 -; LNICM: for.end: -; LNICM-NEXT: [[INC10:%.*]] = add nsw i32 [[K:%.*]], 1 -; LNICM-NEXT: [[CMP:%.*]] = icmp slt i32 [[INC10:%.*]], 10 -; LNICM-NEXT: br i1 [[CMP:%.*]], label [[FOR_BODY:%.*]], label [[FOR_BODY3_SPLIT:%.*]], !llvm.loop !2 -; LNICM: for.end11: -; LNICM-NEXT: ret void +define dso_local void @test(i64 %n, i64 %m, ptr noalias %x, ptr noalias readonly %y, ptr readonly %z) { +; The loopnest is not interchanged when we only run loop interchange. +; INTC-LABEL: @test( +; INTC-NEXT: gurad: +; INTC-NEXT: [[CMP23:%.*]] = icmp sgt i64 [[M:%.*]], 0 +; INTC-NEXT: [[CMP32:%.*]] = icmp sgt i64 [[N:%.*]], 0 +; INTC-NEXT: br i1 [[CMP23]], label [[FOR_COND1_PREHEADER_LR_PH:%.*]], label [[FOR_END11:%.*]] +; INTC: for.cond1.preheader.lr.ph: +; INTC-NEXT: br i1 [[CMP32]], label [[FOR_I_PREHEADER:%.*]], label [[FOR_END11]] +; INTC: for.i.preheader: +; INTC-NEXT: br label [[ENTRY:%.*]] +; INTC: entry: +; INTC-NEXT: br label [[FOR_BODY:%.*]] +; INTC: for.body: +; INTC-NEXT: [[K_02:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC10:%.*]], [[FOR_END:%.*]] ] +; INTC-NEXT: [[TMP0:%.*]] = load i32, ptr [[Z:%.*]], align 4 +; INTC-NEXT: br label [[FOR_BODY3:%.*]] +; INTC: for.body3: +; INTC-NEXT: [[I_01:%.*]] = phi i32 [ 0, [[FOR_BODY]] ], [ [[INC:%.*]], [[FOR_BODY3]] ] +; INTC-NEXT: [[IDXPROM:%.*]] = sext i32 [[K_02]] to i64 +; INTC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[Y:%.*]], i64 [[IDXPROM]] +; INTC-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; INTC-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]] +; INTC-NEXT: [[IDXPROM4:%.*]] = sext i32 [[I_01]] to i64 +; INTC-NEXT: [[INDEX0:%.*]] = mul i64 [[IDXPROM4]], [[N]] +; INTC-NEXT: [[INDEX1:%.*]] = add i64 [[INDEX0]], [[IDXPROM]] +; INTC-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i64 [[INDEX1]] +; INTC-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX7]], align 4 +; INTC-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP2]], [[ADD]] +; INTC-NEXT: store i32 [[ADD8]], ptr [[ARRAYIDX7]], align 4 +; INTC-NEXT: [[INC]] = add nsw i32 [[I_01]], 1 +; INTC-NEXT: [[INC_EXT:%.*]] = sext i32 [[INC]] to i64 +; INTC-NEXT: [[CMP2:%.*]] = icmp slt i64 [[INC_EXT]], [[M]] +; INTC-NEXT: br i1 [[CMP2]], label [[FOR_BODY3]], label [[FOR_END]], !llvm.loop [[LOOP0:![0-9]+]] +; INTC: for.end: +; INTC-NEXT: [[INC10]] = add nsw i32 [[K_02]], 1 +; INTC-NEXT: [[INC10_EXT:%.*]] = sext i32 [[INC10]] to i64 +; INTC-NEXT: [[CMP:%.*]] = icmp slt i64 [[INC10_EXT]], [[N]] +; INTC-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END11_LOOPEXIT:%.*]], !llvm.loop [[LOOP2:![0-9]+]] +; INTC: for.end11.loopexit: +; INTC-NEXT: br label [[FOR_END11]] +; INTC: for.end11: +; INTC-NEXT: ret void +; +; The loopnest is interchanged when we run lnicm and loop interchange. +; LNICM-LABEL: @test( +; LNICM-NEXT: gurad: +; LNICM-NEXT: [[CMP23:%.*]] = icmp sgt i64 [[M:%.*]], 0 +; LNICM-NEXT: [[CMP32:%.*]] = icmp sgt i64 [[N:%.*]], 0 +; LNICM-NEXT: br i1 [[CMP23]], label [[FOR_COND1_PREHEADER_LR_PH:%.*]], label [[FOR_END11:%.*]] +; LNICM: for.cond1.preheader.lr.ph: +; LNICM-NEXT: br i1 [[CMP32]], label [[FOR_I_PREHEADER:%.*]], label [[FOR_END11]] +; LNICM: for.i.preheader: +; LNICM-NEXT: br label [[FOR_BODY3_PREHEADER:%.*]] +; LNICM: entry: +; LNICM-NEXT: br label [[FOR_BODY:%.*]] +; LNICM: for.body: +; LNICM-NEXT: [[K_02:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC10:%.*]], [[FOR_END:%.*]] ] +; LNICM-NEXT: br label [[FOR_BODY3_SPLIT1:%.*]] +; LNICM: for.body3.preheader: +; LNICM-NEXT: [[TMP0:%.*]] = load i32, ptr [[Z:%.*]], align 4 +; LNICM-NEXT: br label [[FOR_BODY3:%.*]] +; LNICM: for.body3: +; LNICM-NEXT: [[I_01:%.*]] = phi i32 [ [[TMP3:%.*]], [[FOR_BODY3_SPLIT:%.*]] ], [ 0, [[FOR_BODY3_PREHEADER]] ] +; LNICM-NEXT: br label [[ENTRY]] +; LNICM: for.body3.split1: +; LNICM-NEXT: [[IDXPROM:%.*]] = sext i32 [[K_02]] to i64 +; LNICM-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[Y:%.*]], i64 [[IDXPROM]] +; LNICM-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; LNICM-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]] +; LNICM-NEXT: [[IDXPROM4:%.*]] = sext i32 [[I_01]] to i64 +; LNICM-NEXT: [[INDEX0:%.*]] = mul i64 [[IDXPROM4]], [[N]] +; LNICM-NEXT: [[INDEX1:%.*]] = add i64 [[INDEX0]], [[IDXPROM]] +; LNICM-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i64 [[INDEX1]] +; LNICM-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX7]], align 4 +; LNICM-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP2]], [[ADD]] +; LNICM-NEXT: store i32 [[ADD8]], ptr [[ARRAYIDX7]], align 4 +; LNICM-NEXT: [[INC:%.*]] = add nsw i32 [[I_01]], 1 +; LNICM-NEXT: [[INC_EXT:%.*]] = sext i32 [[INC]] to i64 +; LNICM-NEXT: [[CMP2:%.*]] = icmp slt i64 [[INC_EXT]], [[M]] +; LNICM-NEXT: br label [[FOR_END]] +; LNICM: for.body3.split: +; LNICM-NEXT: [[TMP3]] = add nsw i32 [[I_01]], 1 +; LNICM-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64 +; LNICM-NEXT: [[TMP5:%.*]] = icmp slt i64 [[TMP4]], [[M]] +; LNICM-NEXT: br i1 [[TMP5]], label [[FOR_BODY3]], label [[FOR_END11_LOOPEXIT:%.*]], !llvm.loop [[LOOP0:![0-9]+]] +; LNICM: for.end: +; LNICM-NEXT: [[INC10]] = add nsw i32 [[K_02]], 1 +; LNICM-NEXT: [[INC10_EXT:%.*]] = sext i32 [[INC10]] to i64 +; LNICM-NEXT: [[CMP:%.*]] = icmp slt i64 [[INC10_EXT]], [[N]] +; LNICM-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_BODY3_SPLIT]], !llvm.loop [[LOOP2:![0-9]+]] +; LNICM: for.end11.loopexit: +; LNICM-NEXT: br label [[FOR_END11]] +; LNICM: for.end11: +; LNICM-NEXT: ret void +; +; The loopnest is not interchanged when we run licm and loop interchange. +; LICM-LABEL: @test( +; LICM-NEXT: gurad: +; LICM-NEXT: [[CMP23:%.*]] = icmp sgt i64 [[M:%.*]], 0 +; LICM-NEXT: [[CMP32:%.*]] = icmp sgt i64 [[N:%.*]], 0 +; LICM-NEXT: br i1 [[CMP23]], label [[FOR_COND1_PREHEADER_LR_PH:%.*]], label [[FOR_END11:%.*]] +; LICM: for.cond1.preheader.lr.ph: +; LICM-NEXT: br i1 [[CMP32]], label [[FOR_I_PREHEADER:%.*]], label [[FOR_END11]] +; LICM: for.i.preheader: +; LICM-NEXT: br label [[ENTRY:%.*]] +; LICM: entry: +; LICM-NEXT: [[TMP0:%.*]] = load i32, ptr [[Z:%.*]], align 4 +; LICM-NEXT: br label [[FOR_BODY:%.*]] +; LICM: for.body: +; LICM-NEXT: [[K_02:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC10:%.*]], [[FOR_END:%.*]] ] +; LICM-NEXT: [[IDXPROM:%.*]] = sext i32 [[K_02]] to i64 +; LICM-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[Y:%.*]], i64 [[IDXPROM]] +; LICM-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; LICM-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]] +; LICM-NEXT: br label [[FOR_BODY3:%.*]] +; LICM: for.body3: +; LICM-NEXT: [[I_01:%.*]] = phi i32 [ 0, [[FOR_BODY]] ], [ [[INC:%.*]], [[FOR_BODY3]] ] +; LICM-NEXT: [[IDXPROM4:%.*]] = sext i32 [[I_01]] to i64 +; LICM-NEXT: [[INDEX0:%.*]] = mul i64 [[IDXPROM4]], [[N]] +; LICM-NEXT: [[INDEX1:%.*]] = add i64 [[INDEX0]], [[IDXPROM]] +; LICM-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i64 [[INDEX1]] +; LICM-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX7]], align 4 +; LICM-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP2]], [[ADD]] +; LICM-NEXT: store i32 [[ADD8]], ptr [[ARRAYIDX7]], align 4 +; LICM-NEXT: [[INC]] = add nsw i32 [[I_01]], 1 +; LICM-NEXT: [[INC_EXT:%.*]] = sext i32 [[INC]] to i64 +; LICM-NEXT: [[CMP2:%.*]] = icmp slt i64 [[INC_EXT]], [[M]] +; LICM-NEXT: br i1 [[CMP2]], label [[FOR_BODY3]], label [[FOR_END]], !llvm.loop [[LOOP0:![0-9]+]] +; LICM: for.end: +; LICM-NEXT: [[INC10]] = add nsw i32 [[K_02]], 1 +; LICM-NEXT: [[INC10_EXT:%.*]] = sext i32 [[INC10]] to i64 +; LICM-NEXT: [[CMP:%.*]] = icmp slt i64 [[INC10_EXT]], [[N]] +; LICM-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END11_LOOPEXIT:%.*]], !llvm.loop [[LOOP2:![0-9]+]] +; LICM: for.end11.loopexit: +; LICM-NEXT: br label [[FOR_END11]] +; LICM: for.end11: +; LICM-NEXT: ret void +; -entry: +gurad: + %cmp23 = icmp sgt i64 %m, 0 + %cmp32 = icmp sgt i64 %n, 0 + br i1 %cmp23, label %for.cond1.preheader.lr.ph, label %for.end11 + +for.cond1.preheader.lr.ph: ; preds = %gurad + br i1 %cmp32, label %for.i.preheader, label %for.end11 + +for.i.preheader: ; preds = %for.cond1.preheader.lr.ph + br label %entry + +entry: ; preds = %for.i.preheader br label %for.body for.body: %k.02 = phi i32 [ 0, %entry ], [ %inc10, %for.end ] - %0 = load i32, i32* %z, align 4 + %0 = load i32, ptr %z, align 4 br label %for.body3 for.body3: %i.01 = phi i32 [ 0, %for.body ], [ %inc, %for.body3 ] %idxprom = sext i32 %k.02 to i64 - %arrayidx = getelementptr inbounds i32, i32* %y, i64 %idxprom - %1 = load i32, i32* %arrayidx, align 4 + %arrayidx = getelementptr inbounds i32, ptr %y, i64 %idxprom + %1 = load i32, ptr %arrayidx, align 4 %add = add nsw i32 %1, %0 %idxprom4 = sext i32 %i.01 to i64 - %arrayidx5 = getelementptr inbounds [10 x i32], [10 x i32]* %x, i64 %idxprom4 - %idxprom6 = sext i32 %k.02 to i64 - %arrayidx7 = getelementptr inbounds [10 x i32], [10 x i32]* %arrayidx5, i64 0, i64 %idxprom6 - %2 = load i32, i32* %arrayidx7, align 4 + %index0 = mul i64 %idxprom4, %n + %index1 = add i64 %index0, %idxprom + %arrayidx7 = getelementptr inbounds i32, ptr %x, i64 %index1 + %2 = load i32, ptr %arrayidx7, align 4 %add8 = add nsw i32 %2, %add - store i32 %add8, i32* %arrayidx7, align 4 + store i32 %add8, ptr %arrayidx7, align 4 %inc = add nsw i32 %i.01, 1 - %cmp2 = icmp slt i32 %inc, 10 + %inc.ext = sext i32 %inc to i64 + %cmp2 = icmp slt i64 %inc.ext, %m br i1 %cmp2, label %for.body3, label %for.end, !llvm.loop !0 for.end: %inc10 = add nsw i32 %k.02, 1 - %cmp = icmp slt i32 %inc10, 10 + %inc10.ext = sext i32 %inc10 to i64 + %cmp = icmp slt i64 %inc10.ext, %n br i1 %cmp, label %for.body, label %for.end11, !llvm.loop !2 for.end11: diff --git a/llvm/test/Transforms/LoopInterchange/call-instructions.ll b/llvm/test/Transforms/LoopInterchange/call-instructions.ll index d945fe6d720c..542e0101c7d4 100644 --- a/llvm/test/Transforms/LoopInterchange/call-instructions.ll +++ b/llvm/test/Transforms/LoopInterchange/call-instructions.ll @@ -4,7 +4,7 @@ ; RUN: FileCheck --input-file=%t %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" +target triple = "powerpc64le-unknown-linux-gnu" @A = common global [100 x [100 x i32]] zeroinitializer diff --git a/llvm/test/Transforms/LoopInterchange/currentLimitation.ll b/llvm/test/Transforms/LoopInterchange/currentLimitation.ll index 82c16555f44f..9f7a954c8e73 100644 --- a/llvm/test/Transforms/LoopInterchange/currentLimitation.ll +++ b/llvm/test/Transforms/LoopInterchange/currentLimitation.ll @@ -8,7 +8,7 @@ ; RUN: FileCheck --check-prefix=DELIN --input-file=%t %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" +target triple = "powerpc64le-unknown-linux-gnu" @A = common global [100 x [100 x i32]] zeroinitializer @B = common global [100 x [100 x [100 x i32]]] zeroinitializer diff --git a/llvm/test/Transforms/LoopInterchange/debuginfo.ll b/llvm/test/Transforms/LoopInterchange/debuginfo.ll index e2187b932814..7b410bcde264 100644 --- a/llvm/test/Transforms/LoopInterchange/debuginfo.ll +++ b/llvm/test/Transforms/LoopInterchange/debuginfo.ll @@ -4,7 +4,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" +target triple = "powerpc64le-unknown-linux-gnu" @A = common global [100 x [100 x i64]] zeroinitializer diff --git a/llvm/test/Transforms/LoopInterchange/inner-indvar-depend-on-outer-indvar.ll b/llvm/test/Transforms/LoopInterchange/inner-indvar-depend-on-outer-indvar.ll index 5e3c59e80301..d341dd4ce632 100644 --- a/llvm/test/Transforms/LoopInterchange/inner-indvar-depend-on-outer-indvar.ll +++ b/llvm/test/Transforms/LoopInterchange/inner-indvar-depend-on-outer-indvar.ll @@ -2,6 +2,7 @@ ; RUN: opt < %s -basic-aa -loop-interchange -verify-dom-info -verify-loop-info \ ; RUN: -S -debug 2>&1 | FileCheck %s +target triple = "powerpc64le-unknown-linux-gnu" @A = common global [100 x [100 x i64]] zeroinitializer @N = dso_local local_unnamed_addr global i64 100, align 8 diff --git a/llvm/test/Transforms/LoopInterchange/inner-only-reductions.ll b/llvm/test/Transforms/LoopInterchange/inner-only-reductions.ll index d0eac6d65e82..cc12b5272e48 100644 --- a/llvm/test/Transforms/LoopInterchange/inner-only-reductions.ll +++ b/llvm/test/Transforms/LoopInterchange/inner-only-reductions.ll @@ -5,6 +5,7 @@ ; Inner loop only reductions are not supported currently. See discussion at ; D53027 for more information on the required checks. +target triple = "powerpc64le-unknown-linux-gnu" @A = common global [500 x [500 x i32]] zeroinitializer @X = common global i32 0 @B = common global [500 x [500 x i32]] zeroinitializer diff --git a/llvm/test/Transforms/LoopInterchange/innermost-latch-uses-values-in-middle-header.ll b/llvm/test/Transforms/LoopInterchange/innermost-latch-uses-values-in-middle-header.ll index 65f1290101ca..d1f8d9f3ebc7 100644 --- a/llvm/test/Transforms/LoopInterchange/innermost-latch-uses-values-in-middle-header.ll +++ b/llvm/test/Transforms/LoopInterchange/innermost-latch-uses-values-in-middle-header.ll @@ -2,6 +2,7 @@ ; RUN: opt < %s -basic-aa -loop-interchange -verify-dom-info -verify-loop-info \ ; RUN: -S -debug 2>&1 | FileCheck %s +target triple = "powerpc64le-unknown-linux-gnu" @a = common global i32 0, align 4 @d = common dso_local local_unnamed_addr global [1 x [6 x i32]] zeroinitializer, align 4 diff --git a/llvm/test/Transforms/LoopInterchange/interchange-flow-dep-outer.ll b/llvm/test/Transforms/LoopInterchange/interchange-flow-dep-outer.ll index 23f7d912ffa2..b04ed4571615 100644 --- a/llvm/test/Transforms/LoopInterchange/interchange-flow-dep-outer.ll +++ b/llvm/test/Transforms/LoopInterchange/interchange-flow-dep-outer.ll @@ -3,7 +3,7 @@ ; RUN: -S -debug 2>&1 | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" +target triple = "powerpc64le-unknown-linux-gnu" @A = common global [100 x [100 x i32]] zeroinitializer @B = common global [100 x i32] zeroinitializer diff --git a/llvm/test/Transforms/LoopInterchange/interchange-insts-between-indvar.ll b/llvm/test/Transforms/LoopInterchange/interchange-insts-between-indvar.ll index fa2f112e7119..b7ce5ebd52f4 100644 --- a/llvm/test/Transforms/LoopInterchange/interchange-insts-between-indvar.ll +++ b/llvm/test/Transforms/LoopInterchange/interchange-insts-between-indvar.ll @@ -1,6 +1,7 @@ ; RUN: opt < %s -basic-aa -loop-interchange -verify-dom-info -verify-loop-info \ ; RUN: -S -pass-remarks=loop-interchange 2>&1 | FileCheck %s +target triple = "powerpc64le-unknown-linux-gnu" @A10 = local_unnamed_addr global [3 x [3 x i32]] zeroinitializer, align 16 ;; Test to make sure we can handle zext instructions introduced by diff --git a/llvm/test/Transforms/LoopInterchange/interchange-no-deps.ll b/llvm/test/Transforms/LoopInterchange/interchange-no-deps.ll index f8e2d6b45a19..ca96b7480e4a 100644 --- a/llvm/test/Transforms/LoopInterchange/interchange-no-deps.ll +++ b/llvm/test/Transforms/LoopInterchange/interchange-no-deps.ll @@ -4,6 +4,7 @@ ; RUN: | FileCheck -check-prefix=STATS %s ; RUN: FileCheck -input-file %t %s +target triple = "powerpc64le-unknown-linux-gnu" ; no_deps_interchange just accesses a single nested array and can be interchange. ; CHECK: Name: Interchanged @@ -34,35 +35,6 @@ exit: ; preds = %for1.inc } -; Only the inner loop induction variable is used for memory accesses. -; Interchanging is not beneficial. -; CHECK: Name: InterchangeNotProfitable -; CHECK-NEXT: Function: no_bad_order -define i32 @no_bad_order(i32* %Arr) { -entry: - br label %for1.header - -for1.header: ; preds = %entry, %for1.inc - %indvars.iv19 = phi i64 [ 0, %entry ], [ %indvars.iv.next20, %for1.inc ] - br label %for2 - -for2: ; preds = %for1.header, %for2 - %indvars.iv = phi i64 [ 0, %for1.header ], [ %indvars.iv.next, %for2 ] - %arrayidx6 = getelementptr inbounds i32, i32* %Arr, i64 %indvars.iv - store i32 0, i32* %arrayidx6, align 4 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond = icmp ne i64 %indvars.iv.next, 1024 - br i1 %exitcond, label %for2, label %for1.inc - -for1.inc: - %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1 - %exitcond21 = icmp ne i64 %indvars.iv.next20, 1024 - br i1 %exitcond21, label %for1.header, label %exit - -exit: ; preds = %for1.inc - ret i32 0 -} - ; No memory access using any induction variables, interchanging not beneficial. ; CHECK: Name: InterchangeNotProfitable ; CHECK-NEXT: Function: no_mem_instrs diff --git a/llvm/test/Transforms/LoopInterchange/interchangeable-innerloop-multiple-indvars.ll b/llvm/test/Transforms/LoopInterchange/interchangeable-innerloop-multiple-indvars.ll index 830d0a26c950..633ed7251266 100644 --- a/llvm/test/Transforms/LoopInterchange/interchangeable-innerloop-multiple-indvars.ll +++ b/llvm/test/Transforms/LoopInterchange/interchangeable-innerloop-multiple-indvars.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -basic-aa -loop-interchange -verify-dom-info -verify-loop-info -verify-scev -verify-loop-lcssa -S | FileCheck %s +target triple = "powerpc64le-unknown-linux-gnu" @b = common dso_local local_unnamed_addr global [200 x [200 x i32]] zeroinitializer, align 4 @a = common dso_local local_unnamed_addr global i32 0, align 4 diff --git a/llvm/test/Transforms/LoopInterchange/interchangeable-outerloop-multiple-indvars.ll b/llvm/test/Transforms/LoopInterchange/interchangeable-outerloop-multiple-indvars.ll index 3b12cf443cb9..605e948d6194 100644 --- a/llvm/test/Transforms/LoopInterchange/interchangeable-outerloop-multiple-indvars.ll +++ b/llvm/test/Transforms/LoopInterchange/interchangeable-outerloop-multiple-indvars.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s --basic-aa -loop-interchange -verify-dom-info -verify-loop-info -verify-scev -verify-loop-lcssa -S | FileCheck %s +; RUN: opt < %s -basic-aa -loop-interchange -verify-dom-info -verify-loop-info -verify-scev -verify-loop-lcssa -S | FileCheck %s +target triple = "powerpc64le-unknown-linux-gnu" @b = constant [200 x [100 x i32]] zeroinitializer, align 4 @a = constant i32 0, align 4 diff --git a/llvm/test/Transforms/LoopInterchange/interchangeable.ll b/llvm/test/Transforms/LoopInterchange/interchangeable.ll index 00261aa99b03..330e466f0e84 100644 --- a/llvm/test/Transforms/LoopInterchange/interchangeable.ll +++ b/llvm/test/Transforms/LoopInterchange/interchangeable.ll @@ -3,7 +3,7 @@ ; RUN: opt < %s -aa-pipeline=basic-aa -passes=loop-interchange -verify-dom-info -verify-loop-info -verify-scev -verify-loop-lcssa -S | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" +target triple = "powerpc64le-unknown-linux-gnu" @A = common global [100 x [100 x i64]] zeroinitializer @B = common global [100 x i64] zeroinitializer diff --git a/llvm/test/Transforms/LoopInterchange/interchanged-loop-nest-3.ll b/llvm/test/Transforms/LoopInterchange/interchanged-loop-nest-3.ll index fa5db55ba5cf..09269c1349c1 100644 --- a/llvm/test/Transforms/LoopInterchange/interchanged-loop-nest-3.ll +++ b/llvm/test/Transforms/LoopInterchange/interchanged-loop-nest-3.ll @@ -3,7 +3,7 @@ ; RUN: -S -debug 2>&1 | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" +target triple = "powerpc64le-unknown-linux-gnu" @D = common global [100 x [100 x [100 x i32]]] zeroinitializer @@ -24,31 +24,31 @@ entry: br label %for.cond1.preheader for.cond1.preheader: ; preds = %for.inc15, %entry - %i.028 = phi i32 [ 0, %entry ], [ %inc16, %for.inc15 ] + %i.028 = phi i64 [ 0, %entry ], [ %inc16, %for.inc15 ] br label %for.cond4.preheader for.cond4.preheader: ; preds = %for.inc12, %for.cond1.preheader - %j.027 = phi i32 [ 0, %for.cond1.preheader ], [ %inc13, %for.inc12 ] + %j.027 = phi i64 [ 0, %for.cond1.preheader ], [ %inc13, %for.inc12 ] br label %for.body6 for.body6: ; preds = %for.body6, %for.cond4.preheader - %k.026 = phi i32 [ 0, %for.cond4.preheader ], [ %inc, %for.body6 ] - %arrayidx8 = getelementptr inbounds [100 x [100 x [100 x i32]]], [100 x [100 x [100 x i32]]]* @D, i32 0, i32 %k.026, i32 %j.027, i32 %i.028 + %k.026 = phi i64 [ 0, %for.cond4.preheader ], [ %inc, %for.body6 ] + %arrayidx8 = getelementptr inbounds [100 x [100 x [100 x i32]]], [100 x [100 x [100 x i32]]]* @D, i64 0, i64 %k.026, i64 %j.027, i64 %i.028 %0 = load i32, i32* %arrayidx8 %add = add nsw i32 %0, %t store i32 %add, i32* %arrayidx8 - %inc = add nuw nsw i32 %k.026, 1 - %exitcond = icmp eq i32 %inc, 100 + %inc = add nuw nsw i64 %k.026, 1 + %exitcond = icmp eq i64 %inc, 100 br i1 %exitcond, label %for.inc12, label %for.body6 for.inc12: ; preds = %for.body6 - %inc13 = add nuw nsw i32 %j.027, 1 - %exitcond29 = icmp eq i32 %inc13, 100 + %inc13 = add nuw nsw i64 %j.027, 1 + %exitcond29 = icmp eq i64 %inc13, 100 br i1 %exitcond29, label %for.inc15, label %for.cond4.preheader for.inc15: ; preds = %for.inc12 - %inc16 = add nuw nsw i32 %i.028, 1 - %exitcond30 = icmp eq i32 %inc16, 100 + %inc16 = add nuw nsw i64 %i.028, 1 + %exitcond30 = icmp eq i64 %inc16, 100 br i1 %exitcond30, label %for.end17, label %for.cond1.preheader for.end17: ; preds = %for.inc15 diff --git a/llvm/test/Transforms/LoopInterchange/lcssa-preheader.ll b/llvm/test/Transforms/LoopInterchange/lcssa-preheader.ll index ab616eea9740..a9db7e3be1f1 100644 --- a/llvm/test/Transforms/LoopInterchange/lcssa-preheader.ll +++ b/llvm/test/Transforms/LoopInterchange/lcssa-preheader.ll @@ -3,6 +3,7 @@ ; RUN: opt < %s -basic-aa -loop-interchange -da-disable-delinearization-checks -pass-remarks-missed='loop-interchange' -verify-loop-lcssa -S | FileCheck -check-prefix=CHECK-DELIN %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "powerpc64le-unknown-linux-gnu" ; void foo(int n, int m) { ; int temp[16][16]; diff --git a/llvm/test/Transforms/LoopInterchange/lcssa.ll b/llvm/test/Transforms/LoopInterchange/lcssa.ll index 9809689bb049..0910fc03d0e9 100644 --- a/llvm/test/Transforms/LoopInterchange/lcssa.ll +++ b/llvm/test/Transforms/LoopInterchange/lcssa.ll @@ -2,7 +2,7 @@ ; RUN: FileCheck --input-file %t --check-prefix REMARK %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" +target triple = "powerpc64le-unknown-linux-gnu" @A = common global [100 x [100 x i32]] zeroinitializer @C = common global [100 x [100 x i32]] zeroinitializer diff --git a/llvm/test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll b/llvm/test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll index 388f237db347..2ff499e698e9 100644 --- a/llvm/test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll +++ b/llvm/test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll @@ -10,6 +10,7 @@ ; RUN: -pass-remarks='loop-interchange' -S -da-disable-delinearization-checks ; RUN: cat %t | FileCheck --check-prefix=DELIN %s +target triple = "powerpc64le-unknown-linux-gnu" @A = common global [100 x [100 x i32]] zeroinitializer @B = common global [100 x [100 x i32]] zeroinitializer @C = common global [100 x i32] zeroinitializer @@ -71,11 +72,7 @@ for.end19: ; DELIN-NEXT: Name: InterchangeNotProfitable ; DELIN-NEXT: Function: test01 ; DELIN-NEXT: Args: -; DELIN-NEXT: - String: 'Interchanging loops is too costly (cost=' -; DELIN-NEXT: - Cost: '2' -; DELIN-NEXT: - String: ', threshold=' -; DELIN-NEXT: - Threshold: '0' -; DELIN-NEXT: - String: ') and it does not improve parallelism.' +; DELIN-NEXT: - String: Interchanging loops is too costly and it does not improve parallelism. ; DELIN-NEXT: ... ;;--------------------------------------Test case 02------------------------------------ diff --git a/llvm/test/Transforms/LoopInterchange/not-interchanged-dependencies-1.ll b/llvm/test/Transforms/LoopInterchange/not-interchanged-dependencies-1.ll index 3bf1e214cfef..4e58e9426df3 100644 --- a/llvm/test/Transforms/LoopInterchange/not-interchanged-dependencies-1.ll +++ b/llvm/test/Transforms/LoopInterchange/not-interchanged-dependencies-1.ll @@ -3,7 +3,7 @@ ; RUN: -S -debug 2>&1 | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" +target triple = "powerpc64le-unknown-linux-gnu" @A = common global [100 x [100 x i32]] zeroinitializer @B = common global [100 x i32] zeroinitializer diff --git a/llvm/test/Transforms/LoopInterchange/not-interchanged-loop-nest-3.ll b/llvm/test/Transforms/LoopInterchange/not-interchanged-loop-nest-3.ll index 46aa4b5b4e76..b771ffadef41 100644 --- a/llvm/test/Transforms/LoopInterchange/not-interchanged-loop-nest-3.ll +++ b/llvm/test/Transforms/LoopInterchange/not-interchanged-loop-nest-3.ll @@ -3,7 +3,7 @@ ; RUN: -S -debug 2>&1 | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" +target triple = "powerpc64le-unknown-linux-gnu" @D = common global [100 x [100 x [100 x i32]]] zeroinitializer @@ -24,31 +24,31 @@ entry: br label %for.cond1.preheader for.cond1.preheader: ; preds = %for.inc15, %entry - %i.028 = phi i32 [ 0, %entry ], [ %inc16, %for.inc15 ] + %i.028 = phi i64 [ 0, %entry ], [ %inc16, %for.inc15 ] br label %for.cond4.preheader for.cond4.preheader: ; preds = %for.inc12, %for.cond1.preheader - %j.027 = phi i32 [ 0, %for.cond1.preheader ], [ %inc13, %for.inc12 ] + %j.027 = phi i64 [ 0, %for.cond1.preheader ], [ %inc13, %for.inc12 ] br label %for.body6 for.body6: ; preds = %for.body6, %for.cond4.preheader - %k.026 = phi i32 [ 0, %for.cond4.preheader ], [ %inc, %for.body6 ] - %arrayidx8 = getelementptr inbounds [100 x [100 x [100 x i32]]], [100 x [100 x [100 x i32]]]* @D, i32 0, i32 %i.028, i32 %k.026, i32 %j.027 + %k.026 = phi i64 [ 0, %for.cond4.preheader ], [ %inc, %for.body6 ] + %arrayidx8 = getelementptr inbounds [100 x [100 x [100 x i32]]], [100 x [100 x [100 x i32]]]* @D, i32 0, i64 %i.028, i64 %k.026, i64 %j.027 %0 = load i32, i32* %arrayidx8 %add = add nsw i32 %0, %t store i32 %add, i32* %arrayidx8 - %inc = add nuw nsw i32 %k.026, 1 - %exitcond = icmp eq i32 %inc, 100 + %inc = add nuw nsw i64 %k.026, 1 + %exitcond = icmp eq i64 %inc, 100 br i1 %exitcond, label %for.inc12, label %for.body6 for.inc12: ; preds = %for.body6 - %inc13 = add nuw nsw i32 %j.027, 1 - %exitcond29 = icmp eq i32 %inc13, 100 + %inc13 = add nuw nsw i64 %j.027, 1 + %exitcond29 = icmp eq i64 %inc13, 100 br i1 %exitcond29, label %for.inc15, label %for.cond4.preheader for.inc15: ; preds = %for.inc12 - %inc16 = add nuw nsw i32 %i.028, 1 - %exitcond30 = icmp eq i32 %inc16, 100 + %inc16 = add nuw nsw i64 %i.028, 1 + %exitcond30 = icmp eq i64 %inc16, 100 br i1 %exitcond30, label %for.end17, label %for.cond1.preheader for.end17: ; preds = %for.inc15 diff --git a/llvm/test/Transforms/LoopInterchange/not-interchanged-tightly-nested.ll b/llvm/test/Transforms/LoopInterchange/not-interchanged-tightly-nested.ll index 82f661502f57..e4301762ed2b 100644 --- a/llvm/test/Transforms/LoopInterchange/not-interchanged-tightly-nested.ll +++ b/llvm/test/Transforms/LoopInterchange/not-interchanged-tightly-nested.ll @@ -3,7 +3,7 @@ ; RUN: -S -debug 2>&1 | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" +target triple = "powerpc64le-unknown-linux-gnu" @A = common global [100 x [100 x i32]] zeroinitializer @B = common global [100 x i32] zeroinitializer @@ -108,13 +108,13 @@ for.end12: ;; The outer loop header does not branch to the inner loop preheader, or the ;; inner loop header, or the outer loop latch. ; CHECK: Not interchanging loops. Cannot prove legality. -define void @interchange_07(i32 %k, i32 %N, i32 %ny) { +define void @interchange_07(i32 %k, i32 %N, i64 %ny) { entry: br label %for1.header for1.header: - %j23 = phi i32 [ 0, %entry ], [ %j.next24, %for1.inc10 ] - %cmp21 = icmp slt i32 0, %ny + %j23 = phi i64 [ 0, %entry ], [ %j.next24, %for1.inc10 ] + %cmp21 = icmp slt i64 0, %ny br label %singleSucc singleSucc: @@ -124,18 +124,18 @@ preheader.j: br label %for2 for2: - %j = phi i32 [ %j.next, %for2 ], [ 0, %preheader.j ] - %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i32 0, i32 %j, i32 %j23 + %j = phi i64 [ %j.next, %for2 ], [ 0, %preheader.j ] + %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %j, i64 %j23 %lv = load i32, i32* %arrayidx5 %add = add nsw i32 %lv, %k store i32 %add, i32* %arrayidx5 - %j.next = add nuw nsw i32 %j, 1 - %exitcond = icmp eq i32 %j, 99 + %j.next = add nuw nsw i64 %j, 1 + %exitcond = icmp eq i64 %j, 99 br i1 %exitcond, label %for1.inc10, label %for2 for1.inc10: - %j.next24 = add nuw nsw i32 %j23, 1 - %exitcond26 = icmp eq i32 %j23, 99 + %j.next24 = add nuw nsw i64 %j23, 1 + %exitcond26 = icmp eq i64 %j23, 99 br i1 %exitcond26, label %for.end12, label %for1.header for.end12: diff --git a/llvm/test/Transforms/LoopInterchange/outer-header-jump-to-inner-latch.ll b/llvm/test/Transforms/LoopInterchange/outer-header-jump-to-inner-latch.ll index 3a7c82b83bfc..2936c5a7a492 100644 --- a/llvm/test/Transforms/LoopInterchange/outer-header-jump-to-inner-latch.ll +++ b/llvm/test/Transforms/LoopInterchange/outer-header-jump-to-inner-latch.ll @@ -1,5 +1,6 @@ ; RUN: opt -basic-aa -loop-interchange -verify-dom-info -verify-loop-info -verify-loop-lcssa -S %s | FileCheck %s +target triple = "powerpc64le-unknown-linux-gnu" @b = global [3 x [5 x [8 x i16]]] [[5 x [8 x i16]] zeroinitializer, [5 x [8 x i16]] [[8 x i16] zeroinitializer, [8 x i16] [i16 0, i16 0, i16 0, i16 6, i16 1, i16 6, i16 0, i16 0], [8 x i16] zeroinitializer, [8 x i16] zeroinitializer, [8 x i16] zeroinitializer], [5 x [8 x i16]] zeroinitializer], align 2 @a = common global i32 0, align 4 @d = common dso_local local_unnamed_addr global [1 x [6 x i32]] zeroinitializer, align 4 diff --git a/llvm/test/Transforms/LoopInterchange/outer-only-reductions.ll b/llvm/test/Transforms/LoopInterchange/outer-only-reductions.ll index 7e9db5a4d4e5..b4312e11c3fa 100644 --- a/llvm/test/Transforms/LoopInterchange/outer-only-reductions.ll +++ b/llvm/test/Transforms/LoopInterchange/outer-only-reductions.ll @@ -4,6 +4,7 @@ ; Outer loop only reductions are not supported currently. +target triple = "powerpc64le-unknown-linux-gnu" @A = common global [500 x [500 x i32]] zeroinitializer ;; global X diff --git a/llvm/test/Transforms/LoopInterchange/perserve-lcssa.ll b/llvm/test/Transforms/LoopInterchange/perserve-lcssa.ll index 77546020bf95..48bf01c86653 100644 --- a/llvm/test/Transforms/LoopInterchange/perserve-lcssa.ll +++ b/llvm/test/Transforms/LoopInterchange/perserve-lcssa.ll @@ -3,6 +3,7 @@ ; Test case for PR41725. The induction variables in the latches escape the ; loops and we must move some PHIs around. +target triple = "powerpc64le-unknown-linux-gnu" @a = common dso_local global i64 0, align 4 @b = common dso_local global i64 0, align 4 @c = common dso_local global [10 x [10 x i32 ]] zeroinitializer, align 16 @@ -157,7 +158,7 @@ exit: ; preds = %outer.latch ; Make sure we do not crash for loops without reachable exits. define void @no_reachable_exits() { ; Check we interchanged. -; CHECK-LABEL: @no_reachable_exits() { +; CHECK-LABEL: @no_reachable_exits() ; CHECK-NEXT: bb: ; CHECK-NEXT: br label %inner.ph ; CHECK-LABEL: outer.ph: diff --git a/llvm/test/Transforms/LoopInterchange/pr43176-move-to-new-latch.ll b/llvm/test/Transforms/LoopInterchange/pr43176-move-to-new-latch.ll index ef0c5ad6cb05..98e172dd751b 100644 --- a/llvm/test/Transforms/LoopInterchange/pr43176-move-to-new-latch.ll +++ b/llvm/test/Transforms/LoopInterchange/pr43176-move-to-new-latch.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -loop-interchange -verify-loop-lcssa -verify-dom-info -S %s | FileCheck %s +target triple = "powerpc64le-unknown-linux-gnu" @b = external dso_local global [5 x i32], align 16 define void @test1() { diff --git a/llvm/test/Transforms/LoopInterchange/pr43326-ideal-access-pattern.ll b/llvm/test/Transforms/LoopInterchange/pr43326-ideal-access-pattern.ll index 443f61fbe2a0..478212c7a9fe 100644 --- a/llvm/test/Transforms/LoopInterchange/pr43326-ideal-access-pattern.ll +++ b/llvm/test/Transforms/LoopInterchange/pr43326-ideal-access-pattern.ll @@ -14,6 +14,8 @@ ; } ; } +target triple = "powerpc64le-unknown-linux-gnu" + ; REMARKS: --- !Passed ; REMARKS-NEXT: Pass: loop-interchange ; REMARKS-NEXT: Name: Interchanged diff --git a/llvm/test/Transforms/LoopInterchange/pr43326.ll b/llvm/test/Transforms/LoopInterchange/pr43326.ll index 68bcc4e74cec..9270692982f3 100644 --- a/llvm/test/Transforms/LoopInterchange/pr43326.ll +++ b/llvm/test/Transforms/LoopInterchange/pr43326.ll @@ -2,6 +2,7 @@ ; RUN: -verify-dom-info -verify-loop-info -verify-loop-lcssa -stats 2>&1 ; RUN: FileCheck --input-file=%t --check-prefix=REMARKS %s +target triple = "powerpc64le-unknown-linux-gnu" @a = global i32 0 @b = global i8 0 @c = global i32 0 diff --git a/llvm/test/Transforms/LoopInterchange/pr43473-invalid-lcssa-phis-in-inner-exit.ll b/llvm/test/Transforms/LoopInterchange/pr43473-invalid-lcssa-phis-in-inner-exit.ll index 9ae5f42e1ae7..d6b132e3fc61 100644 --- a/llvm/test/Transforms/LoopInterchange/pr43473-invalid-lcssa-phis-in-inner-exit.ll +++ b/llvm/test/Transforms/LoopInterchange/pr43473-invalid-lcssa-phis-in-inner-exit.ll @@ -6,6 +6,8 @@ ; In the 2 test cases below, we have a LCSSA PHI in the inner loop exit, which ; is used in the outer loop latch. This is not supported. +target triple = "powerpc64le-unknown-linux-gnu" + define void @test1() { ; CHECK-LABEL: @test1( ; CHECK-NEXT: entry: diff --git a/llvm/test/Transforms/LoopInterchange/pr43797-lcssa-for-multiple-outer-loop-blocks.ll b/llvm/test/Transforms/LoopInterchange/pr43797-lcssa-for-multiple-outer-loop-blocks.ll index 445fffecc2f3..90cd9a311798 100644 --- a/llvm/test/Transforms/LoopInterchange/pr43797-lcssa-for-multiple-outer-loop-blocks.ll +++ b/llvm/test/Transforms/LoopInterchange/pr43797-lcssa-for-multiple-outer-loop-blocks.ll @@ -3,6 +3,7 @@ ; Tests for PR43797. +target triple = "powerpc64le-unknown-linux-gnu" @wdtdr = external dso_local global [5 x [5 x double]], align 16 define void @test1() { diff --git a/llvm/test/Transforms/LoopInterchange/pr45743-move-from-inner-preheader.ll b/llvm/test/Transforms/LoopInterchange/pr45743-move-from-inner-preheader.ll index 733647761524..8198387145d2 100644 --- a/llvm/test/Transforms/LoopInterchange/pr45743-move-from-inner-preheader.ll +++ b/llvm/test/Transforms/LoopInterchange/pr45743-move-from-inner-preheader.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -loop-interchange -S %s | FileCheck %s +target triple = "powerpc64le-unknown-linux-gnu" @global = external local_unnamed_addr global [400 x [400 x i32]], align 16 ; We need to move %tmp4 from the inner loop pre header to the outer loop header diff --git a/llvm/test/Transforms/LoopInterchange/pr48212.ll b/llvm/test/Transforms/LoopInterchange/pr48212.ll index b6894bc0ba30..d4b1624998b9 100644 --- a/llvm/test/Transforms/LoopInterchange/pr48212.ll +++ b/llvm/test/Transforms/LoopInterchange/pr48212.ll @@ -2,6 +2,8 @@ ; RUN: -verify-dom-info -verify-loop-info -verify-loop-lcssa 2>&1 ; RUN: FileCheck --input-file=%t --check-prefix=REMARKS %s +target triple = "powerpc64le-unknown-linux-gnu" + ; REMARKS: --- !Passed ; REMARKS-NEXT: Pass: loop-interchange ; REMARKS-NEXT: Name: Interchanged diff --git a/llvm/test/Transforms/LoopInterchange/profitability.ll b/llvm/test/Transforms/LoopInterchange/profitability.ll index 79706d8afe6a..5e3d5295dd88 100644 --- a/llvm/test/Transforms/LoopInterchange/profitability.ll +++ b/llvm/test/Transforms/LoopInterchange/profitability.ll @@ -5,7 +5,7 @@ ;; We test profitability model in these test cases. target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" +target triple = "powerpc64le-unknown-linux-gnu" @A = common global [100 x [100 x i32]] zeroinitializer @B = common global [100 x [100 x i32]] zeroinitializer diff --git a/llvm/test/Transforms/LoopInterchange/reductions-across-inner-and-outer-loop.ll b/llvm/test/Transforms/LoopInterchange/reductions-across-inner-and-outer-loop.ll index 89b734d5f82f..a0f936b22c80 100644 --- a/llvm/test/Transforms/LoopInterchange/reductions-across-inner-and-outer-loop.ll +++ b/llvm/test/Transforms/LoopInterchange/reductions-across-inner-and-outer-loop.ll @@ -4,7 +4,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" +target triple = "powerpc64le-unknown-linux-gnu" ; REMARKS: --- !Passed ; REMARKS-NEXT: Pass: loop-interchange diff --git a/llvm/test/Transforms/LoopInterchange/update-condbranch-duplicate-successors.ll b/llvm/test/Transforms/LoopInterchange/update-condbranch-duplicate-successors.ll index 3f178443ee6d..d6a48feeabb9 100644 --- a/llvm/test/Transforms/LoopInterchange/update-condbranch-duplicate-successors.ll +++ b/llvm/test/Transforms/LoopInterchange/update-condbranch-duplicate-successors.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -loop-interchange -S %s | FileCheck %s - +target triple = "powerpc64le-unknown-linux-gnu" @global = external dso_local global [1000 x [1000 x i32]], align 16 ; Test that we support updating conditional branches where both targets are the same diff --git a/llvm/test/Transforms/LoopInterchange/vector-gep-operand.ll b/llvm/test/Transforms/LoopInterchange/vector-gep-operand.ll index 3bc69c3d2b0f..d83a489da8f1 100644 --- a/llvm/test/Transforms/LoopInterchange/vector-gep-operand.ll +++ b/llvm/test/Transforms/LoopInterchange/vector-gep-operand.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -loop-interchange -loop-interchange-threshold=-10 -S %s | FileCheck %s +target triple = "powerpc64le-unknown-linux-gnu" + ; The test contains a GEP with an operand that is not SCEV-able. Make sure ; loop-interchange does not crash. define void @test([256 x float]* noalias %src, float* %dst) {