[LoopInterchange] New cost model for loop interchange

This patch proposed to use a new cost model for loop interchange, which
is obtained from loop cache analysis.

Given a loopnest, what loop cache analysis returns is a vector of loops
[loop0, loop1, loop2, ...] where loop0 should be replaced as the outermost
loop, loop1 should be placed one more level inside, and loop2 one more level
inside, etc. What loop cache analysis does is not only more comprehensive than
the current cost model, it is also a "one-shot" query which means that we only
need to query it once during the entire loop interchange pass, which is better
than the current cost model where we query it every time we check whether it is
profitable to interchange two loops. Thus complexity is reduced, especially after
D120386 where we do more interchanges to get the globally optimal loop access pattern.

Updates made to test cases are mostly minor changes and some corrections.
Test coverage for loop interchange is not reduced.

Currently we did not completely remove the legacy cost model, but keep it as
fall-back in case the new cost model did not run successfully. This is because
currently we have some limitations in delinearization, which sometimes makes
loop cache analysis bail out. The longer term goal is to enhance delinearization
and eventually remove the legacy cost model compeletely.

Reviewed By: bmahjour, #loopoptwg

Differential Revision: https://reviews.llvm.org/D124926
This commit is contained in:
Congzhe Cao 2022-06-02 17:53:13 -04:00 committed by CongzheUalberta
parent 4ad17d2e96
commit 006334470d
35 changed files with 301 additions and 169 deletions

View File

@ -18,6 +18,7 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/DependenceAnalysis.h"
#include "llvm/Analysis/LoopCacheAnalysis.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopNestAnalysis.h"
#include "llvm/Analysis/LoopPass.h"
@ -358,8 +359,10 @@ public:
: OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
/// Check if the loop interchange is profitable.
bool isProfitable(unsigned InnerLoopId, unsigned OuterLoopId,
CharMatrix &DepMatrix);
bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop,
unsigned InnerLoopId, unsigned OuterLoopId,
CharMatrix &DepMatrix,
const DenseMap<const Loop *, unsigned> &CostMap);
private:
int getInstrOrderCost();
@ -410,13 +413,15 @@ struct LoopInterchange {
LoopInfo *LI = nullptr;
DependenceInfo *DI = nullptr;
DominatorTree *DT = nullptr;
std::unique_ptr<CacheCost> CC = nullptr;
/// Interface to emit optimization remarks.
OptimizationRemarkEmitter *ORE;
LoopInterchange(ScalarEvolution *SE, LoopInfo *LI, DependenceInfo *DI,
DominatorTree *DT, OptimizationRemarkEmitter *ORE)
: SE(SE), LI(LI), DI(DI), DT(DT), ORE(ORE) {}
DominatorTree *DT, std::unique_ptr<CacheCost> &CC,
OptimizationRemarkEmitter *ORE)
: SE(SE), LI(LI), DI(DI), DT(DT), CC(std::move(CC)), ORE(ORE) {}
bool run(Loop *L) {
if (L->getParentLoop())
@ -499,6 +504,21 @@ struct LoopInterchange {
}
unsigned SelecLoopId = selectLoopForInterchange(LoopList);
// Obtain the loop vector returned from loop cache analysis beforehand,
// and put each <Loop, index> pair into a map for constant time query
// later. Indices in loop vector reprsent the optimal order of the
// corresponding loop, e.g., given a loopnest with depth N, index 0
// indicates the loop should be placed as the outermost loop and index N
// indicates the loop should be placed as the innermost loop.
//
// For the old pass manager CacheCost would be null.
DenseMap<const Loop *, unsigned> CostMap;
if (CC != nullptr) {
const auto &LoopCosts = CC->getLoopCosts();
for (unsigned i = 0; i < LoopCosts.size(); i++) {
CostMap[LoopCosts[i].first] = i;
}
}
// We try to achieve the globally optimal memory access for the loopnest,
// and do interchange based on a bubble-sort fasion. We start from
// the innermost loop, move it outwards to the best possible position
@ -507,7 +527,7 @@ struct LoopInterchange {
bool ChangedPerIter = false;
for (unsigned i = SelecLoopId; i > SelecLoopId - j; i--) {
bool Interchanged = processLoop(LoopList[i], LoopList[i - 1], i, i - 1,
DependencyMatrix);
DependencyMatrix, CostMap);
if (!Interchanged)
continue;
// Loops interchanged, update LoopList accordingly.
@ -531,7 +551,8 @@ struct LoopInterchange {
bool processLoop(Loop *InnerLoop, Loop *OuterLoop, unsigned InnerLoopId,
unsigned OuterLoopId,
std::vector<std::vector<char>> &DependencyMatrix) {
std::vector<std::vector<char>> &DependencyMatrix,
const DenseMap<const Loop *, unsigned> &CostMap) {
LLVM_DEBUG(dbgs() << "Processing InnerLoopId = " << InnerLoopId
<< " and OuterLoopId = " << OuterLoopId << "\n");
LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE);
@ -541,7 +562,8 @@ struct LoopInterchange {
}
LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n");
LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
if (!LIP.isProfitable(InnerLoopId, OuterLoopId, DependencyMatrix)) {
if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId,
DependencyMatrix, CostMap)) {
LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n");
return false;
}
@ -1135,21 +1157,33 @@ static bool isProfitableForVectorization(unsigned InnerLoopId,
return !DepMatrix.empty();
}
bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId,
unsigned OuterLoopId,
CharMatrix &DepMatrix) {
// TODO: Add better profitability checks.
// e.g
// 1) Construct dependency matrix and move the one with no loop carried dep
// inside to enable vectorization.
bool LoopInterchangeProfitability::isProfitable(
const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId,
unsigned OuterLoopId, CharMatrix &DepMatrix,
const DenseMap<const Loop *, unsigned> &CostMap) {
// TODO: Remove the legacy cost model.
// This is rough cost estimation algorithm. It counts the good and bad order
// of induction variables in the instruction and allows reordering if number
// of bad orders is more than good.
int Cost = getInstrOrderCost();
LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n");
if (Cost < -LoopInterchangeCostThreshold)
return true;
// This is the new cost model returned from loop cache analysis.
// A smaller index means the loop should be placed an outer loop, and vice
// versa.
if (CostMap.find(InnerLoop) != CostMap.end() &&
CostMap.find(OuterLoop) != CostMap.end()) {
unsigned InnerIndex = 0, OuterIndex = 0;
InnerIndex = CostMap.find(InnerLoop)->second;
OuterIndex = CostMap.find(OuterLoop)->second;
LLVM_DEBUG(dbgs() << "InnerIndex = " << InnerIndex
<< ", OuterIndex = " << OuterIndex << "\n");
if (InnerIndex < OuterIndex)
return true;
} else {
// Legacy cost model: this is rough cost estimation algorithm. It counts the
// good and bad order of induction variables in the instruction and allows
// reordering if number of bad orders is more than good.
int Cost = getInstrOrderCost();
LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n");
if (Cost < -LoopInterchangeCostThreshold)
return true;
}
// It is not profitable as per current cache profitability model. But check if
// we can move this loop outside to improve parallelism.
@ -1160,10 +1194,8 @@ bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId,
return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
InnerLoop->getStartLoc(),
InnerLoop->getHeader())
<< "Interchanging loops is too costly (cost="
<< ore::NV("Cost", Cost) << ", threshold="
<< ore::NV("Threshold", LoopInterchangeCostThreshold)
<< ") and it does not improve parallelism.";
<< "Interchanging loops is too costly and it does not improve "
"parallelism.";
});
return false;
}
@ -1709,8 +1741,8 @@ struct LoopInterchangeLegacyPass : public LoopPass {
auto *DI = &getAnalysis<DependenceAnalysisWrapperPass>().getDI();
auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
return LoopInterchange(SE, LI, DI, DT, ORE).run(L);
std::unique_ptr<CacheCost> CC = nullptr;
return LoopInterchange(SE, LI, DI, DT, CC, ORE).run(L);
}
};
} // namespace
@ -1737,8 +1769,10 @@ PreservedAnalyses LoopInterchangePass::run(LoopNest &LN,
Function &F = *LN.getParent();
DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI);
std::unique_ptr<CacheCost> CC =
CacheCost::getCacheCost(LN.getOutermostLoop(), AR, DI);
OptimizationRemarkEmitter ORE(&F);
if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, &ORE).run(LN))
if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, CC, &ORE).run(LN))
return PreservedAnalyses::all();
return getLoopPassPreservedAnalyses();
}

View File

@ -1,12 +1,13 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -aa-pipeline=basic-aa -passes='loop(loop-interchange)' -S %s | FileCheck %s --check-prefixes INTC
; RUN: opt -aa-pipeline=basic-aa -passes='loop-mssa(lnicm),loop(loop-interchange)' -S %s | FileCheck %s --check-prefixes LNICM,CHECK
; RUN: opt -aa-pipeline=basic-aa -passes='loop-mssa(licm),loop(loop-interchange)' -S %s | FileCheck %s --check-prefixes LICM,CHECK
; RUN: opt -aa-pipeline=basic-aa -passes='loop-mssa(lnicm),loop(loop-interchange)' -S %s | FileCheck %s --check-prefixes LNICM
; RUN: opt -aa-pipeline=basic-aa -passes='loop-mssa(licm),loop(loop-interchange)' -S %s | FileCheck %s --check-prefixes LICM
; This test represents the following function:
; void test(int x[10][10], int y[10], int *z) {
; for (int k = 0; k < 10; k++) {
; void test(int n, int m, int x[m][n], int y[n], int *z) {
; for (int k = 0; k < n; k++) {
; int tmp = *z;
; for (int i = 0; i < 10; i++)
; for (int i = 0; i < m; i++)
; x[i][k] += y[k] + tmp;
; }
; }
@ -15,83 +16,189 @@
; to keep perfect loop nest. This enables optimizations that require
; perfect loop nest (e.g. loop-interchange) to perform.
target triple = "powerpc64le-unknown-linux-gnu"
define dso_local void @test([10 x i32]* noalias %x, i32* noalias readonly %y, i32* readonly %z) {
; CHECK-LABEL: @test(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[Z:%.*]] = load i32, i32* %z, align 4
; CHECK-NEXT: br label [[FOR_BODY3_PREHEADER:%.*]]
; LNICM: for.body.preheader:
; LICM-NOT: for.body.preheader:
; INTC-NOT: for.body.preheader:
; LNICM-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; LNICM-NEXT: [[K:%.*]] = phi i32 [ [[INC10:%.*]], [[FOR_END:%.*]] ], [ 0, [[FOR_BODY_PREHEADER:%.*]] ]
; LNICM-NEXT: br label [[FOR_BODY3_SPLIT1:%.*]]
; LICM: [[TMP:%.*]] = load i32, i32* [[ARRAYIDX:%.*]], align 4
; LNICM: for.body3.preheader:
; LICM-NOT: for.body3.preheader:
; INTC-NOT: for.body3.preheader:
; LNICM-NEXT: br label [[FOR_BODY3:%.*]]
; CHECK: for.body3:
; LNICM-NEXT: [[I:%.*]] = phi i32 [ [[TMP3:%.*]], [[FOR_BODY3_SPLIT:%.*]] ], [ 0, [[FOR_BODY3_PREHEADER:%.*]] ]
; LNICM-NEXT: br label [[FOR_BODY_PREHEADER:%.*]]
; LNICM: for.body3.split1:
; LNICM-NEXT: [[IDXPROM:%.*]] = sext i32 [[K:%.*]] to i64
; LNICM-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* %y, i64 [[IDXPROM:%.*]]
; LNICM-NEXT: [[TMP:%.*]] = load i32, i32* [[ARRAYIDX:%.*]], align 4
; LNICM-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP:%.*]], [[Z:%.*]]
; LNICM-NEXT: [[IDXPROM4:%.*]] = sext i32 [[I:%.*]] to i64
; LNICM-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* %x, i64 [[IDXPROM4:%.*]]
; LNICM-NEXT: [[IDXPROM6:%.*]] = sext i32 [[K:%.*]] to i64
; LNICM-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX5:%.*]], i64 0, i64 [[IDXPROM6:%.*]]
; LNICM-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX7:%.*]], align 4
; LNICM-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP2:%.*]], [[ADD:%.*]]
; LNICM-NEXT: store i32 [[ADD8:%.*]], i32* [[ARRAYIDX7:%.*]], align 4
; LNICM-NEXT: [[INC:%.*]] = add nsw i32 [[I:%.*]], 1
; LNICM-NEXT: [[CMP2:%.*]] = icmp slt i32 [[INC:%.*]], 10
; LNICM-NEXT: br label [[FOR_END:%.*]]
; LNICM: for.body3.split:
; LICM-NOT: for.body3.split:
; INTC-NOT: for.body3.split:
; LNICM-NEXT: [[TMP3:%.*]] = add nsw i32 [[I:%.*]], 1
; LNICM-NEXT: [[TMP4:%.*]] = icmp slt i32 [[TMP3:%.*]], 10
; LNICM-NEXT: br i1 [[TMP4:%.*]], label [[FOR_BODY3:%.*]], label [[FOR_END11:%.*]], !llvm.loop !0
; LNICM: for.end:
; LNICM-NEXT: [[INC10:%.*]] = add nsw i32 [[K:%.*]], 1
; LNICM-NEXT: [[CMP:%.*]] = icmp slt i32 [[INC10:%.*]], 10
; LNICM-NEXT: br i1 [[CMP:%.*]], label [[FOR_BODY:%.*]], label [[FOR_BODY3_SPLIT:%.*]], !llvm.loop !2
; LNICM: for.end11:
; LNICM-NEXT: ret void
define dso_local void @test(i64 %n, i64 %m, ptr noalias %x, ptr noalias readonly %y, ptr readonly %z) {
; The loopnest is not interchanged when we only run loop interchange.
; INTC-LABEL: @test(
; INTC-NEXT: gurad:
; INTC-NEXT: [[CMP23:%.*]] = icmp sgt i64 [[M:%.*]], 0
; INTC-NEXT: [[CMP32:%.*]] = icmp sgt i64 [[N:%.*]], 0
; INTC-NEXT: br i1 [[CMP23]], label [[FOR_COND1_PREHEADER_LR_PH:%.*]], label [[FOR_END11:%.*]]
; INTC: for.cond1.preheader.lr.ph:
; INTC-NEXT: br i1 [[CMP32]], label [[FOR_I_PREHEADER:%.*]], label [[FOR_END11]]
; INTC: for.i.preheader:
; INTC-NEXT: br label [[ENTRY:%.*]]
; INTC: entry:
; INTC-NEXT: br label [[FOR_BODY:%.*]]
; INTC: for.body:
; INTC-NEXT: [[K_02:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC10:%.*]], [[FOR_END:%.*]] ]
; INTC-NEXT: [[TMP0:%.*]] = load i32, ptr [[Z:%.*]], align 4
; INTC-NEXT: br label [[FOR_BODY3:%.*]]
; INTC: for.body3:
; INTC-NEXT: [[I_01:%.*]] = phi i32 [ 0, [[FOR_BODY]] ], [ [[INC:%.*]], [[FOR_BODY3]] ]
; INTC-NEXT: [[IDXPROM:%.*]] = sext i32 [[K_02]] to i64
; INTC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[Y:%.*]], i64 [[IDXPROM]]
; INTC-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
; INTC-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
; INTC-NEXT: [[IDXPROM4:%.*]] = sext i32 [[I_01]] to i64
; INTC-NEXT: [[INDEX0:%.*]] = mul i64 [[IDXPROM4]], [[N]]
; INTC-NEXT: [[INDEX1:%.*]] = add i64 [[INDEX0]], [[IDXPROM]]
; INTC-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i64 [[INDEX1]]
; INTC-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX7]], align 4
; INTC-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP2]], [[ADD]]
; INTC-NEXT: store i32 [[ADD8]], ptr [[ARRAYIDX7]], align 4
; INTC-NEXT: [[INC]] = add nsw i32 [[I_01]], 1
; INTC-NEXT: [[INC_EXT:%.*]] = sext i32 [[INC]] to i64
; INTC-NEXT: [[CMP2:%.*]] = icmp slt i64 [[INC_EXT]], [[M]]
; INTC-NEXT: br i1 [[CMP2]], label [[FOR_BODY3]], label [[FOR_END]], !llvm.loop [[LOOP0:![0-9]+]]
; INTC: for.end:
; INTC-NEXT: [[INC10]] = add nsw i32 [[K_02]], 1
; INTC-NEXT: [[INC10_EXT:%.*]] = sext i32 [[INC10]] to i64
; INTC-NEXT: [[CMP:%.*]] = icmp slt i64 [[INC10_EXT]], [[N]]
; INTC-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END11_LOOPEXIT:%.*]], !llvm.loop [[LOOP2:![0-9]+]]
; INTC: for.end11.loopexit:
; INTC-NEXT: br label [[FOR_END11]]
; INTC: for.end11:
; INTC-NEXT: ret void
;
; The loopnest is interchanged when we run lnicm and loop interchange.
; LNICM-LABEL: @test(
; LNICM-NEXT: gurad:
; LNICM-NEXT: [[CMP23:%.*]] = icmp sgt i64 [[M:%.*]], 0
; LNICM-NEXT: [[CMP32:%.*]] = icmp sgt i64 [[N:%.*]], 0
; LNICM-NEXT: br i1 [[CMP23]], label [[FOR_COND1_PREHEADER_LR_PH:%.*]], label [[FOR_END11:%.*]]
; LNICM: for.cond1.preheader.lr.ph:
; LNICM-NEXT: br i1 [[CMP32]], label [[FOR_I_PREHEADER:%.*]], label [[FOR_END11]]
; LNICM: for.i.preheader:
; LNICM-NEXT: br label [[FOR_BODY3_PREHEADER:%.*]]
; LNICM: entry:
; LNICM-NEXT: br label [[FOR_BODY:%.*]]
; LNICM: for.body:
; LNICM-NEXT: [[K_02:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC10:%.*]], [[FOR_END:%.*]] ]
; LNICM-NEXT: br label [[FOR_BODY3_SPLIT1:%.*]]
; LNICM: for.body3.preheader:
; LNICM-NEXT: [[TMP0:%.*]] = load i32, ptr [[Z:%.*]], align 4
; LNICM-NEXT: br label [[FOR_BODY3:%.*]]
; LNICM: for.body3:
; LNICM-NEXT: [[I_01:%.*]] = phi i32 [ [[TMP3:%.*]], [[FOR_BODY3_SPLIT:%.*]] ], [ 0, [[FOR_BODY3_PREHEADER]] ]
; LNICM-NEXT: br label [[ENTRY]]
; LNICM: for.body3.split1:
; LNICM-NEXT: [[IDXPROM:%.*]] = sext i32 [[K_02]] to i64
; LNICM-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[Y:%.*]], i64 [[IDXPROM]]
; LNICM-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
; LNICM-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
; LNICM-NEXT: [[IDXPROM4:%.*]] = sext i32 [[I_01]] to i64
; LNICM-NEXT: [[INDEX0:%.*]] = mul i64 [[IDXPROM4]], [[N]]
; LNICM-NEXT: [[INDEX1:%.*]] = add i64 [[INDEX0]], [[IDXPROM]]
; LNICM-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i64 [[INDEX1]]
; LNICM-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX7]], align 4
; LNICM-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP2]], [[ADD]]
; LNICM-NEXT: store i32 [[ADD8]], ptr [[ARRAYIDX7]], align 4
; LNICM-NEXT: [[INC:%.*]] = add nsw i32 [[I_01]], 1
; LNICM-NEXT: [[INC_EXT:%.*]] = sext i32 [[INC]] to i64
; LNICM-NEXT: [[CMP2:%.*]] = icmp slt i64 [[INC_EXT]], [[M]]
; LNICM-NEXT: br label [[FOR_END]]
; LNICM: for.body3.split:
; LNICM-NEXT: [[TMP3]] = add nsw i32 [[I_01]], 1
; LNICM-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
; LNICM-NEXT: [[TMP5:%.*]] = icmp slt i64 [[TMP4]], [[M]]
; LNICM-NEXT: br i1 [[TMP5]], label [[FOR_BODY3]], label [[FOR_END11_LOOPEXIT:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
; LNICM: for.end:
; LNICM-NEXT: [[INC10]] = add nsw i32 [[K_02]], 1
; LNICM-NEXT: [[INC10_EXT:%.*]] = sext i32 [[INC10]] to i64
; LNICM-NEXT: [[CMP:%.*]] = icmp slt i64 [[INC10_EXT]], [[N]]
; LNICM-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_BODY3_SPLIT]], !llvm.loop [[LOOP2:![0-9]+]]
; LNICM: for.end11.loopexit:
; LNICM-NEXT: br label [[FOR_END11]]
; LNICM: for.end11:
; LNICM-NEXT: ret void
;
; The loopnest is not interchanged when we run licm and loop interchange.
; LICM-LABEL: @test(
; LICM-NEXT: gurad:
; LICM-NEXT: [[CMP23:%.*]] = icmp sgt i64 [[M:%.*]], 0
; LICM-NEXT: [[CMP32:%.*]] = icmp sgt i64 [[N:%.*]], 0
; LICM-NEXT: br i1 [[CMP23]], label [[FOR_COND1_PREHEADER_LR_PH:%.*]], label [[FOR_END11:%.*]]
; LICM: for.cond1.preheader.lr.ph:
; LICM-NEXT: br i1 [[CMP32]], label [[FOR_I_PREHEADER:%.*]], label [[FOR_END11]]
; LICM: for.i.preheader:
; LICM-NEXT: br label [[ENTRY:%.*]]
; LICM: entry:
; LICM-NEXT: [[TMP0:%.*]] = load i32, ptr [[Z:%.*]], align 4
; LICM-NEXT: br label [[FOR_BODY:%.*]]
; LICM: for.body:
; LICM-NEXT: [[K_02:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC10:%.*]], [[FOR_END:%.*]] ]
; LICM-NEXT: [[IDXPROM:%.*]] = sext i32 [[K_02]] to i64
; LICM-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[Y:%.*]], i64 [[IDXPROM]]
; LICM-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
; LICM-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
; LICM-NEXT: br label [[FOR_BODY3:%.*]]
; LICM: for.body3:
; LICM-NEXT: [[I_01:%.*]] = phi i32 [ 0, [[FOR_BODY]] ], [ [[INC:%.*]], [[FOR_BODY3]] ]
; LICM-NEXT: [[IDXPROM4:%.*]] = sext i32 [[I_01]] to i64
; LICM-NEXT: [[INDEX0:%.*]] = mul i64 [[IDXPROM4]], [[N]]
; LICM-NEXT: [[INDEX1:%.*]] = add i64 [[INDEX0]], [[IDXPROM]]
; LICM-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i64 [[INDEX1]]
; LICM-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX7]], align 4
; LICM-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP2]], [[ADD]]
; LICM-NEXT: store i32 [[ADD8]], ptr [[ARRAYIDX7]], align 4
; LICM-NEXT: [[INC]] = add nsw i32 [[I_01]], 1
; LICM-NEXT: [[INC_EXT:%.*]] = sext i32 [[INC]] to i64
; LICM-NEXT: [[CMP2:%.*]] = icmp slt i64 [[INC_EXT]], [[M]]
; LICM-NEXT: br i1 [[CMP2]], label [[FOR_BODY3]], label [[FOR_END]], !llvm.loop [[LOOP0:![0-9]+]]
; LICM: for.end:
; LICM-NEXT: [[INC10]] = add nsw i32 [[K_02]], 1
; LICM-NEXT: [[INC10_EXT:%.*]] = sext i32 [[INC10]] to i64
; LICM-NEXT: [[CMP:%.*]] = icmp slt i64 [[INC10_EXT]], [[N]]
; LICM-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END11_LOOPEXIT:%.*]], !llvm.loop [[LOOP2:![0-9]+]]
; LICM: for.end11.loopexit:
; LICM-NEXT: br label [[FOR_END11]]
; LICM: for.end11:
; LICM-NEXT: ret void
;
entry:
gurad:
%cmp23 = icmp sgt i64 %m, 0
%cmp32 = icmp sgt i64 %n, 0
br i1 %cmp23, label %for.cond1.preheader.lr.ph, label %for.end11
for.cond1.preheader.lr.ph: ; preds = %gurad
br i1 %cmp32, label %for.i.preheader, label %for.end11
for.i.preheader: ; preds = %for.cond1.preheader.lr.ph
br label %entry
entry: ; preds = %for.i.preheader
br label %for.body
for.body:
%k.02 = phi i32 [ 0, %entry ], [ %inc10, %for.end ]
%0 = load i32, i32* %z, align 4
%0 = load i32, ptr %z, align 4
br label %for.body3
for.body3:
%i.01 = phi i32 [ 0, %for.body ], [ %inc, %for.body3 ]
%idxprom = sext i32 %k.02 to i64
%arrayidx = getelementptr inbounds i32, i32* %y, i64 %idxprom
%1 = load i32, i32* %arrayidx, align 4
%arrayidx = getelementptr inbounds i32, ptr %y, i64 %idxprom
%1 = load i32, ptr %arrayidx, align 4
%add = add nsw i32 %1, %0
%idxprom4 = sext i32 %i.01 to i64
%arrayidx5 = getelementptr inbounds [10 x i32], [10 x i32]* %x, i64 %idxprom4
%idxprom6 = sext i32 %k.02 to i64
%arrayidx7 = getelementptr inbounds [10 x i32], [10 x i32]* %arrayidx5, i64 0, i64 %idxprom6
%2 = load i32, i32* %arrayidx7, align 4
%index0 = mul i64 %idxprom4, %n
%index1 = add i64 %index0, %idxprom
%arrayidx7 = getelementptr inbounds i32, ptr %x, i64 %index1
%2 = load i32, ptr %arrayidx7, align 4
%add8 = add nsw i32 %2, %add
store i32 %add8, i32* %arrayidx7, align 4
store i32 %add8, ptr %arrayidx7, align 4
%inc = add nsw i32 %i.01, 1
%cmp2 = icmp slt i32 %inc, 10
%inc.ext = sext i32 %inc to i64
%cmp2 = icmp slt i64 %inc.ext, %m
br i1 %cmp2, label %for.body3, label %for.end, !llvm.loop !0
for.end:
%inc10 = add nsw i32 %k.02, 1
%cmp = icmp slt i32 %inc10, 10
%inc10.ext = sext i32 %inc10 to i64
%cmp = icmp slt i64 %inc10.ext, %n
br i1 %cmp, label %for.body, label %for.end11, !llvm.loop !2
for.end11:

View File

@ -4,7 +4,7 @@
; RUN: FileCheck --input-file=%t %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
target triple = "powerpc64le-unknown-linux-gnu"
@A = common global [100 x [100 x i32]] zeroinitializer

View File

@ -8,7 +8,7 @@
; RUN: FileCheck --check-prefix=DELIN --input-file=%t %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
target triple = "powerpc64le-unknown-linux-gnu"
@A = common global [100 x [100 x i32]] zeroinitializer
@B = common global [100 x [100 x [100 x i32]]] zeroinitializer

View File

@ -4,7 +4,7 @@
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
target triple = "powerpc64le-unknown-linux-gnu"
@A = common global [100 x [100 x i64]] zeroinitializer

View File

@ -2,6 +2,7 @@
; RUN: opt < %s -basic-aa -loop-interchange -verify-dom-info -verify-loop-info \
; RUN: -S -debug 2>&1 | FileCheck %s
target triple = "powerpc64le-unknown-linux-gnu"
@A = common global [100 x [100 x i64]] zeroinitializer
@N = dso_local local_unnamed_addr global i64 100, align 8

View File

@ -5,6 +5,7 @@
; Inner loop only reductions are not supported currently. See discussion at
; D53027 for more information on the required checks.
target triple = "powerpc64le-unknown-linux-gnu"
@A = common global [500 x [500 x i32]] zeroinitializer
@X = common global i32 0
@B = common global [500 x [500 x i32]] zeroinitializer

View File

@ -2,6 +2,7 @@
; RUN: opt < %s -basic-aa -loop-interchange -verify-dom-info -verify-loop-info \
; RUN: -S -debug 2>&1 | FileCheck %s
target triple = "powerpc64le-unknown-linux-gnu"
@a = common global i32 0, align 4
@d = common dso_local local_unnamed_addr global [1 x [6 x i32]] zeroinitializer, align 4

View File

@ -3,7 +3,7 @@
; RUN: -S -debug 2>&1 | FileCheck %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
target triple = "powerpc64le-unknown-linux-gnu"
@A = common global [100 x [100 x i32]] zeroinitializer
@B = common global [100 x i32] zeroinitializer

View File

@ -1,6 +1,7 @@
; RUN: opt < %s -basic-aa -loop-interchange -verify-dom-info -verify-loop-info \
; RUN: -S -pass-remarks=loop-interchange 2>&1 | FileCheck %s
target triple = "powerpc64le-unknown-linux-gnu"
@A10 = local_unnamed_addr global [3 x [3 x i32]] zeroinitializer, align 16
;; Test to make sure we can handle zext instructions introduced by

View File

@ -4,6 +4,7 @@
; RUN: | FileCheck -check-prefix=STATS %s
; RUN: FileCheck -input-file %t %s
target triple = "powerpc64le-unknown-linux-gnu"
; no_deps_interchange just accesses a single nested array and can be interchange.
; CHECK: Name: Interchanged
@ -34,35 +35,6 @@ exit: ; preds = %for1.inc
}
; Only the inner loop induction variable is used for memory accesses.
; Interchanging is not beneficial.
; CHECK: Name: InterchangeNotProfitable
; CHECK-NEXT: Function: no_bad_order
define i32 @no_bad_order(i32* %Arr) {
entry:
br label %for1.header
for1.header: ; preds = %entry, %for1.inc
%indvars.iv19 = phi i64 [ 0, %entry ], [ %indvars.iv.next20, %for1.inc ]
br label %for2
for2: ; preds = %for1.header, %for2
%indvars.iv = phi i64 [ 0, %for1.header ], [ %indvars.iv.next, %for2 ]
%arrayidx6 = getelementptr inbounds i32, i32* %Arr, i64 %indvars.iv
store i32 0, i32* %arrayidx6, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp ne i64 %indvars.iv.next, 1024
br i1 %exitcond, label %for2, label %for1.inc
for1.inc:
%indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1
%exitcond21 = icmp ne i64 %indvars.iv.next20, 1024
br i1 %exitcond21, label %for1.header, label %exit
exit: ; preds = %for1.inc
ret i32 0
}
; No memory access using any induction variables, interchanging not beneficial.
; CHECK: Name: InterchangeNotProfitable
; CHECK-NEXT: Function: no_mem_instrs

View File

@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -basic-aa -loop-interchange -verify-dom-info -verify-loop-info -verify-scev -verify-loop-lcssa -S | FileCheck %s
target triple = "powerpc64le-unknown-linux-gnu"
@b = common dso_local local_unnamed_addr global [200 x [200 x i32]] zeroinitializer, align 4
@a = common dso_local local_unnamed_addr global i32 0, align 4

View File

@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s --basic-aa -loop-interchange -verify-dom-info -verify-loop-info -verify-scev -verify-loop-lcssa -S | FileCheck %s
; RUN: opt < %s -basic-aa -loop-interchange -verify-dom-info -verify-loop-info -verify-scev -verify-loop-lcssa -S | FileCheck %s
target triple = "powerpc64le-unknown-linux-gnu"
@b = constant [200 x [100 x i32]] zeroinitializer, align 4
@a = constant i32 0, align 4

View File

@ -3,7 +3,7 @@
; RUN: opt < %s -aa-pipeline=basic-aa -passes=loop-interchange -verify-dom-info -verify-loop-info -verify-scev -verify-loop-lcssa -S | FileCheck %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
target triple = "powerpc64le-unknown-linux-gnu"
@A = common global [100 x [100 x i64]] zeroinitializer
@B = common global [100 x i64] zeroinitializer

View File

@ -3,7 +3,7 @@
; RUN: -S -debug 2>&1 | FileCheck %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
target triple = "powerpc64le-unknown-linux-gnu"
@D = common global [100 x [100 x [100 x i32]]] zeroinitializer
@ -24,31 +24,31 @@ entry:
br label %for.cond1.preheader
for.cond1.preheader: ; preds = %for.inc15, %entry
%i.028 = phi i32 [ 0, %entry ], [ %inc16, %for.inc15 ]
%i.028 = phi i64 [ 0, %entry ], [ %inc16, %for.inc15 ]
br label %for.cond4.preheader
for.cond4.preheader: ; preds = %for.inc12, %for.cond1.preheader
%j.027 = phi i32 [ 0, %for.cond1.preheader ], [ %inc13, %for.inc12 ]
%j.027 = phi i64 [ 0, %for.cond1.preheader ], [ %inc13, %for.inc12 ]
br label %for.body6
for.body6: ; preds = %for.body6, %for.cond4.preheader
%k.026 = phi i32 [ 0, %for.cond4.preheader ], [ %inc, %for.body6 ]
%arrayidx8 = getelementptr inbounds [100 x [100 x [100 x i32]]], [100 x [100 x [100 x i32]]]* @D, i32 0, i32 %k.026, i32 %j.027, i32 %i.028
%k.026 = phi i64 [ 0, %for.cond4.preheader ], [ %inc, %for.body6 ]
%arrayidx8 = getelementptr inbounds [100 x [100 x [100 x i32]]], [100 x [100 x [100 x i32]]]* @D, i64 0, i64 %k.026, i64 %j.027, i64 %i.028
%0 = load i32, i32* %arrayidx8
%add = add nsw i32 %0, %t
store i32 %add, i32* %arrayidx8
%inc = add nuw nsw i32 %k.026, 1
%exitcond = icmp eq i32 %inc, 100
%inc = add nuw nsw i64 %k.026, 1
%exitcond = icmp eq i64 %inc, 100
br i1 %exitcond, label %for.inc12, label %for.body6
for.inc12: ; preds = %for.body6
%inc13 = add nuw nsw i32 %j.027, 1
%exitcond29 = icmp eq i32 %inc13, 100
%inc13 = add nuw nsw i64 %j.027, 1
%exitcond29 = icmp eq i64 %inc13, 100
br i1 %exitcond29, label %for.inc15, label %for.cond4.preheader
for.inc15: ; preds = %for.inc12
%inc16 = add nuw nsw i32 %i.028, 1
%exitcond30 = icmp eq i32 %inc16, 100
%inc16 = add nuw nsw i64 %i.028, 1
%exitcond30 = icmp eq i64 %inc16, 100
br i1 %exitcond30, label %for.end17, label %for.cond1.preheader
for.end17: ; preds = %for.inc15

View File

@ -3,6 +3,7 @@
; RUN: opt < %s -basic-aa -loop-interchange -da-disable-delinearization-checks -pass-remarks-missed='loop-interchange' -verify-loop-lcssa -S | FileCheck -check-prefix=CHECK-DELIN %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "powerpc64le-unknown-linux-gnu"
; void foo(int n, int m) {
; int temp[16][16];

View File

@ -2,7 +2,7 @@
; RUN: FileCheck --input-file %t --check-prefix REMARK %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
target triple = "powerpc64le-unknown-linux-gnu"
@A = common global [100 x [100 x i32]] zeroinitializer
@C = common global [100 x [100 x i32]] zeroinitializer

View File

@ -10,6 +10,7 @@
; RUN: -pass-remarks='loop-interchange' -S -da-disable-delinearization-checks
; RUN: cat %t | FileCheck --check-prefix=DELIN %s
target triple = "powerpc64le-unknown-linux-gnu"
@A = common global [100 x [100 x i32]] zeroinitializer
@B = common global [100 x [100 x i32]] zeroinitializer
@C = common global [100 x i32] zeroinitializer
@ -71,11 +72,7 @@ for.end19:
; DELIN-NEXT: Name: InterchangeNotProfitable
; DELIN-NEXT: Function: test01
; DELIN-NEXT: Args:
; DELIN-NEXT: - String: 'Interchanging loops is too costly (cost='
; DELIN-NEXT: - Cost: '2'
; DELIN-NEXT: - String: ', threshold='
; DELIN-NEXT: - Threshold: '0'
; DELIN-NEXT: - String: ') and it does not improve parallelism.'
; DELIN-NEXT: - String: Interchanging loops is too costly and it does not improve parallelism.
; DELIN-NEXT: ...
;;--------------------------------------Test case 02------------------------------------

View File

@ -3,7 +3,7 @@
; RUN: -S -debug 2>&1 | FileCheck %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
target triple = "powerpc64le-unknown-linux-gnu"
@A = common global [100 x [100 x i32]] zeroinitializer
@B = common global [100 x i32] zeroinitializer

View File

@ -3,7 +3,7 @@
; RUN: -S -debug 2>&1 | FileCheck %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
target triple = "powerpc64le-unknown-linux-gnu"
@D = common global [100 x [100 x [100 x i32]]] zeroinitializer
@ -24,31 +24,31 @@ entry:
br label %for.cond1.preheader
for.cond1.preheader: ; preds = %for.inc15, %entry
%i.028 = phi i32 [ 0, %entry ], [ %inc16, %for.inc15 ]
%i.028 = phi i64 [ 0, %entry ], [ %inc16, %for.inc15 ]
br label %for.cond4.preheader
for.cond4.preheader: ; preds = %for.inc12, %for.cond1.preheader
%j.027 = phi i32 [ 0, %for.cond1.preheader ], [ %inc13, %for.inc12 ]
%j.027 = phi i64 [ 0, %for.cond1.preheader ], [ %inc13, %for.inc12 ]
br label %for.body6
for.body6: ; preds = %for.body6, %for.cond4.preheader
%k.026 = phi i32 [ 0, %for.cond4.preheader ], [ %inc, %for.body6 ]
%arrayidx8 = getelementptr inbounds [100 x [100 x [100 x i32]]], [100 x [100 x [100 x i32]]]* @D, i32 0, i32 %i.028, i32 %k.026, i32 %j.027
%k.026 = phi i64 [ 0, %for.cond4.preheader ], [ %inc, %for.body6 ]
%arrayidx8 = getelementptr inbounds [100 x [100 x [100 x i32]]], [100 x [100 x [100 x i32]]]* @D, i32 0, i64 %i.028, i64 %k.026, i64 %j.027
%0 = load i32, i32* %arrayidx8
%add = add nsw i32 %0, %t
store i32 %add, i32* %arrayidx8
%inc = add nuw nsw i32 %k.026, 1
%exitcond = icmp eq i32 %inc, 100
%inc = add nuw nsw i64 %k.026, 1
%exitcond = icmp eq i64 %inc, 100
br i1 %exitcond, label %for.inc12, label %for.body6
for.inc12: ; preds = %for.body6
%inc13 = add nuw nsw i32 %j.027, 1
%exitcond29 = icmp eq i32 %inc13, 100
%inc13 = add nuw nsw i64 %j.027, 1
%exitcond29 = icmp eq i64 %inc13, 100
br i1 %exitcond29, label %for.inc15, label %for.cond4.preheader
for.inc15: ; preds = %for.inc12
%inc16 = add nuw nsw i32 %i.028, 1
%exitcond30 = icmp eq i32 %inc16, 100
%inc16 = add nuw nsw i64 %i.028, 1
%exitcond30 = icmp eq i64 %inc16, 100
br i1 %exitcond30, label %for.end17, label %for.cond1.preheader
for.end17: ; preds = %for.inc15

View File

@ -3,7 +3,7 @@
; RUN: -S -debug 2>&1 | FileCheck %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
target triple = "powerpc64le-unknown-linux-gnu"
@A = common global [100 x [100 x i32]] zeroinitializer
@B = common global [100 x i32] zeroinitializer
@ -108,13 +108,13 @@ for.end12:
;; The outer loop header does not branch to the inner loop preheader, or the
;; inner loop header, or the outer loop latch.
; CHECK: Not interchanging loops. Cannot prove legality.
define void @interchange_07(i32 %k, i32 %N, i32 %ny) {
define void @interchange_07(i32 %k, i32 %N, i64 %ny) {
entry:
br label %for1.header
for1.header:
%j23 = phi i32 [ 0, %entry ], [ %j.next24, %for1.inc10 ]
%cmp21 = icmp slt i32 0, %ny
%j23 = phi i64 [ 0, %entry ], [ %j.next24, %for1.inc10 ]
%cmp21 = icmp slt i64 0, %ny
br label %singleSucc
singleSucc:
@ -124,18 +124,18 @@ preheader.j:
br label %for2
for2:
%j = phi i32 [ %j.next, %for2 ], [ 0, %preheader.j ]
%arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i32 0, i32 %j, i32 %j23
%j = phi i64 [ %j.next, %for2 ], [ 0, %preheader.j ]
%arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %j, i64 %j23
%lv = load i32, i32* %arrayidx5
%add = add nsw i32 %lv, %k
store i32 %add, i32* %arrayidx5
%j.next = add nuw nsw i32 %j, 1
%exitcond = icmp eq i32 %j, 99
%j.next = add nuw nsw i64 %j, 1
%exitcond = icmp eq i64 %j, 99
br i1 %exitcond, label %for1.inc10, label %for2
for1.inc10:
%j.next24 = add nuw nsw i32 %j23, 1
%exitcond26 = icmp eq i32 %j23, 99
%j.next24 = add nuw nsw i64 %j23, 1
%exitcond26 = icmp eq i64 %j23, 99
br i1 %exitcond26, label %for.end12, label %for1.header
for.end12:

View File

@ -1,5 +1,6 @@
; RUN: opt -basic-aa -loop-interchange -verify-dom-info -verify-loop-info -verify-loop-lcssa -S %s | FileCheck %s
target triple = "powerpc64le-unknown-linux-gnu"
@b = global [3 x [5 x [8 x i16]]] [[5 x [8 x i16]] zeroinitializer, [5 x [8 x i16]] [[8 x i16] zeroinitializer, [8 x i16] [i16 0, i16 0, i16 0, i16 6, i16 1, i16 6, i16 0, i16 0], [8 x i16] zeroinitializer, [8 x i16] zeroinitializer, [8 x i16] zeroinitializer], [5 x [8 x i16]] zeroinitializer], align 2
@a = common global i32 0, align 4
@d = common dso_local local_unnamed_addr global [1 x [6 x i32]] zeroinitializer, align 4

View File

@ -4,6 +4,7 @@
; Outer loop only reductions are not supported currently.
target triple = "powerpc64le-unknown-linux-gnu"
@A = common global [500 x [500 x i32]] zeroinitializer
;; global X

View File

@ -3,6 +3,7 @@
; Test case for PR41725. The induction variables in the latches escape the
; loops and we must move some PHIs around.
target triple = "powerpc64le-unknown-linux-gnu"
@a = common dso_local global i64 0, align 4
@b = common dso_local global i64 0, align 4
@c = common dso_local global [10 x [10 x i32 ]] zeroinitializer, align 16
@ -157,7 +158,7 @@ exit: ; preds = %outer.latch
; Make sure we do not crash for loops without reachable exits.
define void @no_reachable_exits() {
; Check we interchanged.
; CHECK-LABEL: @no_reachable_exits() {
; CHECK-LABEL: @no_reachable_exits()
; CHECK-NEXT: bb:
; CHECK-NEXT: br label %inner.ph
; CHECK-LABEL: outer.ph:

View File

@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -loop-interchange -verify-loop-lcssa -verify-dom-info -S %s | FileCheck %s
target triple = "powerpc64le-unknown-linux-gnu"
@b = external dso_local global [5 x i32], align 16
define void @test1() {

View File

@ -14,6 +14,8 @@
; }
; }
target triple = "powerpc64le-unknown-linux-gnu"
; REMARKS: --- !Passed
; REMARKS-NEXT: Pass: loop-interchange
; REMARKS-NEXT: Name: Interchanged

View File

@ -2,6 +2,7 @@
; RUN: -verify-dom-info -verify-loop-info -verify-loop-lcssa -stats 2>&1
; RUN: FileCheck --input-file=%t --check-prefix=REMARKS %s
target triple = "powerpc64le-unknown-linux-gnu"
@a = global i32 0
@b = global i8 0
@c = global i32 0

View File

@ -6,6 +6,8 @@
; In the 2 test cases below, we have a LCSSA PHI in the inner loop exit, which
; is used in the outer loop latch. This is not supported.
target triple = "powerpc64le-unknown-linux-gnu"
define void @test1() {
; CHECK-LABEL: @test1(
; CHECK-NEXT: entry:

View File

@ -3,6 +3,7 @@
; Tests for PR43797.
target triple = "powerpc64le-unknown-linux-gnu"
@wdtdr = external dso_local global [5 x [5 x double]], align 16
define void @test1() {

View File

@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -loop-interchange -S %s | FileCheck %s
target triple = "powerpc64le-unknown-linux-gnu"
@global = external local_unnamed_addr global [400 x [400 x i32]], align 16
; We need to move %tmp4 from the inner loop pre header to the outer loop header

View File

@ -2,6 +2,8 @@
; RUN: -verify-dom-info -verify-loop-info -verify-loop-lcssa 2>&1
; RUN: FileCheck --input-file=%t --check-prefix=REMARKS %s
target triple = "powerpc64le-unknown-linux-gnu"
; REMARKS: --- !Passed
; REMARKS-NEXT: Pass: loop-interchange
; REMARKS-NEXT: Name: Interchanged

View File

@ -5,7 +5,7 @@
;; We test profitability model in these test cases.
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
target triple = "powerpc64le-unknown-linux-gnu"
@A = common global [100 x [100 x i32]] zeroinitializer
@B = common global [100 x [100 x i32]] zeroinitializer

View File

@ -4,7 +4,7 @@
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
target triple = "powerpc64le-unknown-linux-gnu"
; REMARKS: --- !Passed
; REMARKS-NEXT: Pass: loop-interchange

View File

@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -loop-interchange -S %s | FileCheck %s
target triple = "powerpc64le-unknown-linux-gnu"
@global = external dso_local global [1000 x [1000 x i32]], align 16
; Test that we support updating conditional branches where both targets are the same

View File

@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -loop-interchange -loop-interchange-threshold=-10 -S %s | FileCheck %s
target triple = "powerpc64le-unknown-linux-gnu"
; The test contains a GEP with an operand that is not SCEV-able. Make sure
; loop-interchange does not crash.
define void @test([256 x float]* noalias %src, float* %dst) {