[LSR] Generate cross iteration indexes

Modify GenerateConstantOffsetsImpl to create offsets that can be used
by indexed addressing modes. If formulae can be generated which
result in the constant offset being the same size as the recurrence,
we can generate a pre-indexed access. This allows the pointer to be
updated via the single pre-indexed access so that (hopefully) no
add/subs are required to update it for the next iteration. For small
cores, this can significantly improve performance DSP-like loops.

Differential Revision: https://reviews.llvm.org/D55373

llvm-svn: 353403
This commit is contained in:
Sam Parker 2019-02-07 13:32:54 +00:00
parent bb3b372aa1
commit 67756c09f2
9 changed files with 1598 additions and 40 deletions

View File

@ -486,6 +486,10 @@ public:
/// addressing mode expressions.
bool shouldFavorPostInc() const;
/// Return true if LSR should make efforts to generate indexed addressing
/// modes that operate across loop iterations.
bool shouldFavorBackedgeIndex(const Loop *L) const;
/// Return true if the target supports masked load/store
/// AVX2 and AVX-512 targets allow masks for consecutive load and store
bool isLegalMaskedStore(Type *DataType) const;
@ -1065,6 +1069,7 @@ public:
TargetTransformInfo::LSRCost &C2) = 0;
virtual bool canMacroFuseCmp() = 0;
virtual bool shouldFavorPostInc() const = 0;
virtual bool shouldFavorBackedgeIndex(const Loop *L) const = 0;
virtual bool isLegalMaskedStore(Type *DataType) = 0;
virtual bool isLegalMaskedLoad(Type *DataType) = 0;
virtual bool isLegalMaskedScatter(Type *DataType) = 0;
@ -1301,6 +1306,9 @@ public:
bool shouldFavorPostInc() const override {
return Impl.shouldFavorPostInc();
}
bool shouldFavorBackedgeIndex(const Loop *L) const override {
return Impl.shouldFavorBackedgeIndex(L);
}
bool isLegalMaskedStore(Type *DataType) override {
return Impl.isLegalMaskedStore(DataType);
}

View File

@ -253,6 +253,8 @@ public:
bool shouldFavorPostInc() const { return false; }
bool shouldFavorBackedgeIndex(const Loop *L) const { return false; }
bool isLegalMaskedStore(Type *DataType) { return false; }
bool isLegalMaskedLoad(Type *DataType) { return false; }

View File

@ -162,6 +162,10 @@ bool TargetTransformInfo::shouldFavorPostInc() const {
return TTIImpl->shouldFavorPostInc();
}
bool TargetTransformInfo::shouldFavorBackedgeIndex(const Loop *L) const {
return TTIImpl->shouldFavorBackedgeIndex(L);
}
bool TargetTransformInfo::isLegalMaskedStore(Type *DataType) const {
return TTIImpl->isLegalMaskedStore(DataType);
}

View File

@ -93,6 +93,12 @@ public:
bool enableInterleavedAccessVectorization() { return true; }
bool shouldFavorBackedgeIndex(const Loop *L) const {
if (L->getHeader()->getParent()->optForSize())
return false;
return ST->isMClass() && ST->isThumb2() && L->getNumBlocks() == 1;
}
/// Floating-point computation using ARMv8 AArch32 Advanced
/// SIMD instructions remains unchanged from ARMv7. Only AArch64 SIMD
/// is IEEE-754 compliant, but it's not covered in this target.

View File

@ -154,6 +154,10 @@ static cl::opt<bool> FilterSameScaledReg(
cl::desc("Narrow LSR search space by filtering non-optimal formulae"
" with the same ScaledReg and Scale"));
static cl::opt<bool> EnableBackedgeIndexing(
"lsr-backedge-indexing", cl::Hidden, cl::init(true),
cl::desc("Enable the generation of cross iteration indexed memops"));
static cl::opt<unsigned> ComplexityLimit(
"lsr-complexity-limit", cl::Hidden,
cl::init(std::numeric_limits<uint16_t>::max()),
@ -1052,12 +1056,12 @@ public:
void dump() const;
private:
void RateRegister(const SCEV *Reg,
void RateRegister(const Formula &F, const SCEV *Reg,
SmallPtrSetImpl<const SCEV *> &Regs,
const Loop *L,
ScalarEvolution &SE, DominatorTree &DT,
const TargetTransformInfo &TTI);
void RatePrimaryRegister(const SCEV *Reg,
void RatePrimaryRegister(const Formula &F, const SCEV *Reg,
SmallPtrSetImpl<const SCEV *> &Regs,
const Loop *L,
ScalarEvolution &SE, DominatorTree &DT,
@ -1208,7 +1212,7 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
Instruction *Fixup = nullptr);
/// Tally up interesting quantities from the given register.
void Cost::RateRegister(const SCEV *Reg,
void Cost::RateRegister(const Formula &F, const SCEV *Reg,
SmallPtrSetImpl<const SCEV *> &Regs,
const Loop *L,
ScalarEvolution &SE, DominatorTree &DT,
@ -1235,16 +1239,24 @@ void Cost::RateRegister(const SCEV *Reg,
}
unsigned LoopCost = 1;
if (TTI.shouldFavorPostInc()) {
const SCEV *LoopStep = AR->getStepRecurrence(SE);
if (isa<SCEVConstant>(LoopStep)) {
// Check if a post-indexed load/store can be used.
if (TTI.isIndexedLoadLegal(TTI.MIM_PostInc, AR->getType()) ||
TTI.isIndexedStoreLegal(TTI.MIM_PostInc, AR->getType())) {
if (TTI.isIndexedLoadLegal(TTI.MIM_PostInc, AR->getType()) ||
TTI.isIndexedStoreLegal(TTI.MIM_PostInc, AR->getType())) {
// If the step size matches the base offset, we could use pre-indexed
// addressing.
if (TTI.shouldFavorBackedgeIndex(L)) {
if (auto *Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(SE)))
if (Step->getAPInt() == F.BaseOffset)
LoopCost = 0;
}
if (TTI.shouldFavorPostInc()) {
const SCEV *LoopStep = AR->getStepRecurrence(SE);
if (isa<SCEVConstant>(LoopStep)) {
const SCEV *LoopStart = AR->getStart();
if (!isa<SCEVConstant>(LoopStart) &&
SE.isLoopInvariant(LoopStart, L))
LoopCost = 0;
SE.isLoopInvariant(LoopStart, L))
LoopCost = 0;
}
}
}
@ -1254,7 +1266,7 @@ void Cost::RateRegister(const SCEV *Reg,
// TODO: The non-affine case isn't precisely modeled here.
if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) {
if (!Regs.count(AR->getOperand(1))) {
RateRegister(AR->getOperand(1), Regs, L, SE, DT, TTI);
RateRegister(F, AR->getOperand(1), Regs, L, SE, DT, TTI);
if (isLoser())
return;
}
@ -1278,7 +1290,7 @@ void Cost::RateRegister(const SCEV *Reg,
/// Record this register in the set. If we haven't seen it before, rate
/// it. Optional LoserRegs provides a way to declare any formula that refers to
/// one of those regs an instant loser.
void Cost::RatePrimaryRegister(const SCEV *Reg,
void Cost::RatePrimaryRegister(const Formula &F, const SCEV *Reg,
SmallPtrSetImpl<const SCEV *> &Regs,
const Loop *L,
ScalarEvolution &SE, DominatorTree &DT,
@ -1289,7 +1301,7 @@ void Cost::RatePrimaryRegister(const SCEV *Reg,
return;
}
if (Regs.insert(Reg).second) {
RateRegister(Reg, Regs, L, SE, DT, TTI);
RateRegister(F, Reg, Regs, L, SE, DT, TTI);
if (LoserRegs && isLoser())
LoserRegs->insert(Reg);
}
@ -1313,7 +1325,7 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
Lose();
return;
}
RatePrimaryRegister(ScaledReg, Regs, L, SE, DT, LoserRegs, TTI);
RatePrimaryRegister(F, ScaledReg, Regs, L, SE, DT, LoserRegs, TTI);
if (isLoser())
return;
}
@ -1322,7 +1334,7 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
Lose();
return;
}
RatePrimaryRegister(BaseReg, Regs, L, SE, DT, LoserRegs, TTI);
RatePrimaryRegister(F, BaseReg, Regs, L, SE, DT, LoserRegs, TTI);
if (isLoser())
return;
}
@ -1889,6 +1901,7 @@ class LSRInstance {
LoopInfo &LI;
const TargetTransformInfo &TTI;
Loop *const L;
bool FavorBackedgeIndex = false;
bool Changed = false;
/// This is the insert position that the current loop's induction variable
@ -2803,7 +2816,7 @@ bool IVChain::isProfitableIncrement(const SCEV *OperExpr,
/// TODO: Consider IVInc free if it's already used in another chains.
static bool
isProfitableChain(IVChain &Chain, SmallPtrSetImpl<Instruction*> &Users,
ScalarEvolution &SE, const TargetTransformInfo &TTI) {
ScalarEvolution &SE) {
if (StressIVChain)
return true;
@ -3063,7 +3076,7 @@ void LSRInstance::CollectChains() {
for (unsigned UsersIdx = 0, NChains = IVChainVec.size();
UsersIdx < NChains; ++UsersIdx) {
if (!isProfitableChain(IVChainVec[UsersIdx],
ChainUsersVec[UsersIdx].FarUsers, SE, TTI))
ChainUsersVec[UsersIdx].FarUsers, SE))
continue;
// Preserve the chain at UsesIdx.
if (ChainIdx != UsersIdx)
@ -3077,7 +3090,7 @@ void LSRInstance::CollectChains() {
void LSRInstance::FinalizeChain(IVChain &Chain) {
assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
LLVM_DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n");
for (const IVInc &Inc : Chain) {
LLVM_DEBUG(dbgs() << " Inc: " << *Inc.UserInst << "\n");
auto UseI = find(Inc.UserInst->operands(), Inc.IVOperand);
@ -3737,10 +3750,11 @@ void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
void LSRInstance::GenerateConstantOffsetsImpl(
LSRUse &LU, unsigned LUIdx, const Formula &Base,
const SmallVectorImpl<int64_t> &Worklist, size_t Idx, bool IsScaledReg) {
const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
for (int64_t Offset : Worklist) {
auto GenerateOffset = [&](const SCEV *G, int64_t Offset) {
Formula F = Base;
F.BaseOffset = (uint64_t)Base.BaseOffset - Offset;
if (isLegalUse(TTI, LU.MinOffset - Offset, LU.MaxOffset - Offset, LU.Kind,
LU.AccessTy, F)) {
// Add the offset to the base register.
@ -3760,7 +3774,35 @@ void LSRInstance::GenerateConstantOffsetsImpl(
(void)InsertFormula(LU, LUIdx, F);
}
};
const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
// With constant offsets and constant steps, we can generate pre-inc
// accesses by having the offset equal the step. So, for access #0 with a
// step of 8, we generate a G - 8 base which would require the first access
// to be ((G - 8) + 8),+,8. The pre-indexed access then updates the pointer
// for itself and hopefully becomes the base for other accesses. This means
// means that a single pre-indexed access can be generated to become the new
// base pointer for each iteration of the loop, resulting in no extra add/sub
// instructions for pointer updating.
if (FavorBackedgeIndex && LU.Kind == LSRUse::Address) {
if (auto *GAR = dyn_cast<SCEVAddRecExpr>(G)) {
if (auto *StepRec =
dyn_cast<SCEVConstant>(GAR->getStepRecurrence(SE))) {
const APInt &StepInt = StepRec->getAPInt();
int64_t Step = StepInt.isNegative() ?
StepInt.getSExtValue() : StepInt.getZExtValue();
for (int64_t Offset : Worklist) {
Offset -= Step;
GenerateOffset(G, Offset);
}
}
}
}
for (int64_t Offset : Worklist)
GenerateOffset(G, Offset);
int64_t Imm = ExtractImmediate(G, SE);
if (G->isZero() || Imm == 0)
@ -4417,7 +4459,7 @@ void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
/// When there are many registers for expressions like A, A+1, A+2, etc.,
/// allocate a single register for them.
void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
if (EstimateSearchSpaceComplexity() < ComplexityLimit)
if (EstimateSearchSpaceComplexity() < ComplexityLimit)
return;
LLVM_DEBUG(
@ -5378,7 +5420,9 @@ void LSRInstance::ImplementSolution(
LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
DominatorTree &DT, LoopInfo &LI,
const TargetTransformInfo &TTI)
: IU(IU), SE(SE), DT(DT), LI(LI), TTI(TTI), L(L) {
: IU(IU), SE(SE), DT(DT), LI(LI), TTI(TTI), L(L),
FavorBackedgeIndex(EnableBackedgeIndexing &&
TTI.shouldFavorBackedgeIndex(L)) {
// If LoopSimplify form is not available, stay out of trouble.
if (!L->isLoopSimplifyForm())
return;

View File

@ -0,0 +1,310 @@
; RUN: llc -mtriple=thumbv7em -mattr=+fp-armv8 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEFAULT
; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEFAULT
; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -lsr-backedge-indexing=false %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLED
; RUN: llc -mtriple=thumbv8 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLED
; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -lsr-complexity-limit=2147483647 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-COMPLEX
; CHECK-LABEL: test_qadd_2
; CHECK: @ %loop
; CHECK-DEFAULT: ldr{{.*}}, #4]
; CHECK-DEFAULT: ldr{{.*}}, #4]
; CHECK-DEFAULT: str{{.*}}, #4]
; CHECK-DEFAULT: ldr{{.*}}, #8]!
; CHECK-DEAFULT: ldr{{.*}}, #8]!
; CHECK-DEFAULT: str{{.*}}, #8]!
; CHECK-COMPLEX: ldr{{.*}}, #8]!
; CHECK-COMPLEX: ldr{{.*}}, #8]!
; CHECK-COMPLEX: str{{.*}}, #8]!
; CHECK-COMPLEX: ldr{{.*}}, #4]
; CHECK-COMPLEX: ldr{{.*}}, #4]
; CHECK-COMPLEX: str{{.*}}, #4]
; DISABLED-NOT: ldr{{.*}}]!
; DISABLED-NOT: str{{.*}}]!
define void @test_qadd_2(i32* %a.array, i32* %b.array, i32* %out.array, i32 %N) {
entry:
br label %loop
loop:
%i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
%idx.1 = phi i32 [ 0, %entry ], [ %idx.next, %loop ]
%gep.a.1 = getelementptr inbounds i32, i32* %a.array, i32 %idx.1
%a.1 = load i32, i32* %gep.a.1
%gep.b.1 = getelementptr inbounds i32, i32* %b.array, i32 %idx.1
%b.1 = load i32, i32* %gep.b.1
%qadd.1 = call i32 @llvm.arm.qadd(i32 %a.1, i32 %b.1)
%addr.1 = getelementptr inbounds i32, i32* %out.array, i32 %idx.1
store i32 %qadd.1, i32* %addr.1
%idx.2 = or i32 %idx.1, 1
%gep.a.2 = getelementptr inbounds i32, i32* %a.array, i32 %idx.2
%a.2 = load i32, i32* %gep.a.2
%gep.b.2 = getelementptr inbounds i32, i32* %b.array, i32 %idx.2
%b.2 = load i32, i32* %gep.b.2
%qadd.2 = call i32 @llvm.arm.qadd(i32 %a.2, i32 %b.2)
%addr.2 = getelementptr inbounds i32, i32* %out.array, i32 %idx.2
store i32 %qadd.2, i32* %addr.2
%i.next = add nsw nuw i32 %i, -2
%idx.next = add nsw nuw i32 %idx.1, 2
%cmp = icmp ult i32 %i.next, %N
br i1 %cmp, label %loop, label %exit
exit:
ret void
}
; CHECK-LABEL: test_qadd_2_backwards
; TODO: Indexes should be generated.
; CHECK: @ %loop
; CHECK-DEFAULT: ldr{{.*}},
; CHECK-DEFAULT: ldr{{.*}},
; CHECK-DEFAULT: str{{.*}},
; CHECK-DEFAULT: ldr{{.*}}, #-4]
; CHECK-DEFAULT: ldr{{.*}}, #-4]
; CHECK-DEFAULT: sub{{.*}}, #8
; CHECK-DEFAULT: str{{.*}}, #-4]
; CHECK-DEFAULT: sub{{.*}}, #8
; CHECK-COMPLEX: ldr{{.*}} lsl #2]
; CHECK-COMPLEX: ldr{{.*}} lsl #2]
; CHECK-COMPLEX: str{{.*}} lsl #2]
; CHECK-COMPLEX: ldr{{.*}} lsl #2]
; CHECK-COMPLEX: ldr{{.*}} lsl #2]
; CHECK-COMPLEX: str{{.*}} lsl #2]
; DISABLED-NOT: ldr{{.*}}]!
; DISABLED-NOT: str{{.*}}]!
define void @test_qadd_2_backwards(i32* %a.array, i32* %b.array, i32* %out.array, i32 %N) {
entry:
br label %loop
loop:
%i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
%idx.1 = phi i32 [ %N, %entry ], [ %idx.next, %loop ]
%gep.a.1 = getelementptr inbounds i32, i32* %a.array, i32 %idx.1
%a.1 = load i32, i32* %gep.a.1
%gep.b.1 = getelementptr inbounds i32, i32* %b.array, i32 %idx.1
%b.1 = load i32, i32* %gep.b.1
%qadd.1 = call i32 @llvm.arm.qadd(i32 %a.1, i32 %b.1)
%addr.1 = getelementptr inbounds i32, i32* %out.array, i32 %idx.1
store i32 %qadd.1, i32* %addr.1
%idx.2 = sub nsw nuw i32 %idx.1, 1
%gep.a.2 = getelementptr inbounds i32, i32* %a.array, i32 %idx.2
%a.2 = load i32, i32* %gep.a.2
%gep.b.2 = getelementptr inbounds i32, i32* %b.array, i32 %idx.2
%b.2 = load i32, i32* %gep.b.2
%qadd.2 = call i32 @llvm.arm.qadd(i32 %a.2, i32 %b.2)
%addr.2 = getelementptr inbounds i32, i32* %out.array, i32 %idx.2
store i32 %qadd.2, i32* %addr.2
%i.next = add nsw nuw i32 %i, -2
%idx.next = sub nsw nuw i32 %idx.1, 2
%cmp = icmp ult i32 %i.next, %N
br i1 %cmp, label %loop, label %exit
exit:
ret void
}
; CHECK-LABEL: test_qadd_3
; CHECK: @ %loop
; CHECK-DEFAULT: ldr{{.*}}, #8]
; CHECK-DEFAULT: ldr{{.*}}, #8]
; CHECK-DEFAULT: str{{.*}}, #8]
; CHECK-DEFAULT: ldr{{.*}}, #12]!
; CHECK-DEFAULT: ldr{{.*}}, #12]!
; CHECK-DEFAULT: str{{.*}}, #12]!
; CHECK-COMPLEX: ldr{{.*}}, #12]!
; CHECK-COMPLEX: ldr{{.*}}, #12]!
; CHECK-COMPLEX: str{{.*}}, #12]!
; CHECK-COMPLEX: ldr{{.*}}, #4]
; CHECK-COMPLEX: ldr{{.*}}, #4]
; CHECK-COMPLEX: str{{.*}}, #4]
; CHECK-COMPLEX: ldr{{.*}}, #8]
; CHECK-COMPLEX: ldr{{.*}}, #8]
; CHECK-COMPLEX: str{{.*}}, #8]
; DISABLED-NOT: ldr{{.*}}]!
; DISABLED-NOT: str{{.*}}]!
define void @test_qadd_3(i32* %a.array, i32* %b.array, i32* %out.array, i32 %N) {
entry:
br label %loop
loop:
%i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
%idx.1 = phi i32 [ 0, %entry ], [ %idx.next, %loop ]
%gep.a.1 = getelementptr inbounds i32, i32* %a.array, i32 %idx.1
%a.1 = load i32, i32* %gep.a.1
%gep.b.1 = getelementptr inbounds i32, i32* %b.array, i32 %idx.1
%b.1 = load i32, i32* %gep.b.1
%qadd.1 = call i32 @llvm.arm.qadd(i32 %a.1, i32 %b.1)
%addr.1 = getelementptr inbounds i32, i32* %out.array, i32 %idx.1
store i32 %qadd.1, i32* %addr.1
%idx.2 = add nuw nsw i32 %idx.1, 1
%gep.a.2 = getelementptr inbounds i32, i32* %a.array, i32 %idx.2
%a.2 = load i32, i32* %gep.a.2
%gep.b.2 = getelementptr inbounds i32, i32* %b.array, i32 %idx.2
%b.2 = load i32, i32* %gep.b.2
%qadd.2 = call i32 @llvm.arm.qadd(i32 %a.2, i32 %b.2)
%addr.2 = getelementptr inbounds i32, i32* %out.array, i32 %idx.2
store i32 %qadd.2, i32* %addr.2
%idx.3 = add nuw nsw i32 %idx.1, 2
%gep.a.3 = getelementptr inbounds i32, i32* %a.array, i32 %idx.3
%a.3 = load i32, i32* %gep.a.3
%gep.b.3 = getelementptr inbounds i32, i32* %b.array, i32 %idx.3
%b.3 = load i32, i32* %gep.b.3
%qadd.3 = call i32 @llvm.arm.qadd(i32 %a.3, i32 %b.3)
%addr.3 = getelementptr inbounds i32, i32* %out.array, i32 %idx.3
store i32 %qadd.3, i32* %addr.3
%i.next = add nsw nuw i32 %i, -3
%idx.next = add nsw nuw i32 %idx.1, 3
%cmp = icmp ult i32 %i.next, %N
br i1 %cmp, label %loop, label %exit
exit:
ret void
}
; CHECK-LABEL: test_qadd_4
; CHECK: @ %loop
; TODO: pre-inc store
; CHECK-DEFAULT: ldr{{.*}}, #4]
; CHECK-DEFAULT: ldr{{.*}}, #4]
; CHECK-DEFAULT: str{{.*}}, #4]
; CHECK-DEFAULT: ldr{{.*}}, #8]
; CHECK-DEFAULT: ldr{{.*}}, #8]
; CHECK-DEFAULT: str{{.*}}, #8]
; CHECK-DEFAULT: ldr{{.*}}, #12]
; CHECK-DEFAULT: ldr{{.*}}, #12]
; CHECK-DEFAULT: str{{.*}}, #12]
; CHECK-COMPLEX: ldr{{.*}}, #16]!
; CHECK-COMPLEX: ldr{{.*}}, #16]!
; CHECK-COMPLEX: str{{.*}}, #16]!
; CHECK-COMPLEX: ldr{{.*}}, #4]
; CHECK-COMPLEX: ldr{{.*}}, #4]
; CHECK-COMPLEX: str{{.*}}, #4]
; CHECK-COMPLEX: ldr{{.*}}, #8]
; CHECK-COMPLEX: ldr{{.*}}, #8]
; CHECK-COMPLEX: str{{.*}}, #8]
; CHECK-COMPLEX: ldr{{.*}}, #12]
; CHECK-COMPLEX: ldr{{.*}}, #12]
; CHECK-COMPLEX: str{{.*}}, #12]
; DISABLED-NOT: ldr{{.*}}]!
; DISABLED-NOT: str{{.*}}]!
define void @test_qadd_4(i32* %a.array, i32* %b.array, i32* %out.array, i32 %N) {
entry:
br label %loop
loop:
%i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
%idx.1 = phi i32 [ 0, %entry ], [ %idx.next, %loop ]
%gep.a.1 = getelementptr inbounds i32, i32* %a.array, i32 %idx.1
%a.1 = load i32, i32* %gep.a.1
%gep.b.1 = getelementptr inbounds i32, i32* %b.array, i32 %idx.1
%b.1 = load i32, i32* %gep.b.1
%qadd.1 = call i32 @llvm.arm.qadd(i32 %a.1, i32 %b.1)
%addr.1 = getelementptr inbounds i32, i32* %out.array, i32 %idx.1
store i32 %qadd.1, i32* %addr.1
%idx.2 = or i32 %idx.1, 1
%gep.a.2 = getelementptr inbounds i32, i32* %a.array, i32 %idx.2
%a.2 = load i32, i32* %gep.a.2
%gep.b.2 = getelementptr inbounds i32, i32* %b.array, i32 %idx.2
%b.2 = load i32, i32* %gep.b.2
%qadd.2 = call i32 @llvm.arm.qadd(i32 %a.2, i32 %b.2)
%addr.2 = getelementptr inbounds i32, i32* %out.array, i32 %idx.2
store i32 %qadd.2, i32* %addr.2
%idx.3 = or i32 %idx.1, 2
%gep.a.3 = getelementptr inbounds i32, i32* %a.array, i32 %idx.3
%a.3 = load i32, i32* %gep.a.3
%gep.b.3 = getelementptr inbounds i32, i32* %b.array, i32 %idx.3
%b.3 = load i32, i32* %gep.b.3
%qadd.3 = call i32 @llvm.arm.qadd(i32 %a.3, i32 %b.3)
%addr.3 = getelementptr inbounds i32, i32* %out.array, i32 %idx.3
store i32 %qadd.3, i32* %addr.3
%idx.4 = or i32 %idx.1, 3
%gep.a.4 = getelementptr inbounds i32, i32* %a.array, i32 %idx.4
%a.4 = load i32, i32* %gep.a.4
%gep.b.4 = getelementptr inbounds i32, i32* %b.array, i32 %idx.4
%b.4 = load i32, i32* %gep.b.4
%qadd.4 = call i32 @llvm.arm.qadd(i32 %a.4, i32 %b.4)
%addr.4 = getelementptr inbounds i32, i32* %out.array, i32 %idx.4
store i32 %qadd.4, i32* %addr.4
%i.next = add nsw nuw i32 %i, -4
%idx.next = add nsw nuw i32 %idx.1, 4
%cmp = icmp ult i32 %i.next, %N
br i1 %cmp, label %loop, label %exit
exit:
ret void
}
; CHECK-LABEL: test_qadd16_2
; CHECK: @ %loop
; TODO: pre-inc store.
; CHECK-DEFAULT: ldr{{.*}}, #4]
; CHECK-DEFAULT: ldr{{.*}}, #4]
; CHECK-DEFAULT: str{{.*}}, #8]
; CHECK-DEFAULT: ldr{{.*}}, #8]!
; CHECK-DEFAULT: ldr{{.*}}, #8]!
; CHECK-DEFAULT: str{{.*}}, #16]!
; CHECK-COMPLEX: ldr{{.*}}, #8]!
; CHECK-COMPLEX: ldr{{.*}}, #8]!
; CHECK-COMPLEX: str{{.*}}, #16]!
; CHECK-COMPLEX: ldr{{.*}}, #4]
; CHECK-COMPLEX: ldr{{.*}}, #4]
; CHECK-COMPLEX: str{{.*}}, #8]
; DISABLED-NOT: ldr{{.*}}]!
; DISABLED-NOT: str{{.*}}]!
define void @test_qadd16_2(i16* %a.array, i16* %b.array, i32* %out.array, i32 %N) {
entry:
br label %loop
loop:
%i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
%idx.1 = phi i32 [ 0, %entry ], [ %idx.next, %loop ]
%gep.a.1 = getelementptr inbounds i16, i16* %a.array, i32 %idx.1
%cast.a.1 = bitcast i16* %gep.a.1 to i32*
%a.1 = load i32, i32* %cast.a.1
%gep.b.1 = getelementptr inbounds i16, i16* %b.array, i32 %idx.1
%cast.b.1 = bitcast i16* %gep.b.1 to i32*
%b.1 = load i32, i32* %cast.b.1
%qadd.1 = call i32 @llvm.arm.qadd16(i32 %a.1, i32 %b.1)
%addr.1 = getelementptr inbounds i32, i32* %out.array, i32 %idx.1
store i32 %qadd.1, i32* %addr.1
%idx.2 = add nsw nuw i32 %idx.1, 2
%gep.a.2 = getelementptr inbounds i16, i16* %a.array, i32 %idx.2
%cast.a.2 = bitcast i16* %gep.a.2 to i32*
%a.2 = load i32, i32* %cast.a.2
%gep.b.2 = getelementptr inbounds i16, i16* %b.array, i32 %idx.2
%cast.b.2 = bitcast i16* %gep.b.2 to i32*
%b.2 = load i32, i32* %cast.b.2
%qadd.2 = call i32 @llvm.arm.qadd16(i32 %a.2, i32 %b.2)
%addr.2 = getelementptr inbounds i32, i32* %out.array, i32 %idx.2
store i32 %qadd.2, i32* %addr.2
%i.next = add nsw nuw i32 %i, -2
%idx.next = add nsw nuw i32 %idx.1, 4
%cmp = icmp ult i32 %i.next, %N
br i1 %cmp, label %loop, label %exit
exit:
ret void
}
declare i32 @llvm.arm.qadd(i32, i32)
declare i32 @llvm.arm.qadd16(i32, i32)

View File

@ -1,10 +1,10 @@
; RUN: llc -mtriple=thumbv7m-none-eabi %s -mcpu=cortex-m3 -o - | FileCheck %s
; RUN: llc -mtriple=thumbv7m-none-eabi %s -mcpu=cortex-m4 -o - | FileCheck %s
; RUN: llc -mtriple=thumbv7m-none-eabi %s -mcpu=cortex-m33 -o - | FileCheck %s
; RUN: llc -mtriple=thumbv8m-none-eabi %s -mcpu=cortex-m33 -o - | FileCheck %s
define void @test_loop_alignment(i32* %in, i32* %out) optsize {
; CHECK-LABEL: test_loop_alignment:
; CHECK: movs {{r[0-9]+}}, #0
; CHECK: mov{{.*}}, #0
; CHECK: .p2align 2
entry:

File diff suppressed because it is too large Load Diff

View File

@ -1,21 +1,15 @@
target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
; RUN: opt -mtriple=thumbv7em %s -S -loop-reduce -lsr-complexity-limit=65536 -o - | FileCheck %s --check-prefix=CHECK-DEFAULT
; RUN: opt -mtriple=thumbv7em %s -S -loop-reduce -lsr-complexity-limit=2147483647 -o - | FileCheck %s --check-prefix=CHECK-COMPLEX
; RUN: opt -mtriple=thumbv7em %s -S -loop-reduce -lsr-complexity-limit=65536 -o - | FileCheck %s
; RUN: opt -mtriple=thumbv7em %s -S -loop-reduce -lsr-complexity-limit=2147483647 -o - | FileCheck %s
; CHECK-DEFAULT-LABEL: for.body12.us.us:
; CHECK-DEFAULT: phi i32
; CHECK-DEFAULT: [[LSR_IV:%[^ ]+]] = phi i32 [ [[LSR_IV_NEXT:%[^ ]+]], %for.body12.us.us ], [ 0, %for.cond9.preheader.us.us ]
; CHECK-DEFAULT: phi i32
; CHECK-DEFAULT: [[LSR_IV_NEXT]] = add i32 [[LSR_IV]], 8
; CHECK-COMPLEX-LABEL: for.body12.us.us:
; CHECK-COMPLEX: phi i32
; CHECK-COMPLEX: [[LSR_IV6:%[^ ]+]] = phi i16* [ [[SCEVGEP7:%[^ ]+]], %for.body12.us.us ], [ [[SCEVGEP5:%[^ ]+]], %for.cond9.preheader.us.us ]
; CHECK-COMPLEX: [[LSR_IV:%[^ ]+]] = phi i16* [ [[SCEVGEP1:%[^ ]+]], %for.body12.us.us ], [ [[SCEVGEP:%[^ ]+]], %for.cond9.preheader.us.us ]
; CHECK-COMPLEX: phi i32
; CHECK-COMPLEX: [[SCEVGEP1]] = getelementptr i16, i16* [[LSR_IV]], i32 4
; CHECK-COMPLEX: [[SCEVGEP7]] = getelementptr i16, i16* [[LSR_IV6]], i32 4
; CHECK-LABEL: for.body12.us.us:
; CHECK: [[LSR_IV6:%[^ ]+]] = phi i16* [ [[SCEVGEP7:%[^ ]+]], %for.body12.us.us ], [ [[SCEVGEP5:%[^ ]+]], %for.cond9.preheader.us.us ]
; CHECK: phi i32
; CHECK: [[LSR_IV:%[^ ]+]] = phi i16* [ [[SCEVGEP1:%[^ ]+]], %for.body12.us.us ], [ [[SCEVGEP:%[^ ]+]], %for.cond9.preheader.us.us ]
; CHECK: phi i32
; CHECK: [[SCEVGEP1]] = getelementptr i16, i16* [[LSR_IV]], i32 4
; CHECK: [[SCEVGEP7]] = getelementptr i16, i16* [[LSR_IV6]], i32 4
define void @convolve(i16** nocapture readonly %input_image, i16** nocapture readonly %filter, i32 %filter_dim, i32 %out_width, i32 %out_height, i32** nocapture readonly %convolved) {
entry: