[ARM][MVE] Remove old tail predicates

Remove any predicate that we replace with a vctp intrinsic, and try
to remove their operands too. Also look into the exit block to see if
there's any duplicates of the predicates that we've replaced and
clone the vctp to be used there instead.

Differential Revision: https://reviews.llvm.org/D67709

llvm-svn: 372567
This commit is contained in:
Sam Parker 2019-09-23 09:48:25 +00:00
parent 14f6465c15
commit 9feb429a33
6 changed files with 672 additions and 12 deletions

View File

@ -3986,6 +3986,7 @@ def MVE_VDWDUPu8 : MVE_VxWDUP<"vdwdup", "u8", 0b00, 0b1>;
def MVE_VDWDUPu16 : MVE_VxWDUP<"vdwdup", "u16", 0b01, 0b1>;
def MVE_VDWDUPu32 : MVE_VxWDUP<"vdwdup", "u32", 0b10, 0b1>;
let hasSideEffects = 1 in
class MVE_VCTP<string suffix, bits<2> size, list<dag> pattern=[]>
: MVE_p<(outs VCCR:$P0), (ins rGPR:$Rn), NoItinerary, "vctp", suffix,
"$Rn", vpred_n, "", pattern> {

View File

@ -84,7 +84,7 @@ private:
/// Is the icmp that generates an i1 vector, based upon a loop counter
/// and a limit that is defined outside the loop.
bool isTailPredicate(Value *Predicate, Value *NumElements);
bool isTailPredicate(Instruction *Predicate, Value *NumElements);
};
} // end namespace
@ -178,7 +178,7 @@ bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
return Changed;
}
bool MVETailPredication::isTailPredicate(Value *V, Value *NumElements) {
bool MVETailPredication::isTailPredicate(Instruction *I, Value *NumElements) {
// Look for the following:
// %trip.count.minus.1 = add i32 %N, -1
@ -206,7 +206,7 @@ bool MVETailPredication::isTailPredicate(Value *V, Value *NumElements) {
Instruction *Induction = nullptr;
// The vector icmp
if (!match(V, m_ICmp(Pred, m_Instruction(Induction),
if (!match(I, m_ICmp(Pred, m_Instruction(Induction),
m_Instruction(Shuffle))) ||
Pred != ICmpInst::ICMP_ULE || !L->isLoopInvariant(Shuffle))
return false;
@ -390,6 +390,55 @@ Value* MVETailPredication::ComputeElements(Value *TripCount,
return Expander.expandCodeFor(Elems, Elems->getType(), InsertPt);
}
// Look through the exit block to see whether there's a duplicate predicate
// instruction. This can happen when we need to perform a select on values
// from the last and previous iteration. Instead of doing a straight
// replacement of that predicate with the vctp, clone the vctp and place it
// in the block. This means that the VPR doesn't have to be live into the
// exit block which should make it easier to convert this loop into a proper
// tail predicated loop.
static void Cleanup(DenseMap<Instruction*, Instruction*> &NewPredicates,
SetVector<Instruction*> &MaybeDead, Loop *L) {
if (BasicBlock *Exit = L->getUniqueExitBlock()) {
for (auto &Pair : NewPredicates) {
Instruction *OldPred = Pair.first;
Instruction *NewPred = Pair.second;
for (auto &I : *Exit) {
if (I.isSameOperationAs(OldPred)) {
Instruction *PredClone = NewPred->clone();
PredClone->insertBefore(&I);
I.replaceAllUsesWith(PredClone);
MaybeDead.insert(&I);
break;
}
}
}
}
// Drop references and add operands to check for dead.
SmallPtrSet<Instruction*, 4> Dead;
while (!MaybeDead.empty()) {
auto *I = MaybeDead.front();
MaybeDead.remove(I);
if (I->hasNUsesOrMore(1))
continue;
for (auto &U : I->operands()) {
if (auto *OpI = dyn_cast<Instruction>(U))
MaybeDead.insert(OpI);
}
I->dropAllReferences();
Dead.insert(I);
}
for (auto *I : Dead)
I->eraseFromParent();
for (auto I : L->blocks())
DeleteDeadPHIs(I);
}
bool MVETailPredication::TryConvert(Value *TripCount) {
if (!IsPredicatedVectorLoop())
return false;
@ -400,13 +449,14 @@ bool MVETailPredication::TryConvert(Value *TripCount) {
// operand is generated from an induction variable.
Module *M = L->getHeader()->getModule();
Type *Ty = IntegerType::get(M->getContext(), 32);
SmallPtrSet<Value*, 4> Predicates;
SetVector<Instruction*> Predicates;
DenseMap<Instruction*, Instruction*> NewPredicates;
for (auto *I : MaskedInsts) {
Intrinsic::ID ID = I->getIntrinsicID();
unsigned PredOp = ID == Intrinsic::masked_load ? 2 : 3;
Value *Predicate = I->getArgOperand(PredOp);
if (Predicates.count(Predicate))
auto *Predicate = dyn_cast<Instruction>(I->getArgOperand(PredOp));
if (!Predicate || Predicates.count(Predicate))
continue;
VectorType *VecTy = getVectorType(I);
@ -445,6 +495,7 @@ bool MVETailPredication::TryConvert(Value *TripCount) {
Value *Remaining = Builder.CreateSub(Processed, Factor);
Value *TailPredicate = Builder.CreateCall(VCTP, Remaining);
Predicate->replaceAllUsesWith(TailPredicate);
NewPredicates[Predicate] = cast<Instruction>(TailPredicate);
// Add the incoming value to the new phi.
Processed->addIncoming(Remaining, L->getLoopLatch());
@ -453,9 +504,8 @@ bool MVETailPredication::TryConvert(Value *TripCount) {
<< "TP: Inserted VCTP: " << *TailPredicate << "\n");
}
for (auto I : L->blocks())
DeleteDeadPHIs(I);
// Now clean up.
Cleanup(NewPredicates, Predicates, L);
return true;
}

View File

@ -0,0 +1,292 @@
; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -enable-arm-maskedldst=true -disable-mve-tail-predication=false --verify-machineinstrs %s -o - | FileCheck %s
; CHECK-LABEL: vpsel_mul_reduce_add
; CHECK: dls lr, lr
; CHECK: [[LOOP:.LBB[0-9_]+]]:
; CHECK: sub{{.*}} [[ELEMS:r[0-9]+]], #4
; CHECK: vctp.32 [[ELEMS]]
; CHECK: vstr p0, [sp
; CHECK: vpstt
; CHECK-NEXT: vldrwt.u32
; CHECK-NEXT: vldrwt.u32
; CHECK: vcmp.i32
; CHECK: vpsel
; CHECK: vldr p0, [sp
; CHECK: vpst
; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r0]
; CHECK: le lr, [[LOOP]]
; CHECK: vctp.32 [[ELEMS]]
; CHECK-NEXT: vpsel
; CHECK-NEXT: vaddv.u32
define dso_local i32 @vpsel_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c, i32 %N) {
entry:
%cmp8 = icmp eq i32 %N, 0
br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %add, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%tmp = getelementptr inbounds i32, i32* %a, i32 %index
%tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12
%tmp2 = bitcast i32* %tmp to <4 x i32>*
%wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
%tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
%tmp4 = bitcast i32* %tmp3 to <4 x i32>*
%wide.masked.load.b = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
%tmp5 = getelementptr inbounds i32, i32* %c, i32 %index
%tmp6 = bitcast i32* %tmp5 to <4 x i32>*
%wide.masked.load.c = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp6, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
%rem = urem i32 %index, 16
%rem.broadcast.splatinsert = insertelement <4 x i32> undef, i32 %rem, i32 0
%rem.broadcast.splat = shufflevector <4 x i32> %rem.broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%cmp = icmp eq <4 x i32> %rem.broadcast.splat, <i32 0, i32 0, i32 0, i32 0>
%wide.masked.load = select <4 x i1> %cmp, <4 x i32> %wide.masked.load.b, <4 x i32> %wide.masked.load.c
%mul = mul nsw <4 x i32> %wide.masked.load, %wide.masked.load.a
%add = add nsw <4 x i32> %mul, %vec.phi
%index.next = add i32 %index, 4
%tmp7 = icmp eq i32 %index.next, %n.vec
br i1 %tmp7, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%tmp8 = select <4 x i1> %tmp1, <4 x i32> %add, <4 x i32> %vec.phi
%tmp9 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %tmp8)
br label %for.cond.cleanup
for.cond.cleanup: ; preds = %middle.block, %entry
%res.0.lcssa = phi i32 [ 0, %entry ], [ %tmp9, %middle.block ]
ret i32 %res.0.lcssa
}
; CHECK-LABEL: vpsel_mul_reduce_add_2
; CHECK: dls lr, lr
; CHECK: [[LOOP:.LBB[0-9_]+]]:
; CHECK: sub{{.*}} [[ELEMS:r[0-9]+]], #4
; CHECK: vctp.32 [[ELEMS]]
; CHECK: vstr p0, [sp
; CHECK: vpstt
; CHECK-NEXT: vldrwt.u32
; CHECK-NEXT: vldrwt.u32
; CHECK; vsub
; CHECK: vpst
; CHECK-NEXT: vldrwt.u32
; CHECK: vcmp.i32
; CHECK: vpsel
; CHECK: vldr p0, [sp
; CHECK: vpst
; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r0]
; CHECK: le lr, [[LOOP]]
; CHECK: vctp.32 [[ELEMS]]
; CHECK-NEXT: vpsel
; CHECK-NEXT: vaddv.u32
define dso_local i32 @vpsel_mul_reduce_add_2(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b,
i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) {
entry:
%cmp8 = icmp eq i32 %N, 0
br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %add, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%tmp = getelementptr inbounds i32, i32* %a, i32 %index
%tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12
%tmp2 = bitcast i32* %tmp to <4 x i32>*
%wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
%tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
%tmp4 = bitcast i32* %tmp3 to <4 x i32>*
%wide.masked.load.b = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
%tmp5 = getelementptr inbounds i32, i32* %c, i32 %index
%tmp6 = bitcast i32* %tmp5 to <4 x i32>*
%wide.masked.load.c = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp6, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
%tmp7 = getelementptr inbounds i32, i32* %d, i32 %index
%tmp8 = bitcast i32* %tmp7 to <4 x i32>*
%wide.masked.load.d = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp8, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
%sub = sub <4 x i32> %wide.masked.load.c, %wide.masked.load.d
%rem = urem i32 %index, 16
%rem.broadcast.splatinsert = insertelement <4 x i32> undef, i32 %rem, i32 0
%rem.broadcast.splat = shufflevector <4 x i32> %rem.broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%cmp = icmp eq <4 x i32> %rem.broadcast.splat, <i32 0, i32 0, i32 0, i32 0>
%sel = select <4 x i1> %cmp, <4 x i32> %sub, <4 x i32> %wide.masked.load.b
%mul = mul <4 x i32> %sel, %wide.masked.load.a
%add = add <4 x i32> %mul, %vec.phi
%index.next = add i32 %index, 4
%cmp.exit = icmp eq i32 %index.next, %n.vec
br i1 %cmp.exit, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%acc = select <4 x i1> %tmp1, <4 x i32> %add, <4 x i32> %vec.phi
%reduce = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %acc)
br label %for.cond.cleanup
for.cond.cleanup: ; preds = %middle.block, %entry
%res.0.lcssa = phi i32 [ 0, %entry ], [ %reduce, %middle.block ]
ret i32 %res.0.lcssa
}
; CHECK-LABEL: and_mul_reduce_add
; CHECK: dls lr, lr
; CHECK: [[LOOP:.LBB[0-9_]+]]:
; CHECK: sub{{.*}} [[ELEMS:r[0-9]+]],{{.*}}#4
; CHECK: vctp.32 [[ELEMS]]
; CHECK: vpstt
; CHECK-NEXT: vldrwt.u32
; CHECK-NEXT: vldrwt.u32
; CHECK: vpsttt
; CHECK-NEXT: vcmpt.i32 eq, {{.*}}, zr
; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r3]
; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r2]
; CHECK: le lr, [[LOOP]]
; CHECK: vctp.32 [[ELEMS]]
; CHECK: vpsel
define dso_local i32 @and_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b,
i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) {
entry:
%cmp8 = icmp eq i32 %N, 0
br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %add, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%tmp = getelementptr inbounds i32, i32* %a, i32 %index
%tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12
%tmp2 = bitcast i32* %tmp to <4 x i32>*
%wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
%tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
%tmp4 = bitcast i32* %tmp3 to <4 x i32>*
%wide.masked.load.b = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
%sub = sub <4 x i32> %wide.masked.load.a, %wide.masked.load.b
%cmp = icmp eq <4 x i32> %sub, <i32 0, i32 0, i32 0, i32 0>
%mask = and <4 x i1> %cmp, %tmp1
%tmp5 = getelementptr inbounds i32, i32* %c, i32 %index
%tmp6 = bitcast i32* %tmp5 to <4 x i32>*
%wide.masked.load.c = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp6, i32 4, <4 x i1> %mask, <4 x i32> undef)
%tmp7 = getelementptr inbounds i32, i32* %d, i32 %index
%tmp8 = bitcast i32* %tmp7 to <4 x i32>*
%wide.masked.load.d = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp8, i32 4, <4 x i1> %mask, <4 x i32> undef)
%mul = mul <4 x i32> %wide.masked.load.c, %wide.masked.load.d
%add = add <4 x i32> %mul, %vec.phi
%index.next = add i32 %index, 4
%cmp.exit = icmp eq i32 %index.next, %n.vec
br i1 %cmp.exit, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%acc = select <4 x i1> %tmp1, <4 x i32> %add, <4 x i32> %vec.phi
%reduce = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %acc)
br label %for.cond.cleanup
for.cond.cleanup: ; preds = %middle.block, %entry
%res.0.lcssa = phi i32 [ 0, %entry ], [ %reduce, %middle.block ]
ret i32 %res.0.lcssa
}
; TODO: Why does p0 get reloaded from the stack into p0, just to be vmrs'd?
; CHECK-LABEL: or_mul_reduce_add
; CHECK: dls lr, lr
; CHECK: [[LOOP:.LBB[0-9_]+]]:
; CHECK: sub{{.*}} [[ELEMS:r[0-9]+]],{{.*}}#4
; CHECK: vctp.32 [[ELEMS]]
; CHECK: vstr p0, [sp
; CHECK: vpstt
; CHECK-NEXT: vldrwt.u32
; CHECK-NEXT: vldrwt.u32
; CHECK: vcmp.i32 eq, {{.*}}, zr
; CHECK: vmrs [[VCMP:r[0-9]+]], p0
; CHECK: vldr p0, [sp
; CHECK: vmrs [[VCTP:r[0-9]+]], p0
; CHECK: orr{{.*}} [[VCMP]], [[VCTP]]
; CHECK-NEXT: vmsr p0
; CHECK-NEXT: vpstt
; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r3]
; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r2]
; CHECK: le lr, [[LOOP]]
; CHECK: vctp.32 [[ELEMS]]
; CHECK: vpsel
define dso_local i32 @or_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b,
i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) {
entry:
%cmp8 = icmp eq i32 %N, 0
br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %add, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%tmp = getelementptr inbounds i32, i32* %a, i32 %index
%tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12
%tmp2 = bitcast i32* %tmp to <4 x i32>*
%wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
%tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
%tmp4 = bitcast i32* %tmp3 to <4 x i32>*
%wide.masked.load.b = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
%sub = sub <4 x i32> %wide.masked.load.a, %wide.masked.load.b
%cmp = icmp eq <4 x i32> %sub, <i32 0, i32 0, i32 0, i32 0>
%mask = or <4 x i1> %cmp, %tmp1
%tmp5 = getelementptr inbounds i32, i32* %c, i32 %index
%tmp6 = bitcast i32* %tmp5 to <4 x i32>*
%wide.masked.load.c = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp6, i32 4, <4 x i1> %mask, <4 x i32> undef)
%tmp7 = getelementptr inbounds i32, i32* %d, i32 %index
%tmp8 = bitcast i32* %tmp7 to <4 x i32>*
%wide.masked.load.d = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp8, i32 4, <4 x i1> %mask, <4 x i32> undef)
%mul = mul <4 x i32> %wide.masked.load.c, %wide.masked.load.d
%add = add <4 x i32> %mul, %vec.phi
%index.next = add i32 %index, 4
%cmp.exit = icmp eq i32 %index.next, %n.vec
br i1 %cmp.exit, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%acc = select <4 x i1> %tmp1, <4 x i32> %add, <4 x i32> %vec.phi
%reduce = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %acc)
br label %for.cond.cleanup
for.cond.cleanup: ; preds = %middle.block, %entry
%res.0.lcssa = phi i32 [ 0, %entry ], [ %reduce, %middle.block ]
ret i32 %res.0.lcssa
}
; Function Attrs: argmemonly nounwind readonly willreturn
declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
; Function Attrs: nounwind readnone willreturn
declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)

View File

@ -0,0 +1,242 @@
; RUN: llc -mtriple=armv8.1m.main -mattr=+mve -enable-arm-maskedldst=true -disable-mve-tail-predication=false --verify-machineinstrs %s -o - | FileCheck %s
; CHECK-LABEL: mul_reduce_add
; CHECK: dls lr,
; CHECK: [[LOOP:.LBB[0-9_]+]]:
; CHECK: sub{{.*}} [[ELEMS:r[0-9]+]], #4
; CHECK: vctp.32 [[ELEMS]]
; CHECK: vpstt
; CHECK-NEXT: vldrwt.u32
; CHECK-NEXT: vldrwt.u32
; CHECK: le lr, [[LOOP]]
; CHECK: vctp.32 [[ELEMS]]
; CHECK: vpsel
; CHECK: vaddv.u32 r0
define dso_local i32 @mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) {
entry:
%cmp8 = icmp eq i32 %N, 0
br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %6, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%0 = getelementptr inbounds i32, i32* %a, i32 %index
%1 = icmp ule <4 x i32> %induction, %broadcast.splat12
%2 = bitcast i32* %0 to <4 x i32>*
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
%3 = getelementptr inbounds i32, i32* %b, i32 %index
%4 = bitcast i32* %3 to <4 x i32>*
%wide.masked.load13 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %4, i32 4, <4 x i1> %1, <4 x i32> undef)
%5 = mul nsw <4 x i32> %wide.masked.load13, %wide.masked.load
%6 = add nsw <4 x i32> %5, %vec.phi
%index.next = add i32 %index, 4
%7 = icmp eq i32 %index.next, %n.vec
br i1 %7, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%8 = select <4 x i1> %1, <4 x i32> %6, <4 x i32> %vec.phi
%9 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %8)
br label %for.cond.cleanup
for.cond.cleanup: ; preds = %middle.block, %entry
%res.0.lcssa = phi i32 [ 0, %entry ], [ %9, %middle.block ]
ret i32 %res.0.lcssa
}
; Function Attrs: norecurse nounwind readonly
define dso_local i32 @mul_reduce_add_const(i32* noalias nocapture readonly %a, i32 %b, i32 %N) {
entry:
%cmp6 = icmp eq i32 %N, 0
br i1 %cmp6, label %for.cond.cleanup, label %vector.ph
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert9 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat10 = shufflevector <4 x i32> %broadcast.splatinsert9, <4 x i32> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%0 = getelementptr inbounds i32, i32* %a, i32 %index
%1 = icmp ule <4 x i32> %induction, %broadcast.splat10
%2 = bitcast i32* %0 to <4 x i32>*
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
%3 = add nsw <4 x i32> %wide.masked.load, %vec.phi
%index.next = add i32 %index, 4
%4 = icmp eq i32 %index.next, %n.vec
br i1 %4, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%5 = select <4 x i1> %1, <4 x i32> %3, <4 x i32> %vec.phi
%6 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %5)
br label %for.cond.cleanup
for.cond.cleanup: ; preds = %middle.block, %entry
%res.0.lcssa = phi i32 [ 0, %entry ], [ %6, %middle.block ]
ret i32 %res.0.lcssa
}
; CHECK-LABEL: add_reduce_add_const
; CHECK: dls lr, lr
; CHECK: [[LOOP:.LBB[0-9_]+]]:
; CHECK: subs [[ELEMS:r[0-9]+]], #4
; CHECK: vctp.32 [[ELEMS]]
; CHECK: vpst
; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r0]
; CHECK: vadd.i32
; CHECK: le lr, [[LOOP]]
; CHECK: vctp.32 [[ELEMS]]
; CHECK: vpsel
define dso_local i32 @add_reduce_add_const(i32* noalias nocapture readonly %a, i32 %b, i32 %N) {
entry:
%cmp6 = icmp eq i32 %N, 0
br i1 %cmp6, label %for.cond.cleanup, label %vector.ph
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert9 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat10 = shufflevector <4 x i32> %broadcast.splatinsert9, <4 x i32> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%0 = getelementptr inbounds i32, i32* %a, i32 %index
%1 = icmp ule <4 x i32> %induction, %broadcast.splat10
%2 = bitcast i32* %0 to <4 x i32>*
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
%3 = add nsw <4 x i32> %wide.masked.load, %vec.phi
%index.next = add i32 %index, 4
%4 = icmp eq i32 %index.next, %n.vec
br i1 %4, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%5 = select <4 x i1> %1, <4 x i32> %3, <4 x i32> %vec.phi
%6 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %5)
br label %for.cond.cleanup
for.cond.cleanup: ; preds = %middle.block, %entry
%res.0.lcssa = phi i32 [ 0, %entry ], [ %6, %middle.block ]
ret i32 %res.0.lcssa
}
; CHECK-LABEL: vector_mul_const
; CHECK: dls lr, lr
; CHECK: [[LOOP:.LBB[0-9_]+]]:
; CHECK: subs [[ELEMS:r[0-9]+]], #4
; CHECK: vctp.32 [[ELEMS]]
; CHECK: vpst
; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r1]
; CHECK: vmul.i32
; CHECK: vpst
; CHECK-NEXT: vstrwt.32 q{{.*}}, [r0]
; CHECK: le lr, [[LOOP]]
define dso_local void @vector_mul_const(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %c, i32 %N) {
entry:
%cmp6 = icmp eq i32 %N, 0
br i1 %cmp6, label %for.cond.cleanup, label %vector.ph
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer
%broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %c, i32 0
%broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%0 = getelementptr inbounds i32, i32* %b, i32 %index
%1 = icmp ule <4 x i32> %induction, %broadcast.splat9
%2 = bitcast i32* %0 to <4 x i32>*
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
%3 = mul nsw <4 x i32> %wide.masked.load, %broadcast.splat11
%4 = getelementptr inbounds i32, i32* %a, i32 %index
%5 = bitcast i32* %4 to <4 x i32>*
call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %5, i32 4, <4 x i1> %1)
%index.next = add i32 %index, 4
%6 = icmp eq i32 %index.next, %n.vec
br i1 %6, label %for.cond.cleanup, label %vector.body
for.cond.cleanup: ; preds = %vector.body, %entry
ret void
}
; CHECK-LABEL: vector_add_const
; CHECK: dls lr, lr
; CHECK: [[LOOP:.LBB[0-9_]+]]:
; CHECK: subs [[ELEMS:r[0-9]+]], #4
; CHECK: vctp.32 [[ELEMS]]
; CHECK: vpst
; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r1]
; CHECK: vadd.i32
; CHECK: vpst
; CHECK-NEXT: vstrwt.32 q{{.*}}, [r0]
; CHECK: le lr, [[LOOP]]
define dso_local void @vector_add_const(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %c, i32 %N) {
entry:
%cmp6 = icmp eq i32 %N, 0
br i1 %cmp6, label %for.cond.cleanup, label %vector.ph
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer
%broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %c, i32 0
%broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%0 = getelementptr inbounds i32, i32* %b, i32 %index
%1 = icmp ule <4 x i32> %induction, %broadcast.splat9
%2 = bitcast i32* %0 to <4 x i32>*
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
%3 = add nsw <4 x i32> %wide.masked.load, %broadcast.splat11
%4 = getelementptr inbounds i32, i32* %a, i32 %index
%5 = bitcast i32* %4 to <4 x i32>*
call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %5, i32 4, <4 x i1> %1)
%index.next = add i32 %index, 4
%6 = icmp eq i32 %index.next, %n.vec
br i1 %6, label %for.cond.cleanup, label %vector.body
for.cond.cleanup: ; preds = %vector.body, %entry
ret void
}
declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #4
declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)

View File

@ -0,0 +1,75 @@
; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -disable-mve-tail-predication=false -mattr=+mve %s -S -o - | FileCheck %s
; CHECK-LABEL: vec_mul_reduce_add
; CHECK: vector.body:
; CHECK-NOT: phi i32 [ 0, %vector.ph ]
; CHECK: [[ELTS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[SUB:%[^ ]+]], %vector.body ]
; CHECK: [[SUB]] = sub i32 [[ELTS]], 4
; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[SUB]])
; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]]
; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]],
; CHECK: middle.block:
; CHECK: [[VCTP_CLONE:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[SUB]])
; CHECK: [[VPSEL:%[^ ]+]] = select <4 x i1> [[VCTP_CLONE]],
; CHECK: call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[VPSEL]])
define i32 @vec_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) {
entry:
%cmp8 = icmp eq i32 %N, 0
%0 = add i32 %N, 3
%1 = lshr i32 %0, 2
%2 = shl nuw i32 %1, 2
%3 = add i32 %2, -4
%4 = lshr i32 %3, 2
%5 = add nuw nsw i32 %4, 1
br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
vector.ph: ; preds = %entry
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
call void @llvm.set.loop.iterations.i32(i32 %5)
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%lsr.iv2 = phi i32* [ %scevgep3, %vector.body ], [ %a, %vector.ph ]
%lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %b, %vector.ph ]
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %9, %vector.body ]
%6 = phi i32 [ %5, %vector.ph ], [ %10, %vector.body ]
%lsr.iv24 = bitcast i32* %lsr.iv2 to <4 x i32>*
%lsr.iv1 = bitcast i32* %lsr.iv to <4 x i32>*
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%7 = icmp ule <4 x i32> %induction, %broadcast.splat12
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv24, i32 4, <4 x i1> %7, <4 x i32> undef)
%wide.masked.load13 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1, i32 4, <4 x i1> %7, <4 x i32> undef)
%8 = mul nsw <4 x i32> %wide.masked.load13, %wide.masked.load
%9 = add nsw <4 x i32> %8, %vec.phi
%index.next = add i32 %index, 4
%scevgep = getelementptr i32, i32* %lsr.iv, i32 4
%scevgep3 = getelementptr i32, i32* %lsr.iv2, i32 4
%10 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %6, i32 1)
%11 = icmp ne i32 %10, 0
br i1 %11, label %vector.body, label %middle.block
middle.block: ; preds = %vector.body
%12 = icmp ule <4 x i32> %induction, %broadcast.splat12
%13 = select <4 x i1> %12, <4 x i32> %9, <4 x i32> %vec.phi
%14 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %13)
br label %for.cond.cleanup
for.cond.cleanup: ; preds = %middle.block, %entry
%res.0.lcssa = phi i32 [ 0, %entry ], [ %14, %middle.block ]
ret i32 %res.0.lcssa
}
declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
declare void @llvm.set.loop.iterations.i32(i32)
declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)

View File

@ -4,8 +4,8 @@
define void @vctp8(i32 %arg, <16 x i8> *%in, <16 x i8>* %out) {
; CHECK-LABEL: vctp8:
; CHECK: @ %bb.0:
; CHECK-NEXT: vldrw.u32 q1, [r1]
; CHECK-NEXT: vctp.8 r0
; CHECK-NEXT: vldrw.u32 q1, [r1]
; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: vpsel q0, q1, q0
; CHECK-NEXT: vstrw.32 q0, [r2]
@ -20,8 +20,8 @@ define void @vctp8(i32 %arg, <16 x i8> *%in, <16 x i8>* %out) {
define void @vctp16(i32 %arg, <8 x i16> *%in, <8 x i16>* %out) {
; CHECK-LABEL: vctp16:
; CHECK: @ %bb.0:
; CHECK-NEXT: vldrw.u32 q1, [r1]
; CHECK-NEXT: vctp.16 r0
; CHECK-NEXT: vldrw.u32 q1, [r1]
; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: vpsel q0, q1, q0
; CHECK-NEXT: vstrw.32 q0, [r2]
@ -36,8 +36,8 @@ define void @vctp16(i32 %arg, <8 x i16> *%in, <8 x i16>* %out) {
define void @vctp32(i32 %arg, <4 x i32> *%in, <4 x i32>* %out) {
; CHECK-LABEL: vctp32:
; CHECK: @ %bb.0:
; CHECK-NEXT: vldrw.u32 q1, [r1]
; CHECK-NEXT: vctp.32 r0
; CHECK-NEXT: vldrw.u32 q1, [r1]
; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: vpsel q0, q1, q0
; CHECK-NEXT: vstrw.32 q0, [r2]