[LV] Fix incorrectly marking a pointer indvar as 'scalar'.

collectLoopScalars should only add non-uniform nodes to the list if they
are used by a load/store instruction that is marked as CM_Scalarize.

Before this patch, the LV incorrectly marked pointer induction variables
as 'scalar' when they required to be widened by something else,
such as a compare instruction, and weren't used by a node marked as
'CM_Scalarize'. This case is covered by sve-widen-phi.ll.

This change also allows removing some code where the LV tried to
widen the PHI nodes with a stepvector, even though it was marked as
'scalarAfterVectorization'. Now that this code is more careful about
marking instructions that need widening as 'scalar', this code has
become redundant.

Differential Revision: https://reviews.llvm.org/D114373
This commit is contained in:
Sander de Smalen 2021-11-21 20:07:51 +00:00
parent a9f837bbf0
commit 28a4deab92
6 changed files with 235 additions and 199 deletions

View File

@ -1915,9 +1915,11 @@ private:
/// Collect the instructions that are scalar after vectorization. An
/// instruction is scalar if it is known to be uniform or will be scalarized
/// during vectorization. Non-uniform scalarized instructions will be
/// represented by VF values in the vectorized loop, each corresponding to an
/// iteration of the original scalar loop.
/// during vectorization. collectLoopScalars should only add non-uniform nodes
/// to the list if they are used by a load/store instruction that is marked as
/// CM_Scalarize. Non-uniform scalarized instructions will be represented by
/// VF values in the vectorized loop, each corresponding to an iteration of
/// the original scalar loop.
void collectLoopScalars(ElementCount VF);
/// Keeps cost model vectorization decision and cost for instructions.
@ -4862,38 +4864,14 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
// iteration. If the instruction is uniform, we only need to generate the
// first lane. Otherwise, we generate all VF values.
bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF);
unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue();
bool NeedsVectorIndex = !IsUniform && VF.isScalable();
Value *UnitStepVec = nullptr, *PtrIndSplat = nullptr;
if (NeedsVectorIndex) {
Type *VecIVTy = VectorType::get(PtrInd->getType(), VF);
UnitStepVec = Builder.CreateStepVector(VecIVTy);
PtrIndSplat = Builder.CreateVectorSplat(VF, PtrInd);
}
assert((IsUniform || !State.VF.isScalable()) &&
"Cannot scalarize a scalable VF");
unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue();
for (unsigned Part = 0; Part < UF; ++Part) {
Value *PartStart =
createStepForVF(Builder, PtrInd->getType(), VF, Part);
if (NeedsVectorIndex) {
// Here we cache the whole vector, which means we can support the
// extraction of any lane. However, in some cases the extractelement
// instruction that is generated for scalar uses of this vector (e.g.
// a load instruction) is not folded away. Therefore we still
// calculate values for the first n lanes to avoid redundant moves
// (when extracting the 0th element) and to produce scalar code (i.e.
// additional add/gep instructions instead of expensive extractelement
// instructions) when extracting higher-order elements.
Value *PartStartSplat = Builder.CreateVectorSplat(VF, PartStart);
Value *Indices = Builder.CreateAdd(PartStartSplat, UnitStepVec);
Value *GlobalIndices = Builder.CreateAdd(PtrIndSplat, Indices);
Value *SclrGep =
emitTransformedIndex(Builder, GlobalIndices, PSE.getSE(), DL, II);
SclrGep->setName("next.gep");
State.set(PhiR, SclrGep, Part);
}
for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
Value *Idx = Builder.CreateAdd(
PartStart, ConstantInt::get(PtrInd->getType(), Lane));
@ -5229,38 +5207,11 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
!TheLoop->isLoopInvariant(V);
};
auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) {
if (!isa<PHINode>(Ptr) ||
!Legal->getInductionVars().count(cast<PHINode>(Ptr)))
return false;
auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)];
if (Induction.getKind() != InductionDescriptor::IK_PtrInduction)
return false;
return isScalarUse(MemAccess, Ptr);
};
// A helper that evaluates a memory access's use of a pointer. If the
// pointer is actually the pointer induction of a loop, it is being
// inserted into Worklist. If the use will be a scalar use, and the
// pointer is only used by memory accesses, we place the pointer in
// ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs.
// A helper that evaluates a memory access's use of a pointer. If the use will
// be a scalar use and the pointer is only used by memory accesses, we place
// the pointer in ScalarPtrs. Otherwise, the pointer is placed in
// PossibleNonScalarPtrs.
auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
if (isScalarPtrInduction(MemAccess, Ptr)) {
Worklist.insert(cast<Instruction>(Ptr));
LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr
<< "\n");
Instruction *Update = cast<Instruction>(
cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
// If there is more than one user of Update (Ptr), we shouldn't assume it
// will be scalar after vectorisation as other users of the instruction
// may require widening. Otherwise, add it to ScalarPtrs.
if (Update->hasOneUse() && cast<Value>(*Update->user_begin()) == Ptr) {
ScalarPtrs.insert(Update);
return;
}
}
// We only care about bitcast and getelementptr instructions contained in
// the loop.
if (!isLoopVaryingBitCastOrGEP(Ptr))
@ -5352,11 +5303,22 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
continue;
// Returns true if \p Indvar is a pointer induction that is used directly by
// load/store instruction \p I.
auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
Instruction *I) {
return Induction.second.getKind() ==
InductionDescriptor::IK_PtrInduction &&
(isa<LoadInst>(I) || isa<StoreInst>(I)) &&
Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
};
// Determine if all users of the induction variable are scalar after
// vectorization.
auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
auto *I = cast<Instruction>(U);
return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
IsDirectLoadStoreFromPtrIndvar(Ind, I);
});
if (!ScalarInd)
continue;
@ -5366,7 +5328,8 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
auto ScalarIndUpdate =
llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
auto *I = cast<Instruction>(U);
return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
});
if (!ScalarIndUpdate)
continue;

View File

@ -90,7 +90,7 @@ for.end:
;
; Same as predicate_store except we use a pointer PHI to maintain the address
;
; CHECK: Found new scalar instruction: %addr = phi i32* [ %a, %entry ], [ %addr.next, %for.inc ]
; CHECK: Found scalar instruction: %addr = phi i32* [ %a, %entry ], [ %addr.next, %for.inc ]
; CHECK: Found scalar instruction: %addr.next = getelementptr inbounds i32, i32* %addr, i64 1
; CHECK: Scalarizing and predicating: store i32 %tmp2, i32* %addr, align 4
; CHECK: Found an estimated cost of 0 for VF 2 For instruction: %addr = phi i32* [ %a, %entry ], [ %addr.next, %for.inc ]

View File

@ -41,35 +41,39 @@ define void @pointer_induction_used_as_vector(i8** noalias %start.1, i8* noalias
; CHECK-NEXT: [[IND_END3:%.*]] = getelementptr i8, i8* [[START_2:%.*]], i64 [[N_VEC]]
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi i8* [ [[START_2]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8*, i8** [[START_1]], i64 [[TMP5]]
; CHECK-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[INDEX]], i32 0
; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 2
; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1
; CHECK-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP8]]
; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP7]], 0
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP10]], i32 0
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP7:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP6]]
; CHECK-NEXT: [[TMP8:%.*]] = add <vscale x 2 x i64> [[DOTSPLAT]], [[TMP7]]
; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, i8* [[START_2]], <vscale x 2 x i64> [[TMP8]]
; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, i8* [[START_2]], i64 [[TMP9]]
; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 1
; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i8, i8* [[START_2]], i64 [[TMP10]]
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, <vscale x 2 x i8*> [[NEXT_GEP4]], i64 1
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8*, i8** [[NEXT_GEP]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8** [[TMP12]] to <vscale x 2 x i8*>*
; CHECK-NEXT: store <vscale x 2 x i8*> [[TMP11]], <vscale x 2 x i8*>* [[TMP13]], align 8
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, i8* [[NEXT_GEP5]], i32 0
; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <vscale x 2 x i8>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, <vscale x 2 x i8>* [[TMP15]], align 1
; CHECK-NEXT: [[TMP16:%.*]] = add <vscale x 2 x i8> [[WIDE_LOAD]], shufflevector (<vscale x 2 x i8> insertelement (<vscale x 2 x i8> poison, i8 1, i32 0), <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer)
; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP14]] to <vscale x 2 x i8>*
; CHECK-NEXT: store <vscale x 2 x i8> [[TMP16]], <vscale x 2 x i8>* [[TMP17]], align 1
; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 2
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]]
; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK-NEXT: [[TMP11:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
; CHECK-NEXT: [[TMP12:%.*]] = add <vscale x 2 x i64> [[DOTSPLAT]], [[TMP11]]
; CHECK-NEXT: [[VECTOR_GEP:%.*]] = mul <vscale x 2 x i64> [[TMP12]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, i8* [[POINTER_PHI]], <vscale x 2 x i64> [[VECTOR_GEP]]
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, <vscale x 2 x i8*> [[TMP13]], i64 1
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i8*, i8** [[NEXT_GEP]], i32 0
; CHECK-NEXT: [[TMP16:%.*]] = bitcast i8** [[TMP15]] to <vscale x 2 x i8*>*
; CHECK-NEXT: store <vscale x 2 x i8*> [[TMP14]], <vscale x 2 x i8*>* [[TMP16]], align 8
; CHECK-NEXT: [[TMP17:%.*]] = extractelement <vscale x 2 x i8*> [[TMP13]], i32 0
; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i8, i8* [[TMP17]], i32 0
; CHECK-NEXT: [[TMP19:%.*]] = bitcast i8* [[TMP18]] to <vscale x 2 x i8>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, <vscale x 2 x i8>* [[TMP19]], align 1
; CHECK-NEXT: [[TMP20:%.*]] = add <vscale x 2 x i8> [[WIDE_LOAD]], shufflevector (<vscale x 2 x i8> insertelement (<vscale x 2 x i8> poison, i8 1, i32 0), <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer)
; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8* [[TMP18]] to <vscale x 2 x i8>*
; CHECK-NEXT: store <vscale x 2 x i8> [[TMP20]], <vscale x 2 x i8>* [[TMP21]], align 1
; CHECK-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP23:%.*]] = mul i64 [[TMP22]], 2
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP23]]
; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, i8* [[POINTER_PHI]], i64 [[TMP9]]
; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@ -119,32 +123,66 @@ exit: ; preds = %loop.body
define void @pointer_induction(i8* noalias %start, i64 %N) {
; CHECK-LABEL: @pointer_induction(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1
; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 2
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i8*> poison, i8* [[START:%.*]], i32 0
; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, i8* [[START:%.*]], i64 [[N_VEC]]
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i8*> poison, i8* [[START]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i8*> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i8*> poison, <vscale x 2 x i32> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi i8* [ [[START]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[INDEX1]], i32 0
; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2
; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 1
; CHECK-NEXT: [[TMP8:%.*]] = mul i64 1, [[TMP7]]
; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP6]], 0
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP9]], i32 0
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP6:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP5]]
; CHECK-NEXT: [[TMP7:%.*]] = add <vscale x 2 x i64> [[DOTSPLAT]], [[TMP6]]
; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, i8* [[START]], <vscale x 2 x i64> [[TMP7]]
; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX1]], 0
; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, i8* [[START]], i64 [[TMP8]]
; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 1
; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, i8* [[START]], i64 [[TMP9]]
; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX1]], 0
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[NEXT_GEP3]], i32 0
; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP11]] to <vscale x 2 x i8>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, <vscale x 2 x i8>* [[TMP12]], align 1
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, <vscale x 2 x i8*> [[NEXT_GEP]], i64 1
; CHECK-NEXT: [[TMP14:%.*]] = icmp eq <vscale x 2 x i8*> [[TMP13]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 2
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], [[TMP16]]
; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK-NEXT: [[TMP10:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
; CHECK-NEXT: [[TMP11:%.*]] = add <vscale x 2 x i64> [[DOTSPLAT]], [[TMP10]]
; CHECK-NEXT: [[VECTOR_GEP:%.*]] = mul <vscale x 2 x i64> [[TMP11]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[POINTER_PHI]], <vscale x 2 x i64> [[VECTOR_GEP]]
; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX1]], 0
; CHECK-NEXT: [[TMP14:%.*]] = extractelement <vscale x 2 x i8*> [[TMP12]], i32 0
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i8, i8* [[TMP14]], i32 0
; CHECK-NEXT: [[TMP16:%.*]] = bitcast i8* [[TMP15]] to <vscale x 2 x i8>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, <vscale x 2 x i8>* [[TMP16]], align 1
; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, <vscale x 2 x i8*> [[TMP12]], i64 1
; CHECK-NEXT: [[TMP18:%.*]] = icmp eq <vscale x 2 x i8*> [[TMP17]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 2
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], [[TMP20]]
; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, i8* [[POINTER_PHI]], i64 [[TMP8]]
; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY:%.*]] ]
; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[PTR_PHI:%.*]] = phi i8* [ [[PTR_PHI_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ]
; CHECK-NEXT: [[INDEX_NXT]] = add i64 [[INDEX]], 1
; CHECK-NEXT: [[TMP22:%.*]] = load i8, i8* [[PTR_PHI]], align 1
; CHECK-NEXT: [[PTR_PHI_NEXT]] = getelementptr inbounds i8, i8* [[PTR_PHI]], i64 1
; CHECK-NEXT: [[CMP_I_NOT:%.*]] = icmp eq i8* [[PTR_PHI_NEXT]], [[START]]
; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDEX]], [[N]]
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[END]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: end:
; CHECK-NEXT: ret void
;
entry:
br label %for.body

View File

@ -125,25 +125,26 @@ for.cond.cleanup: ; preds = %for.body
define i32 @pointer_iv_mixed(i32* noalias %a, i32** noalias %b, i64 %n) #0 {
; CHECK-LABEL: @pointer_iv_mixed(
; CHECK: vector.body:
; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi i32* [ %a, %vector.ph ], [ [[PTR_IND:%.*]], %vector.body ]
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i32> [ insertelement (<vscale x 2 x i32> zeroinitializer, i32 0, i32 0), %vector.ph ], [ [[TMP7:%.*]], %vector.body ]
; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[INDEX]], i32 0
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP5:%.*]] = add <vscale x 2 x i64> [[DOTSPLAT]], [[TMP4]]
; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i32, i32* %a, <vscale x 2 x i64> [[TMP5]]
; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i32, i32* %a, i64 [[INDEX]]
; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i32*, i32** %b, i64 [[INDEX]]
; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[NEXT_GEP4]] to <vscale x 2 x i32>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i32>, <vscale x 2 x i32>* [[TMP6]], align 8
; CHECK-NEXT: [[TMP7]] = add <vscale x 2 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32** [[NEXT_GEP6]] to <vscale x 2 x i32*>*
; CHECK-NEXT: store <vscale x 2 x i32*> [[NEXT_GEP]], <vscale x 2 x i32*>* [[TMP8]], align 8
; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP10:%.*]] = shl nuw nsw i64 [[TMP9]], 1
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], {{.*}}
; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label %vector.body, !llvm.loop [[LOOP7:![0-9]+]]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i32> [ insertelement (<vscale x 2 x i32> zeroinitializer, i32 0, i32 0), %vector.ph ], [ [[TMP9:%.*]], %vector.body ]
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 1
; CHECK-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[POINTER_PHI]], <vscale x 2 x i64> [[TMP6]]
; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i32*, i32** %b, i64 [[INDEX]]
; CHECK-NEXT: [[BC:%.*]] = bitcast <vscale x 2 x i32*> [[TMP7]] to <vscale x 2 x <vscale x 2 x i32>*>
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <vscale x 2 x <vscale x 2 x i32>*> [[BC]], i32 0
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i32>, <vscale x 2 x i32>* [[TMP8]], align 8
; CHECK-NEXT: [[TMP9]] = add <vscale x 2 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32** [[NEXT_GEP]] to <vscale x 2 x i32*>*
; CHECK-NEXT: store <vscale x 2 x i32*> [[TMP7]], <vscale x 2 x i32*>* [[TMP10]], align 8
; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 1
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]]
; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], {{.*}}
; CHECK-NEXT: [[PTR_IND]] = getelementptr i32, i32* [[POINTER_PHI]], i64 [[TMP5]]
; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label %vector.body, !llvm.loop [[LOOP7:![0-9]+]]
entry:
br label %for.body
@ -166,7 +167,51 @@ for.end:
ret i32 %tmp5
}
define void @phi_used_in_vector_compare_and_scalar_indvar_update_and_store(i16* %ptr) #0 {
; CHECK-LABEL: @phi_used_in_vector_compare_and_scalar_indvar_update_and_store(
; CHECK: vector.body:
; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi i16* [ %ptr, %vector.ph ], [ [[PTR_IND:%.*]], %vector.body ]
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 1
; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i16, i16* [[POINTER_PHI]], <vscale x 2 x i64> [[TMP4]]
; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <vscale x 2 x i16*> [[TMP5]], zeroinitializer
; CHECK-NEXT: [[BC:%.*]] = bitcast <vscale x 2 x i16*> [[TMP5]] to <vscale x 2 x <vscale x 2 x i16>*>
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <vscale x 2 x <vscale x 2 x i16>*> [[BC]], i32 0
; CHECK-NEXT: call void @llvm.masked.store.nxv2i16.p0nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16>* [[TMP7]], i32 2, <vscale x 2 x i1> [[TMP6]])
; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP9:%.*]] = shl nuw nsw i64 [[TMP8]], 1
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], {{.*}}
; CHECK-NEXT: [[PTR_IND]] = getelementptr i16, i16* [[POINTER_PHI]], i64 [[TMP3]]
; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label %vector.body, !llvm.loop [[LOOP9:![0-9]+]]
entry:
br label %for.body
for.body: ; preds = %if.end, %entry
%iv = phi i64 [ %inc, %if.end ], [ 0, %entry ]
%iv.ptr = phi i16* [ %incdec.iv.ptr, %if.end ], [ %ptr, %entry ]
%cmp.i = icmp ne i16* %iv.ptr, null
br i1 %cmp.i, label %if.end.sink.split, label %if.end
if.end.sink.split: ; preds = %for.body
store i16 0, i16* %iv.ptr, align 2
br label %if.end
if.end: ; preds = %if.end.sink.split, %for.body
%incdec.iv.ptr = getelementptr inbounds i16, i16* %iv.ptr, i64 1
%inc = add nuw nsw i64 %iv, 1
%exitcond.not = icmp ult i64 %inc, 1024
br i1 %exitcond.not, label %for.body, label %for.end, !llvm.loop !6
for.end: ; preds = %if.end, %for.end
%iv.ptr.1.lcssa = phi i16* [ %incdec.iv.ptr, %if.end ]
ret void
}
attributes #0 = { vscale_range(0, 16) }
!0 = distinct !{!0, !1, !2, !3, !4, !5}
!1 = !{!"llvm.loop.mustprogress"}
!2 = !{!"llvm.loop.vectorize.width", i32 4}

View File

@ -399,23 +399,13 @@ for.end:
; CHECK-NOT: LV: Found uniform instruction: %p = phi i32* [ %tmp3, %for.body ], [ %a, %entry ]
; CHECK: LV: Found uniform instruction: %q = phi i32** [ %tmp4, %for.body ], [ %b, %entry ]
; CHECK: vector.body
; CHECK: %pointer.phi = phi i32* [ %a, %vector.ph ], [ %ptr.ind, %vector.body ]
; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; CHECK: %next.gep = getelementptr i32, i32* %a, i64 %index
; CHECK: %[[I1:.+]] = or i64 %index, 1
; CHECK: %next.gep10 = getelementptr i32, i32* %a, i64 %[[I1]]
; CHECK: %[[I2:.+]] = or i64 %index, 2
; CHECK: %next.gep11 = getelementptr i32, i32* %a, i64 %[[I2]]
; CHECK: %[[I3:.+]] = or i64 %index, 3
; CHECK: %next.gep12 = getelementptr i32, i32* %a, i64 %[[I3]]
; CHECK: %[[V0:.+]] = insertelement <4 x i32*> poison, i32* %next.gep, i32 0
; CHECK: %[[V1:.+]] = insertelement <4 x i32*> %[[V0]], i32* %next.gep10, i32 1
; CHECK: %[[V2:.+]] = insertelement <4 x i32*> %[[V1]], i32* %next.gep11, i32 2
; CHECK: %[[V3:.+]] = insertelement <4 x i32*> %[[V2]], i32* %next.gep12, i32 3
; CHECK-NOT: getelementptr
; CHECK: %next.gep13 = getelementptr i32*, i32** %b, i64 %index
; CHECK-NOT: getelementptr
; CHECK: %[[B0:.+]] = bitcast i32** %next.gep13 to <4 x i32*>*
; CHECK: store <4 x i32*> %[[V3]], <4 x i32*>* %[[B0]], align 8
; CHECK: %[[PTRVEC:.+]] = getelementptr i32, i32* %pointer.phi, <4 x i64> <i64 0, i64 1, i64 2, i64 3>
; CHECK: %next.gep = getelementptr i32*, i32** %b, i64 %index
; CHECK: %[[NEXTGEPBC:.+]] = bitcast i32** %next.gep to <4 x i32*>*
; CHECK: store <4 x i32*> %[[PTRVEC]], <4 x i32*>* %[[NEXTGEPBC]], align 8
; CHECK: %ptr.ind = getelementptr i32, i32* %pointer.phi, i64 4
; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body
;
define i32 @pointer_iv_mixed(i32* %a, i32** %b, i64 %n) {

View File

@ -21,13 +21,13 @@ define void @a(i8* readnone %b) {
; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, i8* null, i64 [[TMP1]]
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi i8* [ null, [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[PRED_STORE_CONTINUE7:%.*]] ]
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE7]] ]
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, i8* [[POINTER_PHI]], <4 x i64> <i64 0, i64 -1, i64 -2, i64 -3>
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, <4 x i8*> [[TMP2]], i64 -1
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i8*> [[TMP3]], i32 0
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, i8* [[TMP4]], i32 0
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[TMP5]], i32 -3
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE10:%.*]] ]
; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], -1
; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, i8* null, i64 [[TMP3]]
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP]], i64 -1
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 0
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP5]], i32 -3
; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <4 x i8>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP7]], align 1
; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@ -36,35 +36,43 @@ define void @a(i8* readnone %b) {
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP9]], i32 0
; CHECK-NEXT: br i1 [[TMP10]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
; CHECK: pred.store.if:
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i8*> [[TMP3]], i32 0
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP]], i64 -1
; CHECK-NEXT: store i8 95, i8* [[TMP11]], align 1
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]]
; CHECK: pred.store.continue:
; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i1> [[TMP9]], i32 1
; CHECK-NEXT: br i1 [[TMP12]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3:%.*]]
; CHECK: pred.store.if2:
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i8*> [[TMP3]], i32 1
; CHECK-NEXT: store i8 95, i8* [[TMP13]], align 1
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE3]]
; CHECK: pred.store.continue3:
; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i1> [[TMP9]], i32 2
; CHECK-NEXT: br i1 [[TMP14]], label [[PRED_STORE_IF4:%.*]], label [[PRED_STORE_CONTINUE5:%.*]]
; CHECK: pred.store.if4:
; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i8*> [[TMP3]], i32 2
; CHECK-NEXT: br i1 [[TMP12]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]]
; CHECK: pred.store.if5:
; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 1
; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], -1
; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, i8* null, i64 [[TMP14]]
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP2]], i64 -1
; CHECK-NEXT: store i8 95, i8* [[TMP15]], align 1
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE5]]
; CHECK: pred.store.continue5:
; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i1> [[TMP9]], i32 3
; CHECK-NEXT: br i1 [[TMP16]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7]]
; CHECK: pred.store.if6:
; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i8*> [[TMP3]], i32 3
; CHECK-NEXT: store i8 95, i8* [[TMP17]], align 1
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE7]]
; CHECK: pred.store.continue7:
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE6]]
; CHECK: pred.store.continue6:
; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i1> [[TMP9]], i32 2
; CHECK-NEXT: br i1 [[TMP16]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]]
; CHECK: pred.store.if7:
; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 2
; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], -1
; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, i8* null, i64 [[TMP18]]
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP3]], i64 -1
; CHECK-NEXT: store i8 95, i8* [[TMP19]], align 1
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE8]]
; CHECK: pred.store.continue8:
; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i1> [[TMP9]], i32 3
; CHECK-NEXT: br i1 [[TMP20]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10]]
; CHECK: pred.store.if9:
; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 3
; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], -1
; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, i8* null, i64 [[TMP22]]
; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP4]], i64 -1
; CHECK-NEXT: store i8 95, i8* [[TMP23]], align 1
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE10]]
; CHECK: pred.store.continue10:
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, i8* [[POINTER_PHI]], i64 -4
; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@ -78,8 +86,8 @@ define void @a(i8* readnone %b) {
; CHECK: for.body:
; CHECK-NEXT: [[C_05:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[IF_END:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, i8* [[C_05]], i64 -1
; CHECK-NEXT: [[TMP19:%.*]] = load i8, i8* [[INCDEC_PTR]], align 1
; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP19]], 0
; CHECK-NEXT: [[TMP25:%.*]] = load i8, i8* [[INCDEC_PTR]], align 1
; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP25]], 0
; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END]], label [[IF_THEN:%.*]]
; CHECK: if.then:
; CHECK-NEXT: store i8 95, i8* [[INCDEC_PTR]], align 1
@ -134,35 +142,27 @@ define void @pointer_induction_used_as_vector(i8** noalias %start.1, i8* noalias
; CHECK-NEXT: [[IND_END3:%.*]] = getelementptr i8, i8* [[START_2:%.*]], i64 [[N_VEC]]
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi i8* [ [[START_2]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8*, i8** [[START_1]], i64 [[TMP1]]
; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, i8* [[START_2]], i64 [[TMP2]]
; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 1
; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, i8* [[START_2]], i64 [[TMP3]]
; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 2
; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i8, i8* [[START_2]], i64 [[TMP4]]
; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 3
; CHECK-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i8, i8* [[START_2]], i64 [[TMP5]]
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i8*> poison, i8* [[NEXT_GEP4]], i32 0
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i8*> [[TMP6]], i8* [[NEXT_GEP5]], i32 1
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i8*> [[TMP7]], i8* [[NEXT_GEP6]], i32 2
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i8*> [[TMP8]], i8* [[NEXT_GEP7]], i32 3
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, <4 x i8*> [[TMP9]], i64 1
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8*, i8** [[NEXT_GEP]], i32 0
; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8** [[TMP11]] to <4 x i8*>*
; CHECK-NEXT: store <4 x i8*> [[TMP10]], <4 x i8*>* [[TMP12]], align 8
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, i8* [[NEXT_GEP4]], i32 0
; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to <4 x i8>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP14]], align 1
; CHECK-NEXT: [[TMP15:%.*]] = add <4 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1>
; CHECK-NEXT: [[TMP16:%.*]] = bitcast i8* [[TMP13]] to <4 x i8>*
; CHECK-NEXT: store <4 x i8> [[TMP15]], <4 x i8>* [[TMP16]], align 1
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, i8* [[POINTER_PHI]], <4 x i64> <i64 0, i64 1, i64 2, i64 3>
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, <4 x i8*> [[TMP2]], i64 1
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8*, i8** [[NEXT_GEP]], i32 0
; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8** [[TMP4]] to <4 x i8*>*
; CHECK-NEXT: store <4 x i8*> [[TMP3]], <4 x i8*>* [[TMP5]], align 8
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i8*> [[TMP2]], i32 0
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[TMP6]], i32 0
; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <4 x i8>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP8]], align 1
; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1>
; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP7]] to <4 x i8>*
; CHECK-NEXT: store <4 x i8> [[TMP9]], <4 x i8>* [[TMP10]], align 1
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, i8* [[POINTER_PHI]], i64 4
; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]