diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index b094a6f33fa4..1a38090bede8 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -442,7 +442,9 @@ void VPlan::execute(VPTransformState *State) { IRBuilder<> Builder(State->CFG.PrevBB->getTerminator()); auto *TCMO = Builder.CreateSub(TC, ConstantInt::get(TC->getType(), 1), "trip.count.minus.1"); - Value2VPValue[TCMO] = BackedgeTakenCount; + Value *VTCMO = Builder.CreateVectorSplat(State->VF, TCMO, "broadcast"); + for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part) + State->set(BackedgeTakenCount, VTCMO, Part); } // 0. Set the reverse mapping from VPValues to Values for code generation. diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 25afb09129ff..1a112d35d959 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1544,10 +1544,9 @@ public: if (Entry) VPBlockBase::deleteCFG(Entry); for (auto &MapEntry : Value2VPValue) - if (MapEntry.second != BackedgeTakenCount) - delete MapEntry.second; + delete MapEntry.second; if (BackedgeTakenCount) - delete BackedgeTakenCount; // Delete once, if in Value2VPValue or not. + delete BackedgeTakenCount; for (VPValue *Def : VPExternalDefs) delete Def; for (VPValue *CBV : VPCBVs) diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll index 27250dd59a53..ca468b785fda 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll @@ -202,10 +202,10 @@ for.end: ; ENABLED_MASKED_STRIDED-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 7 ; ENABLED_MASKED_STRIDED-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8 ; ENABLED_MASKED_STRIDED-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1 -; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[CONV]], i32 0 -; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer ; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0 ; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT1]], <8 x i32> undef, <8 x i32> zeroinitializer +; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[CONV]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer ; ENABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] ; ENABLED_MASKED_STRIDED: vector.body: ; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -300,10 +300,10 @@ for.end: ; ENABLED_MASKED_STRIDED-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 7 ; ENABLED_MASKED_STRIDED-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8 ; ENABLED_MASKED_STRIDED-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1 -; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[CONV]], i32 0 -; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer ; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0 ; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT1]], <8 x i32> undef, <8 x i32> zeroinitializer +; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[CONV]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer ; ENABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] ; ENABLED_MASKED_STRIDED: vector.body: ; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll b/llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll index b51cb56045bf..a090b15bda09 100644 --- a/llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll +++ b/llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll @@ -39,3 +39,36 @@ while.end.loopexit: while.end: ret void } + +; Make sure a loop is successfully vectorized with fold-tail when the backedge +; taken count is constant and used inside the loop. Issue revealed by D76992. +; +define void @reuse_const_btc(i8* %A) optsize { +; CHECK-LABEL: @reuse_const_btc +; CHECK: {{%.*}} = icmp ule <4 x i32> {{%.*}}, +; CHECK: {{%.*}} = select <4 x i1> {{%.*}}, <4 x i32> , <4 x i32> +; +entry: + br label %loop + +loop: + %riv = phi i32 [ 13, %entry ], [ %rivMinus1, %merge ] + %sub = sub nuw nsw i32 20, %riv + %arrayidx = getelementptr inbounds i8, i8* %A, i32 %sub + %cond0 = icmp eq i32 %riv, 7 + br i1 %cond0, label %then, label %else +then: + br label %merge +else: + br label %merge +merge: + %blend = phi i32 [ 13, %then ], [ 12, %else ] + %trunc = trunc i32 %blend to i8 + store i8 %trunc, i8* %arrayidx, align 1 + %rivMinus1 = add nuw nsw i32 %riv, -1 + %cond = icmp eq i32 %riv, 0 + br i1 %cond, label %exit, label %loop + +exit: + ret void +}