diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index fd6b274811ff..4c0867971aa0 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1435,16 +1435,16 @@ public: })); } - /// Returns true if \p I is an instruction that will be scalarized with - /// predication when vectorizing \p I with vectorization factor \p VF. Such - /// instructions include conditional stores and instructions that may divide - /// by zero. + /// Returns true if \p I is an instruction which requires predication and + /// for which our chosen predication strategy is scalarization (i.e. we + /// don't have an alternate strategy such as masking available). + /// \p VF is the vectorization factor that will be used to vectorize \p I. bool isScalarWithPredication(Instruction *I, ElementCount VF) const; - // Returns true if \p I is an instruction that will be predicated either - // through scalar predication or masked load/store or masked gather/scatter. - // \p VF is the vectorization factor that will be used to vectorize \p I. - // Superset of instructions that return true for isScalarWithPredication. + /// Returns true if \p I is an instruction that needs to be predicated + /// at runtime. The result is independent of the predication mechanism. + /// \p VF is the vectorization factor that will be used to vectorize \p I. + /// Superset of instructions that return true for isScalarWithPredication. bool isPredicatedInst(Instruction *I, ElementCount VF) const; /// Returns true if \p I is a memory instruction with consecutive memory @@ -4412,15 +4412,16 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { bool LoopVectorizationCostModel::isScalarWithPredication( Instruction *I, ElementCount VF) const { - if (!blockNeedsPredicationForAnyReason(I->getParent())) + if (!isPredicatedInst(I, VF)) return false; + + // Do we have a non-scalar lowering for this predicated + // instruction? No - it is scalar with predication. switch(I->getOpcode()) { default: - break; + return true; case Instruction::Load: case Instruction::Store: { - if (!Legal->isMaskRequired(I)) - return false; auto *Ptr = getLoadStorePointerOperand(I); auto *Ty = getLoadStoreType(I); Type *VTy = Ty; @@ -4432,6 +4433,40 @@ bool LoopVectorizationCostModel::isScalarWithPredication( : !(isLegalMaskedStore(Ty, Ptr, Alignment) || TTI.isLegalMaskedScatter(VTy, Alignment)); } + } +} + +bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I, + ElementCount VF) const { + if (!blockNeedsPredicationForAnyReason(I->getParent())) + return false; + + // Can we prove this instruction is safe to unconditionally execute? + // If not, we must use some form of predication. + switch(I->getOpcode()) { + default: + return false; + case Instruction::Load: + case Instruction::Store: { + if (!Legal->isMaskRequired(I)) + return false; + // When we know the load's address is loop invariant and the instruction + // in the original scalar loop was unconditionally executed then we + // don't need to mark it as a predicated instruction. Tail folding may + // introduce additional predication, but we're guaranteed to always have + // at least one active lane. We call Legal->blockNeedsPredication here + // because it doesn't query tail-folding. For stores, we need to prove + // both speculation safety (which follows from the same argument as loads), + // but also must prove the value being stored is correct. The easiest + // form of the later is to require that all values stored are the same. + if (Legal->isUniformMemOp(*I) && + (isa(I) || + (isa(I) && + TheLoop->isLoopInvariant(cast(I)->getValueOperand()))) && + !Legal->blockNeedsPredication(I->getParent())) + return false; + return true; + } case Instruction::UDiv: case Instruction::SDiv: case Instruction::SRem: @@ -4440,33 +4475,6 @@ bool LoopVectorizationCostModel::isScalarWithPredication( // context sensitive reasoning return !isSafeToSpeculativelyExecute(I); } - return false; -} - -bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I, - ElementCount VF) const { - // When we know the load's address is loop invariant and the instruction - // in the original scalar loop was unconditionally executed then we - // don't need to mark it as a predicated instruction. Tail folding may - // introduce additional predication, but we're guaranteed to always have - // at least one active lane. We call Legal->blockNeedsPredication here - // because it doesn't query tail-folding. For stores, we need to prove - // both speculation safety (which follows from the same argument as loads), - // but also must prove the value being stored is correct. The easiest - // form of the later is to require that all values stored are the same. - if (Legal->isUniformMemOp(*I) && - (isa(I) || - (isa(I) && - TheLoop->isLoopInvariant(cast(I)->getValueOperand()))) && - !Legal->blockNeedsPredication(I->getParent())) - return false; - if (!blockNeedsPredicationForAnyReason(I->getParent())) - return false; - // Loads and stores that need some form of masked operation are predicated - // instructions. - if (isa(I) || isa(I)) - return Legal->isMaskRequired(I); - return isScalarWithPredication(I, VF); } bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( @@ -6070,7 +6078,7 @@ bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I, // from moving "masked load/store" check from legality to cost model. // Masked Load/Gather emulation was previously never allowed. // Limited number of Masked Store/Scatter emulation was allowed. - assert((isPredicatedInst(I, VF) || Legal->isUniformMemOp(*I)) && + assert((isPredicatedInst(I, VF)) && "Expecting a scalar emulated instruction"); return isa(I) || (isa(I) && diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll index 6a76a67586e8..2b036a562731 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll @@ -584,15 +584,44 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt ; ; TF-SCALABLE-LABEL: @uniform_load_unaligned( ; TF-SCALABLE-NEXT: entry: +; TF-SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[TMP1:%.*]] = icmp ult i64 -1025, [[TMP0]] +; TF-SCALABLE-NEXT: br i1 [[TMP1]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; TF-SCALABLE: vector.ph: +; TF-SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; TF-SCALABLE-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]] +; TF-SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP2]] +; TF-SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; TF-SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] +; TF-SCALABLE: vector.body: +; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; TF-SCALABLE-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 +; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv1i1.i64(i64 [[TMP5]], i64 1024) +; TF-SCALABLE-NEXT: [[TMP6:%.*]] = load i64, ptr [[B:%.*]], align 1 +; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP6]], i32 0 +; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; TF-SCALABLE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP5]] +; TF-SCALABLE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0 +; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv1i64.p0( [[BROADCAST_SPLAT]], ptr [[TMP8]], i32 8, [[ACTIVE_LANE_MASK]]) +; TF-SCALABLE-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]] +; TF-SCALABLE-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; TF-SCALABLE-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; TF-SCALABLE: middle.block: +; TF-SCALABLE-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; TF-SCALABLE: scalar.ph: +; TF-SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; TF-SCALABLE-NEXT: br label [[FOR_BODY:%.*]] ; TF-SCALABLE: for.body: -; TF-SCALABLE-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; TF-SCALABLE-NEXT: [[V:%.*]] = load i64, ptr [[B:%.*]], align 1 -; TF-SCALABLE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[IV]] +; TF-SCALABLE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; TF-SCALABLE-NEXT: [[V:%.*]] = load i64, ptr [[B]], align 1 +; TF-SCALABLE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; TF-SCALABLE-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8 ; TF-SCALABLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; TF-SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; TF-SCALABLE: for.end: ; TF-SCALABLE-NEXT: ret void ; @@ -754,7 +783,7 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i ; TF-SCALABLE-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() ; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] ; TF-SCALABLE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; TF-SCALABLE-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; TF-SCALABLE-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; TF-SCALABLE: middle.block: ; TF-SCALABLE-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; TF-SCALABLE: scalar.ph: @@ -767,7 +796,7 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i ; TF-SCALABLE-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8 ; TF-SCALABLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; TF-SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; TF-SCALABLE: for.end: ; TF-SCALABLE-NEXT: ret void ; @@ -1191,15 +1220,44 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap ; ; TF-SCALABLE-LABEL: @uniform_store_unaligned( ; TF-SCALABLE-NEXT: entry: +; TF-SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[TMP1:%.*]] = icmp ult i64 -1025, [[TMP0]] +; TF-SCALABLE-NEXT: br i1 [[TMP1]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; TF-SCALABLE: vector.ph: +; TF-SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; TF-SCALABLE-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]] +; TF-SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP2]] +; TF-SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i32 0 +; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; TF-SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] +; TF-SCALABLE: vector.body: +; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; TF-SCALABLE-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 +; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv1i1.i64(i64 [[TMP5]], i64 1024) +; TF-SCALABLE-NEXT: store i64 [[V]], ptr [[B:%.*]], align 1 +; TF-SCALABLE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP5]] +; TF-SCALABLE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0 +; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv1i64.p0( [[BROADCAST_SPLAT]], ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]]) +; TF-SCALABLE-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; TF-SCALABLE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; TF-SCALABLE-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; TF-SCALABLE: middle.block: +; TF-SCALABLE-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; TF-SCALABLE: scalar.ph: +; TF-SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; TF-SCALABLE-NEXT: br label [[FOR_BODY:%.*]] ; TF-SCALABLE: for.body: -; TF-SCALABLE-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; TF-SCALABLE-NEXT: store i64 [[V:%.*]], ptr [[B:%.*]], align 1 -; TF-SCALABLE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[IV]] +; TF-SCALABLE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; TF-SCALABLE-NEXT: store i64 [[V]], ptr [[B]], align 1 +; TF-SCALABLE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; TF-SCALABLE-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8 ; TF-SCALABLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; TF-SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; TF-SCALABLE: for.end: ; TF-SCALABLE-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll index 0d0f74c62c2d..41dd6000d06c 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll @@ -473,7 +473,7 @@ define void @need_new_block_after_sinking_pr56146(i32 %x, i32* %src, i32* noalia ; CHECK-NEXT: Successor(s): loop.0 ; CHECK-EMPTY: ; CHECK-NEXT: loop.0: -; CHECK-NEXT: REPLICATE ir<[[L]]> = load ir<%src> +; CHECK-NEXT: CLONE ir<[[L]]> = load ir<%src> ; CHECK-NEXT: EMIT vp<[[SPLICE:%.+]]> = first-order splice ir<%.pn> ir<[[L]]> ; CHECK-NEXT: Successor(s): loop.0.split ; CHECK-EMPTY: diff --git a/llvm/test/Transforms/LoopVectorize/pr46525-expander-insertpoint.ll b/llvm/test/Transforms/LoopVectorize/pr46525-expander-insertpoint.ll index 4c8ff9f6323b..945de0696c70 100644 --- a/llvm/test/Transforms/LoopVectorize/pr46525-expander-insertpoint.ll +++ b/llvm/test/Transforms/LoopVectorize/pr46525-expander-insertpoint.ll @@ -28,7 +28,6 @@ define void @test(i16 %x, i64 %y, i32* %ptr) { ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: store i32 0, i32* [[PTR:%.*]], align 4 -; CHECK-NEXT: store i32 0, i32* [[PTR]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll index 2987a320a105..fcd4782958f6 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll @@ -253,7 +253,7 @@ define void @uniform_gep(i64 %k, i16* noalias %A, i16* noalias %B) { ; CHECK-NEXT: EMIT vp<[[WIDE_CAN_IV:%.+]]> = WIDEN-CANONICAL-INDUCTION vp<[[CAN_IV]]> ; CHECK-NEXT: EMIT vp<[[MASK:%.+]]> = icmp ule vp<[[WIDE_CAN_IV]]> vp<[[BTC]]> ; CHECK-NEXT: CLONE ir<%gep.A.uniform> = getelementptr ir<%A>, ir<0> -; CHECK-NEXT: REPLICATE ir<%lv> = load ir<%gep.A.uniform> +; CHECK-NEXT: CLONE ir<%lv> = load ir<%gep.A.uniform> ; CHECK-NEXT: WIDEN ir<%cmp> = icmp ir<%iv>, ir<%k> ; CHECK-NEXT: Successor(s): loop.then ; CHECK-EMPTY: