[LV] Make sure VF doesn't exceed compile time known TC

For the simple copy loop (see test case) vectorizer selects VF equal to 32 while the loop is known to have 17 iterations only. Such behavior makes no sense to me since such vector loop will never be executed. The only case we may want to select VF large than TC is masked vectoriztion. So I haven't touched that case.

Reviewed By: dmgreen

Differential Revision: https://reviews.llvm.org/D114528
This commit is contained in:
Evgeniy Brevnov 2021-11-24 18:30:52 +07:00
parent 9115d75117
commit 2025e0985c
2 changed files with 44 additions and 39 deletions

View File

@ -1706,7 +1706,8 @@ private:
/// disabled or unsupported, then the scalable part will be equal to /// disabled or unsupported, then the scalable part will be equal to
/// ElementCount::getScalable(0). /// ElementCount::getScalable(0).
FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount, FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount,
ElementCount UserVF); ElementCount UserVF,
bool FoldTailByMasking);
/// \return the maximized element count based on the targets vector /// \return the maximized element count based on the targets vector
/// registers and the loop trip-count, but limited to a maximum safe VF. /// registers and the loop trip-count, but limited to a maximum safe VF.
@ -1719,7 +1720,8 @@ private:
ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,
unsigned SmallestType, unsigned SmallestType,
unsigned WidestType, unsigned WidestType,
const ElementCount &MaxSafeVF); const ElementCount &MaxSafeVF,
bool FoldTailByMasking);
/// \return the maximum legal scalable VF, based on the safe max number /// \return the maximum legal scalable VF, based on the safe max number
/// of elements. /// of elements.
@ -5317,9 +5319,8 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
return MaxScalableVF; return MaxScalableVF;
} }
FixedScalableVFPair FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) {
ElementCount UserVF) {
MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
unsigned SmallestType, WidestType; unsigned SmallestType, WidestType;
std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
@ -5406,12 +5407,14 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
FixedScalableVFPair Result(ElementCount::getFixed(1), FixedScalableVFPair Result(ElementCount::getFixed(1),
ElementCount::getScalable(0)); ElementCount::getScalable(0));
if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, if (auto MaxVF =
WidestType, MaxSafeFixedVF)) getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
MaxSafeFixedVF, FoldTailByMasking))
Result.FixedVF = MaxVF; Result.FixedVF = MaxVF;
if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, if (auto MaxVF =
WidestType, MaxSafeScalableVF)) getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
MaxSafeScalableVF, FoldTailByMasking))
if (MaxVF.isScalable()) { if (MaxVF.isScalable()) {
Result.ScalableVF = MaxVF; Result.ScalableVF = MaxVF;
LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
@ -5444,7 +5447,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
switch (ScalarEpilogueStatus) { switch (ScalarEpilogueStatus) {
case CM_ScalarEpilogueAllowed: case CM_ScalarEpilogueAllowed:
return computeFeasibleMaxVF(TC, UserVF); return computeFeasibleMaxVF(TC, UserVF, false);
case CM_ScalarEpilogueNotAllowedUsePredicate: case CM_ScalarEpilogueNotAllowedUsePredicate:
LLVM_FALLTHROUGH; LLVM_FALLTHROUGH;
case CM_ScalarEpilogueNotNeededUsePredicate: case CM_ScalarEpilogueNotNeededUsePredicate:
@ -5482,7 +5485,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
"scalar epilogue instead.\n"); "scalar epilogue instead.\n");
ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
return computeFeasibleMaxVF(TC, UserVF); return computeFeasibleMaxVF(TC, UserVF, false);
} }
return FixedScalableVFPair::getNone(); return FixedScalableVFPair::getNone();
} }
@ -5499,7 +5502,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
} }
FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF); FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true);
// Avoid tail folding if the trip count is known to be a multiple of any VF // Avoid tail folding if the trip count is known to be a multiple of any VF
// we chose. // we chose.
// FIXME: The condition below pessimises the case for fixed-width vectors, // FIXME: The condition below pessimises the case for fixed-width vectors,
@ -5572,7 +5575,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType,
const ElementCount &MaxSafeVF) { const ElementCount &MaxSafeVF, bool FoldTailByMasking) {
bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
TypeSize WidestRegister = TTI.getRegisterBitWidth( TypeSize WidestRegister = TTI.getRegisterBitWidth(
ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
@ -5604,14 +5607,17 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
const auto TripCountEC = ElementCount::getFixed(ConstTripCount); const auto TripCountEC = ElementCount::getFixed(ConstTripCount);
if (ConstTripCount && if (ConstTripCount &&
ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) && ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) &&
isPowerOf2_32(ConstTripCount)) { (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) {
// We need to clamp the VF to be the ConstTripCount. There is no point in // If loop trip count (TC) is known at compile time there is no point in
// choosing a higher viable VF as done in the loop below. If // choosing VF greater than TC (as done in the loop below). Select maximum
// MaxVectorElementCount is scalable, we only fall back on a fixed VF when // power of two which doesn't exceed TC.
// the TC is less than or equal to the known number of lanes. // If MaxVectorElementCount is scalable, we only fall back on a fixed VF
LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " // when the TC is less than or equal to the known number of lanes.
auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount);
LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
"exceeding the constant trip count: "
<< ConstTripCount << "\n"); << ConstTripCount << "\n");
return TripCountEC; return ElementCount::getFixed(ClampedConstTripCount);
} }
ElementCount MaxVF = MaxVectorElementCount; ElementCount MaxVF = MaxVectorElementCount;

View File

@ -4,14 +4,13 @@
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128-ni:1-p2:32:8:8:32-ni:2" target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128-ni:1-p2:32:8:8:32-ni:2"
target triple = "x86_64-unknown-linux-gnu" target triple = "x86_64-unknown-linux-gnu"
; TODO: Make sure selected VF for the main loop doesn't exceed TC.
; TODO: Make sure selected VF for the epilog loop doesn't exceed remaining TC. ; TODO: Make sure selected VF for the epilog loop doesn't exceed remaining TC.
define void @test1(i8 * noalias %src, i8 * noalias %dst) #0 { define void @test1(i8 * noalias %src, i8 * noalias %dst) #0 {
; CHECK-LABEL: @test1( ; CHECK-LABEL: @test1(
; CHECK-NEXT: iter.check: ; CHECK-NEXT: iter.check:
; CHECK-NEXT: br i1 true, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
; CHECK: vector.main.loop.iter.check: ; CHECK: vector.main.loop.iter.check:
; CHECK-NEXT: br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph: ; CHECK: vector.ph:
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body: ; CHECK: vector.body:
@ -19,42 +18,42 @@ define void @test1(i8 * noalias %src, i8 * noalias %dst) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP0]]
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <64 x i8>* ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <64 x i8>, <64 x i8>* [[TMP3]], align 64 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[TMP3]], align 64
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[DST:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[DST:%.*]], i64 [[TMP0]]
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 0
; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <64 x i8>* ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <16 x i8>*
; CHECK-NEXT: store <64 x i8> [[WIDE_LOAD]], <64 x i8>* [[TMP6]], align 64 ; CHECK-NEXT: store <16 x i8> [[WIDE_LOAD]], <16 x i8>* [[TMP6]], align 64
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block: ; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 17, 0 ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 17, 16
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; CHECK: vec.epilog.iter.check: ; CHECK: vec.epilog.iter.check:
; CHECK-NEXT: br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK-NEXT: br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; CHECK: vec.epilog.ph: ; CHECK: vec.epilog.ph:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; CHECK: vec.epilog.vector.body: ; CHECK: vec.epilog.vector.body:
; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] ; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX1]], 0 ; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX1]], 0
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[TMP8]]
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP9]], i32 0 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP9]], i32 0
; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <32 x i8>* ; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <8 x i8>*
; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <32 x i8>, <32 x i8>* [[TMP11]], align 64 ; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i8>, <8 x i8>* [[TMP11]], align 64
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 [[TMP8]]
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, i8* [[TMP12]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, i8* [[TMP12]], i32 0
; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to <32 x i8>* ; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to <8 x i8>*
; CHECK-NEXT: store <32 x i8> [[WIDE_LOAD4]], <32 x i8>* [[TMP14]], align 64 ; CHECK-NEXT: store <8 x i8> [[WIDE_LOAD4]], <8 x i8>* [[TMP14]], align 64
; CHECK-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 32 ; CHECK-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 8
; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 0 ; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 16
; CHECK-NEXT: br i1 [[TMP15]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK-NEXT: br i1 [[TMP15]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
; CHECK: vec.epilog.middle.block: ; CHECK: vec.epilog.middle.block:
; CHECK-NEXT: [[CMP_N3:%.*]] = icmp eq i64 17, 0 ; CHECK-NEXT: [[CMP_N3:%.*]] = icmp eq i64 17, 16
; CHECK-NEXT: br i1 [[CMP_N3]], label [[EXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK-NEXT: br i1 [[CMP_N3]], label [[EXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
; CHECK: vec.epilog.scalar.ph: ; CHECK: vec.epilog.scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
; CHECK-NEXT: br label [[LOOP_MEMCPY_EXPANSION:%.*]] ; CHECK-NEXT: br label [[LOOP_MEMCPY_EXPANSION:%.*]]
; CHECK: loop-memcpy-expansion: ; CHECK: loop-memcpy-expansion:
; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]