Reland "[LV] Calculate max feasible scalable VF."

Relands https://reviews.llvm.org/D98509

This reverts commit 51d648c119.
This commit is contained in:
Sander de Smalen 2021-05-04 09:34:21 +01:00
parent bfb9c749c0
commit 9931ae645e
6 changed files with 419 additions and 193 deletions

View File

@ -174,6 +174,13 @@ void reportVectorizationFailure(const StringRef DebugMsg,
const StringRef OREMsg, const StringRef ORETag,
OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I = nullptr);
/// Reports an informative message: print \p Msg for debugging purposes as well
/// as an optimization remark. Uses either \p I as location of the remark, or
/// otherwise \p TheLoop.
void reportVectorizationInfo(const StringRef OREMsg, const StringRef ORETag,
OptimizationRemarkEmitter *ORE, Loop *TheLoop,
Instruction *I = nullptr);
} // end namespace llvm
#endif // LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZE_H

View File

@ -1065,13 +1065,13 @@ void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr)
B.SetCurrentDebugLocation(DebugLoc());
}
/// Write a record \p DebugMsg about vectorization failure to the debug
/// output stream. If \p I is passed, it is an instruction that prevents
/// vectorization.
/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
/// is passed, the message relates to that particular instruction.
#ifndef NDEBUG
static void debugVectorizationFailure(const StringRef DebugMsg,
Instruction *I) {
dbgs() << "LV: Not vectorizing: " << DebugMsg;
static void debugVectorizationMessage(const StringRef Prefix,
const StringRef DebugMsg,
Instruction *I) {
dbgs() << "LV: " << Prefix << DebugMsg;
if (I != nullptr)
dbgs() << " " << *I;
else
@ -1100,9 +1100,7 @@ static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
DL = I->getDebugLoc();
}
OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
R << "loop not vectorized: ";
return R;
return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
}
/// Return a value for Step multiplied by VF.
@ -1123,12 +1121,24 @@ Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) {
}
void reportVectorizationFailure(const StringRef DebugMsg,
const StringRef OREMsg, const StringRef ORETag,
OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
const StringRef OREMsg, const StringRef ORETag,
OptimizationRemarkEmitter *ORE, Loop *TheLoop,
Instruction *I) {
LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
ORETag, TheLoop, I) << OREMsg);
ORE->emit(
createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
<< "loop not vectorized: " << OREMsg);
}
void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
OptimizationRemarkEmitter *ORE, Loop *TheLoop,
Instruction *I) {
LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
ORE->emit(
createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
<< Msg);
}
} // end namespace llvm
@ -1623,6 +1633,23 @@ private:
ElementCount computeFeasibleMaxVF(unsigned ConstTripCount,
ElementCount UserVF);
/// \return the maximized element count based on the targets vector
/// registers and the loop trip-count, but limited to a maximum safe VF.
/// This is a helper function of computeFeasibleMaxVF.
/// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure
/// issue that occurred on one of the buildbots which cannot be reproduced
/// without having access to the properietary compiler (see comments on
/// D98509). The issue is currently under investigation and this workaround
/// will be removed as soon as possible.
ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,
unsigned SmallestType,
unsigned WidestType,
const ElementCount &MaxSafeVF);
/// \return the maximum legal scalable VF, based on the safe max number
/// of elements.
ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
/// The vectorization cost is a combination of the cost itself and a boolean
/// indicating whether any of the contributing operations will actually
/// operate on
@ -5582,6 +5609,130 @@ bool LoopVectorizationCostModel::runtimeChecksRequired() {
return false;
}
ElementCount
LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
reportVectorizationInfo(
"Disabling scalable vectorization, because target does not "
"support scalable vectors.",
"ScalableVectorsUnsupported", ORE, TheLoop);
return ElementCount::getScalable(0);
}
auto MaxScalableVF = ElementCount::getScalable(
std::numeric_limits<ElementCount::ScalarTy>::max());
// Disable scalable vectorization if the loop contains unsupported reductions.
// Test that the loop-vectorizer can legalize all operations for this MaxVF.
// FIXME: While for scalable vectors this is currently sufficient, this should
// be replaced by a more detailed mechanism that filters out specific VFs,
// instead of invalidating vectorization for a whole set of VFs based on the
// MaxVF.
if (!canVectorizeReductions(MaxScalableVF)) {
reportVectorizationInfo(
"Scalable vectorization not supported for the reduction "
"operations found in this loop.",
"ScalableVFUnfeasible", ORE, TheLoop);
return ElementCount::getScalable(0);
}
if (Legal->isSafeForAnyVectorWidth())
return MaxScalableVF;
// Limit MaxScalableVF by the maximum safe dependence distance.
Optional<unsigned> MaxVScale = TTI.getMaxVScale();
MaxScalableVF = ElementCount::getScalable(
MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0);
if (!MaxScalableVF)
reportVectorizationInfo(
"Max legal vector width too small, scalable vectorization "
"unfeasible.",
"ScalableVFUnfeasible", ORE, TheLoop);
return MaxScalableVF;
}
ElementCount
LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
ElementCount UserVF) {
MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
unsigned SmallestType, WidestType;
std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
// Get the maximum safe dependence distance in bits computed by LAA.
// It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
// the memory accesses that is most restrictive (involved in the smallest
// dependence distance).
unsigned MaxSafeElements =
PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
<< ".\n");
LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
<< ".\n");
// First analyze the UserVF, fall back if the UserVF should be ignored.
if (UserVF) {
auto MaxSafeUserVF =
UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF))
return UserVF;
assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
// Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
// is better to ignore the hint and let the compiler choose a suitable VF.
if (!UserVF.isScalable()) {
LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
<< " is unsafe, clamping to max safe VF="
<< MaxSafeFixedVF << ".\n");
ORE->emit([&]() {
return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
TheLoop->getStartLoc(),
TheLoop->getHeader())
<< "User-specified vectorization factor "
<< ore::NV("UserVectorizationFactor", UserVF)
<< " is unsafe, clamping to maximum safe vectorization factor "
<< ore::NV("VectorizationFactor", MaxSafeFixedVF);
});
return MaxSafeFixedVF;
}
LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
<< " is unsafe. Ignoring scalable UserVF.\n");
ORE->emit([&]() {
return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
TheLoop->getStartLoc(),
TheLoop->getHeader())
<< "User-specified vectorization factor "
<< ore::NV("UserVectorizationFactor", UserVF)
<< " is unsafe. Ignoring the hint to let the compiler pick a "
"suitable VF.";
});
}
LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
<< " / " << WidestType << " bits.\n");
ElementCount MaxFixedVF = ElementCount::getFixed(1);
if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType,
WidestType, MaxSafeFixedVF))
MaxFixedVF = MaxVF;
if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType,
WidestType, MaxSafeScalableVF))
// FIXME: Return scalable VF as well (to be added in future patch).
if (MaxVF.isScalable())
LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
<< "\n");
return MaxFixedVF;
}
Optional<ElementCount>
LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
@ -5722,149 +5873,61 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
return None;
}
ElementCount
LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
ElementCount UserVF) {
bool IgnoreScalableUserVF = UserVF.isScalable() &&
!TTI.supportsScalableVectors() &&
!ForceTargetSupportsScalableVectors;
if (IgnoreScalableUserVF) {
LLVM_DEBUG(
dbgs() << "LV: Ignoring VF=" << UserVF
<< " because target does not support scalable vectors.\n");
ORE->emit([&]() {
return OptimizationRemarkAnalysis(DEBUG_TYPE, "IgnoreScalableUserVF",
TheLoop->getStartLoc(),
TheLoop->getHeader())
<< "Ignoring VF=" << ore::NV("UserVF", UserVF)
<< " because target does not support scalable vectors.";
});
}
ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType,
const ElementCount &MaxSafeVF) {
bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
TypeSize WidestRegister = TTI.getRegisterBitWidth(
ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
: TargetTransformInfo::RGK_FixedWidthVector);
// Beyond this point two scenarios are handled. If UserVF isn't specified
// then a suitable VF is chosen. If UserVF is specified and there are
// dependencies, check if it's legal. However, if a UserVF is specified and
// there are no dependencies, then there's nothing to do.
if (UserVF.isNonZero() && !IgnoreScalableUserVF) {
if (!canVectorizeReductions(UserVF)) {
reportVectorizationFailure(
"LV: Scalable vectorization not supported for the reduction "
"operations found in this loop. Using fixed-width "
"vectorization instead.",
"Scalable vectorization not supported for the reduction operations "
"found in this loop. Using fixed-width vectorization instead.",
"ScalableVFUnfeasible", ORE, TheLoop);
return computeFeasibleMaxVF(
ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue()));
}
if (Legal->isSafeForAnyVectorWidth())
return UserVF;
}
MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
unsigned SmallestType, WidestType;
std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
unsigned WidestRegister =
TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
.getFixedSize();
// Get the maximum safe dependence distance in bits computed by LAA.
// It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
// the memory accesses that is most restrictive (involved in the smallest
// dependence distance).
unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits();
// If the user vectorization factor is legally unsafe, clamp it to a safe
// value. Otherwise, return as is.
if (UserVF.isNonZero() && !IgnoreScalableUserVF) {
unsigned MaxSafeElements =
PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType);
ElementCount MaxSafeVF = ElementCount::getFixed(MaxSafeElements);
if (UserVF.isScalable()) {
Optional<unsigned> MaxVScale = TTI.getMaxVScale();
// Scale VF by vscale before checking if it's safe.
MaxSafeVF = ElementCount::getScalable(
MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0);
if (MaxSafeVF.isZero()) {
// The dependence distance is too small to use scalable vectors,
// fallback on fixed.
LLVM_DEBUG(
dbgs()
<< "LV: Max legal vector width too small, scalable vectorization "
"unfeasible. Using fixed-width vectorization instead.\n");
ORE->emit([&]() {
return OptimizationRemarkAnalysis(DEBUG_TYPE, "ScalableVFUnfeasible",
TheLoop->getStartLoc(),
TheLoop->getHeader())
<< "Max legal vector width too small, scalable vectorization "
<< "unfeasible. Using fixed-width vectorization instead.";
});
return computeFeasibleMaxVF(
ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue()));
}
}
LLVM_DEBUG(dbgs() << "LV: The max safe VF is: " << MaxSafeVF << ".\n");
if (ElementCount::isKnownLE(UserVF, MaxSafeVF))
return UserVF;
LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
<< " is unsafe, clamping to max safe VF=" << MaxSafeVF
<< ".\n");
ORE->emit([&]() {
return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
TheLoop->getStartLoc(),
TheLoop->getHeader())
<< "User-specified vectorization factor "
<< ore::NV("UserVectorizationFactor", UserVF)
<< " is unsafe, clamping to maximum safe vectorization factor "
<< ore::NV("VectorizationFactor", MaxSafeVF);
});
return MaxSafeVF;
}
WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits);
// Convenience function to return the minimum of two ElementCounts.
auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
assert((LHS.isScalable() == RHS.isScalable()) &&
"Scalable flags must match");
return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
};
// Ensure MaxVF is a power of 2; the dependence distance bound may not be.
// Note that both WidestRegister and WidestType may not be a powers of 2.
auto MaxVectorSize =
ElementCount::getFixed(PowerOf2Floor(WidestRegister / WidestType));
LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
<< " / " << WidestType << " bits.\n");
auto MaxVectorElementCount = ElementCount::get(
PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType),
ComputeScalableMaxVF);
MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
<< WidestRegister << " bits.\n");
<< (MaxVectorElementCount * WidestType) << " bits.\n");
assert(MaxVectorSize.getFixedValue() <= WidestRegister &&
"Did not expect to pack so many elements"
" into one vector!");
if (MaxVectorSize.getFixedValue() == 0) {
if (!MaxVectorElementCount) {
LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
return ElementCount::getFixed(1);
} else if (ConstTripCount && ConstTripCount < MaxVectorSize.getFixedValue() &&
isPowerOf2_32(ConstTripCount)) {
// We need to clamp the VF to be the ConstTripCount. There is no point in
// choosing a higher viable VF as done in the loop below.
LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
<< ConstTripCount << "\n");
return ElementCount::getFixed(ConstTripCount);
}
ElementCount MaxVF = MaxVectorSize;
const auto TripCountEC = ElementCount::getFixed(ConstTripCount);
if (ConstTripCount &&
ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) &&
isPowerOf2_32(ConstTripCount)) {
// We need to clamp the VF to be the ConstTripCount. There is no point in
// choosing a higher viable VF as done in the loop below. If
// MaxVectorElementCount is scalable, we only fall back on a fixed VF when
// the TC is less than or equal to the known number of lanes.
LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
<< ConstTripCount << "\n");
return TripCountEC;
}
ElementCount MaxVF = MaxVectorElementCount;
if (TTI.shouldMaximizeVectorBandwidth() ||
(MaximizeBandwidth && isScalarEpilogueAllowed())) {
auto MaxVectorElementCountMaxBW = ElementCount::get(
PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType),
ComputeScalableMaxVF);
MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
// Collect all viable vectorization factors larger than the default MaxVF
// (i.e. MaxVectorSize).
// (i.e. MaxVectorElementCount).
SmallVector<ElementCount, 8> VFs;
auto MaxVectorSizeMaxBW =
ElementCount::getFixed(WidestRegister / SmallestType);
for (ElementCount VS = MaxVectorSize * 2;
ElementCount::isKnownLE(VS, MaxVectorSizeMaxBW); VS *= 2)
for (ElementCount VS = MaxVectorElementCount * 2;
ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
VFs.push_back(VS);
// For each VF calculate its register usage.
@ -5885,7 +5948,7 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
}
}
if (ElementCount MinVF =
TTI.getMinimumVF(SmallestType, /*IsScalable=*/false)) {
TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
if (ElementCount::isKnownLT(MaxVF, MinVF)) {
LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
<< ") with target's minimum: " << MinVF << '\n');

View File

@ -221,7 +221,7 @@ for.end:
ret float %add
}
; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop. Using fixed-width vectorization instead.
; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop.
; CHECK-REMARK: vectorized loop (vectorization width: 8, interleaved count: 2)
define bfloat @fadd_fast_bfloat(bfloat* noalias nocapture readonly %a, i64 %n) {
; CHECK-LABEL: @fadd_fast_bfloat
@ -322,18 +322,18 @@ for.end:
; MUL
; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop. Using fixed-width vectorization instead.
; CHECK-REMARK: vectorized loop (vectorization width: 8, interleaved count: 2)
; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop.
; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 2)
define i32 @mul(i32* nocapture %a, i32* nocapture readonly %b, i64 %n) {
; CHECK-LABEL: @mul
; CHECK: vector.body:
; CHECK: %[[LOAD1:.*]] = load <8 x i32>
; CHECK: %[[LOAD2:.*]] = load <8 x i32>
; CHECK: %[[MUL1:.*]] = mul <8 x i32> %[[LOAD1]]
; CHECK: %[[MUL2:.*]] = mul <8 x i32> %[[LOAD2]]
; CHECK: %[[LOAD1:.*]] = load <4 x i32>
; CHECK: %[[LOAD2:.*]] = load <4 x i32>
; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD1]]
; CHECK: %[[MUL2:.*]] = mul <4 x i32> %[[LOAD2]]
; CHECK: middle.block:
; CHECK: %[[RDX:.*]] = mul <8 x i32> %[[MUL2]], %[[MUL1]]
; CHECK: call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> %[[RDX]])
; CHECK: %[[RDX:.*]] = mul <4 x i32> %[[MUL2]], %[[MUL1]]
; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[RDX]])
entry:
br label %for.body
@ -352,22 +352,22 @@ for.end: ; preds = %for.body, %entry
}
; Note: This test was added to ensure we always check the legality of reductions (end emit a warning if necessary) before checking for memory dependencies
; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop. Using fixed-width vectorization instead.
; CHECK-REMARK: vectorized loop (vectorization width: 8, interleaved count: 2)
; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop.
; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 2)
define i32 @memory_dependence(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i64 %n) {
; CHECK-LABEL: @memory_dependence
; CHECK: vector.body:
; CHECK: %[[LOAD1:.*]] = load <8 x i32>
; CHECK: %[[LOAD2:.*]] = load <8 x i32>
; CHECK: %[[LOAD3:.*]] = load <8 x i32>
; CHECK: %[[LOAD4:.*]] = load <8 x i32>
; CHECK: %[[ADD1:.*]] = add nsw <8 x i32> %[[LOAD3]], %[[LOAD1]]
; CHECK: %[[ADD2:.*]] = add nsw <8 x i32> %[[LOAD4]], %[[LOAD2]]
; CHECK: %[[MUL1:.*]] = mul <8 x i32> %[[LOAD3]]
; CHECK: %[[MUL2:.*]] = mul <8 x i32> %[[LOAD4]]
; CHECK: %[[LOAD1:.*]] = load <4 x i32>
; CHECK: %[[LOAD2:.*]] = load <4 x i32>
; CHECK: %[[LOAD3:.*]] = load <4 x i32>
; CHECK: %[[LOAD4:.*]] = load <4 x i32>
; CHECK: %[[ADD1:.*]] = add nsw <4 x i32> %[[LOAD3]], %[[LOAD1]]
; CHECK: %[[ADD2:.*]] = add nsw <4 x i32> %[[LOAD4]], %[[LOAD2]]
; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD3]]
; CHECK: %[[MUL2:.*]] = mul <4 x i32> %[[LOAD4]]
; CHECK: middle.block:
; CHECK: %[[RDX:.*]] = mul <8 x i32> %[[MUL2]], %[[MUL1]]
; CHECK: call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> %[[RDX]])
; CHECK: %[[RDX:.*]] = mul <4 x i32> %[[MUL2]], %[[MUL1]]
; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[RDX]])
entry:
br label %for.body

View File

@ -0,0 +1,149 @@
; REQUIRES: asserts
; RUN: opt -mtriple=aarch64-none-linux-gnu -mattr=+sve -force-target-instruction-cost=1 -loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK_SCALABLE_ON
; RUN: opt -mtriple=aarch64-none-linux-gnu -mattr=+sve -force-target-instruction-cost=1 -loop-vectorize -S -debug-only=loop-vectorize -vectorizer-maximize-bandwidth < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK_SCALABLE_ON_MAXBW
; Test that the MaxVF for the following loop, that has no dependence distances,
; is calculated as vscale x 4 (max legal SVE vector size) or vscale x 16
; (maximized bandwidth for i8 in the loop).
define void @test0(i32* %a, i8* %b, i32* %c) {
; CHECK: LV: Checking a loop in "test0"
; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4
; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 16
entry:
br label %loop
loop:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
%arrayidx = getelementptr inbounds i32, i32* %c, i64 %iv
%0 = load i32, i32* %arrayidx, align 4
%arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %iv
%1 = load i8, i8* %arrayidx2, align 4
%zext = zext i8 %1 to i32
%add = add nsw i32 %zext, %0
%arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %iv
store i32 %add, i32* %arrayidx5, align 4
%iv.next = add nuw nsw i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, 1024
br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !0
exit:
ret void
}
; Test that the MaxVF for the following loop, with a dependence distance
; of 64 elements, is calculated as (maxvscale = 16) * 4.
define void @test1(i32* %a, i8* %b) {
; CHECK: LV: Checking a loop in "test1"
; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4
; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 4
entry:
br label %loop
loop:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
%arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
%0 = load i32, i32* %arrayidx, align 4
%arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %iv
%1 = load i8, i8* %arrayidx2, align 4
%zext = zext i8 %1 to i32
%add = add nsw i32 %zext, %0
%2 = add nuw nsw i64 %iv, 64
%arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
store i32 %add, i32* %arrayidx5, align 4
%iv.next = add nuw nsw i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, 1024
br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !0
exit:
ret void
}
; Test that the MaxVF for the following loop, with a dependence distance
; of 32 elements, is calculated as (maxvscale = 16) * 2.
define void @test2(i32* %a, i8* %b) {
; CHECK: LV: Checking a loop in "test2"
; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 2
; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 2
entry:
br label %loop
loop:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
%arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
%0 = load i32, i32* %arrayidx, align 4
%arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %iv
%1 = load i8, i8* %arrayidx2, align 4
%zext = zext i8 %1 to i32
%add = add nsw i32 %zext, %0
%2 = add nuw nsw i64 %iv, 32
%arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
store i32 %add, i32* %arrayidx5, align 4
%iv.next = add nuw nsw i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, 1024
br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !0
exit:
ret void
}
; Test that the MaxVF for the following loop, with a dependence distance
; of 16 elements, is calculated as (maxvscale = 16) * 1.
define void @test3(i32* %a, i8* %b) {
; CHECK: LV: Checking a loop in "test3"
; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 1
; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 1
entry:
br label %loop
loop:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
%arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
%0 = load i32, i32* %arrayidx, align 4
%arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %iv
%1 = load i8, i8* %arrayidx2, align 4
%zext = zext i8 %1 to i32
%add = add nsw i32 %zext, %0
%2 = add nuw nsw i64 %iv, 16
%arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
store i32 %add, i32* %arrayidx5, align 4
%iv.next = add nuw nsw i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, 1024
br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !0
exit:
ret void
}
; Test the fallback mechanism when scalable vectors are not feasible due
; to e.g. dependence distance. For the '-scalable-vectorization=exclusive'
; it shouldn't try to vectorize with fixed-width vectors.
define void @test4(i32* %a, i32* %b) {
; CHECK: LV: Checking a loop in "test4"
; CHECK_SCALABLE_ON-NOT: LV: Found feasible scalable VF
; CHECK_SCALABLE_ON_MAXBW-NOT: LV: Found feasible scalable VF
entry:
br label %loop
loop:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
%arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
%0 = load i32, i32* %arrayidx, align 4
%arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv
%1 = load i32, i32* %arrayidx2, align 4
%add = add nsw i32 %1, %0
%2 = add nuw nsw i64 %iv, 8
%arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
store i32 %add, i32* %arrayidx5, align 4
%iv.next = add nuw nsw i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, 1024
br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !2
exit:
ret void
}
!0 = distinct !{!0, !1}
!1 = !{!"llvm.loop.vectorize.enable", i1 true}
!2 = distinct !{!2, !3, !4}
!3 = !{!"llvm.loop.vectorize.enable", i1 true}
!4 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}

View File

@ -37,9 +37,10 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
; unless max(vscale)=2 it's unsafe to vectorize. For SVE max(vscale)=16, check
; fixed-width vectorization is used instead.
; CHECK-DBG: LV: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead.
; CHECK-DBG: remark: <unknown>:0:0: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead.
; CHECK-DBG: LV: The max safe VF is: 8.
; CHECK-DBG: LV: Checking a loop in "test1"
; CHECK-DBG: LV: Max legal vector width too small, scalable vectorization unfeasible.
; CHECK-DBG: remark: <unknown>:0:0: Max legal vector width too small, scalable vectorization unfeasible.
; CHECK-DBG: LV: The max safe fixed VF is: 8.
; CHECK-DBG: LV: Selecting VF: 4.
; CHECK-LABEL: @test1
; CHECK: <4 x i32>
@ -80,9 +81,10 @@ exit:
; }
; }
; CHECK-DBG: LV: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead.
; CHECK-DBG: LV: The max safe VF is: 4.
; CHECK-DBG: LV: User VF=8 is unsafe, clamping to max safe VF=4.
; CHECK-DBG: LV: Checking a loop in "test2"
; CHECK-DBG: LV: Max legal vector width too small, scalable vectorization unfeasible.
; CHECK-DBG: LV: The max safe fixed VF is: 4.
; CHECK-DBG: LV: User VF=vscale x 8 is unsafe. Ignoring scalable UserVF.
; CHECK-DBG: LV: Selecting VF: 4.
; CHECK-LABEL: @test2
; CHECK: <4 x i32>
@ -129,7 +131,7 @@ exit:
; Max fixed VF=32, Max scalable VF=2, safe to vectorize.
; CHECK-DBG-LABEL: LV: Checking a loop in "test3"
; CHECK-DBG: LV: The max safe VF is: vscale x 2.
; CHECK-DBG: LV: The max safe scalable VF is: vscale x 2.
; CHECK-DBG: LV: Using user VF vscale x 2.
; CHECK-LABEL: @test3
; CHECK: <vscale x 2 x i32>
@ -161,7 +163,8 @@ exit:
; test4
;
; Scalable vectorization feasible, but the VF is unsafe. Should clamp.
; Scalable vectorization feasible, but the given VF is unsafe. Should ignore
; the hint and leave it to the vectorizer to pick a more suitable VF.
;
; Specifies a vector of <vscale x 4 x i32>, i.e. maximum of 64 x i32 with 4
; words per 128-bits (packed).
@ -173,15 +176,16 @@ exit:
; }
; }
;
; Max fixed VF=32, Max scalable VF=2, unsafe to vectorize. Should clamp to 2.
; Max fixed VF=32, Max scalable VF=2, unsafe to vectorize.
; CHECK-DBG-LABEL: LV: Checking a loop in "test4"
; CHECK-DBG: LV: The max safe VF is: vscale x 2.
; CHECK-DBG: LV: User VF=vscale x 4 is unsafe, clamping to max safe VF=vscale x 2.
; CHECK-DBG: remark: <unknown>:0:0: User-specified vectorization factor vscale x 4 is unsafe, clamping to maximum safe vectorization factor vscale x 2
; CHECK-DBG: LV: Using max VF vscale x 2
; CHECK-DBG: LV: The max safe scalable VF is: vscale x 2.
; CHECK-DBG: LV: User VF=vscale x 4 is unsafe. Ignoring scalable UserVF.
; CHECK-DBG: remark: <unknown>:0:0: User-specified vectorization factor vscale x 4 is unsafe. Ignoring the hint to let the compiler pick a suitable VF.
; CHECK-DBG: Found feasible scalable VF = vscale x 2
; CHECK-DBG: LV: Selecting VF: 4.
; CHECK-LABEL: @test4
; CHECK: <vscale x 2 x i32>
; CHECK: <4 x i32>
define void @test4(i32* %a, i32* %b) {
entry:
br label %loop
@ -225,7 +229,7 @@ exit:
; Max fixed VF=128, Max scalable VF=8, safe to vectorize.
; CHECK-DBG-LABEL: LV: Checking a loop in "test5"
; CHECK-DBG: LV: The max safe VF is: vscale x 8.
; CHECK-DBG: LV: The max safe scalable VF is: vscale x 8.
; CHECK-DBG: LV: Using user VF vscale x 4
; CHECK-LABEL: @test5
; CHECK: <vscale x 4 x i32>
@ -257,7 +261,8 @@ exit:
; test6
;
; Scalable vectorization feasible, but the VF is unsafe. Should clamp.
; Scalable vectorization feasible, but the VF is unsafe. Should ignore
; the hint and leave it to the vectorizer to pick a more suitable VF.
;
; Specifies a vector of <vscale x 16 x i32>, i.e. maximum of 256 x i32.
;
@ -268,15 +273,16 @@ exit:
; }
; }
;
; Max fixed VF=128, Max scalable VF=8, unsafe to vectorize. Should clamp to 8.
; Max fixed VF=128, Max scalable VF=8, unsafe to vectorize.
; CHECK-DBG-LABEL: LV: Checking a loop in "test6"
; CHECK-DBG: LV: The max safe VF is: vscale x 8.
; CHECK-DBG: LV: User VF=vscale x 16 is unsafe, clamping to max safe VF=vscale x 8.
; CHECK-DBG: remark: <unknown>:0:0: User-specified vectorization factor vscale x 16 is unsafe, clamping to maximum safe vectorization factor vscale x 8
; CHECK-DBG: LV: Using max VF vscale x 8
; CHECK-DBG: LV: The max safe scalable VF is: vscale x 8.
; CHECK-DBG: LV: User VF=vscale x 16 is unsafe. Ignoring scalable UserVF.
; CHECK-DBG: remark: <unknown>:0:0: User-specified vectorization factor vscale x 16 is unsafe. Ignoring the hint to let the compiler pick a suitable VF.
; CHECK-DBG: LV: Found feasible scalable VF = vscale x 4
; CHECK-DBG: Selecting VF: 4.
; CHECK-LABEL: @test6
; CHECK: <vscale x 8 x i32>
; CHECK: <4 x i32>
define void @test6(i32* %a, i32* %b) {
entry:
br label %loop
@ -304,8 +310,9 @@ exit:
!17 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
; CHECK-NO-SVE-LABEL: LV: Checking a loop in "test_no_sve"
; CHECK-NO-SVE: LV: Ignoring VF=vscale x 4 because target does not support scalable vectors.
; CHECK-NO-SVE: remark: <unknown>:0:0: Ignoring VF=vscale x 4 because target does not support scalable vectors.
; CHECK-NO-SVE: LV: Disabling scalable vectorization, because target does not support scalable vectors.
; CHECK-NO-SVE: remark: <unknown>:0:0: Disabling scalable vectorization, because target does not support scalable vectors.
; CHECK-NO-SVE: LV: User VF=vscale x 4 is unsafe. Ignoring scalable UserVF.
; CHECK-NO-SVE: LV: Selecting VF: 4.
; CHECK-NO-SVE: <4 x i32>
; CHECK-NO-SVE-NOT: <vscale x 4 x i32>
@ -337,8 +344,8 @@ exit:
; supported but max vscale is undefined.
;
; CHECK-NO-MAX-VSCALE-LABEL: LV: Checking a loop in "test_no_max_vscale"
; CHECK-NO-MAX-VSCALE: LV: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead.
; CEHCK-NO-MAX-VSCALE: The max safe VF is: 4.
; CEHCK-NO-MAX-VSCALE: The max safe fixed VF is: 4.
; CHECK-NO-MAX-VSCALE: LV: User VF=vscale x 4 is unsafe. Ignoring scalable UserVF.
; CHECK-NO-MAX-VSCALE: LV: Selecting VF: 4.
; CHECK-NO-MAX-VSCALE: <4 x i32>
define void @test_no_max_vscale(i32* %a, i32* %b) {

View File

@ -3,8 +3,8 @@
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
; CHECK: LV: Ignoring VF=vscale x 4 because target does not support scalable vectors.
; CHECK: remark: <unknown>:0:0: Ignoring VF=vscale x 4 because target does not support scalable vectors.
; CHECK: LV: Disabling scalable vectorization, because target does not support scalable vectors.
; CHECK: remark: <unknown>:0:0: Disabling scalable vectorization, because target does not support scalable vectors.
; CHECK: LV: The Widest register safe to use is: 32 bits.
define void @test1(i32* %a, i32* %b) {
entry: