From 51d648c119d7773ce6fb809353bd6bd14bca8818 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Thu, 29 Apr 2021 15:37:57 +0100 Subject: [PATCH] Revert "[LV] Calculate max feasible scalable VF." Temporarily reverting this patch due to some unexpected issue found by one of the PPC buildbots. This reverts commit 584e9b6e4b4987b882719923e640eed854613d91. --- .../llvm/Transforms/Vectorize/LoopVectorize.h | 7 - .../Transforms/Vectorize/LoopVectorize.cpp | 341 ++++++++---------- .../AArch64/scalable-reductions.ll | 42 +-- .../AArch64/scalable-vf-analysis.ll | 149 -------- .../LoopVectorize/AArch64/scalable-vf-hint.ll | 59 ++- .../LoopVectorize/scalable-vf-hint.ll | 4 +- 6 files changed, 191 insertions(+), 411 deletions(-) delete mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-analysis.ll diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h index ad6a4b561a9b..ecb44a7b1518 100644 --- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h +++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h @@ -174,13 +174,6 @@ void reportVectorizationFailure(const StringRef DebugMsg, const StringRef OREMsg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I = nullptr); -/// Reports an informative message: print \p Msg for debugging purposes as well -/// as an optimization remark. Uses either \p I as location of the remark, or -/// otherwise \p TheLoop. -void reportVectorizationInfo(const StringRef OREMsg, const StringRef ORETag, - OptimizationRemarkEmitter *ORE, Loop *TheLoop, - Instruction *I = nullptr); - } // end namespace llvm #endif // LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZE_H diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 0fe92b3685e5..f8f47c475088 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1065,13 +1065,13 @@ void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) B.SetCurrentDebugLocation(DebugLoc()); } -/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I -/// is passed, the message relates to that particular instruction. +/// Write a record \p DebugMsg about vectorization failure to the debug +/// output stream. If \p I is passed, it is an instruction that prevents +/// vectorization. #ifndef NDEBUG -static void debugVectorizationMessage(const StringRef Prefix, - const StringRef DebugMsg, - Instruction *I) { - dbgs() << "LV: " << Prefix << DebugMsg; +static void debugVectorizationFailure(const StringRef DebugMsg, + Instruction *I) { + dbgs() << "LV: Not vectorizing: " << DebugMsg; if (I != nullptr) dbgs() << " " << *I; else @@ -1100,7 +1100,9 @@ static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, DL = I->getDebugLoc(); } - return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); + OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); + R << "loop not vectorized: "; + return R; } /// Return a value for Step multiplied by VF. @@ -1121,24 +1123,12 @@ Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) { } void reportVectorizationFailure(const StringRef DebugMsg, - const StringRef OREMsg, const StringRef ORETag, - OptimizationRemarkEmitter *ORE, Loop *TheLoop, - Instruction *I) { - LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); + const StringRef OREMsg, const StringRef ORETag, + OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { + LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); - ORE->emit( - createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) - << "loop not vectorized: " << OREMsg); -} - -void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, - OptimizationRemarkEmitter *ORE, Loop *TheLoop, - Instruction *I) { - LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); - LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); - ORE->emit( - createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) - << Msg); + ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), + ORETag, TheLoop, I) << OREMsg); } } // end namespace llvm @@ -1633,18 +1623,6 @@ private: ElementCount computeFeasibleMaxVF(unsigned ConstTripCount, ElementCount UserVF); - /// \return the maximized element count based on the targets vector - /// registers and the loop trip-count, but limited to a maximum safe VF. - /// This is a helper function of computeFeasibleMaxVF. - ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, - unsigned SmallestType, - unsigned WidestType, - ElementCount MaxSafeVF); - - /// \return the maximum legal scalable VF, based on the safe max number - /// of elements. - ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); - /// The vectorization cost is a combination of the cost itself and a boolean /// indicating whether any of the contributing operations will actually /// operate on @@ -5598,129 +5576,6 @@ bool LoopVectorizationCostModel::runtimeChecksRequired() { return false; } -ElementCount -LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { - if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { - reportVectorizationInfo( - "Disabling scalable vectorization, because target does not " - "support scalable vectors.", - "ScalableVectorsUnsupported", ORE, TheLoop); - return ElementCount::getScalable(0); - } - - auto MaxScalableVF = ElementCount::getScalable(1u << 16); - - // Disable scalable vectorization if the loop contains unsupported reductions. - // Test that the loop-vectorizer can legalize all operations for this MaxVF. - // FIXME: While for scalable vectors this is currently sufficient, this should - // be replaced by a more detailed mechanism that filters out specific VFs, - // instead of invalidating vectorization for a whole set of VFs based on the - // MaxVF. - if (!canVectorizeReductions(MaxScalableVF)) { - reportVectorizationInfo( - "Scalable vectorization not supported for the reduction " - "operations found in this loop.", - "ScalableVFUnfeasible", ORE, TheLoop); - return ElementCount::getScalable(0); - } - - if (Legal->isSafeForAnyVectorWidth()) - return MaxScalableVF; - - // Limit MaxScalableVF by the maximum safe dependence distance. - Optional MaxVScale = TTI.getMaxVScale(); - MaxScalableVF = ElementCount::getScalable( - MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); - if (!MaxScalableVF) - reportVectorizationInfo( - "Max legal vector width too small, scalable vectorization " - "unfeasible.", - "ScalableVFUnfeasible", ORE, TheLoop); - - return MaxScalableVF; -} - -ElementCount -LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, - ElementCount UserVF) { - MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); - unsigned SmallestType, WidestType; - std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); - - // Get the maximum safe dependence distance in bits computed by LAA. - // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from - // the memory accesses that is most restrictive (involved in the smallest - // dependence distance). - unsigned MaxSafeElements = - PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); - - auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); - auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); - - LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF - << ".\n"); - LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF - << ".\n"); - - // First analyze the UserVF, fall back if the UserVF should be ignored. - if (UserVF) { - auto MaxSafeUserVF = - UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; - - if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) - return UserVF; - - assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); - - // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it - // is better to ignore the hint and let the compiler choose a suitable VF. - if (!UserVF.isScalable()) { - LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF - << " is unsafe, clamping to max safe VF=" - << MaxSafeFixedVF << ".\n"); - ORE->emit([&]() { - return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", - TheLoop->getStartLoc(), - TheLoop->getHeader()) - << "User-specified vectorization factor " - << ore::NV("UserVectorizationFactor", UserVF) - << " is unsafe, clamping to maximum safe vectorization factor " - << ore::NV("VectorizationFactor", MaxSafeFixedVF); - }); - return MaxSafeFixedVF; - } - - LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF - << " is unsafe. Ignoring scalable UserVF.\n"); - ORE->emit([&]() { - return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", - TheLoop->getStartLoc(), - TheLoop->getHeader()) - << "User-specified vectorization factor " - << ore::NV("UserVectorizationFactor", UserVF) - << " is unsafe. Ignoring the hint to let the compiler pick a " - "suitable VF."; - }); - } - - LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType - << " / " << WidestType << " bits.\n"); - - ElementCount MaxFixedVF = ElementCount::getFixed(1); - if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, - WidestType, MaxSafeFixedVF)) - MaxFixedVF = MaxVF; - - if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, - WidestType, MaxSafeScalableVF)) - // FIXME: Return scalable VF as well (to be added in future patch). - if (MaxVF.isScalable()) - LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF - << "\n"); - - return MaxFixedVF; -} - Optional LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { @@ -5861,61 +5716,149 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { return None; } -ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( - unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, - ElementCount MaxSafeVF) { - bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); - TypeSize WidestRegister = TTI.getRegisterBitWidth( - ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector - : TargetTransformInfo::RGK_FixedWidthVector); +ElementCount +LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, + ElementCount UserVF) { + bool IgnoreScalableUserVF = UserVF.isScalable() && + !TTI.supportsScalableVectors() && + !ForceTargetSupportsScalableVectors; + if (IgnoreScalableUserVF) { + LLVM_DEBUG( + dbgs() << "LV: Ignoring VF=" << UserVF + << " because target does not support scalable vectors.\n"); + ORE->emit([&]() { + return OptimizationRemarkAnalysis(DEBUG_TYPE, "IgnoreScalableUserVF", + TheLoop->getStartLoc(), + TheLoop->getHeader()) + << "Ignoring VF=" << ore::NV("UserVF", UserVF) + << " because target does not support scalable vectors."; + }); + } - // Convenience function to return the minimum of two ElementCounts. - auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { - assert((LHS.isScalable() == RHS.isScalable()) && - "Scalable flags must match"); - return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; - }; + // Beyond this point two scenarios are handled. If UserVF isn't specified + // then a suitable VF is chosen. If UserVF is specified and there are + // dependencies, check if it's legal. However, if a UserVF is specified and + // there are no dependencies, then there's nothing to do. + if (UserVF.isNonZero() && !IgnoreScalableUserVF) { + if (!canVectorizeReductions(UserVF)) { + reportVectorizationFailure( + "LV: Scalable vectorization not supported for the reduction " + "operations found in this loop. Using fixed-width " + "vectorization instead.", + "Scalable vectorization not supported for the reduction operations " + "found in this loop. Using fixed-width vectorization instead.", + "ScalableVFUnfeasible", ORE, TheLoop); + return computeFeasibleMaxVF( + ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue())); + } + + if (Legal->isSafeForAnyVectorWidth()) + return UserVF; + } + + MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); + unsigned SmallestType, WidestType; + std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); + unsigned WidestRegister = + TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) + .getFixedSize(); + + // Get the maximum safe dependence distance in bits computed by LAA. + // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from + // the memory accesses that is most restrictive (involved in the smallest + // dependence distance). + unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits(); + + // If the user vectorization factor is legally unsafe, clamp it to a safe + // value. Otherwise, return as is. + if (UserVF.isNonZero() && !IgnoreScalableUserVF) { + unsigned MaxSafeElements = + PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType); + ElementCount MaxSafeVF = ElementCount::getFixed(MaxSafeElements); + + if (UserVF.isScalable()) { + Optional MaxVScale = TTI.getMaxVScale(); + + // Scale VF by vscale before checking if it's safe. + MaxSafeVF = ElementCount::getScalable( + MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); + + if (MaxSafeVF.isZero()) { + // The dependence distance is too small to use scalable vectors, + // fallback on fixed. + LLVM_DEBUG( + dbgs() + << "LV: Max legal vector width too small, scalable vectorization " + "unfeasible. Using fixed-width vectorization instead.\n"); + ORE->emit([&]() { + return OptimizationRemarkAnalysis(DEBUG_TYPE, "ScalableVFUnfeasible", + TheLoop->getStartLoc(), + TheLoop->getHeader()) + << "Max legal vector width too small, scalable vectorization " + << "unfeasible. Using fixed-width vectorization instead."; + }); + return computeFeasibleMaxVF( + ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue())); + } + } + + LLVM_DEBUG(dbgs() << "LV: The max safe VF is: " << MaxSafeVF << ".\n"); + + if (ElementCount::isKnownLE(UserVF, MaxSafeVF)) + return UserVF; + + LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF + << " is unsafe, clamping to max safe VF=" << MaxSafeVF + << ".\n"); + ORE->emit([&]() { + return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", + TheLoop->getStartLoc(), + TheLoop->getHeader()) + << "User-specified vectorization factor " + << ore::NV("UserVectorizationFactor", UserVF) + << " is unsafe, clamping to maximum safe vectorization factor " + << ore::NV("VectorizationFactor", MaxSafeVF); + }); + return MaxSafeVF; + } + + WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits); // Ensure MaxVF is a power of 2; the dependence distance bound may not be. // Note that both WidestRegister and WidestType may not be a powers of 2. - auto MaxVectorElementCount = ElementCount::get( - PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType), - ComputeScalableMaxVF); - MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); - LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " - << (MaxVectorElementCount * WidestType) << " bits.\n"); + auto MaxVectorSize = + ElementCount::getFixed(PowerOf2Floor(WidestRegister / WidestType)); - if (!MaxVectorElementCount) { + LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType + << " / " << WidestType << " bits.\n"); + LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " + << WidestRegister << " bits.\n"); + + assert(MaxVectorSize.getFixedValue() <= WidestRegister && + "Did not expect to pack so many elements" + " into one vector!"); + if (MaxVectorSize.getFixedValue() == 0) { LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); return ElementCount::getFixed(1); - } - - const auto TripCountEC = ElementCount::getFixed(ConstTripCount); - if (ConstTripCount && - ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) && - isPowerOf2_32(ConstTripCount)) { + } else if (ConstTripCount && ConstTripCount < MaxVectorSize.getFixedValue() && + isPowerOf2_32(ConstTripCount)) { // We need to clamp the VF to be the ConstTripCount. There is no point in - // choosing a higher viable VF as done in the loop below. If - // MaxVectorElementCount is scalable, we only fall back on a fixed VF when - // the TC is less than or equal to the known number of lanes. + // choosing a higher viable VF as done in the loop below. LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " << ConstTripCount << "\n"); - return TripCountEC; + return ElementCount::getFixed(ConstTripCount); } - ElementCount MaxVF = MaxVectorElementCount; + ElementCount MaxVF = MaxVectorSize; if (TTI.shouldMaximizeVectorBandwidth() || (MaximizeBandwidth && isScalarEpilogueAllowed())) { - auto MaxVectorElementCountMaxBW = ElementCount::get( - PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType), - ComputeScalableMaxVF); - MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); - // Collect all viable vectorization factors larger than the default MaxVF - // (i.e. MaxVectorElementCount). + // (i.e. MaxVectorSize). SmallVector VFs; - for (ElementCount VS = MaxVectorElementCount * 2; - ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) + auto MaxVectorSizeMaxBW = + ElementCount::getFixed(WidestRegister / SmallestType); + for (ElementCount VS = MaxVectorSize * 2; + ElementCount::isKnownLE(VS, MaxVectorSizeMaxBW); VS *= 2) VFs.push_back(VS); // For each VF calculate its register usage. @@ -5936,7 +5879,7 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( } } if (ElementCount MinVF = - TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { + TTI.getMinimumVF(SmallestType, /*IsScalable=*/false)) { if (ElementCount::isKnownLT(MaxVF, MinVF)) { LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF << ") with target's minimum: " << MinVF << '\n'); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll index 7b410dd73a3c..582bf4dc5747 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll @@ -221,7 +221,7 @@ for.end: ret float %add } -; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop. +; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop. Using fixed-width vectorization instead. ; CHECK-REMARK: vectorized loop (vectorization width: 8, interleaved count: 2) define bfloat @fadd_fast_bfloat(bfloat* noalias nocapture readonly %a, i64 %n) { ; CHECK-LABEL: @fadd_fast_bfloat @@ -322,18 +322,18 @@ for.end: ; MUL -; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop. -; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 2) +; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop. Using fixed-width vectorization instead. +; CHECK-REMARK: vectorized loop (vectorization width: 8, interleaved count: 2) define i32 @mul(i32* nocapture %a, i32* nocapture readonly %b, i64 %n) { ; CHECK-LABEL: @mul ; CHECK: vector.body: -; CHECK: %[[LOAD1:.*]] = load <4 x i32> -; CHECK: %[[LOAD2:.*]] = load <4 x i32> -; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD1]] -; CHECK: %[[MUL2:.*]] = mul <4 x i32> %[[LOAD2]] +; CHECK: %[[LOAD1:.*]] = load <8 x i32> +; CHECK: %[[LOAD2:.*]] = load <8 x i32> +; CHECK: %[[MUL1:.*]] = mul <8 x i32> %[[LOAD1]] +; CHECK: %[[MUL2:.*]] = mul <8 x i32> %[[LOAD2]] ; CHECK: middle.block: -; CHECK: %[[RDX:.*]] = mul <4 x i32> %[[MUL2]], %[[MUL1]] -; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[RDX]]) +; CHECK: %[[RDX:.*]] = mul <8 x i32> %[[MUL2]], %[[MUL1]] +; CHECK: call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> %[[RDX]]) entry: br label %for.body @@ -352,22 +352,22 @@ for.end: ; preds = %for.body, %entry } ; Note: This test was added to ensure we always check the legality of reductions (end emit a warning if necessary) before checking for memory dependencies -; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop. -; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 2) +; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop. Using fixed-width vectorization instead. +; CHECK-REMARK: vectorized loop (vectorization width: 8, interleaved count: 2) define i32 @memory_dependence(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i64 %n) { ; CHECK-LABEL: @memory_dependence ; CHECK: vector.body: -; CHECK: %[[LOAD1:.*]] = load <4 x i32> -; CHECK: %[[LOAD2:.*]] = load <4 x i32> -; CHECK: %[[LOAD3:.*]] = load <4 x i32> -; CHECK: %[[LOAD4:.*]] = load <4 x i32> -; CHECK: %[[ADD1:.*]] = add nsw <4 x i32> %[[LOAD3]], %[[LOAD1]] -; CHECK: %[[ADD2:.*]] = add nsw <4 x i32> %[[LOAD4]], %[[LOAD2]] -; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD3]] -; CHECK: %[[MUL2:.*]] = mul <4 x i32> %[[LOAD4]] +; CHECK: %[[LOAD1:.*]] = load <8 x i32> +; CHECK: %[[LOAD2:.*]] = load <8 x i32> +; CHECK: %[[LOAD3:.*]] = load <8 x i32> +; CHECK: %[[LOAD4:.*]] = load <8 x i32> +; CHECK: %[[ADD1:.*]] = add nsw <8 x i32> %[[LOAD3]], %[[LOAD1]] +; CHECK: %[[ADD2:.*]] = add nsw <8 x i32> %[[LOAD4]], %[[LOAD2]] +; CHECK: %[[MUL1:.*]] = mul <8 x i32> %[[LOAD3]] +; CHECK: %[[MUL2:.*]] = mul <8 x i32> %[[LOAD4]] ; CHECK: middle.block: -; CHECK: %[[RDX:.*]] = mul <4 x i32> %[[MUL2]], %[[MUL1]] -; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[RDX]]) +; CHECK: %[[RDX:.*]] = mul <8 x i32> %[[MUL2]], %[[MUL1]] +; CHECK: call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> %[[RDX]]) entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-analysis.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-analysis.ll deleted file mode 100644 index 6fe546439a13..000000000000 --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-analysis.ll +++ /dev/null @@ -1,149 +0,0 @@ -; REQUIRES: asserts -; RUN: opt -mtriple=aarch64-none-linux-gnu -mattr=+sve -force-target-instruction-cost=1 -loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK_SCALABLE_ON -; RUN: opt -mtriple=aarch64-none-linux-gnu -mattr=+sve -force-target-instruction-cost=1 -loop-vectorize -S -debug-only=loop-vectorize -vectorizer-maximize-bandwidth < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK_SCALABLE_ON_MAXBW - -; Test that the MaxVF for the following loop, that has no dependence distances, -; is calculated as vscale x 4 (max legal SVE vector size) or vscale x 16 -; (maximized bandwidth for i8 in the loop). -define void @test0(i32* %a, i8* %b, i32* %c) { -; CHECK: LV: Checking a loop in "test0" -; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4 -; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 16 -entry: - br label %loop - -loop: - %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] - %arrayidx = getelementptr inbounds i32, i32* %c, i64 %iv - %0 = load i32, i32* %arrayidx, align 4 - %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %iv - %1 = load i8, i8* %arrayidx2, align 4 - %zext = zext i8 %1 to i32 - %add = add nsw i32 %zext, %0 - %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %iv - store i32 %add, i32* %arrayidx5, align 4 - %iv.next = add nuw nsw i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, 1024 - br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !0 - -exit: - ret void -} - -; Test that the MaxVF for the following loop, with a dependence distance -; of 64 elements, is calculated as (maxvscale = 16) * 4. -define void @test1(i32* %a, i8* %b) { -; CHECK: LV: Checking a loop in "test1" -; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4 -; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 4 -entry: - br label %loop - -loop: - %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] - %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv - %0 = load i32, i32* %arrayidx, align 4 - %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %iv - %1 = load i8, i8* %arrayidx2, align 4 - %zext = zext i8 %1 to i32 - %add = add nsw i32 %zext, %0 - %2 = add nuw nsw i64 %iv, 64 - %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2 - store i32 %add, i32* %arrayidx5, align 4 - %iv.next = add nuw nsw i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, 1024 - br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !0 - -exit: - ret void -} - -; Test that the MaxVF for the following loop, with a dependence distance -; of 32 elements, is calculated as (maxvscale = 16) * 2. -define void @test2(i32* %a, i8* %b) { -; CHECK: LV: Checking a loop in "test2" -; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 2 -; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 2 -entry: - br label %loop - -loop: - %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] - %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv - %0 = load i32, i32* %arrayidx, align 4 - %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %iv - %1 = load i8, i8* %arrayidx2, align 4 - %zext = zext i8 %1 to i32 - %add = add nsw i32 %zext, %0 - %2 = add nuw nsw i64 %iv, 32 - %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2 - store i32 %add, i32* %arrayidx5, align 4 - %iv.next = add nuw nsw i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, 1024 - br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !0 - -exit: - ret void -} - -; Test that the MaxVF for the following loop, with a dependence distance -; of 16 elements, is calculated as (maxvscale = 16) * 1. -define void @test3(i32* %a, i8* %b) { -; CHECK: LV: Checking a loop in "test3" -; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 1 -; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 1 -entry: - br label %loop - -loop: - %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] - %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv - %0 = load i32, i32* %arrayidx, align 4 - %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %iv - %1 = load i8, i8* %arrayidx2, align 4 - %zext = zext i8 %1 to i32 - %add = add nsw i32 %zext, %0 - %2 = add nuw nsw i64 %iv, 16 - %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2 - store i32 %add, i32* %arrayidx5, align 4 - %iv.next = add nuw nsw i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, 1024 - br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !0 - -exit: - ret void -} - -; Test the fallback mechanism when scalable vectors are not feasible due -; to e.g. dependence distance. For the '-scalable-vectorization=exclusive' -; it shouldn't try to vectorize with fixed-width vectors. -define void @test4(i32* %a, i32* %b) { -; CHECK: LV: Checking a loop in "test4" -; CHECK_SCALABLE_ON-NOT: LV: Found feasible scalable VF -; CHECK_SCALABLE_ON_MAXBW-NOT: LV: Found feasible scalable VF -entry: - br label %loop - -loop: - %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] - %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv - %0 = load i32, i32* %arrayidx, align 4 - %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv - %1 = load i32, i32* %arrayidx2, align 4 - %add = add nsw i32 %1, %0 - %2 = add nuw nsw i64 %iv, 8 - %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2 - store i32 %add, i32* %arrayidx5, align 4 - %iv.next = add nuw nsw i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, 1024 - br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !2 - -exit: - ret void -} - -!0 = distinct !{!0, !1} -!1 = !{!"llvm.loop.vectorize.enable", i1 true} -!2 = distinct !{!2, !3, !4} -!3 = !{!"llvm.loop.vectorize.enable", i1 true} -!4 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll index 33d75e0a8565..a6b50e03768c 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll @@ -37,10 +37,9 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" ; unless max(vscale)=2 it's unsafe to vectorize. For SVE max(vscale)=16, check ; fixed-width vectorization is used instead. -; CHECK-DBG: LV: Checking a loop in "test1" -; CHECK-DBG: LV: Max legal vector width too small, scalable vectorization unfeasible. -; CHECK-DBG: remark: :0:0: Max legal vector width too small, scalable vectorization unfeasible. -; CHECK-DBG: LV: The max safe fixed VF is: 8. +; CHECK-DBG: LV: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead. +; CHECK-DBG: remark: :0:0: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead. +; CHECK-DBG: LV: The max safe VF is: 8. ; CHECK-DBG: LV: Selecting VF: 4. ; CHECK-LABEL: @test1 ; CHECK: <4 x i32> @@ -81,10 +80,9 @@ exit: ; } ; } -; CHECK-DBG: LV: Checking a loop in "test2" -; CHECK-DBG: LV: Max legal vector width too small, scalable vectorization unfeasible. -; CHECK-DBG: LV: The max safe fixed VF is: 4. -; CHECK-DBG: LV: User VF=vscale x 8 is unsafe. Ignoring scalable UserVF. +; CHECK-DBG: LV: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead. +; CHECK-DBG: LV: The max safe VF is: 4. +; CHECK-DBG: LV: User VF=8 is unsafe, clamping to max safe VF=4. ; CHECK-DBG: LV: Selecting VF: 4. ; CHECK-LABEL: @test2 ; CHECK: <4 x i32> @@ -131,7 +129,7 @@ exit: ; Max fixed VF=32, Max scalable VF=2, safe to vectorize. ; CHECK-DBG-LABEL: LV: Checking a loop in "test3" -; CHECK-DBG: LV: The max safe scalable VF is: vscale x 2. +; CHECK-DBG: LV: The max safe VF is: vscale x 2. ; CHECK-DBG: LV: Using user VF vscale x 2. ; CHECK-LABEL: @test3 ; CHECK: @@ -163,8 +161,7 @@ exit: ; test4 ; -; Scalable vectorization feasible, but the given VF is unsafe. Should ignore -; the hint and leave it to the vectorizer to pick a more suitable VF. +; Scalable vectorization feasible, but the VF is unsafe. Should clamp. ; ; Specifies a vector of , i.e. maximum of 64 x i32 with 4 ; words per 128-bits (packed). @@ -176,16 +173,15 @@ exit: ; } ; } ; -; Max fixed VF=32, Max scalable VF=2, unsafe to vectorize. +; Max fixed VF=32, Max scalable VF=2, unsafe to vectorize. Should clamp to 2. ; CHECK-DBG-LABEL: LV: Checking a loop in "test4" -; CHECK-DBG: LV: The max safe scalable VF is: vscale x 2. -; CHECK-DBG: LV: User VF=vscale x 4 is unsafe. Ignoring scalable UserVF. -; CHECK-DBG: remark: :0:0: User-specified vectorization factor vscale x 4 is unsafe. Ignoring the hint to let the compiler pick a suitable VF. -; CHECK-DBG: Found feasible scalable VF = vscale x 2 -; CHECK-DBG: LV: Selecting VF: 4. +; CHECK-DBG: LV: The max safe VF is: vscale x 2. +; CHECK-DBG: LV: User VF=vscale x 4 is unsafe, clamping to max safe VF=vscale x 2. +; CHECK-DBG: remark: :0:0: User-specified vectorization factor vscale x 4 is unsafe, clamping to maximum safe vectorization factor vscale x 2 +; CHECK-DBG: LV: Using max VF vscale x 2 ; CHECK-LABEL: @test4 -; CHECK: <4 x i32> +; CHECK: define void @test4(i32* %a, i32* %b) { entry: br label %loop @@ -229,7 +225,7 @@ exit: ; Max fixed VF=128, Max scalable VF=8, safe to vectorize. ; CHECK-DBG-LABEL: LV: Checking a loop in "test5" -; CHECK-DBG: LV: The max safe scalable VF is: vscale x 8. +; CHECK-DBG: LV: The max safe VF is: vscale x 8. ; CHECK-DBG: LV: Using user VF vscale x 4 ; CHECK-LABEL: @test5 ; CHECK: @@ -261,8 +257,7 @@ exit: ; test6 ; -; Scalable vectorization feasible, but the VF is unsafe. Should ignore -; the hint and leave it to the vectorizer to pick a more suitable VF. +; Scalable vectorization feasible, but the VF is unsafe. Should clamp. ; ; Specifies a vector of , i.e. maximum of 256 x i32. ; @@ -273,16 +268,15 @@ exit: ; } ; } ; -; Max fixed VF=128, Max scalable VF=8, unsafe to vectorize. +; Max fixed VF=128, Max scalable VF=8, unsafe to vectorize. Should clamp to 8. ; CHECK-DBG-LABEL: LV: Checking a loop in "test6" -; CHECK-DBG: LV: The max safe scalable VF is: vscale x 8. -; CHECK-DBG: LV: User VF=vscale x 16 is unsafe. Ignoring scalable UserVF. -; CHECK-DBG: remark: :0:0: User-specified vectorization factor vscale x 16 is unsafe. Ignoring the hint to let the compiler pick a suitable VF. -; CHECK-DBG: LV: Found feasible scalable VF = vscale x 4 -; CHECK-DBG: Selecting VF: 4. +; CHECK-DBG: LV: The max safe VF is: vscale x 8. +; CHECK-DBG: LV: User VF=vscale x 16 is unsafe, clamping to max safe VF=vscale x 8. +; CHECK-DBG: remark: :0:0: User-specified vectorization factor vscale x 16 is unsafe, clamping to maximum safe vectorization factor vscale x 8 +; CHECK-DBG: LV: Using max VF vscale x 8 ; CHECK-LABEL: @test6 -; CHECK: <4 x i32> +; CHECK: define void @test6(i32* %a, i32* %b) { entry: br label %loop @@ -310,9 +304,8 @@ exit: !17 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} ; CHECK-NO-SVE-LABEL: LV: Checking a loop in "test_no_sve" -; CHECK-NO-SVE: LV: Disabling scalable vectorization, because target does not support scalable vectors. -; CHECK-NO-SVE: remark: :0:0: Disabling scalable vectorization, because target does not support scalable vectors. -; CHECK-NO-SVE: LV: User VF=vscale x 4 is unsafe. Ignoring scalable UserVF. +; CHECK-NO-SVE: LV: Ignoring VF=vscale x 4 because target does not support scalable vectors. +; CHECK-NO-SVE: remark: :0:0: Ignoring VF=vscale x 4 because target does not support scalable vectors. ; CHECK-NO-SVE: LV: Selecting VF: 4. ; CHECK-NO-SVE: <4 x i32> ; CHECK-NO-SVE-NOT: @@ -344,8 +337,8 @@ exit: ; supported but max vscale is undefined. ; ; CHECK-NO-MAX-VSCALE-LABEL: LV: Checking a loop in "test_no_max_vscale" -; CEHCK-NO-MAX-VSCALE: The max safe fixed VF is: 4. -; CHECK-NO-MAX-VSCALE: LV: User VF=vscale x 4 is unsafe. Ignoring scalable UserVF. +; CHECK-NO-MAX-VSCALE: LV: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead. +; CEHCK-NO-MAX-VSCALE: The max safe VF is: 4. ; CHECK-NO-MAX-VSCALE: LV: Selecting VF: 4. ; CHECK-NO-MAX-VSCALE: <4 x i32> define void @test_no_max_vscale(i32* %a, i32* %b) { diff --git a/llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll b/llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll index f99d87f8ad79..1361ba59bca2 100644 --- a/llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll +++ b/llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll @@ -3,8 +3,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" -; CHECK: LV: Disabling scalable vectorization, because target does not support scalable vectors. -; CHECK: remark: :0:0: Disabling scalable vectorization, because target does not support scalable vectors. +; CHECK: LV: Ignoring VF=vscale x 4 because target does not support scalable vectors. +; CHECK: remark: :0:0: Ignoring VF=vscale x 4 because target does not support scalable vectors. ; CHECK: LV: The Widest register safe to use is: 32 bits. define void @test1(i32* %a, i32* %b) { entry: