[LV] Legalize scalable VF hints

In the following loop: void foo(int *a, int *b, int N) { for (int i=0; i<N; ++i) a[i + 4] = a[i] + b[i]; } The loop dependence constrains the VF to a maximum of (4, fixed), which would mean using <4 x i32> as the vector type in vectorization. Extending this to scalable vectorization, a VF of (4, scalable) implies a vector type of <vscale x 4 x i32>. To determine if this is legal vscale must be taken into account. For this example, unless max(vscale)=1, it's unsafe to vectorize. For SVE, the number of bits in an SVE register is architecturally defined to be a multiple of 128 bits with a maximum of 2048 bits, thus the maximum vscale is 16. In the loop above it is therefore unfeasible to vectorize with SVE. However, in this loop: void foo(int *a, int *b, int N) { #pragma clang loop vectorize_width(X, scalable) for (int i=0; i<N; ++i) a[i + 32] = a[i] + b[i]; } As long as max(vscale) multiplied by the number of lanes 'X' doesn't exceed the dependence distance, it is safe to vectorize. For SVE a VF of (2, scalable) is within this constraint, since a vector of <16 x 2 x 32> will have no dependencies between lanes. For any number of lanes larger than this it would be unsafe to vectorize. This patch extends 'computeFeasibleMaxVF' to legalize scalable VFs specified as loop hints, implementing the following behaviour: * If the backend does not support scalable vectors, ignore the hint. * If scalable vectorization is unfeasible given the loop dependence, like in the first example above for SVE, then use a fixed VF. * Accept scalable VFs if it's safe to do so. * Otherwise, clamp scalable VFs that exceed the maximum safe VF. Reviewed By: sdesmalen, fhahn, david-arm Differential Revision: https://reviews.llvm.org/D91718
2020-11-16 11:02:14 +00:00 · 2020-11-16 11:02:14 +00:00 · 1e7efd397a
parent eeba70a463
commit 1e7efd397a
9 changed files with 522 additions and 45 deletions
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@ -205,6 +205,12 @@ public:
    return Status == VectorizationSafetyStatus::Safe;
  }

+  /// Return true if the number of elements that are safe to operate on
+  /// simultaneously is not bounded.
+  bool isSafeForAnyVectorWidth() const {
+    return MaxSafeVectorWidthInBits == UINT_MAX;
+  }
+
  /// The maximum number of bytes of a vector register we can vectorize
  /// the accesses safely with.
  uint64_t getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; }
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@ -325,6 +325,10 @@ public:

  const LoopAccessInfo *getLAI() const { return LAI; }

+  bool isSafeForAnyVectorWidth() const {
+    return LAI->getDepChecker().isSafeForAnyVectorWidth();
+  }
+
  unsigned getMaxSafeDepDistBytes() { return LAI->getMaxSafeDepDistBytes(); }

  uint64_t getMaxSafeVectorWidthInBits() const {
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@ -272,6 +272,12 @@ static cl::opt<unsigned> ForceTargetInstructionCost(
             "an instruction to a single constant value. Mostly "
             "useful for getting consistent testing."));

+static cl::opt<bool> ForceTargetSupportsScalableVectors(
+    "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
+    cl::desc(
+        "Pretend that scalable vectors are supported, even if the target does "
+        "not support them. This flag should only be used for testing."));
+
 static cl::opt<unsigned> SmallLoopCost(
    "small-loop-cost", cl::init(20), cl::Hidden,
    cl::desc(
@ -5592,6 +5598,30 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
 ElementCount
 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
                                                 ElementCount UserVF) {
+  bool IgnoreScalableUserVF = UserVF.isScalable() &&
+                              !TTI.supportsScalableVectors() &&
+                              !ForceTargetSupportsScalableVectors;
+  if (IgnoreScalableUserVF) {
+    LLVM_DEBUG(
+        dbgs() << "LV: Ignoring VF=" << UserVF
+               << " because target does not support scalable vectors.\n");
+    ORE->emit([&]() {
+      return OptimizationRemarkAnalysis(DEBUG_TYPE, "IgnoreScalableUserVF",
+                                        TheLoop->getStartLoc(),
+                                        TheLoop->getHeader())
+             << "Ignoring VF=" << ore::NV("UserVF", UserVF)
+             << " because target does not support scalable vectors.";
+    });
+  }
+
+  // Beyond this point two scenarios are handled. If UserVF isn't specified
+  // then a suitable VF is chosen. If UserVF is specified and there are
+  // dependencies, check if it's legal. However, if a UserVF is specified and
+  // there are no dependencies, then there's nothing to do.
+  if (UserVF.isNonZero() && !IgnoreScalableUserVF &&
+      Legal->isSafeForAnyVectorWidth())
+    return UserVF;
+
  MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
  unsigned SmallestType, WidestType;
  std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
@ -5603,15 +5633,42 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
  // dependence distance).
  unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits();

-  if (UserVF.isNonZero()) {
-    // For now, don't verify legality of scalable vectors.
-    // This will be addressed properly in https://reviews.llvm.org/D91718.
-    if (UserVF.isScalable())
-      return UserVF;
+  // If the user vectorization factor is legally unsafe, clamp it to a safe
+  // value. Otherwise, return as is.
+  if (UserVF.isNonZero() && !IgnoreScalableUserVF) {
+    unsigned MaxSafeElements =
+        PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType);
+    ElementCount MaxSafeVF = ElementCount::getFixed(MaxSafeElements);

-    // If legally unsafe, clamp the user vectorization factor to a safe value.
-    unsigned MaxSafeVF = PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType);
-    if (UserVF.getFixedValue() <= MaxSafeVF)
+    if (UserVF.isScalable()) {
+      Optional<unsigned> MaxVScale = TTI.getMaxVScale();
+
+      // Scale VF by vscale before checking if it's safe.
+      MaxSafeVF = ElementCount::getScalable(
+          MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0);
+
+      if (MaxSafeVF.isZero()) {
+        // The dependence distance is too small to use scalable vectors,
+        // fallback on fixed.
+        LLVM_DEBUG(
+            dbgs()
+            << "LV: Max legal vector width too small, scalable vectorization "
+               "unfeasible. Using fixed-width vectorization instead.\n");
+        ORE->emit([&]() {
+          return OptimizationRemarkAnalysis(DEBUG_TYPE, "ScalableVFUnfeasible",
+                                            TheLoop->getStartLoc(),
+                                            TheLoop->getHeader())
+                 << "Max legal vector width too small, scalable vectorization "
+                 << "unfeasible. Using fixed-width vectorization instead.";
+        });
+        return computeFeasibleMaxVF(
+            ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue()));
+      }
+    }
+
+    LLVM_DEBUG(dbgs() << "LV: The max safe VF is: " << MaxSafeVF << ".\n");
+
+    if (ElementCount::isKnownLE(UserVF, MaxSafeVF))
      return UserVF;

    LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
@ -5626,7 +5683,7 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
             << " is unsafe, clamping to maximum safe vectorization factor "
             << ore::NV("VectorizationFactor", MaxSafeVF);
    });
-    return ElementCount::getFixed(MaxSafeVF);
+    return MaxSafeVF;
  }

  WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits);
@ -7426,17 +7483,24 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
  ElementCount MaxVF = MaybeMaxVF.getValue();
  assert(MaxVF.isNonZero() && "MaxVF is zero.");

-  if (!UserVF.isZero() && ElementCount::isKnownLE(UserVF, MaxVF)) {
-    LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
-    assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
+  bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxVF);
+  if (!UserVF.isZero() &&
+      (UserVFIsLegal || (UserVF.isScalable() && MaxVF.isScalable()))) {
+    // FIXME: MaxVF is temporarily used inplace of UserVF for illegal scalable
+    // VFs here, this should be reverted to only use legal UserVFs once the
+    // loop below supports scalable VFs.
+    ElementCount VF = UserVFIsLegal ? UserVF : MaxVF;
+    LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max")
+                      << " VF " << VF << ".\n");
+    assert(isPowerOf2_32(VF.getKnownMinValue()) &&
           "VF needs to be a power of two");
    // Collect the instructions (and their associated costs) that will be more
    // profitable to scalarize.
-    CM.selectUserVectorizationFactor(UserVF);
+    CM.selectUserVectorizationFactor(VF);
    CM.collectInLoopReductions();
-    buildVPlansWithVPRecipes(UserVF, UserVF);
+    buildVPlansWithVPRecipes(VF, VF);
    LLVM_DEBUG(printPlans(dbgs()));
-    return {{UserVF, 0}};
+    return {{VF, 0}};
  }

  assert(!MaxVF.isScalable() &&
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll
@ -0,0 +1,368 @@
+; REQUIRES: asserts
+; RUN: opt -mtriple=aarch64-none-linux-gnu -mattr=+sve -loop-vectorize -S < %s 2>&1 | FileCheck %s
+; RUN: opt -mtriple=aarch64-none-linux-gnu -mattr=+sve -loop-vectorize -pass-remarks-analysis=loop-vectorize -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck --check-prefix=CHECK-DBG %s
+; RUN: opt -mtriple=aarch64-none-linux-gnu -loop-vectorize -pass-remarks-analysis=loop-vectorize -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck --check-prefix=CHECK-NO-SVE %s
+; RUN: opt -mtriple=aarch64-none-linux-gnu -loop-vectorize -force-target-supports-scalable-vectors=true -pass-remarks-analysis=loop-vectorize -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck --check-prefix=CHECK-NO-MAX-VSCALE %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+
+; These tests validate the behaviour of scalable vectorization factor hints,
+; where the following applies:
+;
+; * If the backend does not support scalable vectors, ignore the hint and let
+;   the vectorizer pick a VF.
+; * If there are no dependencies and assuming the VF is a power of 2 the VF
+;   should be accepted. This applies to both fixed and scalable VFs.
+; * If the dependency is too small to use scalable vectors, change the VF to
+;   fixed, where existing behavior applies (clamping).
+; * If scalable vectorization is feasible given the dependency and the VF is
+;   valid, accept it. Otherwise, clamp to the max scalable VF.
+
+; test1
+;
+; Scalable vectorization unfeasible, clamp VF from (4, scalable) -> (4, fixed).
+;
+; The pragma applied to this loop implies a scalable vector <vscale x 4 x i32>
+; be used for vectorization. For fixed vectors the MaxVF=8, otherwise there
+; would be a dependence between vector lanes for vectors greater than 256 bits.
+;
+; void test1(int *a, int *b, int N) {
+;   #pragma clang loop vectorize(enable) vectorize_width(4, scalable)
+;   for (int i=0; i<N; ++i) {
+;     a[i + 8] = a[i] + b[i];
+;   }
+; }
+;
+; For scalable vectorization 'vscale' has to be considered, for this example
+; unless max(vscale)=2 it's unsafe to vectorize. For SVE max(vscale)=16, check
+; fixed-width vectorization is used instead.
+
+; CHECK-DBG: LV: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead.
+; CHECK-DBG: remark: <unknown>:0:0: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead.
+; CHECK-DBG: LV: The max safe VF is: 8.
+; CHECK-DBG: LV: Selecting VF: 4.
+; CHECK-LABEL: @test1
+; CHECK: <4 x i32>
+define void @test1(i32* %a, i32* %b) {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %2 = add nuw nsw i64 %iv, 8
+  %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %add, i32* %arrayidx5, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !0
+
+exit:
+  ret void
+}
+
+!0 = !{!0, !1, !2}
+!1 = !{!"llvm.loop.vectorize.width", i32 4}
+!2 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+
+; test2
+;
+; Scalable vectorization unfeasible, clamp VF from (8, scalable) -> (4, fixed).
+;
+; void test2(int *a, int *b, int N) {
+;   #pragma clang loop vectorize(enable) vectorize_width(8, scalable)
+;   for (int i=0; i<N; ++i) {
+;     a[i + 4] = a[i] + b[i];
+;   }
+; }
+
+; CHECK-DBG: LV: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead.
+; CHECK-DBG: LV: The max safe VF is: 4.
+; CHECK-DBG: LV: User VF=8 is unsafe, clamping to max safe VF=4.
+; CHECK-DBG: LV: Selecting VF: 4.
+; CHECK-LABEL: @test2
+; CHECK: <4 x i32>
+define void @test2(i32* %a, i32* %b) {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %2 = add nuw nsw i64 %iv, 4
+  %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %add, i32* %arrayidx5, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !3
+
+exit:
+  ret void
+}
+
+!3 = !{!3, !4, !5}
+!4 = !{!"llvm.loop.vectorize.width", i32 8}
+!5 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+
+; test3
+;
+; Scalable vectorization feasible and the VF is valid.
+;
+; Specifies a vector of <vscale x 2 x i32>, i.e. maximum of 32 x i32 with 2
+; words per 128-bits (unpacked).
+;
+; void test3(int *a, int *b, int N) {
+;   #pragma clang loop vectorize(enable) vectorize_width(2, scalable)
+;   for (int i=0; i<N; ++i) {
+;     a[i + 32] = a[i] + b[i];
+;   }
+; }
+;
+; Max fixed VF=32, Max scalable VF=2, safe to vectorize.
+
+; CHECK-DBG-LABEL: LV: Checking a loop in "test3"
+; CHECK-DBG: LV: The max safe VF is: vscale x 2.
+; CHECK-DBG: LV: Using user VF vscale x 2.
+; CHECK-LABEL: @test3
+; CHECK: <vscale x 2 x i32>
+define void @test3(i32* %a, i32* %b) {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %2 = add nuw nsw i64 %iv, 32
+  %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %add, i32* %arrayidx5, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !6
+
+exit:
+  ret void
+}
+
+!6 = !{!6, !7, !8}
+!7 = !{!"llvm.loop.vectorize.width", i32 2}
+!8 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+
+; test4
+;
+; Scalable vectorization feasible, but the VF is unsafe. Should clamp.
+;
+; Specifies a vector of <vscale x 4 x i32>, i.e. maximum of 64 x i32 with 4
+; words per 128-bits (packed).
+;
+; void test4(int *a, int *b, int N) {
+;   #pragma clang loop vectorize(enable) vectorize_width(4, scalable)
+;   for (int i=0; i<N; ++i) {
+;     a[i + 32] = a[i] + b[i];
+;   }
+; }
+;
+; Max fixed VF=32, Max scalable VF=2, unsafe to vectorize. Should clamp to 2.
+
+; CHECK-DBG-LABEL: LV: Checking a loop in "test4"
+; CHECK-DBG: LV: The max safe VF is: vscale x 2.
+; CHECK-DBG: LV: User VF=vscale x 4 is unsafe, clamping to max safe VF=vscale x 2.
+; CHECK-DBG: remark: <unknown>:0:0: User-specified vectorization factor vscale x 4 is unsafe, clamping to maximum safe vectorization factor vscale x 2
+; CHECK-DBG: LV: Using max VF vscale x 2
+; CHECK-LABEL: @test4
+; CHECK: <vscale x 2 x i32>
+define void @test4(i32* %a, i32* %b) {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %2 = add nuw nsw i64 %iv, 32
+  %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %add, i32* %arrayidx5, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !9
+
+exit:
+  ret void
+}
+
+!9 = !{!9, !10, !11}
+!10 = !{!"llvm.loop.vectorize.width", i32 4}
+!11 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+
+; test5
+;
+; Scalable vectorization feasible and the VF is valid.
+;
+; Specifies a vector of <vscale x 4 x i32>, i.e. maximum of 64 x i32 with 4
+; words per 128-bits (packed).
+;
+; void test5(int *a, int *b, int N) {
+;   #pragma clang loop vectorize(enable) vectorize_width(4, scalable)
+;   for (int i=0; i<N; ++i) {
+;     a[i + 128] = a[i] + b[i];
+;   }
+; }
+;
+; Max fixed VF=128, Max scalable VF=8, safe to vectorize.
+
+; CHECK-DBG-LABEL: LV: Checking a loop in "test5"
+; CHECK-DBG: LV: The max safe VF is: vscale x 8.
+; CHECK-DBG: LV: Using user VF vscale x 4
+; CHECK-LABEL: @test5
+; CHECK: <vscale x 4 x i32>
+define void @test5(i32* %a, i32* %b) {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %2 = add nuw nsw i64 %iv, 128
+  %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %add, i32* %arrayidx5, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !12
+
+exit:
+  ret void
+}
+
+!12 = !{!12, !13, !14}
+!13 = !{!"llvm.loop.vectorize.width", i32 4}
+!14 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+
+; test6
+;
+; Scalable vectorization feasible, but the VF is unsafe. Should clamp.
+;
+; Specifies a vector of <vscale x 16 x i32>, i.e. maximum of 256 x i32.
+;
+; void test6(int *a, int *b, int N) {
+;   #pragma clang loop vectorize(enable) vectorize_width(16, scalable)
+;   for (int i=0; i<N; ++i) {
+;     a[i + 128] = a[i] + b[i];
+;   }
+; }
+;
+; Max fixed VF=128, Max scalable VF=8, unsafe to vectorize. Should clamp to 8.
+
+; CHECK-DBG-LABEL: LV: Checking a loop in "test6"
+; CHECK-DBG: LV: The max safe VF is: vscale x 8.
+; CHECK-DBG: LV: User VF=vscale x 16 is unsafe, clamping to max safe VF=vscale x 8.
+; CHECK-DBG: remark: <unknown>:0:0: User-specified vectorization factor vscale x 16 is unsafe, clamping to maximum safe vectorization factor vscale x 8
+; CHECK-DBG: LV: Using max VF vscale x 8
+; CHECK-LABEL: @test6
+; CHECK: <vscale x 8 x i32>
+define void @test6(i32* %a, i32* %b) {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %2 = add nuw nsw i64 %iv, 128
+  %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %add, i32* %arrayidx5, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !15
+
+exit:
+  ret void
+}
+
+!15 = !{!15, !16, !17}
+!16 = !{!"llvm.loop.vectorize.width", i32 16}
+!17 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+
+; CHECK-NO-SVE-LABEL: LV: Checking a loop in "test_no_sve"
+; CHECK-NO-SVE: LV: Ignoring VF=vscale x 4 because target does not support scalable vectors.
+; CHECK-NO-SVE: remark: <unknown>:0:0: Ignoring VF=vscale x 4 because target does not support scalable vectors.
+; CHECK-NO-SVE: LV: Selecting VF: 4.
+; CHECK-NO-SVE: <4 x i32>
+; CHECK-NO-SVE-NOT: <vscale x 4 x i32>
+define void @test_no_sve(i32* %a, i32* %b) {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  store i32 %add, i32* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !18
+
+exit:
+  ret void
+}
+
+!18 = !{!18, !19, !20}
+!19 = !{!"llvm.loop.vectorize.width", i32 4}
+!20 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+
+; Test the LV falls back to fixed-width vectorization if scalable vectors are
+; supported but max vscale is undefined.
+;
+; CHECK-NO-MAX-VSCALE-LABEL: LV: Checking a loop in "test_no_max_vscale"
+; CHECK-NO-MAX-VSCALE: LV: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead.
+; CEHCK-NO-MAX-VSCALE: The max safe VF is: 4.
+; CHECK-NO-MAX-VSCALE: LV: Selecting VF: 4.
+; CHECK-NO-MAX-VSCALE: <4 x i32>
+define void @test_no_max_vscale(i32* %a, i32* %b) {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %2 = add nuw nsw i64 %iv, 4
+  %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %add, i32* %arrayidx5, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !21
+
+exit:
+  ret void
+}
+
+!21 = !{!21, !22, !23}
+!22 = !{!"llvm.loop.vectorize.width", i32 4}
+!23 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
--- a/llvm/test/Transforms/LoopVectorize/metadata-width.ll
+++ b/llvm/test/Transforms/LoopVectorize/metadata-width.ll
@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-target-supports-scalable-vectors=true -dce -instcombine -S | FileCheck %s

 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"

--- a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-limitations.ll
+++ b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-limitations.ll
@ -99,27 +99,3 @@ for.end.loopexit:                                 ; preds = %for.body
 for.end:                                          ; preds = %for.end.loopexit, %entry
  ret void
 }
-
-; Currently we cannot handle scalable vectorization factors.
-; CHECK: LV: Checking a loop in "f4"
-; CHECK: LEV: Epilogue vectorization for scalable vectors not yet supported.
-
-define void @f4(i8* %A) {
-entry:
-  br label %for.body
-
-for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %A, i64 %iv
-  store i8 1, i8* %arrayidx, align 1
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp ne i64 %iv.next, 1024
-  br i1 %exitcond, label %for.body, label %exit, !llvm.loop !0
-
-exit:
-  ret void
-}
-
-!0 = !{!0, !1, !2}
-!1 = !{!"llvm.loop.vectorize.width", i32 4}
-!2 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
--- a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-scalable.ll
+++ b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-scalable.ll
@ -0,0 +1,27 @@
+; REQUIRES: asserts
+; RUN: opt < %s  -passes='loop-vectorize' -force-vector-width=2 -force-target-supports-scalable-vectors=true -enable-epilogue-vectorization -epilogue-vectorization-force-VF=2 --debug-only=loop-vectorize -S 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-n32:64-v256:256:256-v512:512:512"
+
+; Currently we cannot handle scalable vectorization factors.
+; CHECK: LV: Checking a loop in "f1"
+; CHECK: LEV: Epilogue vectorization for scalable vectors not yet supported.
+
+define void @f1(i8* %A) {
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i8, i8* %A, i64 %iv
+  store i8 1, i8* %arrayidx, align 1
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp ne i64 %iv.next, 1024
+  br i1 %exitcond, label %for.body, label %exit, !llvm.loop !0
+
+exit:
+  ret void
+}
+
+!0 = !{!0, !1}
+!1 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
--- a/llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll
@ -1,5 +1,5 @@
-; RUN: opt -S -loop-vectorize -instcombine -force-vector-interleave=1 < %s | FileCheck %s --check-prefix=CHECKUF1
-; RUN: opt -S -loop-vectorize -instcombine -force-vector-interleave=2 < %s | FileCheck %s --check-prefix=CHECKUF2
+; RUN: opt -S -loop-vectorize -instcombine -force-vector-interleave=1 -force-vector-width=4 -force-target-supports-scalable-vectors=true < %s | FileCheck %s --check-prefix=CHECKUF1
+; RUN: opt -S -loop-vectorize -instcombine -force-vector-interleave=2 -force-vector-width=4 -force-target-supports-scalable-vectors=true < %s | FileCheck %s --check-prefix=CHECKUF2

 ; CHECKUF1: for.body.preheader:
 ; CHECKUF1-DAG: %wide.trip.count = zext i32 %N to i64
@ -96,6 +96,5 @@ for.body:                                         ; preds = %for.body.preheader,
  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !1
 }

-!1 = distinct !{!1, !2, !3}
-!2 = !{!"llvm.loop.vectorize.width", i32 4}
-!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
--- a/llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll
@ -0,0 +1,33 @@
+; REQUIRES: asserts
+; RUN: opt -loop-vectorize -pass-remarks-analysis=loop-vectorize -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; CHECK: LV: Ignoring VF=vscale x 4 because target does not support scalable vectors.
+; CHECK: remark: <unknown>:0:0: Ignoring VF=vscale x 4 because target does not support scalable vectors.
+; CHECK: LV: The Widest register safe to use is: 32 bits.
+define void @test1(i32* %a, i32* %b) {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %2 = add nuw nsw i64 %iv, 4
+  %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %add, i32* %arrayidx5, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !0
+
+exit:
+  ret void
+}
+
+!0 = !{!0, !1, !2}
+!1 = !{!"llvm.loop.vectorize.width", i32 4}
+!2 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}