Recommit "[VP,Integer,#2] ExpandVectorPredication pass"

This reverts the revert 02c5ba8679 Fix: Pass was registered as DUMMY_FUNCTION_PASS causing the newpm-pass functions to be doubly defined. Triggered in -DLLVM_ENABLE_MODULE=1 builds. Original commit: This patch implements expansion of llvm.vp.* intrinsics (https://llvm.org/docs/LangRef.html#vector-predication-intrinsics). VP expansion is required for targets that do not implement VP code generation. Since expansion is controllable with TTI, targets can switch on the VP intrinsics they do support in their backend offering a smooth transition strategy for VP code generation (VE, RISC-V V, ARM SVE, AVX512, ..). Reviewed By: rogfer01 Differential Revision: https://reviews.llvm.org/D78203
2021-04-30 13:43:48 +02:00 · 2021-04-30 13:43:48 +02:00 · 1db4dbba24
parent f800ac8309
commit 1db4dbba24
21 changed files with 823 additions and 1 deletions
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@ -61,6 +61,7 @@ class TargetLibraryInfo;
 class Type;
 class User;
 class Value;
+class VPIntrinsic;
 struct KnownBits;
 template <typename T> class Optional;

@ -1379,6 +1380,38 @@ public:
  /// Intrinsics") Use of %evl is discouraged when that is not the case.
  bool hasActiveVectorLength() const;

+  struct VPLegalization {
+    enum VPTransform {
+      // keep the predicating parameter
+      Legal = 0,
+      // where legal, discard the predicate parameter
+      Discard = 1,
+      // transform into something else that is also predicating
+      Convert = 2
+    };
+
+    // How to transform the EVL parameter.
+    // Legal:   keep the EVL parameter as it is.
+    // Discard: Ignore the EVL parameter where it is safe to do so.
+    // Convert: Fold the EVL into the mask parameter.
+    VPTransform EVLParamStrategy;
+
+    // How to transform the operator.
+    // Legal:   The target supports this operator.
+    // Convert: Convert this to a non-VP operation.
+    // The 'Discard' strategy is invalid.
+    VPTransform OpStrategy;
+
+    bool shouldDoNothing() const {
+      return (EVLParamStrategy == Legal) && (OpStrategy == Legal);
+    }
+    VPLegalization(VPTransform EVLParamStrategy, VPTransform OpStrategy)
+        : EVLParamStrategy(EVLParamStrategy), OpStrategy(OpStrategy) {}
+  };
+
+  /// \returns How the target needs this vector-predicated operation to be
+  /// transformed.
+  VPLegalization getVPLegalizationStrategy(const VPIntrinsic &PI) const;
  /// @}

  /// @}
@ -1688,6 +1721,8 @@ public:
  virtual bool supportsScalableVectors() const = 0;
  virtual bool hasActiveVectorLength() const = 0;
  virtual InstructionCost getInstructionLatency(const Instruction *I) = 0;
+  virtual VPLegalization
+  getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0;
 };

 template <typename T>
@ -2259,6 +2294,11 @@ public:
  InstructionCost getInstructionLatency(const Instruction *I) override {
    return Impl.getInstructionLatency(I);
  }
+
+  VPLegalization
+  getVPLegalizationStrategy(const VPIntrinsic &PI) const override {
+    return Impl.getVPLegalizationStrategy(PI);
+  }
 };

 template <typename T>
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@ -750,6 +750,13 @@ public:

  bool hasActiveVectorLength() const { return false; }

+  TargetTransformInfo::VPLegalization
+  getVPLegalizationStrategy(const VPIntrinsic &PI) const {
+    return TargetTransformInfo::VPLegalization(
+        /* EVLParamStrategy */ TargetTransformInfo::VPLegalization::Discard,
+        /* OperatorStrategy */ TargetTransformInfo::VPLegalization::Convert);
+  }
+
 protected:
  // Obtain the minimum required size to hold the value (without the sign)
  // In case of a vector it returns the min required size for one element.
--- a/llvm/include/llvm/CodeGen/ExpandVectorPredication.h
+++ b/llvm/include/llvm/CodeGen/ExpandVectorPredication.h
@ -0,0 +1,23 @@
+//===-- ExpandVectorPredication.h - Expand vector predication ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_EXPANDVECTORPREDICATION_H
+#define LLVM_CODEGEN_EXPANDVECTORPREDICATION_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class ExpandVectorPredicationPass
+    : public PassInfoMixin<ExpandVectorPredicationPass> {
+public:
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+} // end namespace llvm
+
+#endif // LLVM_CODEGEN_EXPANDVECTORPREDICATION_H
--- a/llvm/include/llvm/CodeGen/MachinePassRegistry.def
+++ b/llvm/include/llvm/CodeGen/MachinePassRegistry.def
@ -44,6 +44,7 @@ FUNCTION_PASS("partially-inline-libcalls", PartiallyInlineLibCallsPass, ())
 FUNCTION_PASS("ee-instrument", EntryExitInstrumenterPass, (false))
 FUNCTION_PASS("post-inline-ee-instrument", EntryExitInstrumenterPass, (true))
 FUNCTION_PASS("expand-reductions", ExpandReductionsPass, ())
+FUNCTION_PASS("expandvp", ExpandVectorPredicationPass, ())
 FUNCTION_PASS("lowerinvoke", LowerInvokePass, ())
 FUNCTION_PASS("scalarize-masked-mem-intrin", ScalarizeMaskedMemIntrinPass, ())
 FUNCTION_PASS("verify", VerifierPass, ())
--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@ -453,6 +453,11 @@ namespace llvm {
  // the corresponding function in a vector library (e.g., SVML, libmvec).
  FunctionPass *createReplaceWithVeclibLegacyPass();

+  /// This pass expands the vector predication intrinsics into unpredicated
+  /// instructions with selects or just the explicit vector length into the
+  /// predicate mask.
+  FunctionPass *createExpandVectorPredicationPass();
+
  // This pass expands memcmp() to load/stores.
  FunctionPass *createExpandMemCmpPass();

--- a/llvm/include/llvm/IR/IntrinsicInst.h
+++ b/llvm/include/llvm/IR/IntrinsicInst.h
@ -395,9 +395,11 @@ public:

  /// \return the mask parameter or nullptr.
  Value *getMaskParam() const;
+  void setMaskParam(Value *);

  /// \return the vector length parameter or nullptr.
  Value *getVectorLengthParam() const;
+  void setVectorLengthParam(Value *);

  /// \return whether the vector length param can be ignored.
  bool canIgnoreVectorLengthParam() const;
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@ -154,6 +154,7 @@ void initializeEntryExitInstrumenterPass(PassRegistry&);
 void initializeExpandMemCmpPassPass(PassRegistry&);
 void initializeExpandPostRAPass(PassRegistry&);
 void initializeExpandReductionsPass(PassRegistry&);
+void initializeExpandVectorPredicationPass(PassRegistry &);
 void initializeMakeGuardsExplicitLegacyPassPass(PassRegistry&);
 void initializeExternalAAWrapperPassPass(PassRegistry&);
 void initializeFEntryInserterPass(PassRegistry&);
--- a/llvm/include/llvm/LinkAllPasses.h
+++ b/llvm/include/llvm/LinkAllPasses.h
@ -197,6 +197,7 @@ namespace {
      (void) llvm::createMergeFunctionsPass();
      (void) llvm::createMergeICmpsLegacyPass();
      (void) llvm::createExpandMemCmpPass();
+      (void) llvm::createExpandVectorPredicationPass();
      std::string buf;
      llvm::raw_string_ostream os(buf);
      (void) llvm::createPrintModulePass(os);
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@ -1026,6 +1026,11 @@ bool TargetTransformInfo::preferPredicatedReductionSelect(
  return TTIImpl->preferPredicatedReductionSelect(Opcode, Ty, Flags);
 }

+TargetTransformInfo::VPLegalization
+TargetTransformInfo::getVPLegalizationStrategy(const VPIntrinsic &VPI) const {
+  return TTIImpl->getVPLegalizationStrategy(VPI);
+}
+
 bool TargetTransformInfo::shouldExpandReduction(const IntrinsicInst *II) const {
  return TTIImpl->shouldExpandReduction(II);
 }
--- a/llvm/lib/CodeGen/CMakeLists.txt
+++ b/llvm/lib/CodeGen/CMakeLists.txt
@ -29,6 +29,7 @@ add_llvm_component_library(LLVMCodeGen
  ExpandMemCmp.cpp
  ExpandPostRAPseudos.cpp
  ExpandReductions.cpp
+  ExpandVectorPredication.cpp
  FaultMaps.cpp
  FEntryInserter.cpp
  FinalizeISel.cpp
--- a/llvm/lib/CodeGen/ExpandVectorPredication.cpp
+++ b/llvm/lib/CodeGen/ExpandVectorPredication.cpp
@ -0,0 +1,469 @@
+//===----- CodeGen/ExpandVectorPredication.cpp - Expand VP intrinsics -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements IR expansion for vector predication intrinsics, allowing
+// targets to enable vector predication until just before codegen.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/ExpandVectorPredication.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+
+using namespace llvm;
+
+using VPLegalization = TargetTransformInfo::VPLegalization;
+using VPTransform = TargetTransformInfo::VPLegalization::VPTransform;
+
+// Keep this in sync with TargetTransformInfo::VPLegalization.
+#define VPINTERNAL_VPLEGAL_CASES                                               \
+  VPINTERNAL_CASE(Legal)                                                       \
+  VPINTERNAL_CASE(Discard)                                                     \
+  VPINTERNAL_CASE(Convert)
+
+#define VPINTERNAL_CASE(X) "|" #X
+
+// Override options.
+static cl::opt<std::string> EVLTransformOverride(
+    "expandvp-override-evl-transform", cl::init(""), cl::Hidden,
+    cl::desc("Options: <empty>" VPINTERNAL_VPLEGAL_CASES
+             ". If non-empty, ignore "
+             "TargetTransformInfo and "
+             "always use this transformation for the %evl parameter (Used in "
+             "testing)."));
+
+static cl::opt<std::string> MaskTransformOverride(
+    "expandvp-override-mask-transform", cl::init(""), cl::Hidden,
+    cl::desc("Options: <empty>" VPINTERNAL_VPLEGAL_CASES
+             ". If non-empty, Ignore "
+             "TargetTransformInfo and "
+             "always use this transformation for the %mask parameter (Used in "
+             "testing)."));
+
+#undef VPINTERNAL_CASE
+#define VPINTERNAL_CASE(X) .Case(#X, VPLegalization::X)
+
+static VPTransform parseOverrideOption(const std::string &TextOpt) {
+  return StringSwitch<VPTransform>(TextOpt) VPINTERNAL_VPLEGAL_CASES;
+}
+
+#undef VPINTERNAL_VPLEGAL_CASES
+
+// Whether any override options are set.
+static bool anyExpandVPOverridesSet() {
+  return !EVLTransformOverride.empty() || !MaskTransformOverride.empty();
+}
+
+#define DEBUG_TYPE "expandvp"
+
+STATISTIC(NumFoldedVL, "Number of folded vector length params");
+STATISTIC(NumLoweredVPOps, "Number of folded vector predication operations");
+
+///// Helpers {
+
+/// \returns Whether the vector mask \p MaskVal has all lane bits set.
+static bool isAllTrueMask(Value *MaskVal) {
+  auto *ConstVec = dyn_cast<ConstantVector>(MaskVal);
+  return ConstVec && ConstVec->isAllOnesValue();
+}
+
+/// \returns A non-excepting divisor constant for this type.
+static Constant *getSafeDivisor(Type *DivTy) {
+  assert(DivTy->isIntOrIntVectorTy() && "Unsupported divisor type");
+  return ConstantInt::get(DivTy, 1u, false);
+}
+
+/// Transfer operation properties from \p OldVPI to \p NewVal.
+static void transferDecorations(Value &NewVal, VPIntrinsic &VPI) {
+  auto *NewInst = dyn_cast<Instruction>(&NewVal);
+  if (!NewInst || !isa<FPMathOperator>(NewVal))
+    return;
+
+  auto *OldFMOp = dyn_cast<FPMathOperator>(&VPI);
+  if (!OldFMOp)
+    return;
+
+  NewInst->setFastMathFlags(OldFMOp->getFastMathFlags());
+}
+
+/// Transfer all properties from \p OldOp to \p NewOp and replace all uses.
+/// OldVP gets erased.
+static void replaceOperation(Value &NewOp, VPIntrinsic &OldOp) {
+  transferDecorations(NewOp, OldOp);
+  OldOp.replaceAllUsesWith(&NewOp);
+  OldOp.eraseFromParent();
+}
+
+//// } Helpers
+
+namespace {
+
+// Expansion pass state at function scope.
+struct CachingVPExpander {
+  Function &F;
+  const TargetTransformInfo &TTI;
+
+  /// \returns A (fixed length) vector with ascending integer indices
+  /// (<0, 1, ..., NumElems-1>).
+  /// \p Builder
+  ///    Used for instruction creation.
+  /// \p LaneTy
+  ///    Integer element type of the result vector.
+  /// \p NumElems
+  ///    Number of vector elements.
+  Value *createStepVector(IRBuilder<> &Builder, Type *LaneTy,
+                          unsigned NumElems);
+
+  /// \returns A bitmask that is true where the lane position is less-than \p
+  /// EVLParam
+  ///
+  /// \p Builder
+  ///    Used for instruction creation.
+  /// \p VLParam
+  ///    The explicit vector length parameter to test against the lane
+  ///    positions.
+  /// \p ElemCount
+  ///    Static (potentially scalable) number of vector elements.
+  Value *convertEVLToMask(IRBuilder<> &Builder, Value *EVLParam,
+                          ElementCount ElemCount);
+
+  Value *foldEVLIntoMask(VPIntrinsic &VPI);
+
+  /// "Remove" the %evl parameter of \p PI by setting it to the static vector
+  /// length of the operation.
+  void discardEVLParameter(VPIntrinsic &PI);
+
+  /// \brief Lower this VP binary operator to a unpredicated binary operator.
+  Value *expandPredicationInBinaryOperator(IRBuilder<> &Builder,
+                                           VPIntrinsic &PI);
+
+  /// \brief Query TTI and expand the vector predication in \p P accordingly.
+  Value *expandPredication(VPIntrinsic &PI);
+
+  /// \brief  Determine how and whether the VPIntrinsic \p VPI shall be
+  /// expanded. This overrides TTI with the cl::opts listed at the top of this
+  /// file.
+  VPLegalization getVPLegalizationStrategy(const VPIntrinsic &VPI) const;
+  bool UsingTTIOverrides;
+
+public:
+  CachingVPExpander(Function &F, const TargetTransformInfo &TTI)
+      : F(F), TTI(TTI), UsingTTIOverrides(anyExpandVPOverridesSet()) {}
+
+  bool expandVectorPredication();
+};
+
+//// CachingVPExpander {
+
+Value *CachingVPExpander::createStepVector(IRBuilder<> &Builder, Type *LaneTy,
+                                           unsigned NumElems) {
+  // TODO add caching
+  SmallVector<Constant *, 16> ConstElems;
+
+  for (unsigned Idx = 0; Idx < NumElems; ++Idx)
+    ConstElems.push_back(ConstantInt::get(LaneTy, Idx, false));
+
+  return ConstantVector::get(ConstElems);
+}
+
+Value *CachingVPExpander::convertEVLToMask(IRBuilder<> &Builder,
+                                           Value *EVLParam,
+                                           ElementCount ElemCount) {
+  // TODO add caching
+  // Scalable vector %evl conversion.
+  if (ElemCount.isScalable()) {
+    auto *M = Builder.GetInsertBlock()->getModule();
+    Type *BoolVecTy = VectorType::get(Builder.getInt1Ty(), ElemCount);
+    Function *ActiveMaskFunc = Intrinsic::getDeclaration(
+        M, Intrinsic::get_active_lane_mask, {BoolVecTy, EVLParam->getType()});
+    // `get_active_lane_mask` performs an implicit less-than comparison.
+    Value *ConstZero = Builder.getInt32(0);
+    return Builder.CreateCall(ActiveMaskFunc, {ConstZero, EVLParam});
+  }
+
+  // Fixed vector %evl conversion.
+  Type *LaneTy = EVLParam->getType();
+  unsigned NumElems = ElemCount.getFixedValue();
+  Value *VLSplat = Builder.CreateVectorSplat(NumElems, EVLParam);
+  Value *IdxVec = createStepVector(Builder, LaneTy, NumElems);
+  return Builder.CreateICmp(CmpInst::ICMP_ULT, IdxVec, VLSplat);
+}
+
+Value *
+CachingVPExpander::expandPredicationInBinaryOperator(IRBuilder<> &Builder,
+                                                     VPIntrinsic &VPI) {
+  assert((isSafeToSpeculativelyExecute(&VPI) ||
+          VPI.canIgnoreVectorLengthParam()) &&
+         "Implicitly dropping %evl in non-speculatable operator!");
+
+  auto OC = static_cast<Instruction::BinaryOps>(VPI.getFunctionalOpcode());
+  assert(Instruction::isBinaryOp(OC));
+
+  Value *Op0 = VPI.getOperand(0);
+  Value *Op1 = VPI.getOperand(1);
+  Value *Mask = VPI.getMaskParam();
+
+  // Blend in safe operands.
+  if (Mask && !isAllTrueMask(Mask)) {
+    switch (OC) {
+    default:
+      // Can safely ignore the predicate.
+      break;
+
+    // Division operators need a safe divisor on masked-off lanes (1).
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+      // 2nd operand must not be zero.
+      Value *SafeDivisor = getSafeDivisor(VPI.getType());
+      Op1 = Builder.CreateSelect(Mask, Op1, SafeDivisor);
+    }
+  }
+
+  Value *NewBinOp = Builder.CreateBinOp(OC, Op0, Op1, VPI.getName());
+
+  replaceOperation(*NewBinOp, VPI);
+  return NewBinOp;
+}
+
+void CachingVPExpander::discardEVLParameter(VPIntrinsic &VPI) {
+  LLVM_DEBUG(dbgs() << "Discard EVL parameter in " << VPI << "\n");
+
+  if (VPI.canIgnoreVectorLengthParam())
+    return;
+
+  Value *EVLParam = VPI.getVectorLengthParam();
+  if (!EVLParam)
+    return;
+
+  ElementCount StaticElemCount = VPI.getStaticVectorLength();
+  Value *MaxEVL = nullptr;
+  Type *Int32Ty = Type::getInt32Ty(VPI.getContext());
+  if (StaticElemCount.isScalable()) {
+    // TODO add caching
+    auto *M = VPI.getModule();
+    Function *VScaleFunc =
+        Intrinsic::getDeclaration(M, Intrinsic::vscale, Int32Ty);
+    IRBuilder<> Builder(VPI.getParent(), VPI.getIterator());
+    Value *FactorConst = Builder.getInt32(StaticElemCount.getKnownMinValue());
+    Value *VScale = Builder.CreateCall(VScaleFunc, {}, "vscale");
+    MaxEVL = Builder.CreateMul(VScale, FactorConst, "scalable_size",
+                               /*NUW*/ true, /*NSW*/ false);
+  } else {
+    MaxEVL = ConstantInt::get(Int32Ty, StaticElemCount.getFixedValue(), false);
+  }
+  VPI.setVectorLengthParam(MaxEVL);
+}
+
+Value *CachingVPExpander::foldEVLIntoMask(VPIntrinsic &VPI) {
+  LLVM_DEBUG(dbgs() << "Folding vlen for " << VPI << '\n');
+
+  IRBuilder<> Builder(&VPI);
+
+  // Ineffective %evl parameter and so nothing to do here.
+  if (VPI.canIgnoreVectorLengthParam())
+    return &VPI;
+
+  // Only VP intrinsics can have an %evl parameter.
+  Value *OldMaskParam = VPI.getMaskParam();
+  Value *OldEVLParam = VPI.getVectorLengthParam();
+  assert(OldMaskParam && "no mask param to fold the vl param into");
+  assert(OldEVLParam && "no EVL param to fold away");
+
+  LLVM_DEBUG(dbgs() << "OLD evl: " << *OldEVLParam << '\n');
+  LLVM_DEBUG(dbgs() << "OLD mask: " << *OldMaskParam << '\n');
+
+  // Convert the %evl predication into vector mask predication.
+  ElementCount ElemCount = VPI.getStaticVectorLength();
+  Value *VLMask = convertEVLToMask(Builder, OldEVLParam, ElemCount);
+  Value *NewMaskParam = Builder.CreateAnd(VLMask, OldMaskParam);
+  VPI.setMaskParam(NewMaskParam);
+
+  // Drop the %evl parameter.
+  discardEVLParameter(VPI);
+  assert(VPI.canIgnoreVectorLengthParam() &&
+         "transformation did not render the evl param ineffective!");
+
+  // Reassess the modified instruction.
+  return &VPI;
+}
+
+Value *CachingVPExpander::expandPredication(VPIntrinsic &VPI) {
+  LLVM_DEBUG(dbgs() << "Lowering to unpredicated op: " << VPI << '\n');
+
+  IRBuilder<> Builder(&VPI);
+
+  // Try lowering to a LLVM instruction first.
+  unsigned OC = VPI.getFunctionalOpcode();
+
+  if (Instruction::isBinaryOp(OC))
+    return expandPredicationInBinaryOperator(Builder, VPI);
+
+  return &VPI;
+}
+
+//// } CachingVPExpander
+
+struct TransformJob {
+  VPIntrinsic *PI;
+  TargetTransformInfo::VPLegalization Strategy;
+  TransformJob(VPIntrinsic *PI, TargetTransformInfo::VPLegalization InitStrat)
+      : PI(PI), Strategy(InitStrat) {}
+
+  bool isDone() const { return Strategy.shouldDoNothing(); }
+};
+
+void sanitizeStrategy(Instruction &I, VPLegalization &LegalizeStrat) {
+  // Speculatable instructions do not strictly need predication.
+  if (isSafeToSpeculativelyExecute(&I)) {
+    // Converting a speculatable VP intrinsic means dropping %mask and %evl.
+    // No need to expand %evl into the %mask only to ignore that code.
+    if (LegalizeStrat.OpStrategy == VPLegalization::Convert)
+      LegalizeStrat.EVLParamStrategy = VPLegalization::Discard;
+    return;
+  }
+
+  // We have to preserve the predicating effect of %evl for this
+  // non-speculatable VP intrinsic.
+  // 1) Never discard %evl.
+  // 2) If this VP intrinsic will be expanded to non-VP code, make sure that
+  //    %evl gets folded into %mask.
+  if ((LegalizeStrat.EVLParamStrategy == VPLegalization::Discard) ||
+      (LegalizeStrat.OpStrategy == VPLegalization::Convert)) {
+    LegalizeStrat.EVLParamStrategy = VPLegalization::Convert;
+  }
+}
+
+VPLegalization
+CachingVPExpander::getVPLegalizationStrategy(const VPIntrinsic &VPI) const {
+  auto VPStrat = TTI.getVPLegalizationStrategy(VPI);
+  if (LLVM_LIKELY(!UsingTTIOverrides)) {
+    // No overrides - we are in production.
+    return VPStrat;
+  }
+
+  // Overrides set - we are in testing, the following does not need to be
+  // efficient.
+  VPStrat.EVLParamStrategy = parseOverrideOption(EVLTransformOverride);
+  VPStrat.OpStrategy = parseOverrideOption(MaskTransformOverride);
+  return VPStrat;
+}
+
+/// \brief Expand llvm.vp.* intrinsics as requested by \p TTI.
+bool CachingVPExpander::expandVectorPredication() {
+  SmallVector<TransformJob, 16> Worklist;
+
+  // Collect all VPIntrinsics that need expansion and determine their expansion
+  // strategy.
+  for (auto &I : instructions(F)) {
+    auto *VPI = dyn_cast<VPIntrinsic>(&I);
+    if (!VPI)
+      continue;
+    auto VPStrat = getVPLegalizationStrategy(*VPI);
+    sanitizeStrategy(I, VPStrat);
+    if (!VPStrat.shouldDoNothing())
+      Worklist.emplace_back(VPI, VPStrat);
+  }
+  if (Worklist.empty())
+    return false;
+
+  // Transform all VPIntrinsics on the worklist.
+  LLVM_DEBUG(dbgs() << "\n:::: Transforming " << Worklist.size()
+                    << " instructions ::::\n");
+  for (TransformJob Job : Worklist) {
+    // Transform the EVL parameter.
+    switch (Job.Strategy.EVLParamStrategy) {
+    case VPLegalization::Legal:
+      break;
+    case VPLegalization::Discard:
+      discardEVLParameter(*Job.PI);
+      break;
+    case VPLegalization::Convert:
+      if (foldEVLIntoMask(*Job.PI))
+        ++NumFoldedVL;
+      break;
+    }
+    Job.Strategy.EVLParamStrategy = VPLegalization::Legal;
+
+    // Replace with a non-predicated operation.
+    switch (Job.Strategy.OpStrategy) {
+    case VPLegalization::Legal:
+      break;
+    case VPLegalization::Discard:
+      llvm_unreachable("Invalid strategy for operators.");
+    case VPLegalization::Convert:
+      expandPredication(*Job.PI);
+      ++NumLoweredVPOps;
+      break;
+    }
+    Job.Strategy.OpStrategy = VPLegalization::Legal;
+
+    assert(Job.isDone() && "incomplete transformation");
+  }
+
+  return true;
+}
+class ExpandVectorPredication : public FunctionPass {
+public:
+  static char ID;
+  ExpandVectorPredication() : FunctionPass(ID) {
+    initializeExpandVectorPredicationPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    const auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    CachingVPExpander VPExpander(F, *TTI);
+    return VPExpander.expandVectorPredication();
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.setPreservesCFG();
+  }
+};
+} // namespace
+
+char ExpandVectorPredication::ID;
+INITIALIZE_PASS_BEGIN(ExpandVectorPredication, "expandvp",
+                      "Expand vector predication intrinsics", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(ExpandVectorPredication, "expandvp",
+                    "Expand vector predication intrinsics", false, false)
+
+FunctionPass *llvm::createExpandVectorPredicationPass() {
+  return new ExpandVectorPredication();
+}
+
+PreservedAnalyses
+ExpandVectorPredicationPass::run(Function &F, FunctionAnalysisManager &AM) {
+  const auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+  CachingVPExpander VPExpander(F, TTI);
+  if (!VPExpander.expandVectorPredication())
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@ -864,6 +864,11 @@ void TargetPassConfig::addIRPasses() {
  if (getOptLevel() != CodeGenOpt::None && !DisablePartialLibcallInlining)
    addPass(createPartiallyInlineLibCallsPass());

+  // Expand vector predication intrinsics into standard IR instructions.
+  // This pass has to run before ScalarizeMaskedMemIntrin and ExpandReduction
+  // passes since it emits those kinds of intrinsics.
+  addPass(createExpandVectorPredicationPass());
+
  // Add scalarization of target's unsupported masked memory intrinsics pass.
  // the unsupported intrinsic will be replaced with a chain of basic blocks,
  // that stores/loads element one-by-one if the appropriate mask bit is set.
--- a/llvm/lib/IR/IntrinsicInst.cpp
+++ b/llvm/lib/IR/IntrinsicInst.cpp
@ -262,6 +262,11 @@ Value *VPIntrinsic::getMaskParam() const {
  return nullptr;
 }

+void VPIntrinsic::setMaskParam(Value *NewMask) {
+  auto MaskPos = GetMaskParamPos(getIntrinsicID());
+  setArgOperand(*MaskPos, NewMask);
+}
+
 Value *VPIntrinsic::getVectorLengthParam() const {
  auto vlenPos = GetVectorLengthParamPos(getIntrinsicID());
  if (vlenPos)
@ -269,6 +274,11 @@ Value *VPIntrinsic::getVectorLengthParam() const {
  return nullptr;
 }

+void VPIntrinsic::setVectorLengthParam(Value *NewEVL) {
+  auto EVLPos = GetVectorLengthParamPos(getIntrinsicID());
+  setArgOperand(*EVLPos, NewEVL);
+}
+
 Optional<int> VPIntrinsic::GetMaskParamPos(Intrinsic::ID IntrinsicID) {
  switch (IntrinsicID) {
  default:
--- a/llvm/test/CodeGen/AArch64/O0-pipeline.ll
+++ b/llvm/test/CodeGen/AArch64/O0-pipeline.ll
@ -21,6 +21,7 @@
 ; CHECK-NEXT:       Shadow Stack GC Lowering
 ; CHECK-NEXT:       Lower constant intrinsics
 ; CHECK-NEXT:       Remove unreachable blocks from the CFG
+; CHECK-NEXT:       Expand vector predication intrinsics
 ; CHECK-NEXT:       Scalarize Masked Memory Intrinsics
 ; CHECK-NEXT:       Expand reduction intrinsics
 ; CHECK-NEXT:       AArch64 Stack Tagging
--- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll
+++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
@ -56,6 +56,7 @@
 ; CHECK-NEXT:       Constant Hoisting
 ; CHECK-NEXT:       Replace intrinsics with calls to vector library
 ; CHECK-NEXT:       Partially inline calls to library functions
+; CHECK-NEXT:       Expand vector predication intrinsics
 ; CHECK-NEXT:       Scalarize Masked Memory Intrinsics
 ; CHECK-NEXT:       Expand reduction intrinsics
 ; CHECK-NEXT:     Stack Safety Analysis
--- a/llvm/test/CodeGen/ARM/O3-pipeline.ll
+++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll
@ -37,6 +37,7 @@
 ; CHECK-NEXT:      Constant Hoisting
 ; CHECK-NEXT:      Replace intrinsics with calls to vector library
 ; CHECK-NEXT:      Partially inline calls to library functions
+; CHECK-NEXT:      Expand vector predication intrinsics
 ; CHECK-NEXT:      Scalarize Masked Memory Intrinsics
 ; CHECK-NEXT:      Expand reduction intrinsics
 ; CHECK-NEXT:      Natural Loop Information
--- a/llvm/test/CodeGen/Generic/expand-vp.ll
+++ b/llvm/test/CodeGen/Generic/expand-vp.ll
@ -0,0 +1,245 @@
+; Partial expansion cases (still VP with parameter expansions).
+; RUN: opt --expandvp --expandvp-override-evl-transform=Legal --expandvp-override-mask-transform=Legal -S < %s | FileCheck %s --check-prefix=LEGAL_LEGAL
+; RUN: opt --expandvp --expandvp-override-evl-transform=Discard --expandvp-override-mask-transform=Legal -S < %s | FileCheck %s --check-prefix=DISCARD_LEGAL
+; RUN: opt --expandvp --expandvp-override-evl-transform=Convert --expandvp-override-mask-transform=Legal -S < %s | FileCheck %s --check-prefix=CONVERT_LEGAL
+; Full expansion cases (all expanded to non-VP).
+; RUN: opt --expandvp --expandvp-override-evl-transform=Discard --expandvp-override-mask-transform=Convert -S < %s | FileCheck %s --check-prefix=ALL-CONVERT
+; RUN: opt --expandvp -S < %s | FileCheck %s --check-prefix=ALL-CONVERT
+; RUN: opt --expandvp --expandvp-override-evl-transform=Legal --expandvp-override-mask-transform=Convert -S < %s | FileCheck %s --check-prefix=ALL-CONVERT
+; RUN: opt --expandvp --expandvp-override-evl-transform=Convert --expandvp-override-mask-transform=Convert -S < %s | FileCheck %s --check-prefix=ALL-CONVERT
+
+
+; Fixed-width vectors
+; Integer arith
+declare <8 x i32> @llvm.vp.add.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
+declare <8 x i32> @llvm.vp.sub.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
+declare <8 x i32> @llvm.vp.mul.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
+declare <8 x i32> @llvm.vp.sdiv.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
+declare <8 x i32> @llvm.vp.srem.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
+declare <8 x i32> @llvm.vp.udiv.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
+declare <8 x i32> @llvm.vp.urem.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
+; Bit arith
+declare <8 x i32> @llvm.vp.and.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
+declare <8 x i32> @llvm.vp.xor.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
+declare <8 x i32> @llvm.vp.or.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
+declare <8 x i32> @llvm.vp.ashr.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
+declare <8 x i32> @llvm.vp.lshr.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
+declare <8 x i32> @llvm.vp.shl.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
+
+; Fixed vector test function.
+define void @test_vp_int_v8(<8 x i32> %i0, <8 x i32> %i1, <8 x i32> %i2, <8 x i32> %f3, <8 x i1> %m, i32 %n) {
+  %r0 = call <8 x i32> @llvm.vp.add.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
+  %r1 = call <8 x i32> @llvm.vp.sub.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
+  %r2 = call <8 x i32> @llvm.vp.mul.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
+  %r3 = call <8 x i32> @llvm.vp.sdiv.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
+  %r4 = call <8 x i32> @llvm.vp.srem.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
+  %r5 = call <8 x i32> @llvm.vp.udiv.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
+  %r6 = call <8 x i32> @llvm.vp.urem.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
+  %r7 = call <8 x i32> @llvm.vp.and.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
+  %r8 = call <8 x i32> @llvm.vp.or.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
+  %r9 = call <8 x i32> @llvm.vp.xor.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
+  %rA = call <8 x i32> @llvm.vp.ashr.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
+  %rB = call <8 x i32> @llvm.vp.lshr.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
+  %rC = call <8 x i32> @llvm.vp.shl.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
+  ret void
+}
+
+; Scalable-width vectors
+; Integer arith
+declare <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32)
+declare <vscale x 4 x i32> @llvm.vp.sub.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32)
+declare <vscale x 4 x i32> @llvm.vp.mul.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32)
+declare <vscale x 4 x i32> @llvm.vp.sdiv.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32)
+declare <vscale x 4 x i32> @llvm.vp.srem.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32)
+declare <vscale x 4 x i32> @llvm.vp.udiv.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32)
+declare <vscale x 4 x i32> @llvm.vp.urem.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32)
+; Bit arith
+declare <vscale x 4 x i32> @llvm.vp.and.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32)
+declare <vscale x 4 x i32> @llvm.vp.xor.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32)
+declare <vscale x 4 x i32> @llvm.vp.or.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32)
+declare <vscale x 4 x i32> @llvm.vp.ashr.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32)
+declare <vscale x 4 x i32> @llvm.vp.lshr.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32)
+declare <vscale x 4 x i32> @llvm.vp.shl.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32)
+
+; Scalable vector test function.
+define void @test_vp_int_vscale(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i32> %i2, <vscale x 4 x i32> %f3, <vscale x 4 x i1> %m, i32 %n) {
+  %r0 = call <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
+  %r1 = call <vscale x 4 x i32> @llvm.vp.sub.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
+  %r2 = call <vscale x 4 x i32> @llvm.vp.mul.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
+  %r3 = call <vscale x 4 x i32> @llvm.vp.sdiv.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
+  %r4 = call <vscale x 4 x i32> @llvm.vp.srem.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
+  %r5 = call <vscale x 4 x i32> @llvm.vp.udiv.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
+  %r6 = call <vscale x 4 x i32> @llvm.vp.urem.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
+  %r7 = call <vscale x 4 x i32> @llvm.vp.and.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
+  %r8 = call <vscale x 4 x i32> @llvm.vp.or.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
+  %r9 = call <vscale x 4 x i32> @llvm.vp.xor.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
+  %rA = call <vscale x 4 x i32> @llvm.vp.ashr.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
+  %rB = call <vscale x 4 x i32> @llvm.vp.lshr.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
+  %rC = call <vscale x 4 x i32> @llvm.vp.shl.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
+  ret void
+}
+; All VP intrinsics have to be lowered into non-VP ops
+; Convert %evl into %mask for non-speculatable VP intrinsics and emit the
+; instruction+select idiom with a non-VP SIMD instruction.
+;
+; ALL-CONVERT-NOT: {{call.* @llvm.vp.add}}
+; ALL-CONVERT-NOT: {{call.* @llvm.vp.sub}}
+; ALL-CONVERT-NOT: {{call.* @llvm.vp.mul}}
+; ALL-CONVERT-NOT: {{call.* @llvm.vp.sdiv}}
+; ALL-CONVERT-NOT: {{call.* @llvm.vp.srem}}
+; ALL-CONVERT-NOT: {{call.* @llvm.vp.udiv}}
+; ALL-CONVERT-NOT: {{call.* @llvm.vp.urem}}
+; ALL-CONVERT-NOT: {{call.* @llvm.vp.and}}
+; ALL-CONVERT-NOT: {{call.* @llvm.vp.or}}
+; ALL-CONVERT-NOT: {{call.* @llvm.vp.xor}}
+; ALL-CONVERT-NOT: {{call.* @llvm.vp.ashr}}
+; ALL-CONVERT-NOT: {{call.* @llvm.vp.lshr}}
+; ALL-CONVERT-NOT: {{call.* @llvm.vp.shl}}
+;
+; ALL-CONVERT: define void @test_vp_int_v8(<8 x i32> %i0, <8 x i32> %i1, <8 x i32> %i2, <8 x i32> %f3, <8 x i1> %m, i32 %n) {
+; ALL-CONVERT-NEXT:  %{{.*}} = add <8 x i32> %i0, %i1
+; ALL-CONVERT-NEXT:  %{{.*}} = sub <8 x i32> %i0, %i1
+; ALL-CONVERT-NEXT:  %{{.*}} = mul <8 x i32> %i0, %i1
+; ALL-CONVERT-NEXT:  [[NINS:%.+]] = insertelement <8 x i32> poison, i32 %n, i32 0
+; ALL-CONVERT-NEXT:  [[NSPLAT:%.+]] = shufflevector <8 x i32> [[NINS]], <8 x i32> poison, <8 x i32> zeroinitializer
+; ALL-CONVERT-NEXT:  [[EVLM:%.+]] = icmp ult <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[NSPLAT]]
+; ALL-CONVERT-NEXT:  [[NEWM:%.+]] = and <8 x i1> [[EVLM]], %m
+; ALL-CONVERT-NEXT:  [[SELONE:%.+]] = select <8 x i1> [[NEWM]], <8 x i32> %i1, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; ALL-CONVERT-NEXT:  %{{.+}} = sdiv <8 x i32> %i0, [[SELONE]]
+; ALL-CONVERT-NOT:   %{{.+}} = srem <8 x i32> %i0, %i1
+; ALL-CONVERT:       %{{.+}} = srem <8 x i32> %i0, %{{.+}}
+; ALL-CONVERT-NOT:   %{{.+}} = udiv <8 x i32> %i0, %i1
+; ALL-CONVERT:       %{{.+}} = udiv <8 x i32> %i0, %{{.+}}
+; ALL-CONVERT-NOT:   %{{.+}} = urem <8 x i32> %i0, %i1
+; ALL-CONVERT:       %{{.+}} = urem <8 x i32> %i0, %{{.+}}
+; ALL-CONVERT-NEXT:  %{{.+}} = and <8 x i32> %i0, %i1
+; ALL-CONVERT-NEXT:  %{{.+}} = or <8 x i32> %i0, %i1
+; ALL-CONVERT-NEXT:  %{{.+}} = xor <8 x i32> %i0, %i1
+; ALL-CONVERT-NEXT:  %{{.+}} = ashr <8 x i32> %i0, %i1
+; ALL-CONVERT-NEXT:  %{{.+}} = lshr <8 x i32> %i0, %i1
+; ALL-CONVERT-NEXT:  %{{.+}} = shl <8 x i32> %i0, %i1
+; ALL-CONVERT:       ret void
+
+
+
+
+; All legal - don't transform anything.
+
+; LEGAL_LEGAL: define void @test_vp_int_v8(<8 x i32> %i0, <8 x i32> %i1, <8 x i32> %i2, <8 x i32> %f3, <8 x i1> %m, i32 %n) {
+; LEGAL_LEGAL-NEXT:   %r0 = call <8 x i32> @llvm.vp.add.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:   %r1 = call <8 x i32> @llvm.vp.sub.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:   %r2 = call <8 x i32> @llvm.vp.mul.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:   %r3 = call <8 x i32> @llvm.vp.sdiv.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:   %r4 = call <8 x i32> @llvm.vp.srem.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:   %r5 = call <8 x i32> @llvm.vp.udiv.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:   %r6 = call <8 x i32> @llvm.vp.urem.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:   %r7 = call <8 x i32> @llvm.vp.and.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:   %r8 = call <8 x i32> @llvm.vp.or.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:   %r9 = call <8 x i32> @llvm.vp.xor.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:   %rA = call <8 x i32> @llvm.vp.ashr.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:   %rB = call <8 x i32> @llvm.vp.lshr.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:   %rC = call <8 x i32> @llvm.vp.shl.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:   ret void
+
+; LEGAL_LEGAL:define void @test_vp_int_vscale(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i32> %i2, <vscale x 4 x i32> %f3, <vscale x 4 x i1> %m, i32 %n) {
+; LEGAL_LEGAL-NEXT:  %r0 = call <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:  %r1 = call <vscale x 4 x i32> @llvm.vp.sub.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:  %r2 = call <vscale x 4 x i32> @llvm.vp.mul.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:  %r3 = call <vscale x 4 x i32> @llvm.vp.sdiv.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:  %r4 = call <vscale x 4 x i32> @llvm.vp.srem.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:  %r5 = call <vscale x 4 x i32> @llvm.vp.udiv.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:  %r6 = call <vscale x 4 x i32> @llvm.vp.urem.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:  %r7 = call <vscale x 4 x i32> @llvm.vp.and.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:  %r8 = call <vscale x 4 x i32> @llvm.vp.or.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:  %r9 = call <vscale x 4 x i32> @llvm.vp.xor.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:  %rA = call <vscale x 4 x i32> @llvm.vp.ashr.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:  %rB = call <vscale x 4 x i32> @llvm.vp.lshr.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:  %rC = call <vscale x 4 x i32> @llvm.vp.shl.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:  ret void
+
+
+; Drop %evl where possible else fold %evl into %mask (%evl Discard, %mask Legal)
+;
+; There is no caching yet in the ExpandVectorPredication pass and the %evl
+; expansion code is emitted for every non-speculatable intrinsic again. Hence,
+; only check that..
+; (1) The %evl folding code and %mask are correct for the first
+;     non-speculatable VP intrinsic.
+; (2) All other non-speculatable VP intrinsics have a modified mask argument.
+; (3) All speculatable VP intrinsics keep their %mask and %evl.
+; (4) All VP intrinsics have an ineffective %evl parameter.
+
+; DISCARD_LEGAL: define void @test_vp_int_v8(<8 x i32> %i0, <8 x i32> %i1, <8 x i32> %i2, <8 x i32> %f3, <8 x i1> %m, i32 %n) {
+; DISCARD_LEGAL-NEXT:   %r0 = call <8 x i32> @llvm.vp.add.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8)
+; DISCARD_LEGAL-NEXT:   %r1 = call <8 x i32> @llvm.vp.sub.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8)
+; DISCARD_LEGAL-NEXT:   %r2 = call <8 x i32> @llvm.vp.mul.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8)
+; DISCARD_LEGAL-NEXT:   [[NSPLATINS:%.+]] = insertelement <8 x i32> poison, i32 %n, i32 0
+; DISCARD_LEGAL-NEXT:   [[NSPLAT:%.+]] = shufflevector <8 x i32> [[NSPLATINS]], <8 x i32> poison, <8 x i32> zeroinitializer
+; DISCARD_LEGAL-NEXT:   [[EVLMASK:%.+]] = icmp ult <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[NSPLAT]]
+; DISCARD_LEGAL-NEXT:   [[NEWMASK:%.+]] = and <8 x i1> [[EVLMASK]], %m
+; DISCARD_LEGAL-NEXT:   %r3 = call <8 x i32> @llvm.vp.sdiv.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> [[NEWMASK]], i32 8)
+; DISCARD_LEGAL-NOT:    %r4 = call <8 x i32> @llvm.vp.srem.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8)
+; DISCARD_LEGAL-NOT:    %r5 = call <8 x i32> @llvm.vp.udiv.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8)
+; DISCARD_LEGAL-NOT:    %r6 = call <8 x i32> @llvm.vp.urem.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8)
+; DISCARD_LEGAL:        %r7 = call <8 x i32> @llvm.vp.and.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8)
+; DISCARD_LEGAL-NEXT:   %r8 = call <8 x i32> @llvm.vp.or.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8)
+; DISCARD_LEGAL-NEXT:   %r9 = call <8 x i32> @llvm.vp.xor.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8)
+; DISCARD_LEGAL-NEXT:   %rA = call <8 x i32> @llvm.vp.ashr.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8)
+; DISCARD_LEGAL-NEXT:   %rB = call <8 x i32> @llvm.vp.lshr.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8)
+; DISCARD_LEGAL-NEXT:   %rC = call <8 x i32> @llvm.vp.shl.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8)
+; DISCARD_LEGAL-NEXT:   ret void
+
+; TODO compute vscale only once and use caching.
+; In the meantime, we only check for the correct vscale code for the first VP
+; intrinsic and skip over it for all others.
+
+; DISCARD_LEGAL: define void @test_vp_int_vscale(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i32> %i2, <vscale x 4 x i32> %f3, <vscale x 4 x i1> %m, i32 %n) {
+; DISCARD_LEGAL-NEXT: %vscale = call i32 @llvm.vscale.i32()
+; DISCARD_LEGAL-NEXT: %scalable_size = mul nuw i32 %vscale, 4
+; DISCARD_LEGAL-NEXT: %r0 = call <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %scalable_size)
+; DISCARD_LEGAL:      %r1 = call <vscale x 4 x i32> @llvm.vp.sub.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %scalable_size{{.*}})
+; DISCARD_LEGAL:      %r2 = call <vscale x 4 x i32> @llvm.vp.mul.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %scalable_size{{.*}})
+; DISCARD_LEGAL:      [[EVLM:%.+]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 %n)
+; DISCARD_LEGAL:      [[NEWM:%.+]] = and <vscale x 4 x i1> [[EVLM]], %m
+; DISCARD_LEGAL:      %r3 = call <vscale x 4 x i32> @llvm.vp.sdiv.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> [[NEWM]], i32 %scalable_size{{.*}})
+; DISCARD_LEGAL-NOT:  %{{.+}} = call <vscale x 4 x i32> @llvm.vp.{{.*}}, i32 %n)
+; DISCARD_LEGAL:      ret void
+
+
+; Convert %evl into %mask everywhere (%evl Convert, %mask Legal)
+;
+; For the same reasons as in the (%evl Discard, %mask Legal) case only check that..
+; (1) The %evl folding code and %mask are correct for the first VP intrinsic.
+; (2) All other VP intrinsics have a modified mask argument.
+; (3) All VP intrinsics have an ineffective %evl parameter.
+;
+; CONVERT_LEGAL: define void @test_vp_int_v8(<8 x i32> %i0, <8 x i32> %i1, <8 x i32> %i2, <8 x i32> %f3, <8 x i1> %m, i32 %n) {
+; CONVERT_LEGAL-NEXT:  [[NINS:%.+]] = insertelement <8 x i32> poison, i32 %n, i32 0
+; CONVERT_LEGAL-NEXT:  [[NSPLAT:%.+]] = shufflevector <8 x i32> [[NINS]], <8 x i32> poison, <8 x i32> zeroinitializer
+; CONVERT_LEGAL-NEXT:  [[EVLM:%.+]] = icmp ult <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[NSPLAT]]
+; CONVERT_LEGAL-NEXT:  [[NEWM:%.+]] = and <8 x i1> [[EVLM]], %m
+; CONVERT_LEGAL-NEXT:  %{{.+}} = call <8 x i32> @llvm.vp.add.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> [[NEWM]], i32 8)
+; CONVERT_LEGAL-NOT:   %{{.+}} = call <8 x i32> @llvm.vp.sub.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8)
+; CONVERT_LEGAL-NOT:   %{{.+}} = call <8 x i32> @llvm.vp.mul.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8)
+; CONVERT_LEGAL-NOT:   %{{.+}} = call <8 x i32> @llvm.vp.sdiv.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8)
+; CONVERT_LEGAL-NOT:   %{{.+}} = call <8 x i32> @llvm.vp.srem.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8)
+; CONVERT_LEGAL-NOT:   %{{.+}} = call <8 x i32> @llvm.vp.udiv.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8)
+; CONVERT_LEGAL-NOT:   %{{.+}} = call <8 x i32> @llvm.vp.urem.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8)
+; CONVERT_LEGAL-NOT:   %{{.+}} = call <8 x i32> @llvm.vp.and.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8)
+; CONVERT_LEGAL-NOT:   %{{.+}} = call <8 x i32> @llvm.vp.or.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8)
+; CONVERT_LEGAL-NOT:   %{{.+}} = call <8 x i32> @llvm.vp.xor.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8)
+; CONVERT_LEGAL-NOT:   %{{.+}} = call <8 x i32> @llvm.vp.ashr.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8)
+; CONVERT_LEGAL-NOT:   %{{.+}} = call <8 x i32> @llvm.vp.lshr.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8)
+; CONVERT_LEGAL-NOT:   %{{.+}} = call <8 x i32> @llvm.vp.shl.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8)
+; CONVERT_LEGAL:       ret void
+
+; Similar to %evl discard, %mask legal but make sure the first VP intrinsic has a legal expansion
+; CONVERT_LEGAL: define void @test_vp_int_vscale(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i32> %i2, <vscale x 4 x i32> %f3, <vscale x 4 x i1> %m, i32 %n) {
+; CONVERT_LEGAL-NEXT:   [[EVLM:%.+]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 %n)
+; CONVERT_LEGAL-NEXT:   [[NEWM:%.+]] = and <vscale x 4 x i1> [[EVLM]], %m
+; CONVERT_LEGAL-NEXT:   %vscale = call i32 @llvm.vscale.i32()
+; CONVERT_LEGAL-NEXT:   %scalable_size = mul nuw i32 %vscale, 4
+; CONVERT_LEGAL-NEXT:   %r0 = call <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> [[NEWM]], i32 %scalable_size)
+; CONVERT_LEGAL-NOT:    %{{.*}} = call <vscale x 4 x i32> @llvm.vp.{{.*}}, i32 %n)
+; CONVERT_LEGAL:        ret void
+
--- a/llvm/test/CodeGen/X86/O0-pipeline.ll
+++ b/llvm/test/CodeGen/X86/O0-pipeline.ll
@ -25,6 +25,7 @@
 ; CHECK-NEXT:       Shadow Stack GC Lowering
 ; CHECK-NEXT:       Lower constant intrinsics
 ; CHECK-NEXT:       Remove unreachable blocks from the CFG
+; CHECK-NEXT:       Expand vector predication intrinsics
 ; CHECK-NEXT:       Scalarize Masked Memory Intrinsics
 ; CHECK-NEXT:       Expand reduction intrinsics
 ; CHECK-NEXT:       Expand indirectbr instructions
--- a/llvm/test/CodeGen/X86/opt-pipeline.ll
+++ b/llvm/test/CodeGen/X86/opt-pipeline.ll
@ -54,6 +54,7 @@
 ; CHECK-NEXT:       Constant Hoisting
 ; CHECK-NEXT:       Replace intrinsics with calls to vector library
 ; CHECK-NEXT:       Partially inline calls to library functions
+; CHECK-NEXT:       Expand vector predication intrinsics
 ; CHECK-NEXT:       Scalarize Masked Memory Intrinsics
 ; CHECK-NEXT:       Expand reduction intrinsics
 ; CHECK-NEXT:       Interleaved Access Pass
--- a/llvm/tools/llc/llc.cpp
+++ b/llvm/tools/llc/llc.cpp
@ -352,6 +352,7 @@ int main(int argc, char **argv) {
  initializeVectorization(*Registry);
  initializeScalarizeMaskedMemIntrinLegacyPassPass(*Registry);
  initializeExpandReductionsPass(*Registry);
+  initializeExpandVectorPredicationPass(*Registry);
  initializeHardwareLoopsPass(*Registry);
  initializeTransformUtils(*Registry);
  initializeReplaceWithVeclibLegacyPass(*Registry);
--- a/llvm/tools/opt/opt.cpp
+++ b/llvm/tools/opt/opt.cpp
@ -513,7 +513,7 @@ static bool shouldPinPassToLegacyPM(StringRef Pass) {
      "safe-stack",           "cost-model",
      "codegenprepare",       "interleaved-load-combine",
      "unreachableblockelim", "verify-safepoint-ir",
-      "atomic-expand",
+      "atomic-expand",        "expandvp",
      "hardware-loops",       "type-promotion",
      "mve-tail-predication", "interleaved-access",
      "global-merge",         "pre-isel-intrinsic-lowering",
@ -591,6 +591,7 @@ int main(int argc, char **argv) {
  initializePostInlineEntryExitInstrumenterPass(Registry);
  initializeUnreachableBlockElimLegacyPassPass(Registry);
  initializeExpandReductionsPass(Registry);
+  initializeExpandVectorPredicationPass(Registry);
  initializeWasmEHPreparePass(Registry);
  initializeWriteBitcodePassPass(Registry);
  initializeHardwareLoopsPass(Registry);