[LV] Support vectorization of interleave-groups that require an epilog under

optsize using masked wide loads Under Opt for Size, the vectorizer does not vectorize interleave-groups that have gaps at the end of the group (such as a loop that reads only the even elements: a[2*i]) because that implies that we'll require a scalar epilogue (which is not allowed under Opt for Size). This patch extends the support for masked-interleave-groups (introduced by D53011 for conditional accesses) to also cover the case of gaps in a group of loads; Targets that enable the masked-interleave-group feature don't have to invalidate interleave-groups of loads with gaps; they could now use masked wide-loads and shuffles (if that's what the cost model selects). Reviewers: Ayal, hsaito, dcaballe, fhahn Reviewed By: Ayal Differential Revision: https://reviews.llvm.org/D53668 llvm-svn: 345705
2018-10-31 09:57:56 +00:00 · 2018-10-31 09:57:56 +00:00 · 34da6dd696
parent 889356eb71
commit 34da6dd696
20 changed files with 452 additions and 152 deletions
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@ -588,7 +588,8 @@ public:
  bool enableInterleavedAccessVectorization() const;

  /// Enable matching of interleaved access groups that contain predicated 
-  /// accesses and are vectorized using masked vector loads/stores.
+  /// accesses or gaps and therefore vectorized using masked
+  /// vector loads/stores.
  bool enableMaskedInterleavedAccessVectorization() const;

  /// Indicate that it is potentially unsafe to automatically vectorize
@ -827,11 +828,13 @@ public:
  ///    load allows gaps)
  /// \p Alignment is the alignment of the memory operation
  /// \p AddressSpace is address space of the pointer.
-  /// \p IsMasked indicates if the memory access is predicated.
+  /// \p UseMaskForCond indicates if the memory access is predicated.
+  /// \p UseMaskForGaps indicates if gaps should be masked.
  int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
                                 ArrayRef<unsigned> Indices, unsigned Alignment,
-                                 unsigned AddressSpace, 
-                                 bool IsMasked = false) const;
+                                 unsigned AddressSpace,
+                                 bool UseMaskForCond = false,
+                                 bool UseMaskForGaps = false) const;

  /// Calculate the cost of performing a vector reduction.
  ///
@ -1142,7 +1145,8 @@ public:
                                         ArrayRef<unsigned> Indices,
                                         unsigned Alignment,
                                         unsigned AddressSpace,
-                                         bool IsMasked = false) = 0;
+                                         bool UseMaskForCond = false,
+                                         bool UseMaskForGaps = false) = 0;
  virtual int getArithmeticReductionCost(unsigned Opcode, Type *Ty,
                                         bool IsPairwiseForm) = 0;
  virtual int getMinMaxReductionCost(Type *Ty, Type *CondTy,
@ -1484,9 +1488,11 @@ public:
  }
  int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
                                 ArrayRef<unsigned> Indices, unsigned Alignment,
-                                 unsigned AddressSpace, bool IsMasked) override {
+                                 unsigned AddressSpace, bool UseMaskForCond,
+                                 bool UseMaskForGaps) override {
    return Impl.getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                           Alignment, AddressSpace, IsMasked);
+                                           Alignment, AddressSpace,
+                                           UseMaskForCond, UseMaskForGaps);
  }
  int getArithmeticReductionCost(unsigned Opcode, Type *Ty,
                                 bool IsPairwiseForm) override {
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@ -453,7 +453,8 @@ public:
                                      unsigned Factor,
                                      ArrayRef<unsigned> Indices,
                                      unsigned Alignment, unsigned AddressSpace,
-                                      bool IsMasked = false) {
+                                      bool UseMaskForCond = false,
+                                      bool UseMaskForGaps = false) {
    return 1;
  }

--- a/llvm/include/llvm/Analysis/VectorUtils.h
+++ b/llvm/include/llvm/Analysis/VectorUtils.h
@ -24,6 +24,7 @@ namespace llvm {
 template <typename T> class ArrayRef;
 class DemandedBits;
 class GetElementPtrInst;
+class InterleaveGroup; 
 class Loop;
 class ScalarEvolution;
 class TargetTransformInfo;
@ -125,6 +126,20 @@ computeMinimumValueSizes(ArrayRef<BasicBlock*> Blocks,
 /// This function always sets a (possibly null) value for each K in Kinds.
 Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL);

+/// Create a mask that filters the members of an interleave group where there
+/// are gaps.
+///
+/// For example, the mask for \p Group with interleave-factor 3
+/// and \p VF 4, that has only its first member present is:
+///
+///   <1,0,0,1,0,0,1,0,0,1,0,0>
+///
+/// Note: The result is a mask of 0's and 1's, as opposed to the other
+/// create[*]Mask() utilities which create a shuffle mask (mask that
+/// consists of indices).
+Constant *createBitMaskForGaps(IRBuilder<> &Builder, unsigned VF,
+                               const InterleaveGroup &Group);
+
 /// Create a mask with replicated elements.
 ///
 /// This function creates a shuffle mask for replicating each of the \p VF 
@ -406,8 +421,8 @@ public:
  bool requiresScalarEpilogue() const { return RequiresScalarEpilogue; }

  /// Invalidate groups that require a scalar epilogue (due to gaps). This can
-  /// happen when we optimize for size and don't allow creating a scalar
-  /// epilogue.
+  /// happen when optimizing for size forbids a scalar epilogue, and the gap
+  /// cannot be filtered by masking the load/store.
  void invalidateGroupsRequiringScalarEpilogue();

 private:
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@ -804,7 +804,8 @@ public:
                                      unsigned Factor,
                                      ArrayRef<unsigned> Indices,
                                      unsigned Alignment, unsigned AddressSpace,
-                                      bool IsMasked = false) {
+                                      bool UseMaskForCond = false,
+                                      bool UseMaskForGaps = false) {
    VectorType *VT = dyn_cast<VectorType>(VecTy);
    assert(VT && "Expect a vector type for interleaved memory op");

@ -816,7 +817,7 @@ public:

    // Firstly, the cost of load/store operation.
    unsigned Cost;
-    if (IsMasked)
+    if (UseMaskForCond || UseMaskForGaps)
      Cost = static_cast<T *>(this)->getMaskedMemoryOpCost(
          Opcode, VecTy, Alignment, AddressSpace);
    else
@ -917,7 +918,7 @@ public:
                    ->getVectorInstrCost(Instruction::InsertElement, VT, i);
    }

-    if (!IsMasked)
+    if (!UseMaskForCond)
      return Cost;

    Type *I8Type = Type::getInt8Ty(VT->getContext());
@ -942,6 +943,15 @@ public:
      Cost += static_cast<T *>(this)->getVectorInstrCost(
          Instruction::InsertElement, MaskVT, i);

+    // The Gaps mask is invariant and created outside the loop, therefore the
+    // cost of creating it is not accounted for here. However if we have both
+    // a MaskForGaps and some other mask that guards the execution of the
+    // memory access, we need to account for the cost of And-ing the two masks
+    // inside the loop.
+    if (UseMaskForGaps)
+      Cost += static_cast<T *>(this)->getArithmeticInstrCost(
+          BinaryOperator::And, MaskVT); 
+
    return Cost;
  }

--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@ -519,9 +519,12 @@ int TargetTransformInfo::getGatherScatterOpCost(unsigned Opcode, Type *DataTy,

 int TargetTransformInfo::getInterleavedMemoryOpCost(
    unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
-    unsigned Alignment, unsigned AddressSpace, bool IsMasked) const {
-  int Cost = TTIImpl->getInterleavedMemoryOpCost(
-      Opcode, VecTy, Factor, Indices, Alignment, AddressSpace, IsMasked);
+    unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond,
+    bool UseMaskForGaps) const {
+  int Cost = TTIImpl->getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                                 Alignment, AddressSpace,
+                                                 UseMaskForCond,
+                                                 UseMaskForGaps);
  assert(Cost >= 0 && "TTI should not produce negative costs!");
  return Cost;
 }
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@ -504,6 +504,25 @@ Instruction *llvm::propagateMetadata(Instruction *Inst, ArrayRef<Value *> VL) {
  return Inst;
 }

+Constant *llvm::createBitMaskForGaps(IRBuilder<> &Builder, unsigned VF,
+                                           const InterleaveGroup &Group) {
+  // All 1's means mask is not needed.
+  if (Group.getNumMembers() == Group.getFactor())
+    return nullptr;
+
+  // TODO: support reversed access.
+  assert(!Group.isReverse() && "Reversed group not supported.");
+
+  SmallVector<Constant *, 16> Mask;
+  for (unsigned i = 0; i < VF; i++)
+    for (unsigned j = 0; j < Group.getFactor(); ++j) {
+      unsigned HasMember = Group.getMember(j) ? 1 : 0;
+      Mask.push_back(Builder.getInt1(HasMember));
+    }
+
+  return ConstantVector::get(Mask);
+}
+
 Constant *llvm::createReplicatedMask(IRBuilder<> &Builder, 
                                     unsigned ReplicationFactor, unsigned VF) {
  SmallVector<Constant *, 16> MaskVec;
@ -935,9 +954,10 @@ void InterleavedAccessInfo::invalidateGroupsRequiringScalarEpilogue() {
  }
  for (auto *Ptr : DelSet) {
    LLVM_DEBUG(
-        dbgs() 
+        dbgs()
        << "LV: Invalidate candidate interleaved group due to gaps that "
-           "require a scalar epilogue.\n");
+           "require a scalar epilogue (not allowed under optsize) and cannot "
+           "be masked (not enabled). \n");
    releaseGroup(Ptr);
  }

--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@ -660,11 +660,13 @@ int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
                                               ArrayRef<unsigned> Indices,
                                               unsigned Alignment,
                                               unsigned AddressSpace,
-                                               bool IsMasked) {
+                                               bool UseMaskForCond,
+                                               bool UseMaskForGaps) {
  assert(Factor >= 2 && "Invalid interleave factor");
  assert(isa<VectorType>(VecTy) && "Expect a vector type");

-  if (!IsMasked && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
+  if (!UseMaskForCond && !UseMaskForGaps && 
+      Factor <= TLI->getMaxSupportedInterleaveFactor()) {
    unsigned NumElts = VecTy->getVectorNumElements();
    auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);

@ -677,7 +679,8 @@ int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
  }

  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                           Alignment, AddressSpace, IsMasked);
+                                           Alignment, AddressSpace,
+                                           UseMaskForCond, UseMaskForGaps);
 }

 int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@ -146,7 +146,9 @@ public:

  int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
                                 ArrayRef<unsigned> Indices, unsigned Alignment,
-                                 unsigned AddressSpace, bool IsMasked = false);
+                                 unsigned AddressSpace,
+                                 bool UseMaskForCond = false,
+                                 bool UseMaskForGaps = false);

  bool
  shouldConsiderAddressTypePromotion(const Instruction &I,
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@ -564,7 +564,8 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
                                           ArrayRef<unsigned> Indices,
                                           unsigned Alignment,
                                           unsigned AddressSpace,
-                                           bool IsMasked) {
+                                           bool UseMaskForCond,
+                                           bool UseMaskForGaps) {
  assert(Factor >= 2 && "Invalid interleave factor");
  assert(isa<VectorType>(VecTy) && "Expect a vector type");

@ -572,7 +573,7 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
  bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;

  if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
-      !IsMasked) {
+      !UseMaskForCond && !UseMaskForGaps) {
    unsigned NumElts = VecTy->getVectorNumElements();
    auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);

@ -585,7 +586,8 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
  }

  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                           Alignment, AddressSpace, IsMasked);
+                                           Alignment, AddressSpace,
+                                           UseMaskForCond, UseMaskForGaps);
 }

 void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@ -169,7 +169,9 @@ public:

  int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
                                 ArrayRef<unsigned> Indices, unsigned Alignment,
-                                 unsigned AddressSpace, bool IsMasked);
+                                 unsigned AddressSpace,
+                                 bool UseMaskForCond = false,
+                                 bool UseMaskForGaps = false);

  void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                               TTI::UnrollingPreferences &UP);
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@ -206,10 +206,12 @@ unsigned HexagonTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy,

 unsigned HexagonTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode,
      Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
-      unsigned Alignment, unsigned AddressSpace, bool IsMasked) {
-  if (Indices.size() != Factor || IsMasked)
+      unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond,
+      bool UseMaskForGaps) {
+  if (Indices.size() != Factor || UseMaskForCond || UseMaskForGaps)
    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                             Alignment, AddressSpace, IsMasked);
+                                             Alignment, AddressSpace,
+                                             UseMaskForCond, UseMaskForGaps);
  return getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, nullptr);
 }

--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@ -123,7 +123,8 @@ public:
            bool VariableMask, unsigned Alignment);
  unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
            unsigned Factor, ArrayRef<unsigned> Indices, unsigned Alignment,
-            unsigned AddressSpace, bool IsMasked);
+            unsigned AddressSpace, bool UseMaskForCond = false,
+            bool UseMaskForGaps = false);
  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
            const Instruction *I);
  unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@ -474,10 +474,12 @@ int PPCTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
                                           ArrayRef<unsigned> Indices,
                                           unsigned Alignment,
                                           unsigned AddressSpace,
-                                           bool IsMasked) {
-  if (IsMasked)
+                                           bool UseMaskForCond,
+                                           bool UseMaskForGaps) {
+  if (UseMaskForCond || UseMaskForGaps)
    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                             Alignment, AddressSpace, IsMasked);
+                                             Alignment, AddressSpace,
+                                             UseMaskForCond, UseMaskForGaps);

  assert(isa<VectorType>(VecTy) &&
         "Expect a vector type for interleaved memory op");
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
@ -91,7 +91,8 @@ public:
                                 ArrayRef<unsigned> Indices,
                                 unsigned Alignment,
                                 unsigned AddressSpace,
-                                 bool IsMasked = false);
+                                 bool UseMaskForCond = false,
+                                 bool UseMaskForGaps = false);

  /// @}
 };
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@ -969,10 +969,12 @@ int SystemZTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
                                               ArrayRef<unsigned> Indices,
                                               unsigned Alignment,
                                               unsigned AddressSpace,
-                                               bool IsMasked) {
-  if (IsMasked)
+                                               bool UseMaskForCond,
+                                               bool UseMaskForGaps) {
+  if (UseMaskForCond || UseMaskForGaps)
    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                             Alignment, AddressSpace, IsMasked);
+                                             Alignment, AddressSpace,
+                                             UseMaskForCond, UseMaskForGaps);
  assert(isa<VectorType>(VecTy) &&
         "Expect a vector type for interleaved memory op");

--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@ -93,7 +93,9 @@ public:
                                 unsigned Factor,
                                 ArrayRef<unsigned> Indices,
                                 unsigned Alignment,
-                                 unsigned AddressSpace, bool IsMasked = false);
+                                 unsigned AddressSpace,
+                                 bool UseMaskForCond = false,
+                                 bool UseMaskForGaps = false);
  /// @}
 };

--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@ -2784,11 +2784,13 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
                                               ArrayRef<unsigned> Indices,
                                               unsigned Alignment,
                                               unsigned AddressSpace,
-                                               bool IsMasked) {
+                                               bool UseMaskForCond,
+                                               bool UseMaskForGaps) {

-  if (IsMasked)
+  if (UseMaskForCond || UseMaskForGaps)
    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                             Alignment, AddressSpace, IsMasked);
+                                             Alignment, AddressSpace,
+                                             UseMaskForCond, UseMaskForGaps);

  // We currently Support only fully-interleaved groups, with no gaps.
  // TODO: Support also strided loads (interleaved-groups with gaps).
@ -2898,11 +2900,13 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
                                                 ArrayRef<unsigned> Indices,
                                                 unsigned Alignment,
                                                 unsigned AddressSpace,
-                                                 bool IsMasked) {
+                                                 bool UseMaskForCond,
+                                                 bool UseMaskForGaps) {

-  if (IsMasked)
+  if (UseMaskForCond || UseMaskForGaps)
    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                             Alignment, AddressSpace, IsMasked);
+                                             Alignment, AddressSpace,
+                                             UseMaskForCond, UseMaskForGaps);

  // VecTy for interleave memop is <VF*Factor x Elt>.
  // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
@ -3021,7 +3025,8 @@ int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
                                           ArrayRef<unsigned> Indices,
                                           unsigned Alignment,
                                           unsigned AddressSpace,
-                                           bool IsMasked) {
+                                           bool UseMaskForCond,
+                                           bool UseMaskForGaps) {
  auto isSupportedOnAVX512 = [](Type *VecTy, bool HasBW) {
    Type *EltTy = VecTy->getVectorElementType();
    if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
@ -3033,11 +3038,14 @@ int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
  };
  if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI()))
    return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices,
-                                            Alignment, AddressSpace, IsMasked);
+                                            Alignment, AddressSpace,
+                                            UseMaskForCond, UseMaskForGaps);
  if (ST->hasAVX2())
    return getInterleavedMemoryOpCostAVX2(Opcode, VecTy, Factor, Indices,
-                                          Alignment, AddressSpace, IsMasked);
+                                          Alignment, AddressSpace,
+                                          UseMaskForCond, UseMaskForGaps);

  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                           Alignment, AddressSpace, IsMasked);
+                                           Alignment, AddressSpace,
+                                           UseMaskForCond, UseMaskForGaps);
 }
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@ -102,15 +102,18 @@ public:
  int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
                                 unsigned Factor, ArrayRef<unsigned> Indices,
                                 unsigned Alignment, unsigned AddressSpace,
-                                 bool IsMasked = false);
+                                 bool UseMaskForCond = false,
+                                 bool UseMaskForGaps = false);
  int getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
                                 unsigned Factor, ArrayRef<unsigned> Indices,
                                 unsigned Alignment, unsigned AddressSpace,
-                                 bool IsMasked = false);
+                                 bool UseMaskForCond = false,
+                                 bool UseMaskForGaps = false);
  int getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
                                 unsigned Factor, ArrayRef<unsigned> Indices,
                                 unsigned Alignment, unsigned AddressSpace,
-                                 bool IsMasked = false);
+                                 bool UseMaskForCond = false,
+                                 bool UseMaskForGaps = false);

  int getIntImmCost(int64_t);

--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@ -172,6 +172,8 @@ static cl::opt<bool> EnableInterleavedMemAccesses(
    "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
    cl::desc("Enable vectorization on interleaved memory accesses in a loop"));

+/// An interleave-group may need masking if it resides in a block that needs
+/// predication, or in order to mask away gaps. 
 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
    "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
    cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
@ -1134,11 +1136,15 @@ public:
  }

  /// Returns true if an interleaved group requires a scalar iteration
-  /// to handle accesses with gaps.
+  /// to handle accesses with gaps, and there is nothing preventing us from
+  /// creating a scalar epilogue.
  bool requiresScalarEpilogue() const {
-    return InterleaveInfo.requiresScalarEpilogue();
+    return IsScalarEpilogueAllowed && InterleaveInfo.requiresScalarEpilogue();
  }

+  /// Returns true if a scalar epilogue is not allowed due to optsize.
+  bool isScalarEpilogueAllowed() const { return IsScalarEpilogueAllowed; }
+
  /// Returns true if all loop blocks should be masked to fold tail loop.
  bool foldTailByMasking() const { return FoldTailByMasking; }

@ -1229,6 +1235,15 @@ private:
  /// vectorization as a predicated block.
  SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;

+  /// Records whether it is allowed to have the original scalar loop execute at
+  /// least once. This may be needed as a fallback loop in case runtime 
+  /// aliasing/dependence checks fail, or to handle the tail/remainder
+  /// iterations when the trip count is unknown or doesn't divide by the VF,
+  /// or as a peel-loop to handle gaps in interleave-groups.
+  /// Under optsize and when the trip count is very small we don't allow any
+  /// iterations to execute in the scalar loop.
+  bool IsScalarEpilogueAllowed = true;
+
  /// All blocks of loop are to be masked to fold tail of scalar iterations.
  bool FoldTailByMasking = false;

@ -1938,6 +1953,17 @@ Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
                                     "reverse");
 }

+// Return whether we allow using masked interleave-groups (for dealing with
+// strided loads/stores that reside in predicated blocks, or for dealing
+// with gaps).
+static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
+  // If an override option has been passed in for interleaved accesses, use it.
+  if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
+    return EnableMaskedInterleavedMemAccesses;
+
+  return TTI.enableMaskedInterleavedAccessVectorization();
+}
+
 // Try to vectorize the interleave group that \p Instr belongs to.
 //
 // E.g. Translate following interleaved load group (factor = 3):
@ -1990,12 +2016,12 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
  unsigned Index = Group->getIndex(Instr);

  VectorParts Mask;
-  bool IsMaskRequired = BlockInMask;
-  if (IsMaskRequired) {
+  bool IsMaskForCondRequired = BlockInMask;
+  if (IsMaskForCondRequired) {
    Mask = *BlockInMask;
    // TODO: extend the masked interleaved-group support to reversed access.
    assert(!Group->isReverse() && "Reversed masked interleave-group "
-                                  "not supported."); 
+                                  "not supported.");
  }

  // If the group is reverse, adjust the index to refer to the last vector lane
@ -2036,20 +2062,35 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
  setDebugLocFromInst(Builder, Instr);
  Value *UndefVec = UndefValue::get(VecTy);

+  Value *MaskForGaps = nullptr;
+  if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
+    MaskForGaps = createBitMaskForGaps(Builder, VF, *Group);
+    assert(MaskForGaps && "Mask for Gaps is required but it is null");
+  }
+
  // Vectorize the interleaved load group.
  if (isa<LoadInst>(Instr)) {
    // For each unroll part, create a wide load for the group.
    SmallVector<Value *, 2> NewLoads;
    for (unsigned Part = 0; Part < UF; Part++) {
      Instruction *NewLoad;
-      if (IsMaskRequired) {
-        auto *Undefs = UndefValue::get(Mask[Part]->getType());
-        auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
-        Value *ShuffledMask = Builder.CreateShuffleVector(
-            Mask[Part], Undefs, RepMask, "interleaved.mask");
-        NewLoad = Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(), 
-                                           ShuffledMask, UndefVec,
-                                           "wide.masked.vec");
+      if (IsMaskForCondRequired || MaskForGaps) {
+        assert(useMaskedInterleavedAccesses(*TTI) &&
+               "masked interleaved groups are not allowed.");
+        Value *GroupMask = MaskForGaps;
+        if (IsMaskForCondRequired) {
+          auto *Undefs = UndefValue::get(Mask[Part]->getType());
+          auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
+          Value *ShuffledMask = Builder.CreateShuffleVector(
+              Mask[Part], Undefs, RepMask, "interleaved.mask");
+          GroupMask = MaskForGaps
+                          ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
+                                                MaskForGaps)
+                          : ShuffledMask;
+        }
+        NewLoad =
+            Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(),
+                                     GroupMask, UndefVec, "wide.masked.vec");
      }
      else
        NewLoad = Builder.CreateAlignedLoad(NewPtrs[Part], 
@ -2121,7 +2162,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
                                              "interleaved.vec");

    Instruction *NewStoreInstr;
-    if (IsMaskRequired) {
+    if (IsMaskForCondRequired) {
      auto *Undefs = UndefValue::get(Mask[Part]->getType());
      auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
      Value *ShuffledMask = Builder.CreateShuffleVector(
@ -4333,29 +4374,32 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigne
  return false;
 }

-static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
-  if (!(EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0))
-    return TTI.enableMaskedInterleavedAccessVectorization();
-
-  // If an override option has been passed in for interleaved accesses, use it.
-  return EnableMaskedInterleavedMemAccesses;
-}
-
 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
                                                               unsigned VF) {
  assert(isAccessInterleaved(I) && "Expecting interleaved access.");
  assert(getWideningDecision(I, VF) == CM_Unknown &&
         "Decision should not be set yet.");
+  auto *Group = getInterleavedAccessGroup(I);
+  assert(Group && "Must have a group.");

-  if (!Legal->blockNeedsPredication(I->getParent()) ||
-      !Legal->isMaskRequired(I))
+  // Check if masking is required.
+  // A Group may need masking for one of two reasons: it resides in a block that
+  // needs predication, or it was decided to use masking to deal with gaps.
+  bool PredicatedAccessRequiresMasking = 
+      Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
+  bool AccessWithGapsRequiresMasking = 
+      Group->requiresScalarEpilogue() && !IsScalarEpilogueAllowed;
+  if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
    return true;

-  if (!useMaskedInterleavedAccesses(TTI))
-    return false;
+  // If masked interleaving is required, we expect that the user/target had
+  // enabled it, because otherwise it either wouldn't have been created or
+  // it should have been invalidated by the CostModel.
+  assert(useMaskedInterleavedAccesses(TTI) &&
+         "Masked interleave-groups for predicated accesses are not enabled.");

  auto *Ty = getMemInstValueType(I);
-  return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty) 
+  return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty)
                          : TTI.isLegalMaskedStore(Ty);
 }

@ -4606,9 +4650,13 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
  // Record that scalar epilogue is not allowed.
  LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");

+  IsScalarEpilogueAllowed = !OptForSize;
+
  // We don't create an epilogue when optimizing for size.
-  // Invalidate interleave groups that require an epilogue.
-  InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
+  // Invalidate interleave groups that require an epilogue if we can't mask
+  // the interleave-group.
+  if (!useMaskedInterleavedAccesses(TTI)) 
+    InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();

  unsigned MaxVF = computeFeasibleMaxVF(OptForSize, TC);

@ -5495,13 +5543,15 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
  }

  // Calculate the cost of the whole interleaved group.
+  bool UseMaskForGaps = 
+      Group->requiresScalarEpilogue() && !IsScalarEpilogueAllowed;
  unsigned Cost = TTI.getInterleavedMemoryOpCost(
      I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
-      Group->getAlignment(), AS, Legal->isMaskRequired(I));
+      Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps);

  if (Group->isReverse()) {
    // TODO: Add support for reversed masked interleaved access.
-    assert(!Legal->isMaskRequired(I) && 
+    assert(!Legal->isMaskRequired(I) &&
           "Reverse masked interleaved access not supported.");
    Cost += Group->getNumMembers() *
            TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
--- a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
@ -84,26 +84,39 @@ for.end:

 ; Exactly the same scenario except we are now optimizing for size, therefore
 ; we check that no scalar epilogue is created. Since we can't create an epilog
-; the interleave-group is invalidated because is has gaps, so we end up
-; scalarizing.
-; (Before the fix that this test checks, we used to create an epilogue despite
-; optsize, and vectorized the access as an interleaved-group. This is now fixed,
-; and we make sure that a scalar epilogue does not exist).
+; we need the ability to mask out the gaps.
+; When enable-masked-interleaved-access is enabled, the interleave-groups will
+; be vectorized with masked wide-loads with the mask properly shuffled and
+; And-ed with the gaps mask.

 ;ENABLED_MASKED_STRIDED-LABEL: @masked_strided1_optsize(
-;ENABLED_MASKED_STRIDED: vector.body:
-;ENABLED_MASKED_STRIDED-NEXT:  %index = phi i32 
-;ENABLED_MASKED_STRIDED-NEXT:  %[[VECIND:.+]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-;ENABLED_MASKED_STRIDED-NOT:   %interleaved.mask = 
-;ENABLED_MASKED_STRIDED-NOT:   call <16 x i8> @llvm.masked.load.v16i8.p0v16i8
-;ENABLED_MASKED_STRIDED:       %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}}
-;ENABLED_MASKED_STRIDED-NEXT:  %{{.*}} = shl nuw nsw <8 x i32> %[[VECIND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-;ENABLED_MASKED_STRIDED-NEXT:  %[[M:.+]] = extractelement <8 x i1> %[[VMASK]], i32 0
-;ENABLED_MASKED_STRIDED-NEXT:  br i1 %[[M]], label %pred.load.if, label %pred.load.continue
-;ENABLED_MASKED_STRIDED-NOT:   %interleaved.mask = 
-;ENABLED_MASKED_STRIDED-NOT:   call <16 x i8> @llvm.masked.load.v16i8.p0v16i8
-;ENABLED_MASKED_STRIDED-NOT: for.body:
-;ENABLED_MASKED_STRIDED:     for.end:
+;ENABLED_MASKED_STRIDED-NEXT:  entry:
+;ENABLED_MASKED_STRIDED-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD:%.*]] to i32
+;ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[CONV]], i32 0
+;ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
+;ENABLED_MASKED_STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
+;ENABLED_MASKED_STRIDED:       vector.body:
+;ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+;ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = shl nuw nsw i32 [[INDEX]], 1
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP1]]
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>*
+;ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP0]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = and <16 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>
+;ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP3]], i32 1, <16 x i1> [[TMP4]], <16 x i8> undef)
+;ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]]
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <8 x i8>*
+;ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP6]], i32 1, <8 x i1> [[TMP0]])
+;ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
+;ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
+;ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP7]]
+;ENABLED_MASKED_STRIDED-NOT:   for.body:
+;ENABLED_MASKED_STRIDED:       for.end:
+;ENABLED_MASKED_STRIDED-NEXT:    ret void
+

 define dso_local void @masked_strided1_optsize(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr optsize {
 entry:
@ -138,12 +151,15 @@ for.end:
 ; remainder loop into the main loop using masking) together with interleaved-
 ; groups.
 ; When masked-interleave-group is disabled the interleave-groups will be
-; invalidated during Legality checks;
-; When masked-interleave-group is enabled the interleave-groups will be
-; invalidated during cost-model checks, because we don't have a way to support
-; interleave-groups with gaps that require an epilogue using masking.
-; So in both cases we check for no epilogue and scalarized conditional accesses.
-
+; invalidated during Legality checks; So there we check for no epilogue
+; and for scalarized conditional accesses.
+; When masked-interleave-group is enabled we check that there is no epilogue,
+; and that the interleave-groups are vectorized using proper masking (with
+; shuffling of the mask feeding the wide masked load/store).
+; The mask itself is an And of two masks: one that masks away the remainder
+; iterations, and one that masks away the 'else' of the 'if' statement.
+; The shuffled mask is also And-ed with the gaps mask.
+;
 ; void masked_strided1_optsize_unknown_tc(const unsigned char* restrict p,
 ;                      unsigned char* restrict q,
 ;                      unsigned char guard,
@ -178,21 +194,39 @@ for.end:


 ; ENABLED_MASKED_STRIDED-LABEL: @masked_strided1_optsize_unknown_tc(
+; ENABLED_MASKED_STRIDED-NEXT:  entry:
+; ENABLED_MASKED_STRIDED-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[CMP9]], label [[VECTOR_PH:%.*]], label [[FOR_END:%.*]]
+; ENABLED_MASKED_STRIDED:       vector.ph:
+; ENABLED_MASKED_STRIDED-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD:%.*]] to i32
+; ENABLED_MASKED_STRIDED-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 7
+; ENABLED_MASKED_STRIDED-NEXT:    [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8
+; ENABLED_MASKED_STRIDED-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[CONV]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT1]], <8 x i32> undef, <8 x i32> zeroinitializer
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; ENABLED_MASKED_STRIDED:       vector.body:
-; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi 
-; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], {{.*}}
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = icmp ule <8 x i32> [[VEC_IND]], {{.*}}
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = and <8 x i1> [[TMP0]], [[TMP2]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = extractelement <8 x i1> [[TMP3]], i32 0
-; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP4]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-; ENABLED_MASKED_STRIDED:       pred.load.if:
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP5]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = load i8, i8* [[TMP6]], align 1
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = insertelement <8 x i8> undef, i8 [[TMP7]], i32 0
-; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = shl nuw nsw i32 [[INDEX]], 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP1]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = and <8 x i1> [[TMP0]], [[TMP3]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = and <16 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>
+; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP5]], i32 1, <16 x i1> [[TMP6]], <16 x i8> undef)
+; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <8 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP8]], i32 1, <8 x i1> [[TMP4]])
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
+; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP9]], label [[FOR_END]], label [[VECTOR_BODY]]
 ; ENABLED_MASKED_STRIDED-NOT:   for.body:
 ; ENABLED_MASKED_STRIDED:       for.end:
 ; ENABLED_MASKED_STRIDED-NEXT:    ret void
@ -231,17 +265,115 @@ for.end:
  ret void
 }

+; Same, with stride 3. This is to check the gaps-mask and the shuffled mask
+; with a different stride.
+; So accesses are with gaps under Optsize scenario again, with unknown trip-
+; count, in order to check the behavior of folding-the-tail (folding the
+; remainder loop into the main loop using masking) together with interleaved-
+; groups.
+; When masked-interleave-group is enabled we check that there is no epilogue,
+; and that the interleave-groups are vectorized using proper masking (with
+; shuffling of the mask feeding the wide masked load/store).
+; The mask itself is an And of two masks: one that masks away the remainder
+; iterations, and one that masks away the 'else' of the 'if' statement.
+; The shuffled mask is also And-ed with the gaps mask.
+;
+; void masked_strided3_optsize_unknown_tc(const unsigned char* restrict p,
+;                      unsigned char* restrict q,
+;                      unsigned char guard,
+;                      int n) {
+;   for(ix=0; ix < n; ++ix) {
+;     if (ix > guard) {
+;         char t = p[3*ix];
+;         q[ix] = t;
+;     }
+;   }
+; }

-; Same, but the load/store are not predicated. The interleave-group is
-; invalidated here as well because we have gaps and we can't create an epilog.
-; The access is thus scalarized.
+
+; ENABLED_MASKED_STRIDED-LABEL: @masked_strided3_optsize_unknown_tc(
+; ENABLED_MASKED_STRIDED-NEXT:  entry:
+; ENABLED_MASKED_STRIDED-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[CMP9]], label [[VECTOR_PH:%.*]], label [[FOR_END:%.*]]
+; ENABLED_MASKED_STRIDED:       vector.ph:
+; ENABLED_MASKED_STRIDED-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD:%.*]] to i32
+; ENABLED_MASKED_STRIDED-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 7
+; ENABLED_MASKED_STRIDED-NEXT:    [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8
+; ENABLED_MASKED_STRIDED-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[CONV]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT1]], <8 x i32> undef, <8 x i32> zeroinitializer
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; ENABLED_MASKED_STRIDED:       vector.body:
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = mul nsw i32 [[INDEX]], 3
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP1]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = and <8 x i1> [[TMP0]], [[TMP3]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to <24 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> undef, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = and <24 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false>
+; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <24 x i8> @llvm.masked.load.v24i8.p0v24i8(<24 x i8>* [[TMP5]], i32 1, <24 x i1> [[TMP6]], <24 x i8> undef)
+; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <24 x i8> [[WIDE_MASKED_VEC]], <24 x i8> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <8 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP8]], i32 1, <8 x i1> [[TMP4]])
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
+; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP9]], label [[FOR_END]], label [[VECTOR_BODY]]
+; ENABLED_MASKED_STRIDED:       for.end:
+; ENABLED_MASKED_STRIDED-NEXT:    ret void
+;
+define dso_local void @masked_strided3_optsize_unknown_tc(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard, i32 %n) local_unnamed_addr optsize {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+  %conv = zext i8 %guard to i32
+  br label %for.body
+
+for.body:
+  %ix.010 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
+  %cmp1 = icmp ugt i32 %ix.010, %conv
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %mul = mul nsw i32 %ix.010, 3
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %ix.010
+  store i8 %0, i8* %arrayidx3, align 1
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.010, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+
+; Back to stride 2 with gaps with a known trip count under opt for size,
+; but this time the load/store are not predicated. 
+; When enable-masked-interleaved-access is disabled, the interleave-groups will
+; be invalidated during cost-model checks because we have gaps and we can't
+; create an epilog. The access is thus scalarized.
 ; (Before the fix that this test checks, we used to create an epilogue despite
 ; optsize, and vectorized the access as an interleaved-group. This is now fixed,
 ; and we make sure that a scalar epilogue does not exist).
-; Since enable-masked-interleaved-accesses currently only affects predicated
-; accesses, the behavior is the same with this switch set/unset.
-
-
+; When enable-masked-interleaved-access is enabled, the interleave-groups will
+; be vectorized with masked wide-loads (masking away the gaps).
+;
 ; void unconditional_strided1_optsize(const unsigned char* restrict p,
 ;                                unsigned char* restrict q,
 ;                                unsigned char guard) {
@ -259,11 +391,25 @@ for.end:
 ;DISABLED_MASKED_STRIDED:     for.end:

 ;ENABLED_MASKED_STRIDED-LABEL: @unconditional_strided1_optsize(
-;ENABLED_MASKED_STRIDED: vector.body:
-;ENABLED_MASKED_STRIDED-NOT: call <16 x i8> @llvm.masked.load.v16i8.p0v16i8
-;ENABLED_MASKED_STRIDED:     %{{.*}} = extractelement <8 x i32> %{{.*}}, i32 0       
-;ENABLED_MASKED_STRIDED-NOT: for.body:
-;ENABLED_MASKED_STRIDED:     for.end:
+;ENABLED_MASKED_STRIDED-NEXT:  entry:
+;ENABLED_MASKED_STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
+;ENABLED_MASKED_STRIDED:       vector.body:
+;ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = shl nuw nsw i32 [[INDEX]], 1
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP0]]
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <16 x i8>*
+;ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP2]], i32 1, <16 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <16 x i8> undef)
+;ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]]
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <8 x i8>*
+;ENABLED_MASKED_STRIDED-NEXT:    store <8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP4]], align 1
+;ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
+;ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP5]], label [[FOR_END:%.*]], label [[VECTOR_BODY]]
+;ENABLED_MASKED_STRIDED-NOT:   for.body:
+;ENABLED_MASKED_STRIDED:       for.end:
+;ENABLED_MASKED_STRIDED-NEXT:    ret void
+

 define dso_local void @unconditional_strided1_optsize(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr optsize {
 entry:
@ -289,13 +435,17 @@ for.end:
 ; Unconditioal accesses with gaps under Optsize scenario again, with unknown
 ; trip-count this time, in order to check the behavior of folding-the-tail 
 ; (folding the remainder loop into the main loop using masking) together with
-; interleaved-groups.
-; The interleave-groups will be invalidated during cost-model checks, because
-; we don't have a way to support interleave-groups with gaps that require an
-; epilogue using masking (even when interleaved-masking is enabled; this
-; is not yet supported).
-; So we check for no epilogue and for scalarized conditional accesses.
-
+; interleaved-groups. Folding-the-tail turns the accesses to conditional which
+; requires proper masking. In addition we need to mask out the gaps (all
+; because we are not allowed to use an epilog due to optsize).
+; When enable-masked-interleaved-access is disabled, the interleave-groups will
+; be invalidated during cost-model checks. So there we check for no epilogue
+; and for scalarized conditional accesses.
+; When masked-interleave-group is enabled we check that there is no epilogue,
+; and that the interleave-groups are vectorized using proper masking (with
+; shuffling of the mask feeding the wide masked load/store).
+; The shuffled mask is also And-ed with the gaps mask.
+;
 ;   for(ix=0; ix < n; ++ix) {
 ;         char t = p[2*ix];
 ;         q[ix] = t;
@ -319,21 +469,36 @@ for.end:
 ; DISABLED_MASKED_STRIDED:       for.end:
 ; DISABLED_MASKED_STRIDED-NEXT:    ret void

-
 ; ENABLED_MASKED_STRIDED-LABEL: @unconditional_strided1_optsize_unknown_tc(
+; ENABLED_MASKED_STRIDED-NEXT:  entry:
+; ENABLED_MASKED_STRIDED-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[CMP6]], label [[VECTOR_PH:%.*]], label [[FOR_END:%.*]]
+; ENABLED_MASKED_STRIDED:       vector.ph:
+; ENABLED_MASKED_STRIDED-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 7
+; ENABLED_MASKED_STRIDED-NEXT:    [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8
+; ENABLED_MASKED_STRIDED-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT1]], <8 x i32> undef, <8 x i32> zeroinitializer
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; ENABLED_MASKED_STRIDED:       vector.body:
-; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 
-; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = icmp ule <8 x i32> [[VEC_IND]], {{.*}}
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0
-; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP2]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-; ENABLED_MASKED_STRIDED:       pred.load.if:
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[TMP0]], i32 0
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP3]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = load i8, i8* [[TMP4]], align 1
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = insertelement <8 x i8> undef, i8 [[TMP5]], i32 0
-; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[INDEX]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDUCTION:%.*]] = add <8 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = shl nuw nsw i32 [[INDEX]], 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP0]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = icmp ule <8 x i32> [[INDUCTION]], [[BROADCAST_SPLAT2]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP1]] to <16 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = and <16 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>
+; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP3]], i32 1, <16 x i1> [[TMP4]], <16 x i8> undef)
+; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <8 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP6]], i32 1, <8 x i1> [[TMP2]])
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP7]], label [[FOR_END]], label [[VECTOR_BODY]]
 ; ENABLED_MASKED_STRIDED-NOT:   for.body:
 ; ENABLED_MASKED_STRIDED:       for.end:
 ; ENABLED_MASKED_STRIDED-NEXT:    ret void