[ValueTracking] Add DemandedElts support to computeKnownBits/ComputeNumSignBits (PR36319)

This patch adds initial support for a DemandedElts mask to the internal computeKnownBits/ComputeNumSignBits methods, matching the SelectionDAG and GlobalISel equivalents.

So far only a couple of instructions have been setup to handle the DemandedElts, the remainder still using the existing 'all elements' default. The plan is to extend support as we have test coverage.

Differential Revision: https://reviews.llvm.org/D73435
This commit is contained in:
Simon Pilgrim 2020-02-01 12:45:46 +00:00
parent 1acf129bcf
commit 105e5c940c
5 changed files with 193 additions and 60 deletions

View File

@ -163,8 +163,43 @@ static const Instruction *safeCxtI(const Value *V, const Instruction *CxtI) {
return nullptr;
}
static void computeKnownBits(const Value *V, KnownBits &Known,
unsigned Depth, const Query &Q);
static bool getShuffleDemandedElts(const ShuffleVectorInst *Shuf,
const APInt &DemandedElts,
APInt &DemandedLHS, APInt &DemandedRHS) {
int NumElts = Shuf->getOperand(0)->getType()->getVectorNumElements();
int NumMaskElts = Shuf->getMask()->getType()->getVectorNumElements();
DemandedLHS = DemandedRHS = APInt::getNullValue(NumElts);
for (int i = 0; i != NumMaskElts; ++i) {
if (!DemandedElts[i])
continue;
int M = Shuf->getMaskValue(i);
assert(M < (NumElts * 2) && "Invalid shuffle mask constant");
// For undef elements, we don't know anything about the common state of
// the shuffle result.
if (M == -1)
return false;
if (M < NumElts)
DemandedLHS.setBit(M % NumElts);
else
DemandedRHS.setBit(M % NumElts);
}
return true;
}
static void computeKnownBits(const Value *V, const APInt &DemandedElts,
KnownBits &Known, unsigned Depth, const Query &Q);
static void computeKnownBits(const Value *V, KnownBits &Known, unsigned Depth,
const Query &Q) {
Type *Ty = V->getType();
APInt DemandedElts = Ty->isVectorTy()
? APInt::getAllOnesValue(Ty->getVectorNumElements())
: APInt(1, 1);
computeKnownBits(V, DemandedElts, Known, Depth, Q);
}
void llvm::computeKnownBits(const Value *V, KnownBits &Known,
const DataLayout &DL, unsigned Depth,
@ -295,8 +330,17 @@ bool llvm::MaskedValueIsZero(const Value *V, const APInt &Mask,
V, Mask, Depth, Query(DL, AC, safeCxtI(V, CxtI), DT, UseInstrInfo));
}
static unsigned ComputeNumSignBits(const Value *V, const APInt &DemandedElts,
unsigned Depth, const Query &Q);
static unsigned ComputeNumSignBits(const Value *V, unsigned Depth,
const Query &Q);
const Query &Q) {
Type *Ty = V->getType();
APInt DemandedElts = Ty->isVectorTy()
? APInt::getAllOnesValue(Ty->getVectorNumElements())
: APInt(1, 1);
return ComputeNumSignBits(V, DemandedElts, Depth, Q);
}
unsigned llvm::ComputeNumSignBits(const Value *V, const DataLayout &DL,
unsigned Depth, AssumptionCache *AC,
@ -1039,8 +1083,10 @@ static void computeKnownBitsFromShiftOperator(
Known.setAllZero();
}
static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
unsigned Depth, const Query &Q) {
static void computeKnownBitsFromOperator(const Operator *I,
const APInt &DemandedElts,
KnownBits &Known, unsigned Depth,
const Query &Q) {
unsigned BitWidth = Known.getBitWidth();
KnownBits Known2(Known);
@ -1654,6 +1700,63 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
}
}
break;
case Instruction::ShuffleVector: {
auto *Shuf = cast<ShuffleVectorInst>(I);
// For undef elements, we don't know anything about the common state of
// the shuffle result.
APInt DemandedLHS, DemandedRHS;
if (!getShuffleDemandedElts(Shuf, DemandedElts, DemandedLHS, DemandedRHS)) {
Known.resetAll();
return;
}
Known.One.setAllBits();
Known.Zero.setAllBits();
if (!!DemandedLHS) {
const Value *LHS = Shuf->getOperand(0);
computeKnownBits(LHS, DemandedLHS, Known, Depth + 1, Q);
// If we don't know any bits, early out.
if (Known.isUnknown())
break;
}
if (!!DemandedRHS) {
const Value *RHS = Shuf->getOperand(1);
computeKnownBits(RHS, DemandedRHS, Known2, Depth + 1, Q);
Known.One &= Known2.One;
Known.Zero &= Known2.Zero;
}
break;
}
case Instruction::InsertElement: {
auto *IEI = cast<InsertElementInst>(I);
Value *Vec = IEI->getOperand(0);
Value *Elt = IEI->getOperand(1);
auto *CIdx = dyn_cast<ConstantInt>(IEI->getOperand(2));
// Early out if the index is non-constant or out-of-range.
unsigned NumElts = DemandedElts.getBitWidth();
if (!CIdx || CIdx->getValue().uge(NumElts)) {
Known.resetAll();
return;
}
Known.One.setAllBits();
Known.Zero.setAllBits();
unsigned EltIdx = CIdx->getZExtValue();
// Do we demand the inserted element?
if (DemandedElts[EltIdx]) {
computeKnownBits(Elt, Known, Depth + 1, Q);
// If we don't know any bits, early out.
if (Known.isUnknown())
break;
}
// We don't need the base vector element that has been inserted.
APInt DemandedVecElts = DemandedElts;
DemandedVecElts.clearBit(EltIdx);
if (!!DemandedVecElts) {
computeKnownBits(Vec, DemandedVecElts, Known2, Depth + 1, Q);
Known.One &= Known2.One;
Known.Zero &= Known2.Zero;
}
break;
}
case Instruction::ExtractElement:
// Look through extract element. At the moment we keep this simple and skip
// tracking the specific element. But at least we might find information
@ -1688,6 +1791,7 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
}
}
}
break;
}
}
@ -1713,24 +1817,34 @@ KnownBits computeKnownBits(const Value *V, unsigned Depth, const Query &Q) {
/// type, and vectors of integers. In the case
/// where V is a vector, known zero, and known one values are the
/// same width as the vector element, and the bit is set only if it is true
/// for all of the elements in the vector.
void computeKnownBits(const Value *V, KnownBits &Known, unsigned Depth,
const Query &Q) {
/// for all of the demanded elements in the vector specified by DemandedElts.
void computeKnownBits(const Value *V, const APInt &DemandedElts,
KnownBits &Known, unsigned Depth, const Query &Q) {
assert(V && "No Value?");
assert(Depth <= MaxDepth && "Limit Search Depth");
unsigned BitWidth = Known.getBitWidth();
assert((V->getType()->isIntOrIntVectorTy(BitWidth) ||
V->getType()->isPtrOrPtrVectorTy()) &&
Type *Ty = V->getType();
assert((Ty->isIntOrIntVectorTy(BitWidth) || Ty->isPtrOrPtrVectorTy()) &&
"Not integer or pointer type!");
assert(((Ty->isVectorTy() &&
Ty->getVectorNumElements() == DemandedElts.getBitWidth()) ||
(!Ty->isVectorTy() && DemandedElts == APInt(1, 1))) &&
"Unexpected vector size");
Type *ScalarTy = V->getType()->getScalarType();
Type *ScalarTy = Ty->getScalarType();
unsigned ExpectedWidth = ScalarTy->isPointerTy() ?
Q.DL.getPointerTypeSizeInBits(ScalarTy) : Q.DL.getTypeSizeInBits(ScalarTy);
assert(ExpectedWidth == BitWidth && "V and Known should have same BitWidth");
(void)BitWidth;
(void)ExpectedWidth;
if (!DemandedElts) {
// No demanded elts, better to assume we don't know anything.
Known.resetAll();
return;
}
const APInt *C;
if (match(V, m_APInt(C))) {
// We know all of the bits for a scalar constant or a splat vector constant!
@ -1746,10 +1860,15 @@ void computeKnownBits(const Value *V, KnownBits &Known, unsigned Depth,
// Handle a constant vector by taking the intersection of the known bits of
// each element.
if (const ConstantDataSequential *CDS = dyn_cast<ConstantDataSequential>(V)) {
assert((!Ty->isVectorTy() ||
CDS->getNumElements() == DemandedElts.getBitWidth()) &&
"Unexpected vector size");
// We know that CDS must be a vector of integers. Take the intersection of
// each element.
Known.Zero.setAllBits(); Known.One.setAllBits();
for (unsigned i = 0, e = CDS->getNumElements(); i != e; ++i) {
if (Ty->isVectorTy() && !DemandedElts[i])
continue;
APInt Elt = CDS->getElementAsAPInt(i);
Known.Zero &= ~Elt;
Known.One &= Elt;
@ -1758,10 +1877,14 @@ void computeKnownBits(const Value *V, KnownBits &Known, unsigned Depth,
}
if (const auto *CV = dyn_cast<ConstantVector>(V)) {
assert(CV->getNumOperands() == DemandedElts.getBitWidth() &&
"Unexpected vector size");
// We know that CV must be a vector of integers. Take the intersection of
// each element.
Known.Zero.setAllBits(); Known.One.setAllBits();
for (unsigned i = 0, e = CV->getNumOperands(); i != e; ++i) {
if (!DemandedElts[i])
continue;
Constant *Element = CV->getAggregateElement(i);
auto *ElementCI = dyn_cast_or_null<ConstantInt>(Element);
if (!ElementCI) {
@ -1800,10 +1923,10 @@ void computeKnownBits(const Value *V, KnownBits &Known, unsigned Depth,
}
if (const Operator *I = dyn_cast<Operator>(V))
computeKnownBitsFromOperator(I, Known, Depth, Q);
computeKnownBitsFromOperator(I, DemandedElts, Known, Depth, Q);
// Aligned pointers have trailing zeros - refine Known.Zero set
if (V->getType()->isPointerTy()) {
if (Ty->isPointerTy()) {
const MaybeAlign Align = V->getPointerAlignment(Q.DL);
if (Align)
Known.Zero.setLowBits(countTrailingZeros(Align->value()));
@ -2429,6 +2552,7 @@ static bool isSignedMinMaxClamp(const Value *Select, const Value *&In,
/// or if any element was not analyzed; otherwise, return the count for the
/// element with the minimum number of sign bits.
static unsigned computeNumSignBitsVectorConstant(const Value *V,
const APInt &DemandedElts,
unsigned TyBits) {
const auto *CV = dyn_cast<Constant>(V);
if (!CV || !CV->getType()->isVectorTy())
@ -2437,6 +2561,8 @@ static unsigned computeNumSignBitsVectorConstant(const Value *V,
unsigned MinSignBits = TyBits;
unsigned NumElts = CV->getType()->getVectorNumElements();
for (unsigned i = 0; i != NumElts; ++i) {
if (!DemandedElts[i])
continue;
// If we find a non-ConstantInt, bail out.
auto *Elt = dyn_cast_or_null<ConstantInt>(CV->getAggregateElement(i));
if (!Elt)
@ -2448,12 +2574,22 @@ static unsigned computeNumSignBitsVectorConstant(const Value *V,
return MinSignBits;
}
static unsigned ComputeNumSignBitsImpl(const Value *V, unsigned Depth,
const Query &Q);
static unsigned ComputeNumSignBitsImpl(const Value *V,
const APInt &DemandedElts,
unsigned Depth, const Query &Q);
static unsigned ComputeNumSignBits(const Value *V, unsigned Depth,
const Query &Q) {
unsigned Result = ComputeNumSignBitsImpl(V, Depth, Q);
static unsigned ComputeNumSignBitsImpl(const Value *V, unsigned Depth,
const Query &Q) {
Type *Ty = V->getType();
APInt DemandedElts = Ty->isVectorTy()
? APInt::getAllOnesValue(Ty->getVectorNumElements())
: APInt(1, 1);
return ComputeNumSignBitsImpl(V, DemandedElts, Depth, Q);
}
static unsigned ComputeNumSignBits(const Value *V, const APInt &DemandedElts,
unsigned Depth, const Query &Q) {
unsigned Result = ComputeNumSignBitsImpl(V, DemandedElts, Depth, Q);
assert(Result > 0 && "At least one sign bit needs to be present!");
return Result;
}
@ -2463,16 +2599,24 @@ static unsigned ComputeNumSignBits(const Value *V, unsigned Depth,
/// (itself), but other cases can give us information. For example, immediately
/// after an "ashr X, 2", we know that the top 3 bits are all equal to each
/// other, so we return 3. For vectors, return the number of sign bits for the
/// vector element with the minimum number of known sign bits.
static unsigned ComputeNumSignBitsImpl(const Value *V, unsigned Depth,
const Query &Q) {
/// vector element with the minimum number of known sign bits of the demanded
/// elements in the vector specified by DemandedElts.
static unsigned ComputeNumSignBitsImpl(const Value *V,
const APInt &DemandedElts,
unsigned Depth, const Query &Q) {
assert(Depth <= MaxDepth && "Limit Search Depth");
// We return the minimum number of sign bits that are guaranteed to be present
// in V, so for undef we have to conservatively return 1. We don't have the
// same behavior for poison though -- that's a FIXME today.
Type *ScalarTy = V->getType()->getScalarType();
Type *Ty = V->getType();
assert(((Ty->isVectorTy() &&
Ty->getVectorNumElements() == DemandedElts.getBitWidth()) ||
(!Ty->isVectorTy() && DemandedElts == APInt(1, 1))) &&
"Unexpected vector size");
Type *ScalarTy = Ty->getScalarType();
unsigned TyBits = ScalarTy->isPointerTy() ?
Q.DL.getPointerTypeSizeInBits(ScalarTy) :
Q.DL.getTypeSizeInBits(ScalarTy);
@ -2698,40 +2842,33 @@ static unsigned ComputeNumSignBitsImpl(const Value *V, unsigned Depth,
return ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
case Instruction::ShuffleVector: {
// TODO: This is copied almost directly from the SelectionDAG version of
// ComputeNumSignBits. It would be better if we could share common
// code. If not, make sure that changes are translated to the DAG.
// Collect the minimum number of sign bits that are shared by every vector
// element referenced by the shuffle.
auto *Shuf = cast<ShuffleVectorInst>(U);
int NumElts = Shuf->getOperand(0)->getType()->getVectorNumElements();
int NumMaskElts = Shuf->getMask()->getType()->getVectorNumElements();
APInt DemandedLHS(NumElts, 0), DemandedRHS(NumElts, 0);
for (int i = 0; i != NumMaskElts; ++i) {
int M = Shuf->getMaskValue(i);
assert(M < NumElts * 2 && "Invalid shuffle mask constant");
// For undef elements, we don't know anything about the common state of
// the shuffle result.
if (M == -1)
return 1;
if (M < NumElts)
DemandedLHS.setBit(M % NumElts);
else
DemandedRHS.setBit(M % NumElts);
}
APInt DemandedLHS, DemandedRHS;
// For undef elements, we don't know anything about the common state of
// the shuffle result.
if (!getShuffleDemandedElts(Shuf, DemandedElts, DemandedLHS, DemandedRHS))
return 1;
Tmp = std::numeric_limits<unsigned>::max();
if (!!DemandedLHS)
Tmp = ComputeNumSignBits(Shuf->getOperand(0), Depth + 1, Q);
if (!!DemandedLHS) {
const Value *LHS = Shuf->getOperand(0);
Tmp = ComputeNumSignBits(LHS, DemandedLHS, Depth + 1, Q);
}
// If we don't know anything, early out and try computeKnownBits
// fall-back.
if (Tmp == 1)
break;
if (!!DemandedRHS) {
Tmp2 = ComputeNumSignBits(Shuf->getOperand(1), Depth + 1, Q);
const Value *RHS = Shuf->getOperand(1);
Tmp2 = ComputeNumSignBits(RHS, DemandedRHS, Depth + 1, Q);
Tmp = std::min(Tmp, Tmp2);
}
// If we don't know anything, early out and try computeKnownBits
// fall-back.
if (Tmp == 1)
break;
assert(Tmp <= V->getType()->getScalarSizeInBits() &&
assert(Tmp <= Ty->getScalarSizeInBits() &&
"Failed to determine minimum sign bits");
return Tmp;
}
@ -2743,11 +2880,12 @@ static unsigned ComputeNumSignBitsImpl(const Value *V, unsigned Depth,
// If we can examine all elements of a vector constant successfully, we're
// done (we can't do any better than that). If not, keep trying.
if (unsigned VecSignBits = computeNumSignBitsVectorConstant(V, TyBits))
if (unsigned VecSignBits =
computeNumSignBitsVectorConstant(V, DemandedElts, TyBits))
return VecSignBits;
KnownBits Known(TyBits);
computeKnownBits(V, Known, Depth, Q);
computeKnownBits(V, DemandedElts, Known, Depth, Q);
// If we know that the sign bit is either zero or one, determine the number of
// identical bits in the top of the input value.

View File

@ -99,13 +99,11 @@ define i8 @nopreserve4(i8 %A, i8 %B) {
ret i8 %add
}
; TODO: computeKnownBits() should look through a shufflevector.
define <3 x i32> @shl_nuw_nsw_shuffle_splat_vec(<2 x i8> %x) {
; CHECK-LABEL: @shl_nuw_nsw_shuffle_splat_vec(
; CHECK-NEXT: [[T2:%.*]] = zext <2 x i8> [[X:%.*]] to <2 x i32>
; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <2 x i32> [[T2]], <2 x i32> undef, <3 x i32> <i32 1, i32 0, i32 1>
; CHECK-NEXT: [[T3:%.*]] = shl nsw <3 x i32> [[SHUF]], <i32 17, i32 17, i32 17>
; CHECK-NEXT: [[T3:%.*]] = shl nuw nsw <3 x i32> [[SHUF]], <i32 17, i32 17, i32 17>
; CHECK-NEXT: ret <3 x i32> [[T3]]
;
%t2 = zext <2 x i8> %x to <2 x i32>

View File

@ -78,8 +78,7 @@ define <4 x i32> @shl_C1_add_A_C2_v4i32_splat(i16 %I) {
; CHECK-NEXT: [[A:%.*]] = zext i16 [[I:%.*]] to i32
; CHECK-NEXT: [[B:%.*]] = insertelement <4 x i32> undef, i32 [[A]], i32 0
; CHECK-NEXT: [[C:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> undef, <4 x i32> zeroinitializer
; CHECK-NEXT: [[D:%.*]] = add <4 x i32> [[C]], <i32 0, i32 1, i32 50, i32 16>
; CHECK-NEXT: [[E:%.*]] = shl <4 x i32> <i32 6, i32 2, i32 1, i32 -7>, [[D]]
; CHECK-NEXT: [[E:%.*]] = shl <4 x i32> <i32 6, i32 4, i32 undef, i32 -458752>, [[C]]
; CHECK-NEXT: ret <4 x i32> [[E]]
;
%A = zext i16 %I to i32
@ -95,8 +94,7 @@ define <4 x i32> @ashr_C1_add_A_C2_v4i32_splat(i16 %I) {
; CHECK-NEXT: [[A:%.*]] = zext i16 [[I:%.*]] to i32
; CHECK-NEXT: [[B:%.*]] = insertelement <4 x i32> undef, i32 [[A]], i32 0
; CHECK-NEXT: [[C:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> undef, <4 x i32> zeroinitializer
; CHECK-NEXT: [[D:%.*]] = add <4 x i32> [[C]], <i32 0, i32 1, i32 50, i32 16>
; CHECK-NEXT: [[E:%.*]] = ashr <4 x i32> <i32 6, i32 2, i32 1, i32 -7>, [[D]]
; CHECK-NEXT: [[E:%.*]] = ashr <4 x i32> <i32 6, i32 1, i32 undef, i32 -1>, [[C]]
; CHECK-NEXT: ret <4 x i32> [[E]]
;
%A = zext i16 %I to i32
@ -112,8 +110,7 @@ define <4 x i32> @lshr_C1_add_A_C2_v4i32_splat(i16 %I) {
; CHECK-NEXT: [[A:%.*]] = zext i16 [[I:%.*]] to i32
; CHECK-NEXT: [[B:%.*]] = insertelement <4 x i32> undef, i32 [[A]], i32 0
; CHECK-NEXT: [[C:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> undef, <4 x i32> zeroinitializer
; CHECK-NEXT: [[D:%.*]] = add <4 x i32> [[C]], <i32 0, i32 1, i32 50, i32 16>
; CHECK-NEXT: [[E:%.*]] = lshr <4 x i32> <i32 6, i32 2, i32 1, i32 -7>, [[D]]
; CHECK-NEXT: [[E:%.*]] = lshr <4 x i32> <i32 6, i32 1, i32 undef, i32 65535>, [[C]]
; CHECK-NEXT: ret <4 x i32> [[E]]
;
%A = zext i16 %I to i32

View File

@ -87,7 +87,7 @@ define void @example2(i32 %n, i32 %x) optsize {
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE8:%.*]] ]
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[INDEX]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer
; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
; CHECK-NEXT: [[INDUCTION:%.*]] = or <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
; CHECK-NEXT: [[TMP5:%.*]] = or i64 [[INDEX]], 1
; CHECK-NEXT: [[TMP6:%.*]] = or i64 [[INDEX]], 2
; CHECK-NEXT: [[TMP7:%.*]] = or i64 [[INDEX]], 3
@ -275,7 +275,7 @@ define void @example23c(i16* noalias nocapture %src, i32* noalias nocapture %dst
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE22:%.*]] ]
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[INDEX]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer
; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
; CHECK-NEXT: [[INDUCTION:%.*]] = or <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <4 x i64> [[INDUCTION]], <i64 257, i64 257, i64 257, i64 257>
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0
; CHECK-NEXT: br i1 [[TMP2]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]

View File

@ -484,7 +484,7 @@ for.end:
; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[INDEX]], i32 0
; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
; ENABLED_MASKED_STRIDED-NEXT: [[INDUCTION:%.*]] = add <8 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; ENABLED_MASKED_STRIDED-NEXT: [[INDUCTION:%.*]] = or <8 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = shl nuw nsw i32 [[INDEX]], 1
; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP0]]
; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = icmp ule <8 x i32> [[INDUCTION]], [[BROADCAST_SPLAT2]]
@ -766,7 +766,7 @@ for.end:
; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32
; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[INDEX]], i32 0
; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
; ENABLED_MASKED_STRIDED-NEXT: [[INDUCTION:%.*]] = add <8 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; ENABLED_MASKED_STRIDED-NEXT: [[INDUCTION:%.*]] = or <8 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = shl nuw nsw i32 [[INDEX]], 1
; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP0]]
; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = icmp ule <8 x i32> {{.*}}, {{.*}}