[ARM][LV] Add a preferPredicatedReductionSelect target hook

As part of D84741, this adds a target hook for the
preferPredicatedReductionSelect option and makes use
of it under MVE, allowing us to tail predicate most
reduction loops.

Differential Revision: https://reviews.llvm.org/D85980
This commit is contained in:
David Green 2020-08-21 08:48:12 +01:00
parent 91039784b3
commit 2b69efded0
9 changed files with 356 additions and 354 deletions

View File

@ -1283,6 +1283,20 @@ public:
bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
ReductionFlags Flags) const;
/// \returns True if the target prefers reductions select kept in the loop
/// when tail folding. i.e.
/// loop:
/// p = phi (0, s)
/// a = add (p, x)
/// s = select (mask, a, p)
/// vecreduce.add(s)
///
/// As opposed to the normal scheme of p = phi (0, a) which allows the select
/// to be pulled out of the loop. If the select(.., add, ..) can be predicated
/// by the target, this can lead to cleaner code generation.
bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty,
ReductionFlags Flags) const;
/// \returns True if the target wants to expand the given reduction intrinsic
/// into a shuffle sequence.
bool shouldExpandReduction(const IntrinsicInst *II) const;
@ -1573,6 +1587,8 @@ public:
VectorType *VecTy) const = 0;
virtual bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
ReductionFlags) const = 0;
virtual bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty,
ReductionFlags) const = 0;
virtual bool shouldExpandReduction(const IntrinsicInst *II) const = 0;
virtual unsigned getGISelRematGlobalCost() const = 0;
virtual bool hasActiveVectorLength() const = 0;
@ -2073,6 +2089,10 @@ public:
ReductionFlags Flags) const override {
return Impl.useReductionIntrinsic(Opcode, Ty, Flags);
}
bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty,
ReductionFlags Flags) const override {
return Impl.preferPredicatedReductionSelect(Opcode, Ty, Flags);
}
bool shouldExpandReduction(const IntrinsicInst *II) const override {
return Impl.shouldExpandReduction(II);
}

View File

@ -658,6 +658,11 @@ public:
return false;
}
bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty,
TTI::ReductionFlags Flags) const {
return false;
}
bool shouldExpandReduction(const IntrinsicInst *II) const { return true; }
unsigned getGISelRematGlobalCost() const { return 1; }

View File

@ -1013,6 +1013,11 @@ bool TargetTransformInfo::useReductionIntrinsic(unsigned Opcode, Type *Ty,
return TTIImpl->useReductionIntrinsic(Opcode, Ty, Flags);
}
bool TargetTransformInfo::preferPredicatedReductionSelect(
unsigned Opcode, Type *Ty, ReductionFlags Flags) const {
return TTIImpl->preferPredicatedReductionSelect(Opcode, Ty, Flags);
}
bool TargetTransformInfo::shouldExpandReduction(const IntrinsicInst *II) const {
return TTIImpl->shouldExpandReduction(II);
}

View File

@ -1589,35 +1589,28 @@ static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
const LoopAccessInfo *LAI) {
LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");
// If there are live-out values, it is probably a reduction, which needs a
// final reduction step after the loop. MVE has a VADDV instruction to reduce
// integer vectors, but doesn't have an equivalent one for float vectors. A
// live-out value that is not recognised as a reduction will result in the
// tail-predicated loop to be reverted to a non-predicated loop and this is
// very expensive, i.e. it has a significant performance impact. So, in this
// case it's better not to tail-predicate the loop, which is what we check
// here. Thus, we allow only 1 live-out value, which has to be an integer
// reduction, which matches the loops supported by ARMLowOverheadLoops.
// It is important to keep ARMLowOverheadLoops and canTailPredicateLoop in
// sync with each other.
// If there are live-out values, it is probably a reduction. We can predicate
// most reduction operations freely under MVE using a combination of
// prefer-predicated-reduction-select and inloop reductions. We limit this to
// floating point and integer reductions, but don't check for operators
// specifically here. If the value ends up not being a reduction (and so the
// vectorizer cannot tailfold the loop), we should fall back to standard
// vectorization automatically.
SmallVector< Instruction *, 8 > LiveOuts;
LiveOuts = llvm::findDefsUsedOutsideOfLoop(L);
bool IntReductionsDisabled =
bool ReductionsDisabled =
EnableTailPredication == TailPredication::EnabledNoReductions ||
EnableTailPredication == TailPredication::ForceEnabledNoReductions;
for (auto *I : LiveOuts) {
if (!I->getType()->isIntegerTy()) {
LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer "
if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() &&
!I->getType()->isHalfTy()) {
LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float "
"live-out value\n");
return false;
}
if (I->getOpcode() != Instruction::Add) {
LLVM_DEBUG(dbgs() << "Only add reductions supported\n");
return false;
}
if (IntReductionsDisabled) {
LLVM_DEBUG(dbgs() << "Integer add reductions not enabled\n");
if (ReductionsDisabled) {
LLVM_DEBUG(dbgs() << "Reductions not enabled\n");
return false;
}
}
@ -1811,3 +1804,10 @@ bool ARMTTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty,
TTI::ReductionFlags Flags) const {
return ST->hasMVEIntegerOps();
}
bool ARMTTIImpl::preferPredicatedReductionSelect(
unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const {
if (!ST->hasMVEIntegerOps())
return false;
return true;
}

View File

@ -186,6 +186,9 @@ public:
bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
TTI::ReductionFlags Flags) const;
bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty,
TTI::ReductionFlags Flags) const;
bool shouldExpandReduction(const IntrinsicInst *II) const {
switch (II->getIntrinsicID()) {
case Intrinsic::experimental_vector_reduce_v2_fadd:

View File

@ -3929,7 +3929,10 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
// and so use the select value for the phi instead of the old
// LoopExitValue.
RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
if (PreferPredicatedReductionSelect) {
if (PreferPredicatedReductionSelect ||
TTI->preferPredicatedReductionSelect(
RdxDesc.getRecurrenceBinOp(RdxDesc.getRecurrenceKind()),
Phi->getType(), TargetTransformInfo::ReductionFlags())) {
auto *VecRdxPhi = cast<PHINode>(getOrCreateVectorValue(Phi, Part));
VecRdxPhi->setIncomingValueForBlock(
LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);

View File

@ -12,18 +12,18 @@ define i32 @reduction_sum_single(i32* noalias nocapture %A) {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 256)
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
; CHECK-NEXT: [[TMP2]] = add <4 x i32> [[VEC_PHI]], [[WIDE_MASKED_LOAD]]
; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP3]] = add <4 x i32> [[VEC_PHI]], [[TMP2]]
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
; CHECK: middle.block:
; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP2]], <4 x i32> [[VEC_PHI]]
; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP3]])
; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: br label [[DOTLR_PH:%.*]]
@ -60,7 +60,7 @@ define i32 @reduction_sum(i32* noalias nocapture %A, i32* noalias nocapture %B)
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 256)
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
@ -70,14 +70,14 @@ define i32 @reduction_sum(i32* noalias nocapture %A, i32* noalias nocapture %B)
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[VEC_PHI]], [[VEC_IND]]
; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[WIDE_MASKED_LOAD]]
; CHECK-NEXT: [[TMP6]] = add <4 x i32> [[TMP5]], [[WIDE_MASKED_LOAD1]]
; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i32> [[TMP5]], [[WIDE_MASKED_LOAD1]]
; CHECK-NEXT: [[TMP7]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP6]], <4 x i32> [[VEC_PHI]]
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4
; CHECK: middle.block:
; CHECK-NEXT: [[TMP8:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP6]], <4 x i32> [[VEC_PHI]]
; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP8]])
; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP7]])
; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: br label [[DOTLR_PH:%.*]]
@ -117,39 +117,29 @@ define i32 @reduction_prod(i32* noalias nocapture %A, i32* noalias nocapture %B)
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 1, i32 1, i32 1, i32 1>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 1, i32 1, i32 1, i32 1>, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 256)
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
; CHECK-NEXT: [[TMP5]] = mul <4 x i32> [[TMP4]], [[WIDE_LOAD1]]
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i32> [[VEC_PHI]], [[WIDE_MASKED_LOAD]]
; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i32> [[TMP4]], [[WIDE_MASKED_LOAD1]]
; CHECK-NEXT: [[TMP6]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP5]], <4 x i32> [[VEC_PHI]]
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !6
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !6
; CHECK: middle.block:
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> [[TMP5]])
; CHECK-NEXT: br i1 false, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> [[TMP6]])
; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ]
; CHECK-NEXT: br label [[DOTLR_PH:%.*]]
; CHECK: .lr.ph:
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[DOTLR_PH]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
; CHECK-NEXT: [[PROD_02:%.*]] = phi i32 [ [[L9:%.*]], [[DOTLR_PH]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
; CHECK-NEXT: [[L2:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[INDVARS_IV]]
; CHECK-NEXT: [[L3:%.*]] = load i32, i32* [[L2]], align 4
; CHECK-NEXT: [[L4:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[INDVARS_IV]]
; CHECK-NEXT: [[L5:%.*]] = load i32, i32* [[L4]], align 4
; CHECK-NEXT: [[L8:%.*]] = mul i32 [[PROD_02]], [[L3]]
; CHECK-NEXT: [[L9]] = mul i32 [[L8]], [[L5]]
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i32 [[INDVARS_IV]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], 257
; CHECK-NEXT: br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop !7
; CHECK-NEXT: br i1 undef, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop !7
; CHECK: ._crit_edge:
; CHECK-NEXT: [[PROD_0_LCSSA:%.*]] = phi i32 [ [[L9]], [[DOTLR_PH]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: [[PROD_0_LCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i32 [[PROD_0_LCSSA]]
;
entry:
@ -181,39 +171,29 @@ define i32 @reduction_and(i32* nocapture %A, i32* nocapture %B) {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 256)
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
; CHECK-NEXT: [[TMP4:%.*]] = and <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
; CHECK-NEXT: [[TMP5]] = and <4 x i32> [[TMP4]], [[WIDE_LOAD1]]
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
; CHECK-NEXT: [[TMP4:%.*]] = and <4 x i32> [[VEC_PHI]], [[WIDE_MASKED_LOAD]]
; CHECK-NEXT: [[TMP5:%.*]] = and <4 x i32> [[TMP4]], [[WIDE_MASKED_LOAD1]]
; CHECK-NEXT: [[TMP6]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP5]], <4 x i32> [[VEC_PHI]]
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !8
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !8
; CHECK: middle.block:
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> [[TMP5]])
; CHECK-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> [[TMP6]])
; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ -1, [[ENTRY]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
; CHECK-NEXT: [[RESULT_08:%.*]] = phi i32 [ [[AND:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[INDVARS_IV]]
; CHECK-NEXT: [[L0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[INDVARS_IV]]
; CHECK-NEXT: [[L1:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
; CHECK-NEXT: [[ADD:%.*]] = and i32 [[RESULT_08]], [[L0]]
; CHECK-NEXT: [[AND]] = and i32 [[ADD]], [[L1]]
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i32 [[INDVARS_IV]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], 257
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !9
; CHECK-NEXT: br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !9
; CHECK: for.end:
; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi i32 [ [[AND]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i32 [[RESULT_0_LCSSA]]
;
entry:
@ -245,39 +225,29 @@ define i32 @reduction_or(i32* nocapture %A, i32* nocapture %B) {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 256)
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
; CHECK-NEXT: [[TMP5]] = or <4 x i32> [[TMP4]], [[VEC_PHI]]
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]]
; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP4]], <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP6]] = or <4 x i32> [[VEC_PHI]], [[TMP5]]
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !10
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !10
; CHECK: middle.block:
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> [[TMP5]])
; CHECK-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> [[TMP6]])
; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
; CHECK-NEXT: [[RESULT_08:%.*]] = phi i32 [ [[OR:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[INDVARS_IV]]
; CHECK-NEXT: [[L0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[INDVARS_IV]]
; CHECK-NEXT: [[L1:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[L1]], [[L0]]
; CHECK-NEXT: [[OR]] = or i32 [[ADD]], [[RESULT_08]]
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i32 [[INDVARS_IV]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], 257
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !11
; CHECK-NEXT: br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !11
; CHECK: for.end:
; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi i32 [ [[OR]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i32 [[RESULT_0_LCSSA]]
;
entry:
@ -309,39 +279,29 @@ define i32 @reduction_xor(i32* nocapture %A, i32* nocapture %B) {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 256)
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
; CHECK-NEXT: [[TMP5]] = xor <4 x i32> [[TMP4]], [[VEC_PHI]]
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]]
; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP4]], <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP6]] = xor <4 x i32> [[VEC_PHI]], [[TMP5]]
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !12
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !12
; CHECK: middle.block:
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> [[TMP5]])
; CHECK-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> [[TMP6]])
; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
; CHECK-NEXT: [[RESULT_08:%.*]] = phi i32 [ [[XOR:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[INDVARS_IV]]
; CHECK-NEXT: [[L0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[INDVARS_IV]]
; CHECK-NEXT: [[L1:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[L1]], [[L0]]
; CHECK-NEXT: [[XOR]] = xor i32 [[ADD]], [[RESULT_08]]
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i32 [[INDVARS_IV]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], 257
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !13
; CHECK-NEXT: br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !13
; CHECK: for.end:
; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi i32 [ [[XOR]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i32 [[RESULT_0_LCSSA]]
;
entry:
@ -373,39 +333,29 @@ define float @reduction_fadd(float* nocapture %A, float* nocapture %B) {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 256)
; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[TMP0]] to <4 x float>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> undef)
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>*
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
; CHECK-NEXT: [[TMP4:%.*]] = fadd fast <4 x float> [[VEC_PHI]], [[WIDE_LOAD]]
; CHECK-NEXT: [[TMP5]] = fadd fast <4 x float> [[TMP4]], [[WIDE_LOAD1]]
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> undef)
; CHECK-NEXT: [[TMP4:%.*]] = fadd fast <4 x float> [[VEC_PHI]], [[WIDE_MASKED_LOAD]]
; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <4 x float> [[TMP4]], [[WIDE_MASKED_LOAD1]]
; CHECK-NEXT: [[TMP6]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> [[TMP5]], <4 x float> [[VEC_PHI]]
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !14
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !14
; CHECK: middle.block:
; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP5]])
; CHECK-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP6]])
; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
; CHECK-NEXT: [[RESULT_08:%.*]] = phi float [ [[FADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i32 [[INDVARS_IV]]
; CHECK-NEXT: [[L0:%.*]] = load float, float* [[ARRAYIDX]], align 4
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i32 [[INDVARS_IV]]
; CHECK-NEXT: [[L1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[RESULT_08]], [[L0]]
; CHECK-NEXT: [[FADD]] = fadd fast float [[ADD]], [[L1]]
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i32 [[INDVARS_IV]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], 257
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !15
; CHECK-NEXT: br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !15
; CHECK: for.end:
; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi float [ [[FADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi float [ undef, [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret float [[RESULT_0_LCSSA]]
;
entry:
@ -437,39 +387,29 @@ define float @reduction_fmul(float* nocapture %A, float* nocapture %B) {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ <float 0.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ <float 0.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 256)
; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[TMP0]] to <4 x float>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> undef)
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>*
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <4 x float> [[VEC_PHI]], [[WIDE_LOAD]]
; CHECK-NEXT: [[TMP5]] = fmul fast <4 x float> [[TMP4]], [[WIDE_LOAD1]]
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> undef)
; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <4 x float> [[VEC_PHI]], [[WIDE_MASKED_LOAD]]
; CHECK-NEXT: [[TMP5:%.*]] = fmul fast <4 x float> [[TMP4]], [[WIDE_MASKED_LOAD1]]
; CHECK-NEXT: [[TMP6]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> [[TMP5]], <4 x float> [[VEC_PHI]]
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !16
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !16
; CHECK: middle.block:
; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.000000e+00, <4 x float> [[TMP5]])
; CHECK-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.000000e+00, <4 x float> [[TMP6]])
; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
; CHECK-NEXT: [[RESULT_08:%.*]] = phi float [ [[FMUL:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i32 [[INDVARS_IV]]
; CHECK-NEXT: [[L0:%.*]] = load float, float* [[ARRAYIDX]], align 4
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i32 [[INDVARS_IV]]
; CHECK-NEXT: [[L1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
; CHECK-NEXT: [[ADD:%.*]] = fmul fast float [[RESULT_08]], [[L0]]
; CHECK-NEXT: [[FMUL]] = fmul fast float [[ADD]], [[L1]]
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i32 [[INDVARS_IV]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], 257
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !17
; CHECK-NEXT: br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !17
; CHECK: for.end:
; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi float [ [[FMUL]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi float [ undef, [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret float [[RESULT_0_LCSSA]]
;
entry:

View File

@ -34,3 +34,212 @@ for.body: ; preds = %for.body.preheader,
%exitcond.not = icmp eq i32 %inc, %N
br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
}
define dso_local void @f32_reduction(float* nocapture readonly %Input, i32 %N, float* nocapture %Output) {
; CHECK-LABEL: f32_reduction(
; CHECK: vector.body:
; CHECK: @llvm.masked.load
; CHECK: br i1 %{{.*}}, label {{.*}}, label %vector.body
entry:
%cmp6 = icmp eq i32 %N, 0
br i1 %cmp6, label %while.end, label %while.body.preheader
while.body.preheader: ; preds = %entry
br label %while.body
while.body: ; preds = %while.body.preheader, %while.body
%blkCnt.09 = phi i32 [ %dec, %while.body ], [ %N, %while.body.preheader ]
%sum.08 = phi float [ %add, %while.body ], [ 0.000000e+00, %while.body.preheader ]
%Input.addr.07 = phi float* [ %incdec.ptr, %while.body ], [ %Input, %while.body.preheader ]
%incdec.ptr = getelementptr inbounds float, float* %Input.addr.07, i32 1
%0 = load float, float* %Input.addr.07, align 4
%add = fadd fast float %0, %sum.08
%dec = add i32 %blkCnt.09, -1
%cmp = icmp eq i32 %dec, 0
br i1 %cmp, label %while.end.loopexit, label %while.body
while.end.loopexit: ; preds = %while.body
%add.lcssa = phi float [ %add, %while.body ]
br label %while.end
while.end: ; preds = %while.end.loopexit, %entry
%sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add.lcssa, %while.end.loopexit ]
%conv = uitofp i32 %N to float
%div = fdiv fast float %sum.0.lcssa, %conv
store float %div, float* %Output, align 4
ret void
}
define dso_local void @f16_reduction(half* nocapture readonly %Input, i32 %N, half* nocapture %Output) {
; CHECK-LABEL: f16_reduction(
; CHECK: vector.body:
; CHECK: @llvm.masked.load
; CHECK: br i1 %{{.*}}, label {{.*}}, label %vector.body
entry:
%cmp6 = icmp eq i32 %N, 0
br i1 %cmp6, label %while.end, label %while.body.preheader
while.body.preheader: ; preds = %entry
br label %while.body
while.body: ; preds = %while.body.preheader, %while.body
%blkCnt.09 = phi i32 [ %dec, %while.body ], [ %N, %while.body.preheader ]
%sum.08 = phi half [ %add, %while.body ], [ 0.000000e+00, %while.body.preheader ]
%Input.addr.07 = phi half* [ %incdec.ptr, %while.body ], [ %Input, %while.body.preheader ]
%incdec.ptr = getelementptr inbounds half, half* %Input.addr.07, i32 1
%0 = load half, half* %Input.addr.07, align 2
%add = fadd fast half %0, %sum.08
%dec = add i32 %blkCnt.09, -1
%cmp = icmp eq i32 %dec, 0
br i1 %cmp, label %while.end.loopexit, label %while.body
while.end.loopexit: ; preds = %while.body
%add.lcssa = phi half [ %add, %while.body ]
br label %while.end
while.end: ; preds = %while.end.loopexit, %entry
%sum.0.lcssa = phi half [ 0.000000e+00, %entry ], [ %add.lcssa, %while.end.loopexit ]
%conv = uitofp i32 %N to half
%div = fdiv fast half %sum.0.lcssa, %conv
store half %div, half* %Output, align 2
ret void
}
define dso_local void @mixed_f32_i32_reduction(float* nocapture readonly %fInput, i32* nocapture readonly %iInput, i32 %N, float* nocapture %fOutput, i32* nocapture %iOutput) {
; CHECK-LABEL: mixed_f32_i32_reduction(
; CHECK: vector.body:
; CHECK: @llvm.masked.load
; CHECK: br i1 %{{.*}}, label {{.*}}, label %vector.body
entry:
%cmp15 = icmp eq i32 %N, 0
br i1 %cmp15, label %while.end, label %while.body.preheader
while.body.preheader:
br label %while.body
while.body:
%blkCnt.020 = phi i32 [ %dec, %while.body ], [ %N, %while.body.preheader ]
%isum.019 = phi i32 [ %add2, %while.body ], [ 0, %while.body.preheader ]
%fsum.018 = phi float [ %add, %while.body ], [ 0.000000e+00, %while.body.preheader ]
%fInput.addr.017 = phi float* [ %incdec.ptr, %while.body ], [ %fInput, %while.body.preheader ]
%iInput.addr.016 = phi i32* [ %incdec.ptr1, %while.body ], [ %iInput, %while.body.preheader ]
%incdec.ptr = getelementptr inbounds float, float* %fInput.addr.017, i32 1
%incdec.ptr1 = getelementptr inbounds i32, i32* %iInput.addr.016, i32 1
%0 = load i32, i32* %iInput.addr.016, align 4
%add2 = add nsw i32 %0, %isum.019
%1 = load float, float* %fInput.addr.017, align 4
%add = fadd fast float %1, %fsum.018
%dec = add i32 %blkCnt.020, -1
%cmp = icmp eq i32 %dec, 0
br i1 %cmp, label %while.end.loopexit, label %while.body
while.end.loopexit:
%add.lcssa = phi float [ %add, %while.body ]
%add2.lcssa = phi i32 [ %add2, %while.body ]
%phitmp = sitofp i32 %add2.lcssa to float
br label %while.end
while.end:
%fsum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add.lcssa, %while.end.loopexit ]
%isum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %phitmp, %while.end.loopexit ]
%conv = uitofp i32 %N to float
%div = fdiv fast float %fsum.0.lcssa, %conv
store float %div, float* %fOutput, align 4
%div5 = fdiv fast float %isum.0.lcssa, %conv
%conv6 = fptosi float %div5 to i32
store i32 %conv6, i32* %iOutput, align 4
ret void
}
define dso_local i32 @i32_mul_reduction(i32* noalias nocapture readonly %B, i32 %N) {
; CHECK-LABEL: i32_mul_reduction(
; CHECK: vector.body:
; CHECK: @llvm.masked.load
; CHECK: br i1 %{{.*}}, label {{.*}}, label %vector.body
entry:
%cmp6 = icmp sgt i32 %N, 0
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader:
br label %for.body
for.cond.cleanup.loopexit:
%mul.lcssa = phi i32 [ %mul, %for.body ]
br label %for.cond.cleanup
for.cond.cleanup:
%S.0.lcssa = phi i32 [ 1, %entry ], [ %mul.lcssa, %for.cond.cleanup.loopexit ]
ret i32 %S.0.lcssa
for.body:
%i.08 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
%S.07 = phi i32 [ %mul, %for.body ], [ 1, %for.body.preheader ]
%arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.08
%0 = load i32, i32* %arrayidx, align 4
%mul = mul nsw i32 %0, %S.07
%inc = add nuw nsw i32 %i.08, 1
%exitcond = icmp eq i32 %inc, %N
br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
}
define dso_local i32 @i32_or_reduction(i32* noalias nocapture readonly %B, i32 %N) {
; CHECK-LABEL: i32_or_reduction(
; CHECK: vector.body:
; CHECK: @llvm.masked.load
; CHECK: br i1 %{{.*}}, label {{.*}}, label %vector.body
entry:
%cmp6 = icmp sgt i32 %N, 0
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
br label %for.body
for.cond.cleanup.loopexit: ; preds = %for.body
%or.lcssa = phi i32 [ %or, %for.body ]
br label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
%S.0.lcssa = phi i32 [ 1, %entry ], [ %or.lcssa, %for.cond.cleanup.loopexit ]
ret i32 %S.0.lcssa
for.body: ; preds = %for.body.preheader, %for.body
%i.08 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
%S.07 = phi i32 [ %or, %for.body ], [ 1, %for.body.preheader ]
%arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.08
%0 = load i32, i32* %arrayidx, align 4
%or = or i32 %0, %S.07
%inc = add nuw nsw i32 %i.08, 1
%exitcond = icmp eq i32 %inc, %N
br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
}
define dso_local i32 @i32_and_reduction(i32* noalias nocapture readonly %A, i32 %N, i32 %S) {
; CHECK-LABEL: i32_and_reduction(
; CHECK: vector.body:
; CHECK: @llvm.masked.load
; CHECK: br i1 %{{.*}}, label {{.*}}, label %vector.body
entry:
%cmp5 = icmp sgt i32 %N, 0
br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
br label %for.body
for.cond.cleanup.loopexit: ; preds = %for.body
%and.lcssa = phi i32 [ %and, %for.body ]
br label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
%S.addr.0.lcssa = phi i32 [ %S, %entry ], [ %and.lcssa, %for.cond.cleanup.loopexit ]
ret i32 %S.addr.0.lcssa
for.body: ; preds = %for.body.preheader, %for.body
%i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
%S.addr.06 = phi i32 [ %and, %for.body ], [ %S, %for.body.preheader ]
%arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.07
%0 = load i32, i32* %arrayidx, align 4
%and = and i32 %0, %S.addr.06
%inc = add nuw nsw i32 %i.07, 1
%exitcond = icmp eq i32 %inc, %N
br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
}

View File

@ -266,189 +266,6 @@ for.body: ; preds = %for.body.preheader,
br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
}
; Don't tail-fold float reductions.
;
define dso_local void @f32_reduction(float* nocapture readonly %Input, i32 %N, float* nocapture %Output) local_unnamed_addr #0 {
; CHECK-LABEL: f32_reduction(
; CHECK: vector.body:
; CHECK-NOT: @llvm.masked.load
; CHECK-NOT: @llvm.masked.store
; CHECK: br i1 %{{.*}}, label {{.*}}, label %vector.body
entry:
%cmp6 = icmp eq i32 %N, 0
br i1 %cmp6, label %while.end, label %while.body.preheader
while.body.preheader: ; preds = %entry
br label %while.body
while.body: ; preds = %while.body.preheader, %while.body
%blkCnt.09 = phi i32 [ %dec, %while.body ], [ %N, %while.body.preheader ]
%sum.08 = phi float [ %add, %while.body ], [ 0.000000e+00, %while.body.preheader ]
%Input.addr.07 = phi float* [ %incdec.ptr, %while.body ], [ %Input, %while.body.preheader ]
%incdec.ptr = getelementptr inbounds float, float* %Input.addr.07, i32 1
%0 = load float, float* %Input.addr.07, align 4
%add = fadd fast float %0, %sum.08
%dec = add i32 %blkCnt.09, -1
%cmp = icmp eq i32 %dec, 0
br i1 %cmp, label %while.end.loopexit, label %while.body
while.end.loopexit: ; preds = %while.body
%add.lcssa = phi float [ %add, %while.body ]
br label %while.end
while.end: ; preds = %while.end.loopexit, %entry
%sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add.lcssa, %while.end.loopexit ]
%conv = uitofp i32 %N to float
%div = fdiv fast float %sum.0.lcssa, %conv
store float %div, float* %Output, align 4
ret void
}
; Don't tail-fold float reductions.
;
define dso_local void @mixed_f32_i32_reduction(float* nocapture readonly %fInput, i32* nocapture readonly %iInput, i32 %N, float* nocapture %fOutput, i32* nocapture %iOutput) local_unnamed_addr #0 {
; CHECK-LABEL: mixed_f32_i32_reduction(
; CHECK: vector.body:
; CHECK-NOT: @llvm.masked.load
; CHECK-NOT: @llvm.masked.store
; CHECK: br i1 %{{.*}}, label {{.*}}, label %vector.body
entry:
%cmp15 = icmp eq i32 %N, 0
br i1 %cmp15, label %while.end, label %while.body.preheader
while.body.preheader:
br label %while.body
while.body:
%blkCnt.020 = phi i32 [ %dec, %while.body ], [ %N, %while.body.preheader ]
%isum.019 = phi i32 [ %add2, %while.body ], [ 0, %while.body.preheader ]
%fsum.018 = phi float [ %add, %while.body ], [ 0.000000e+00, %while.body.preheader ]
%fInput.addr.017 = phi float* [ %incdec.ptr, %while.body ], [ %fInput, %while.body.preheader ]
%iInput.addr.016 = phi i32* [ %incdec.ptr1, %while.body ], [ %iInput, %while.body.preheader ]
%incdec.ptr = getelementptr inbounds float, float* %fInput.addr.017, i32 1
%incdec.ptr1 = getelementptr inbounds i32, i32* %iInput.addr.016, i32 1
%0 = load i32, i32* %iInput.addr.016, align 4
%add2 = add nsw i32 %0, %isum.019
%1 = load float, float* %fInput.addr.017, align 4
%add = fadd fast float %1, %fsum.018
%dec = add i32 %blkCnt.020, -1
%cmp = icmp eq i32 %dec, 0
br i1 %cmp, label %while.end.loopexit, label %while.body
while.end.loopexit:
%add.lcssa = phi float [ %add, %while.body ]
%add2.lcssa = phi i32 [ %add2, %while.body ]
%phitmp = sitofp i32 %add2.lcssa to float
br label %while.end
while.end:
%fsum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add.lcssa, %while.end.loopexit ]
%isum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %phitmp, %while.end.loopexit ]
%conv = uitofp i32 %N to float
%div = fdiv fast float %fsum.0.lcssa, %conv
store float %div, float* %fOutput, align 4
%div5 = fdiv fast float %isum.0.lcssa, %conv
%conv6 = fptosi float %div5 to i32
store i32 %conv6, i32* %iOutput, align 4
ret void
}
define dso_local i32 @i32_mul_reduction(i32* noalias nocapture readonly %B, i32 %N) local_unnamed_addr #0 {
; CHECK-LABEL: i32_mul_reduction(
; CHECK: vector.body:
; CHECK-NOT: @llvm.masked.load
; CHECK-NOT: @llvm.masked.store
; CHECK: br i1 %{{.*}}, label {{.*}}, label %vector.body
entry:
%cmp6 = icmp sgt i32 %N, 0
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader:
br label %for.body
for.cond.cleanup.loopexit:
%mul.lcssa = phi i32 [ %mul, %for.body ]
br label %for.cond.cleanup
for.cond.cleanup:
%S.0.lcssa = phi i32 [ 1, %entry ], [ %mul.lcssa, %for.cond.cleanup.loopexit ]
ret i32 %S.0.lcssa
for.body:
%i.08 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
%S.07 = phi i32 [ %mul, %for.body ], [ 1, %for.body.preheader ]
%arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.08
%0 = load i32, i32* %arrayidx, align 4
%mul = mul nsw i32 %0, %S.07
%inc = add nuw nsw i32 %i.08, 1
%exitcond = icmp eq i32 %inc, %N
br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
}
define dso_local i32 @i32_or_reduction(i32* noalias nocapture readonly %B, i32 %N) local_unnamed_addr #0 {
; CHECK-LABEL: i32_or_reduction(
; CHECK: vector.body:
; CHECK-NOT: @llvm.masked.load
; CHECK-NOT: @llvm.masked.store
; CHECK: br i1 %{{.*}}, label {{.*}}, label %vector.body
entry:
%cmp6 = icmp sgt i32 %N, 0
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
br label %for.body
for.cond.cleanup.loopexit: ; preds = %for.body
%or.lcssa = phi i32 [ %or, %for.body ]
br label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
%S.0.lcssa = phi i32 [ 1, %entry ], [ %or.lcssa, %for.cond.cleanup.loopexit ]
ret i32 %S.0.lcssa
for.body: ; preds = %for.body.preheader, %for.body
%i.08 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
%S.07 = phi i32 [ %or, %for.body ], [ 1, %for.body.preheader ]
%arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.08
%0 = load i32, i32* %arrayidx, align 4
%or = or i32 %0, %S.07
%inc = add nuw nsw i32 %i.08, 1
%exitcond = icmp eq i32 %inc, %N
br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
}
define dso_local i32 @i32_and_reduction(i32* noalias nocapture readonly %A, i32 %N, i32 %S) local_unnamed_addr #0 {
; CHECK-LABEL: i32_and_reduction(
; CHECK: vector.body:
; CHECK-NOT: @llvm.masked.load
; CHECK-NOT: @llvm.masked.store
; CHECK: br i1 %{{.*}}, label {{.*}}, label %vector.body
entry:
%cmp5 = icmp sgt i32 %N, 0
br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
br label %for.body
for.cond.cleanup.loopexit: ; preds = %for.body
%and.lcssa = phi i32 [ %and, %for.body ]
br label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
%S.addr.0.lcssa = phi i32 [ %S, %entry ], [ %and.lcssa, %for.cond.cleanup.loopexit ]
ret i32 %S.addr.0.lcssa
for.body: ; preds = %for.body.preheader, %for.body
%i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
%S.addr.06 = phi i32 [ %and, %for.body ], [ %S, %for.body.preheader ]
%arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.07
%0 = load i32, i32* %arrayidx, align 4
%and = and i32 %0, %S.addr.06
%inc = add nuw nsw i32 %i.07, 1
%exitcond = icmp eq i32 %inc, %N
br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
}
define i32 @i32_smin_reduction(i32* nocapture readonly %x, i32 %n) #0 {
; CHECK-LABEL: i32_smin_reduction(
; CHECK: vector.body: