[LV] Vectorize first-order recurrences

This patch enables the vectorization of first-order recurrences. A first-order
recurrence is a non-reduction recurrence relation in which the value of the
recurrence in the current loop iteration equals a value defined in the previous
iteration. The load PRE of the GVN pass often creates these recurrences by
hoisting loads from within loops.

In this patch, we add a new recurrence kind for first-order phi nodes and
attempt to vectorize them if possible. Vectorization is performed by shuffling
the values for the current and previous iterations. The vectorization cost
estimate is updated to account for the added shuffle instruction.

Contributed-by: Matthew Simpson and Chad Rosier <mcrosier@codeaurora.org>
Differential Revision: http://reviews.llvm.org/D16197

llvm-svn: 261346
This commit is contained in:
Matthew Simpson 2016-02-19 17:56:08 +00:00
parent 615a807ee8
commit 29c997c1a1
4 changed files with 450 additions and 6 deletions

View File

@ -175,6 +175,13 @@ public:
static bool isReductionPHI(PHINode *Phi, Loop *TheLoop,
RecurrenceDescriptor &RedDes);
/// Returns true if Phi is a first-order recurrence. A first-order recurrence
/// is a non-reduction recurrence relation in which the value of the
/// recurrence in the current loop iteration equals a value defined in the
/// previous iteration.
static bool isFirstOrderRecurrence(PHINode *Phi, Loop *TheLoop,
DominatorTree *DT);
RecurrenceKind getRecurrenceKind() { return Kind; }
MinMaxRecurrenceKind getMinMaxRecurrenceKind() { return MinMaxKind; }

View File

@ -520,6 +520,43 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop,
return false;
}
bool RecurrenceDescriptor::isFirstOrderRecurrence(PHINode *Phi, Loop *TheLoop,
DominatorTree *DT) {
// Ensure the phi node is in the loop header and has two incoming values.
if (Phi->getParent() != TheLoop->getHeader() ||
Phi->getNumIncomingValues() != 2)
return false;
// Ensure the loop has a preheader and a single latch block. The loop
// vectorizer will need the latch to set up the next iteration of the loop.
auto *Preheader = TheLoop->getLoopPreheader();
auto *Latch = TheLoop->getLoopLatch();
if (!Preheader || !Latch)
return false;
// Ensure the phi node's incoming blocks are the loop preheader and latch.
if (Phi->getBasicBlockIndex(Preheader) < 0 ||
Phi->getBasicBlockIndex(Latch) < 0)
return false;
// Get the previous value. The previous value comes from the latch edge while
// the initial value comes form the preheader edge.
auto *Previous = dyn_cast<Instruction>(Phi->getIncomingValueForBlock(Latch));
if (!Previous)
return false;
// Ensure every user of the phi node is dominated by the previous value. The
// dominance requirement ensures the loop vectorizer will not need to
// vectorize the initial value prior to the first iteration of the loop.
for (User *U : Phi->users())
if (auto *I = dyn_cast<Instruction>(U))
if (!DT->dominates(Previous, I))
return false;
return true;
}
/// This function returns the identity element (or neutral element) for
/// the operation K.
Constant *RecurrenceDescriptor::getRecurrenceIdentity(RecurrenceKind K,

View File

@ -364,6 +364,10 @@ protected:
/// Copy and widen the instructions from the old loop.
virtual void vectorizeLoop();
/// Fix a first-order recurrence. This is the second phase of vectorizing
/// this phi node.
void fixFirstOrderRecurrence(PHINode *Phi);
/// \brief The Loop exit block may have single value PHI nodes where the
/// incoming value is 'Undef'. While vectorizing we only handled real values
/// that were defined inside the loop. Here we fix the 'undef case'.
@ -1201,6 +1205,10 @@ public:
/// induction descriptor.
typedef MapVector<PHINode*, InductionDescriptor> InductionList;
/// RecurrenceSet contains the phi nodes that are recurrences other than
/// inductions and reductions.
typedef SmallPtrSet<const PHINode *, 8> RecurrenceSet;
/// Returns true if it is legal to vectorize this loop.
/// This does not mean that it is profitable to vectorize this
/// loop, only that it is legal to do so.
@ -1215,6 +1223,9 @@ public:
/// Returns the induction variables found in the loop.
InductionList *getInductionVars() { return &Inductions; }
/// Return the first-order recurrences found in the loop.
RecurrenceSet *getFirstOrderRecurrences() { return &FirstOrderRecurrences; }
/// Returns the widest induction type.
Type *getWidestInductionType() { return WidestIndTy; }
@ -1224,6 +1235,9 @@ public:
/// Returns True if PN is a reduction variable in this loop.
bool isReductionVariable(PHINode *PN) { return Reductions.count(PN); }
/// Returns True if Phi is a first-order recurrence in this loop.
bool isFirstOrderRecurrence(const PHINode *Phi);
/// Return true if the block BB needs to be predicated in order for the loop
/// to be vectorized.
bool blockNeedsPredication(BasicBlock *BB);
@ -1384,6 +1398,8 @@ private:
/// Notice that inductions don't need to start at zero and that induction
/// variables can be pointers.
InductionList Inductions;
/// Holds the phi nodes that are first-order recurrences.
RecurrenceSet FirstOrderRecurrences;
/// Holds the widest induction type encountered.
Type *WidestIndTy;
@ -3386,8 +3402,14 @@ void InnerLoopVectorizer::vectorizeLoop() {
for (PHINode *Phi : PHIsToFix) {
assert(Phi && "Unable to recover vectorized PHI");
// We currently only handle reductions. Ensure the PHI node to be fixed is
// a reduction, and get its reduction variable descriptor.
// Handle first-order recurrences that need to be fixed.
if (Legal->isFirstOrderRecurrence(Phi)) {
fixFirstOrderRecurrence(Phi);
continue;
}
// If the phi node is not a first-order recurrence, it must be a reduction.
// Get it's reduction variable descriptor.
assert(Legal->isReductionVariable(Phi) &&
"Unable to find the reduction variable");
RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi];
@ -3613,6 +3635,158 @@ void InnerLoopVectorizer::vectorizeLoop() {
cse(LoopVectorBody);
}
void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
// This is the second phase of vectorizing first-order rececurrences. An
// overview of the transformation is described below. Suppose we have the
// following loop.
//
// for (int i = 0; i < n; ++i)
// b[i] = a[i] - a[i - 1];
//
// There is a first-order recurrence on "a". For this loop, the shorthand
// scalar IR looks like:
//
// scalar.ph:
// s_init = a[-1]
// br scalar.body
//
// scalar.body:
// i = phi [0, scalar.ph], [i+1, scalar.body]
// s1 = phi [s_init, scalar.ph], [s2, scalar.body]
// s2 = a[i]
// b[i] = s2 - s1
// br cond, scalar.body, ...
//
// In this example, s1 is a recurrence because it's value depends on the
// previous iteration. In the first phase of vectorization, we created a
// temporary value for s1. We now complete the vectorization and produce the
// shorthand vector IR shown below (for VF = 4, UF = 1).
//
// vector.ph:
// v_init = vector(..., ..., ..., a[-1])
// br vector.body
//
// vector.body
// i = phi [0, vector.ph], [i+4, vector.body]
// v1 = phi [v_init, vector.ph], [v2, vector.body]
// v2 = a[i, i+1, i+2, i+3];
// v3 = vector(v1(3), v2(0, 1, 2))
// b[i, i+1, i+2, i+3] = v2 - v3
// br cond, vector.body, middle.block
//
// middle.block:
// x = v2(3)
// br scalar.ph
//
// scalar.ph:
// s_init = phi [x, middle.block], [a[-1], otherwise]
// br scalar.body
//
// After execution completes the vector loop, we extract the next value of
// the recurrence (x) to use as the initial value in the scalar loop.
// Get the original loop preheader and single loop latch.
auto *Preheader = OrigLoop->getLoopPreheader();
auto *Latch = OrigLoop->getLoopLatch();
// Get the initial and previous values of the scalar recurrence.
auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
auto *Previous = Phi->getIncomingValueForBlock(Latch);
// Create a vector from the initial value.
auto *VectorInit = ScalarInit;
if (VF > 1) {
Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
VectorInit = Builder.CreateInsertElement(
UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
Builder.getInt32(VF - 1), "vector.recur.init");
}
// We constructed a temporary phi node in the first phase of vectorization.
// This phi node will eventually be deleted.
auto &PhiParts = getVectorValue(Phi);
Builder.SetInsertPoint(cast<Instruction>(PhiParts[0]));
// Create a phi node for the new recurrence. The current value will either be
// the initial value inserted into a vector or loop-varying vector value.
auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
// Get the vectorized previous value. We ensured the previous values was an
// instruction when detecting the recurrence.
auto &PreviousParts = getVectorValue(Previous);
// Set the insertion point to be after this instruction. We ensured the
// previous value dominated all uses of the phi when detecting the
// recurrence.
Builder.SetInsertPoint(
&*++BasicBlock::iterator(cast<Instruction>(PreviousParts[UF - 1])));
// We will construct a vector for the recurrence by combining the values for
// the current and previous iterations. This is the required shuffle mask.
SmallVector<Constant *, 8> ShuffleMask(VF);
ShuffleMask[0] = Builder.getInt32(VF - 1);
for (unsigned I = 1; I < VF; ++I)
ShuffleMask[I] = Builder.getInt32(I + VF - 1);
// The vector from which to take the initial value for the current iteration
// (actual or unrolled). Initially, this is the vector phi node.
Value *Incoming = VecPhi;
// Shuffle the current and previous vector and update the vector parts.
for (unsigned Part = 0; Part < UF; ++Part) {
auto *Shuffle =
VF > 1
? Builder.CreateShuffleVector(Incoming, PreviousParts[Part],
ConstantVector::get(ShuffleMask))
: Incoming;
PhiParts[Part]->replaceAllUsesWith(Shuffle);
cast<Instruction>(PhiParts[Part])->eraseFromParent();
PhiParts[Part] = Shuffle;
Incoming = PreviousParts[Part];
}
// Fix the latch value of the new recurrence in the vector loop.
VecPhi->addIncoming(Incoming,
LI->getLoopFor(LoopVectorBody[0])->getLoopLatch());
// Extract the last vector element in the middle block. This will be the
// initial value for the recurrence when jumping to the scalar loop.
auto *Extract = Incoming;
if (VF > 1) {
Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
Extract = Builder.CreateExtractElement(Extract, Builder.getInt32(VF - 1),
"vector.recur.extract");
}
// Fix the initial value of the original recurrence in the scalar loop.
Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
for (auto *BB : predecessors(LoopScalarPreHeader)) {
auto *Incoming = BB == LoopMiddleBlock ? Extract : ScalarInit;
Start->addIncoming(Incoming, BB);
}
Phi->setIncomingValue(Phi->getBasicBlockIndex(LoopScalarPreHeader), Start);
Phi->setName("scalar.recur");
// Finally, fix users of the recurrence outside the loop. The users will need
// either the last value of the scalar recurrence or the last value of the
// vector recurrence we extracted in the middle block. Since the loop is in
// LCSSA form, we just need to find the phi node for the original scalar
// recurrence in the exit block, and then add an edge for the middle block.
for (auto &I : *LoopExitBlock) {
auto *LCSSAPhi = dyn_cast<PHINode>(&I);
if (!LCSSAPhi)
break;
if (LCSSAPhi->getIncomingValue(0) == Phi) {
LCSSAPhi->addIncoming(Extract, LoopMiddleBlock);
break;
}
}
}
void InnerLoopVectorizer::fixLCSSAPHIs() {
for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
@ -3687,8 +3861,8 @@ void InnerLoopVectorizer::widenPHIInstruction(
Instruction *PN, InnerLoopVectorizer::VectorParts &Entry, unsigned UF,
unsigned VF, PhiVector *PV) {
PHINode* P = cast<PHINode>(PN);
// Handle reduction variables:
if (Legal->isReductionVariable(P)) {
// Handle recurrences.
if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
for (unsigned part = 0; part < UF; ++part) {
// This is phase one of vectorizing PHIs.
Type *VecTy = (VF == 1) ? PN->getType() :
@ -4384,6 +4558,11 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
continue;
}
if (RecurrenceDescriptor::isFirstOrderRecurrence(Phi, TheLoop, DT)) {
FirstOrderRecurrences.insert(Phi);
continue;
}
emitAnalysis(VectorizationReport(&*it) <<
"value that could not be identified as "
"reduction is used outside the loop");
@ -4561,6 +4740,10 @@ bool LoopVectorizationLegality::isInductionVariable(const Value *V) {
return Inductions.count(PN);
}
bool LoopVectorizationLegality::isFirstOrderRecurrence(const PHINode *Phi) {
return FirstOrderRecurrences.count(Phi);
}
bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) {
return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
}
@ -5450,9 +5633,17 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
case Instruction::Br: {
return TTI.getCFInstrCost(I->getOpcode());
}
case Instruction::PHI:
//TODO: IF-converted IFs become selects.
case Instruction::PHI: {
auto *Phi = cast<PHINode>(I);
// First-order recurrences are replaced by vector shuffles inside the loop.
if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
VectorTy, VF - 1, VectorTy);
// TODO: IF-converted IFs become selects.
return 0;
}
case Instruction::Add:
case Instruction::FAdd:
case Instruction::Sub:

View File

@ -0,0 +1,209 @@
; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -dce -instcombine -S | FileCheck %s
; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -dce -instcombine -S | FileCheck %s --check-prefix=UNROLL
target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
; CHECK-LABEL: @recurrence_1
;
; void recurrence_1(int *a, int *b, int n) {
; for(int i = 0; i < n; i++)
; b[i] = a[i] + a[i - 1]
; }
;
; CHECK: vector.ph:
; CHECK: %vector.recur.init = insertelement <4 x i32> undef, i32 %pre_load, i32 3
;
; CHECK: vector.body:
; CHECK: %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ [[L1:%[a-zA-Z0-9.]+]], %vector.body ]
; CHECK: [[L1]] = load <4 x i32>
; CHECK: {{.*}} = shufflevector <4 x i32> %vector.recur, <4 x i32> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
;
; CHECK: middle.block:
; CHECK: %vector.recur.extract = extractelement <4 x i32> [[L1]], i32 3
;
; CHECK: scalar.ph:
; CHECK: %scalar.recur.init = phi i32 [ %vector.recur.extract, %middle.block ], [ %pre_load, %vector.memcheck ], [ %pre_load, %min.iters.checked ], [ %pre_load, %for.preheader ]
;
; CHECK: scalar.body:
; CHECK: %scalar.recur = phi i32 [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ]
;
; UNROLL: vector.body:
; UNROLL: %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ [[L2:%[a-zA-Z0-9.]+]], %vector.body ]
; UNROLL: [[L1:%[a-zA-Z0-9.]+]] = load <4 x i32>
; UNROLL: [[L2]] = load <4 x i32>
; UNROLL: {{.*}} = shufflevector <4 x i32> %vector.recur, <4 x i32> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; UNROLL: {{.*}} = shufflevector <4 x i32> [[L1]], <4 x i32> [[L2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
;
; UNROLL: middle.block:
; UNROLL: %vector.recur.extract = extractelement <4 x i32> [[L2]], i32 3
;
define void @recurrence_1(i32* nocapture readonly %a, i32* nocapture %b, i32 %n) {
entry:
br label %for.preheader
for.preheader:
%arrayidx.phi.trans.insert = getelementptr inbounds i32, i32* %a, i64 0
%pre_load = load i32, i32* %arrayidx.phi.trans.insert
br label %scalar.body
scalar.body:
%0 = phi i32 [ %pre_load, %for.preheader ], [ %1, %scalar.body ]
%indvars.iv = phi i64 [ 0, %for.preheader ], [ %indvars.iv.next, %scalar.body ]
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%arrayidx32 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next
%1 = load i32, i32* %arrayidx32
%arrayidx34 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
%add35 = add i32 %1, %0
store i32 %add35, i32* %arrayidx34
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %n
br i1 %exitcond, label %for.exit, label %scalar.body
for.exit:
ret void
}
; CHECK-LABEL: @recurrence_2
;
; int recurrence_2(int *a, int n) {
; int minmax;
; for (int i = 0; i < n; ++i)
; minmax = min(minmax, max(a[i] - a[i-1], 0));
; return minmax;
; }
;
; CHECK: vector.ph:
; CHECK: %vector.recur.init = insertelement <4 x i32> undef, i32 %.pre, i32 3
;
; CHECK: vector.body:
; CHECK: %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ [[L1:%[a-zA-Z0-9.]+]], %vector.body ]
; CHECK: [[L1]] = load <4 x i32>
; CHECK: {{.*}} = shufflevector <4 x i32> %vector.recur, <4 x i32> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
;
; CHECK: middle.block:
; CHECK: %vector.recur.extract = extractelement <4 x i32> [[L1]], i32 3
;
; CHECK: scalar.ph:
; CHECK: %scalar.recur.init = phi i32 [ %vector.recur.extract, %middle.block ], [ %.pre, %min.iters.checked ], [ %.pre, %for.preheader ]
;
; CHECK: scalar.body:
; CHECK: %scalar.recur = phi i32 [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ]
;
; UNROLL: vector.body:
; UNROLL: %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ [[L2:%[a-zA-Z0-9.]+]], %vector.body ]
; UNROLL: [[L1:%[a-zA-Z0-9.]+]] = load <4 x i32>
; UNROLL: [[L2]] = load <4 x i32>
; UNROLL: {{.*}} = shufflevector <4 x i32> %vector.recur, <4 x i32> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; UNROLL: {{.*}} = shufflevector <4 x i32> [[L1]], <4 x i32> [[L2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
;
; UNROLL: middle.block:
; UNROLL: %vector.recur.extract = extractelement <4 x i32> [[L2]], i32 3
;
define i32 @recurrence_2(i32* nocapture readonly %a, i32 %n) {
entry:
%cmp27 = icmp sgt i32 %n, 0
br i1 %cmp27, label %for.preheader, label %for.cond.cleanup
for.preheader:
%arrayidx2.phi.trans.insert = getelementptr inbounds i32, i32* %a, i64 -1
%.pre = load i32, i32* %arrayidx2.phi.trans.insert, align 4
br label %scalar.body
for.cond.cleanup.loopexit:
%minmax.0.cond.lcssa = phi i32 [ %minmax.0.cond, %scalar.body ]
br label %for.cond.cleanup
for.cond.cleanup:
%minmax.0.lcssa = phi i32 [ undef, %entry ], [ %minmax.0.cond.lcssa, %for.cond.cleanup.loopexit ]
ret i32 %minmax.0.lcssa
scalar.body:
%0 = phi i32 [ %.pre, %for.preheader ], [ %1, %scalar.body ]
%indvars.iv = phi i64 [ 0, %for.preheader ], [ %indvars.iv.next, %scalar.body ]
%minmax.028 = phi i32 [ undef, %for.preheader ], [ %minmax.0.cond, %scalar.body ]
%arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
%1 = load i32, i32* %arrayidx, align 4
%sub3 = sub nsw i32 %1, %0
%cmp4 = icmp sgt i32 %sub3, 0
%cond = select i1 %cmp4, i32 %sub3, i32 0
%cmp5 = icmp slt i32 %minmax.028, %cond
%minmax.0.cond = select i1 %cmp5, i32 %minmax.028, i32 %cond
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %n
br i1 %exitcond, label %for.cond.cleanup.loopexit, label %scalar.body
}
; CHECK-LABEL: @recurrence_3
;
; void recurrence_3(short *a, double *b, int n, float f, short p) {
; b[0] = (double)a[0] - f * (double)p;
; for (int i = 1; i < n; i++)
; b[i] = (double)a[i] - f * (double)a[i - 1];
; }
;
;
; CHECK: vector.ph:
; CHECK: %vector.recur.init = insertelement <4 x i16> undef, i16 %0, i32 3
;
; CHECK: vector.body:
; CHECK: %vector.recur = phi <4 x i16> [ %vector.recur.init, %vector.ph ], [ [[L1:%[a-zA-Z0-9.]+]], %vector.body ]
; CHECK: [[L1]] = load <4 x i16>
; CHECK: {{.*}} = shufflevector <4 x i16> %vector.recur, <4 x i16> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
;
; CHECK: middle.block:
; CHECK: %vector.recur.extract = extractelement <4 x i16> [[L1]], i32 3
;
; CHECK: scalar.ph:
; CHECK: %scalar.recur.init = phi i16 [ %vector.recur.extract, %middle.block ], [ %0, %vector.memcheck ], [ %0, %min.iters.checked ], [ %0, %for.preheader ]
;
; CHECK: scalar.body:
; CHECK: %scalar.recur = phi i16 [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ]
;
; UNROLL: vector.body:
; UNROLL: %vector.recur = phi <4 x i16> [ %vector.recur.init, %vector.ph ], [ [[L2:%[a-zA-Z0-9.]+]], %vector.body ]
; UNROLL: [[L1:%[a-zA-Z0-9.]+]] = load <4 x i16>
; UNROLL: [[L2]] = load <4 x i16>
; UNROLL: {{.*}} = shufflevector <4 x i16> %vector.recur, <4 x i16> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; UNROLL: {{.*}} = shufflevector <4 x i16> [[L1]], <4 x i16> [[L2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
;
; UNROLL: middle.block:
; UNROLL: %vector.recur.extract = extractelement <4 x i16> [[L2]], i32 3
;
define void @recurrence_3(i16* nocapture readonly %a, double* nocapture %b, i32 %n, float %f, i16 %p) {
entry:
%0 = load i16, i16* %a, align 2
%conv = sitofp i16 %0 to double
%conv1 = fpext float %f to double
%conv2 = sitofp i16 %p to double
%mul = fmul fast double %conv2, %conv1
%sub = fsub fast double %conv, %mul
store double %sub, double* %b, align 8
%cmp25 = icmp sgt i32 %n, 1
br i1 %cmp25, label %for.preheader, label %for.end
for.preheader:
br label %scalar.body
scalar.body:
%1 = phi i16 [ %0, %for.preheader ], [ %2, %scalar.body ]
%advars.iv = phi i64 [ %advars.iv.next, %scalar.body ], [ 1, %for.preheader ]
%arrayidx5 = getelementptr inbounds i16, i16* %a, i64 %advars.iv
%2 = load i16, i16* %arrayidx5, align 2
%conv6 = sitofp i16 %2 to double
%conv11 = sitofp i16 %1 to double
%mul12 = fmul fast double %conv11, %conv1
%sub13 = fsub fast double %conv6, %mul12
%arrayidx15 = getelementptr inbounds double, double* %b, i64 %advars.iv
store double %sub13, double* %arrayidx15, align 8
%advars.iv.next = add nuw nsw i64 %advars.iv, 1
%lftr.wideiv = trunc i64 %advars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %n
br i1 %exitcond, label %for.end.loopexit, label %scalar.body
for.end.loopexit:
br label %for.end
for.end:
ret void
}