[Loop Vectorizer] Handling loops FP induction variables.

Allowed loop vectorization with secondary FP IVs. Like this:
float *A;
float x = init;
for (int i=0; i < N; ++i) {
  A[i] = x;
  x -= fp_inc;
}

The auto-vectorization is possible when the induction binary operator is "fast" or the function has "unsafe" attribute.

Differential Revision: https://reviews.llvm.org/D21330

llvm-svn: 276554
This commit is contained in:
Elena Demikhovsky 2016-07-24 07:24:54 +00:00
parent 93ea19b9a6
commit 376a18bd92
6 changed files with 561 additions and 56 deletions

View File

@ -263,13 +263,15 @@ public:
enum InductionKind {
IK_NoInduction, ///< Not an induction variable.
IK_IntInduction, ///< Integer induction variable. Step = C.
IK_PtrInduction ///< Pointer induction var. Step = C / sizeof(elem).
IK_PtrInduction, ///< Pointer induction var. Step = C / sizeof(elem).
IK_FpInduction ///< Floating point induction variable.
};
public:
/// Default constructor - creates an invalid induction.
InductionDescriptor()
: StartValue(nullptr), IK(IK_NoInduction), Step(nullptr) {}
: StartValue(nullptr), IK(IK_NoInduction), Step(nullptr),
InductionBinOp(nullptr) {}
/// Get the consecutive direction. Returns:
/// 0 - unknown or non-consecutive.
@ -291,26 +293,58 @@ public:
const SCEV *getStep() const { return Step; }
ConstantInt *getConstIntStepValue() const;
/// Returns true if \p Phi is an induction. If \p Phi is an induction,
/// the induction descriptor \p D will contain the data describing this
/// induction. If by some other means the caller has a better SCEV
/// Returns true if \p Phi is an induction in the loop \p L. If \p Phi is an
/// induction, the induction descriptor \p D will contain the data describing
/// this induction. If by some other means the caller has a better SCEV
/// expression for \p Phi than the one returned by the ScalarEvolution
/// analysis, it can be passed through \p Expr.
static bool isInductionPHI(PHINode *Phi, ScalarEvolution *SE,
static bool isInductionPHI(PHINode *Phi, const Loop* L, ScalarEvolution *SE,
InductionDescriptor &D,
const SCEV *Expr = nullptr);
/// Returns true if \p Phi is an induction, in the context associated with
/// the run-time predicate of PSE. If \p Assume is true, this can add further
/// SCEV predicates to \p PSE in order to prove that \p Phi is an induction.
/// Returns true if \p Phi is a floating point induction in the loop \p L.
/// If \p Phi is an induction, the induction descriptor \p D will contain
/// the data describing this induction.
static bool isFPInductionPHI(PHINode *Phi, const Loop* L,
ScalarEvolution *SE, InductionDescriptor &D);
/// Returns true if \p Phi is a loop \p L induction, in the context associated
/// with the run-time predicate of PSE. If \p Assume is true, this can add
/// further SCEV predicates to \p PSE in order to prove that \p Phi is an
/// induction.
/// If \p Phi is an induction, \p D will contain the data describing this
/// induction.
static bool isInductionPHI(PHINode *Phi, PredicatedScalarEvolution &PSE,
static bool isInductionPHI(PHINode *Phi, const Loop* L,
PredicatedScalarEvolution &PSE,
InductionDescriptor &D, bool Assume = false);
/// Returns true if the induction type is FP and the binary operator does
/// not have the "fast-math" property. Such operation requires a relaxed FP
/// mode.
bool hasUnsafeAlgebra() {
return InductionBinOp &&
!cast<FPMathOperator>(InductionBinOp)->hasUnsafeAlgebra();
}
/// Returns induction operator that does not have "fast-math" property
/// and requires FP unsafe mode.
Instruction *getUnsafeAlgebraInst() {
if (!InductionBinOp ||
cast<FPMathOperator>(InductionBinOp)->hasUnsafeAlgebra())
return nullptr;
return InductionBinOp;
}
/// Returns binary opcode of the induction operator.
Instruction::BinaryOps getInductionOpcode() const {
return InductionBinOp ? InductionBinOp->getOpcode() :
Instruction::BinaryOpsEnd;
}
private:
/// Private constructor - used by \c isInductionPHI.
InductionDescriptor(Value *Start, InductionKind K, const SCEV *Step);
InductionDescriptor(Value *Start, InductionKind K, const SCEV *Step,
BinaryOperator *InductionBinOp = nullptr);
/// Start value.
TrackingVH<Value> StartValue;
@ -318,6 +352,8 @@ private:
InductionKind IK;
/// Step value.
const SCEV *Step;
// Instruction that advances induction variable.
BinaryOperator *InductionBinOp;
};
BasicBlock *InsertPreheaderForLoop(Loop *L, DominatorTree *DT, LoopInfo *LI,

View File

@ -703,7 +703,7 @@ bool LoopInterchangeLegality::findInductionAndReductions(
RecurrenceDescriptor RD;
InductionDescriptor ID;
PHINode *PHI = cast<PHINode>(I);
if (InductionDescriptor::isInductionPHI(PHI, SE, ID))
if (InductionDescriptor::isInductionPHI(PHI, L, SE, ID))
Inductions.push_back(PHI);
else if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD))
Reductions.push_back(PHI);

View File

@ -654,8 +654,8 @@ Value *RecurrenceDescriptor::createMinMaxOp(IRBuilder<> &Builder,
}
InductionDescriptor::InductionDescriptor(Value *Start, InductionKind K,
const SCEV *Step)
: StartValue(Start), IK(K), Step(Step) {
const SCEV *Step, BinaryOperator *BOp)
: StartValue(Start), IK(K), Step(Step), InductionBinOp(BOp) {
assert(IK != IK_NoInduction && "Not an induction");
// Start value type should match the induction kind and the value
@ -672,7 +672,15 @@ InductionDescriptor::InductionDescriptor(Value *Start, InductionKind K,
assert((IK != IK_PtrInduction || getConstIntStepValue()) &&
"Step value should be constant for pointer induction");
assert(Step->getType()->isIntegerTy() && "StepValue is not an integer");
assert((IK == IK_FpInduction || Step->getType()->isIntegerTy()) &&
"StepValue is not an integer");
assert((IK != IK_FpInduction || Step->getType()->isFloatingPointTy()) &&
"StepValue is not FP for FpInduction");
assert((IK != IK_FpInduction || (InductionBinOp &&
(InductionBinOp->getOpcode() == Instruction::FAdd ||
InductionBinOp->getOpcode() == Instruction::FSub))) &&
"Binary opcode should be specified for FP induction");
}
int InductionDescriptor::getConsecutiveDirection() const {
@ -693,6 +701,8 @@ Value *InductionDescriptor::transform(IRBuilder<> &B, Value *Index,
const DataLayout& DL) const {
SCEVExpander Exp(*SE, DL, "induction");
assert(Index->getType() == Step->getType() &&
"Index type does not match StepValue type");
switch (IK) {
case IK_IntInduction: {
assert(Index->getType() == StartValue->getType() &&
@ -717,29 +727,113 @@ Value *InductionDescriptor::transform(IRBuilder<> &B, Value *Index,
return Exp.expandCodeFor(S, StartValue->getType(), &*B.GetInsertPoint());
}
case IK_PtrInduction: {
assert(Index->getType() == Step->getType() &&
"Index type does not match StepValue type");
assert(isa<SCEVConstant>(Step) &&
"Expected constant step for pointer induction");
const SCEV *S = SE->getMulExpr(SE->getSCEV(Index), Step);
Index = Exp.expandCodeFor(S, Index->getType(), &*B.GetInsertPoint());
return B.CreateGEP(nullptr, StartValue, Index);
}
case IK_FpInduction: {
assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
assert(InductionBinOp &&
(InductionBinOp->getOpcode() == Instruction::FAdd ||
InductionBinOp->getOpcode() == Instruction::FSub) &&
"Original bin op should be defined for FP induction");
Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
// Floating point operations had to be 'fast' to enable the induction.
FastMathFlags Flags;
Flags.setUnsafeAlgebra();
Value *MulExp = B.CreateFMul(StepValue, Index);
if (isa<Instruction>(MulExp))
// We have to check, the MulExp may be a constant.
cast<Instruction>(MulExp)->setFastMathFlags(Flags);
Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode() , StartValue,
MulExp, "induction");
if (isa<Instruction>(BOp))
cast<Instruction>(BOp)->setFastMathFlags(Flags);
return BOp;
}
case IK_NoInduction:
return nullptr;
}
llvm_unreachable("invalid enum");
}
bool InductionDescriptor::isInductionPHI(PHINode *Phi,
bool InductionDescriptor::isFPInductionPHI(PHINode *Phi, const Loop *TheLoop,
ScalarEvolution *SE,
InductionDescriptor &D) {
// Here we only handle FP induction variables.
assert(Phi->getType()->isFloatingPointTy() && "Unexpected Phi type");
if (TheLoop->getHeader() != Phi->getParent())
return false;
// The loop may have multiple entrances or multiple exits; we can analyze
// this phi if it has a unique entry value and a unique backedge value.
if (Phi->getNumIncomingValues() != 2)
return false;
Value *BEValue = nullptr, *StartValue = nullptr;
if (TheLoop->contains(Phi->getIncomingBlock(0))) {
BEValue = Phi->getIncomingValue(0);
StartValue = Phi->getIncomingValue(1);
} else {
assert(TheLoop->contains(Phi->getIncomingBlock(1)) &&
"Unexpected Phi node in the loop");
BEValue = Phi->getIncomingValue(1);
StartValue = Phi->getIncomingValue(0);
}
BinaryOperator *BOp = dyn_cast<BinaryOperator>(BEValue);
if (!BOp)
return false;
Value *Addend = nullptr;
if (BOp->getOpcode() == Instruction::FAdd) {
if (BOp->getOperand(0) == Phi)
Addend = BOp->getOperand(1);
else if (BOp->getOperand(1) == Phi)
Addend = BOp->getOperand(0);
} else if (BOp->getOpcode() == Instruction::FSub)
if (BOp->getOperand(0) == Phi)
Addend = BOp->getOperand(1);
if (!Addend)
return false;
// The addend should be loop invariant
if (auto *I = dyn_cast<Instruction>(Addend))
if (TheLoop->contains(I))
return false;
// FP Step has unknown SCEV
const SCEV *Step = SE->getUnknown(Addend);
D = InductionDescriptor(StartValue, IK_FpInduction, Step, BOp);
return true;
}
bool InductionDescriptor::isInductionPHI(PHINode *Phi, const Loop *TheLoop,
PredicatedScalarEvolution &PSE,
InductionDescriptor &D,
bool Assume) {
Type *PhiTy = Phi->getType();
// We only handle integer and pointer inductions variables.
if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy())
// Handle integer and pointer inductions variables.
// Now we handle also FP induction but not trying to make a
// recurrent expression from the PHI node in-place.
if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy() &&
!PhiTy->isFloatTy() && !PhiTy->isDoubleTy() && !PhiTy->isHalfTy())
return false;
if (PhiTy->isFloatingPointTy())
return isFPInductionPHI(Phi, TheLoop, PSE.getSE(), D);
const SCEV *PhiScev = PSE.getSCEV(Phi);
const auto *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
@ -752,10 +846,10 @@ bool InductionDescriptor::isInductionPHI(PHINode *Phi,
return false;
}
return isInductionPHI(Phi, PSE.getSE(), D, AR);
return isInductionPHI(Phi, TheLoop, PSE.getSE(), D, AR);
}
bool InductionDescriptor::isInductionPHI(PHINode *Phi,
bool InductionDescriptor::isInductionPHI(PHINode *Phi, const Loop *TheLoop,
ScalarEvolution *SE,
InductionDescriptor &D,
const SCEV *Expr) {
@ -773,7 +867,7 @@ bool InductionDescriptor::isInductionPHI(PHINode *Phi,
return false;
}
assert(AR->getLoop()->getHeader() == Phi->getParent() &&
assert(TheLoop->getHeader() == Phi->getParent() &&
"PHI is an AddRec for a different loop?!");
Value *StartValue =
Phi->getIncomingValueForBlock(AR->getLoop()->getLoopPreheader());
@ -781,7 +875,7 @@ bool InductionDescriptor::isInductionPHI(PHINode *Phi,
// Calculate the pointer stride and check if it is consecutive.
// The stride may be a constant or a loop invariant integer value.
const SCEVConstant *ConstStep = dyn_cast<SCEVConstant>(Step);
if (!ConstStep && !SE->isLoopInvariant(Step, AR->getLoop()))
if (!ConstStep && !SE->isLoopInvariant(Step, TheLoop))
return false;
if (PhiTy->isIntegerTy()) {

View File

@ -402,7 +402,10 @@ protected:
/// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
/// to each vector element of Val. The sequence starts at StartIndex.
virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step);
/// \p Opcode is relevant for FP induction variable.
virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
Instruction::BinaryOps Opcode =
Instruction::BinaryOpsEnd);
/// Compute scalar induction steps. \p ScalarIV is the scalar induction
/// variable on which to base the steps, \p Step is the size of the step, and
@ -625,7 +628,9 @@ private:
bool IfPredicateStore = false) override;
void vectorizeMemoryInstruction(Instruction *Instr) override;
Value *getBroadcastInstrs(Value *V) override;
Value *getStepVector(Value *Val, int StartIdx, Value *Step) override;
Value *getStepVector(Value *Val, int StartIdx, Value *Step,
Instruction::BinaryOps Opcode =
Instruction::BinaryOpsEnd) override;
Value *reverseVector(Value *Vec) override;
};
@ -2000,32 +2005,60 @@ void InnerLoopVectorizer::widenIntInduction(PHINode *IV, VectorParts &Entry,
}
}
Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx,
Value *Step) {
Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
Instruction::BinaryOps BinOp) {
// Create and check the types.
assert(Val->getType()->isVectorTy() && "Must be a vector");
assert(Val->getType()->getScalarType()->isIntegerTy() &&
"Elem must be an integer");
assert(Step->getType() == Val->getType()->getScalarType() &&
"Step has wrong type");
// Create the types.
Type *ITy = Val->getType()->getScalarType();
VectorType *Ty = cast<VectorType>(Val->getType());
int VLen = Ty->getNumElements();
int VLen = Val->getType()->getVectorNumElements();
Type *STy = Val->getType()->getScalarType();
assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
"Induction Step must be an integer or FP");
assert(Step->getType() == STy && "Step has wrong type");
SmallVector<Constant *, 8> Indices;
if (STy->isIntegerTy()) {
// Create a vector of consecutive numbers from zero to VF.
for (int i = 0; i < VLen; ++i)
Indices.push_back(ConstantInt::get(STy, StartIdx + i));
// Add the consecutive indices to the vector value.
Constant *Cv = ConstantVector::get(Indices);
assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
Step = Builder.CreateVectorSplat(VLen, Step);
assert(Step->getType() == Val->getType() && "Invalid step vec");
// FIXME: The newly created binary instructions should contain nsw/nuw flags,
// which can be found from the original scalar operations.
Step = Builder.CreateMul(Cv, Step);
return Builder.CreateAdd(Val, Step, "induction");
}
// Floating point induction.
assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
"Binary Opcode should be specified for FP induction");
// Create a vector of consecutive numbers from zero to VF.
for (int i = 0; i < VLen; ++i)
Indices.push_back(ConstantInt::get(ITy, StartIdx + i));
Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
// Add the consecutive indices to the vector value.
Constant *Cv = ConstantVector::get(Indices);
assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
Step = Builder.CreateVectorSplat(VLen, Step);
assert(Step->getType() == Val->getType() && "Invalid step vec");
// FIXME: The newly created binary instructions should contain nsw/nuw flags,
// which can be found from the original scalar operations.
Step = Builder.CreateMul(Cv, Step);
return Builder.CreateAdd(Val, Step, "induction");
// Floating point operations had to be 'fast' to enable the induction.
FastMathFlags Flags;
Flags.setUnsafeAlgebra();
Value *MulOp = Builder.CreateFMul(Cv, Step);
if (isa<Instruction>(MulOp))
// Have to check, MulOp may be a constant
cast<Instruction>(MulOp)->setFastMathFlags(Flags);
Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
if (isa<Instruction>(BOp))
cast<Instruction>(BOp)->setFastMathFlags(Flags);
return BOp;
}
void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
@ -3099,8 +3132,10 @@ void InnerLoopVectorizer::createEmptyLoop() {
EndValue = CountRoundDown;
} else {
IRBuilder<> B(LoopBypassBlocks.back()->getTerminator());
Value *CRD = B.CreateSExtOrTrunc(CountRoundDown,
II.getStep()->getType(), "cast.crd");
Type *StepType = II.getStep()->getType();
Instruction::CastOps CastOp =
CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
EndValue = II.transform(B, CRD, PSE.getSE(), DL);
EndValue->setName("ind.end");
@ -4047,7 +4082,7 @@ void InnerLoopVectorizer::widenPHIInstruction(
llvm_unreachable("Unknown induction");
case InductionDescriptor::IK_IntInduction:
return widenIntInduction(P, Entry);
case InductionDescriptor::IK_PtrInduction:
case InductionDescriptor::IK_PtrInduction: {
// Handle the pointer induction variable case.
assert(P->getType()->isPointerTy() && "Unexpected type.");
// This is the normalized GEP that starts counting at zero.
@ -4080,6 +4115,29 @@ void InnerLoopVectorizer::widenPHIInstruction(
}
return;
}
case InductionDescriptor::IK_FpInduction: {
assert(P->getType() == II.getStartValue()->getType() &&
"Types must match");
// Handle other induction variables that are now based on the
// canonical one.
assert(P != OldInduction && "Primary induction can be integer only");
Value *V = Builder.CreateCast(Instruction::SIToFP, Induction, P->getType());
V = II.transform(Builder, V, PSE.getSE(), DL);
V->setName("fp.offset.idx");
// Now we have scalar op: %fp.offset.idx = StartVal +/- Induction*StepVal
Value *Broadcasted = getBroadcastInstrs(V);
// After broadcasting the induction variable we need to make the vector
// consecutive by adding StepVal*0, StepVal*1, StepVal*2, etc.
Value *StepVal = cast<SCEVUnknown>(II.getStep())->getValue();
for (unsigned part = 0; part < UF; ++part)
Entry[part] = getStepVector(Broadcasted, VF * part, StepVal,
II.getInductionOpcode());
return;
}
}
}
void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
@ -4565,10 +4623,12 @@ void LoopVectorizationLegality::addInductionPhi(
const DataLayout &DL = Phi->getModule()->getDataLayout();
// Get the widest type.
if (!WidestIndTy)
WidestIndTy = convertPointerToIntegerType(DL, PhiTy);
else
WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy);
if (!PhiTy->isFloatingPointTy()) {
if (!WidestIndTy)
WidestIndTy = convertPointerToIntegerType(DL, PhiTy);
else
WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy);
}
// Int inductions are special because we only allow one IV.
if (ID.getKind() == InductionDescriptor::IK_IntInduction &&
@ -4649,8 +4709,10 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
}
InductionDescriptor ID;
if (InductionDescriptor::isInductionPHI(Phi, PSE, ID)) {
if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID)) {
addInductionPhi(Phi, ID, AllowedExit);
if (ID.hasUnsafeAlgebra() && !HasFunNoNaNAttr)
Requirements->addUnsafeAlgebraInst(ID.getUnsafeAlgebraInst());
continue;
}
@ -4661,7 +4723,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
// As a last resort, coerce the PHI to a AddRec expression
// and re-try classifying it a an induction PHI.
if (InductionDescriptor::isInductionPHI(Phi, PSE, ID, true)) {
if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true)) {
addInductionPhi(Phi, ID, AllowedExit);
continue;
}
@ -6348,11 +6410,20 @@ Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step) {
Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
Instruction::BinaryOps BinOp) {
// When unrolling and the VF is 1, we only need to add a simple scalar.
Type *ITy = Val->getType();
assert(!ITy->isVectorTy() && "Val must be a scalar");
Constant *C = ConstantInt::get(ITy, StartIdx);
Type *Ty = Val->getType();
assert(!Ty->isVectorTy() && "Val must be a scalar");
if (Ty->isFloatingPointTy()) {
Constant *C = ConstantFP::get(Ty, (double)StartIdx);
// Floating point operations had to be 'fast' to enable the unrolling.
Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
}
Constant *C = ConstantInt::get(Ty, StartIdx);
return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
}

View File

@ -0,0 +1,86 @@
; RUN: opt < %s -O3 -mcpu=core-avx2 -mtriple=x86_64-unknown-linux-gnu -S | FileCheck --check-prefix AUTO_VEC %s
; This test checks auto-vectorization with FP induction variable.
; The FP operation is not "fast" and requires "fast-math" function attribute.
;void fp_iv_loop1(float * __restrict__ A, int N) {
; float x = 1.0;
; for (int i=0; i < N; ++i) {
; A[i] = x;
; x += 0.5;
; }
;}
; AUTO_VEC-LABEL: @fp_iv_loop1(
; AUTO_VEC: vector.body
; AUTO_VEC: store <8 x float>
define void @fp_iv_loop1(float* noalias nocapture %A, i32 %N) #0 {
entry:
%cmp4 = icmp sgt i32 %N, 0
br i1 %cmp4, label %for.body.preheader, label %for.end
for.body.preheader: ; preds = %entry
br label %for.body
for.body: ; preds = %for.body.preheader, %for.body
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
%x.06 = phi float [ %conv1, %for.body ], [ 1.000000e+00, %for.body.preheader ]
%arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
store float %x.06, float* %arrayidx, align 4
%conv1 = fadd float %x.06, 5.000000e-01
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %N
br i1 %exitcond, label %for.end.loopexit, label %for.body
for.end.loopexit: ; preds = %for.body
br label %for.end
for.end: ; preds = %for.end.loopexit, %entry
ret void
}
; The same as the previous, FP operation is not fast, different function attribute
; Vectorization should be rejected.
;void fp_iv_loop2(float * __restrict__ A, int N) {
; float x = 1.0;
; for (int i=0; i < N; ++i) {
; A[i] = x;
; x += 0.5;
; }
;}
; AUTO_VEC-LABEL: @fp_iv_loop2(
; AUTO_VEC-NOT: vector.body
; AUTO_VEC-NOT: store <{{.*}} x float>
define void @fp_iv_loop2(float* noalias nocapture %A, i32 %N) #1 {
entry:
%cmp4 = icmp sgt i32 %N, 0
br i1 %cmp4, label %for.body.preheader, label %for.end
for.body.preheader: ; preds = %entry
br label %for.body
for.body: ; preds = %for.body.preheader, %for.body
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
%x.06 = phi float [ %conv1, %for.body ], [ 1.000000e+00, %for.body.preheader ]
%arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
store float %x.06, float* %arrayidx, align 4
%conv1 = fadd float %x.06, 5.000000e-01
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %N
br i1 %exitcond, label %for.end.loopexit, label %for.body
for.end.loopexit: ; preds = %for.body
br label %for.end
for.end: ; preds = %for.end.loopexit, %entry
ret void
}
attributes #0 = { "no-nans-fp-math"="true" }
attributes #1 = { "no-nans-fp-math"="false" }

View File

@ -0,0 +1,218 @@
; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck --check-prefix VEC4_INTERL1 %s
; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=4 -dce -instcombine -S | FileCheck --check-prefix VEC4_INTERL2 %s
; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=1 -dce -instcombine -S | FileCheck --check-prefix VEC1_INTERL2 %s
; VEC4_INTERL1-LABEL: @fp_iv_loop1(
; VEC4_INTERL1: %[[FP_INC:.*]] = load float, float* @fp_inc
; VEC4_INTERL1: vector.body:
; VEC4_INTERL1: %[[FP_INDEX:.*]] = sitofp i64 {{.*}} to float
; VEC4_INTERL1: %[[VEC_INCR:.*]] = fmul fast float {{.*}}, %[[FP_INDEX]]
; VEC4_INTERL1: %[[FP_OFFSET_IDX:.*]] = fsub fast float %init, %[[VEC_INCR]]
; VEC4_INTERL1: %[[BRCT_INSERT:.*]] = insertelement <4 x float> undef, float %[[FP_OFFSET_IDX]], i32 0
; VEC4_INTERL1-NEXT: %[[BRCT_SPLAT:.*]] = shufflevector <4 x float> %[[BRCT_INSERT]], <4 x float> undef, <4 x i32> zeroinitializer
; VEC4_INTERL1: %[[BRCT_INSERT:.*]] = insertelement {{.*}} %[[FP_INC]]
; VEC4_INTERL1-NEXT: %[[FP_INC_BCST:.*]] = shufflevector <4 x float> %[[BRCT_INSERT]], {{.*}} zeroinitializer
; VEC4_INTERL1: %[[VSTEP:.*]] = fmul fast <4 x float> %[[FP_INC_BCST]], <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
; VEC4_INTERL1-NEXT: %[[VEC_INDUCTION:.*]] = fsub fast <4 x float> %[[BRCT_SPLAT]], %[[VSTEP]]
; VEC4_INTERL1: store <4 x float> %[[VEC_INDUCTION]]
; VEC4_INTERL2-LABEL: @fp_iv_loop1(
; VEC4_INTERL2: %[[FP_INC:.*]] = load float, float* @fp_inc
; VEC4_INTERL2: vector.body:
; VEC4_INTERL2: %[[INDEX:.*]] = sitofp i64 {{.*}} to float
; VEC4_INTERL2: %[[VEC_INCR:.*]] = fmul fast float %{{.*}}, %[[INDEX]]
; VEC4_INTERL2: fsub fast float %init, %[[VEC_INCR]]
; VEC4_INTERL2: %[[VSTEP1:.*]] = fmul fast <4 x float> %{{.*}}, <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
; VEC4_INTERL2-NEXT: %[[VEC_INDUCTION1:.*]] = fsub fast <4 x float> {{.*}}, %[[VSTEP1]]
; VEC4_INTERL2: %[[VSTEP2:.*]] = fmul fast <4 x float> %{{.*}}, <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>
; VEC4_INTERL2-NEXT: %[[VEC_INDUCTION2:.*]] = fsub fast <4 x float> {{.*}}, %[[VSTEP2]]
; VEC4_INTERL2: store <4 x float> %[[VEC_INDUCTION1]]
; VEC4_INTERL2: store <4 x float> %[[VEC_INDUCTION2]]
; VEC1_INTERL2-LABEL: @fp_iv_loop1(
; VEC1_INTERL2: %[[FP_INC:.*]] = load float, float* @fp_inc
; VEC1_INTERL2: vector.body:
; VEC1_INTERL2: %[[INDEX:.*]] = sitofp i64 {{.*}} to float
; VEC1_INTERL2: %[[STEP:.*]] = fmul fast float %{{.*}}, %[[INDEX]]
; VEC1_INTERL2: %[[FP_OFFSET_IDX:.*]] = fsub fast float %init, %[[STEP]]
; VEC1_INTERL2: %[[SCALAR_INDUCTION2:.*]] = fsub fast float %[[FP_OFFSET_IDX]], %[[FP_INC]]
; VEC1_INTERL2: store float %[[FP_OFFSET_IDX]]
; VEC1_INTERL2: store float %[[SCALAR_INDUCTION2]]
@fp_inc = common global float 0.000000e+00, align 4
;void fp_iv_loop1(float init, float * __restrict__ A, int N) {
; float x = init;
; for (int i=0; i < N; ++i) {
; A[i] = x;
; x -= fp_inc;
; }
;}
define void @fp_iv_loop1(float %init, float* noalias nocapture %A, i32 %N) #1 {
entry:
%cmp4 = icmp sgt i32 %N, 0
br i1 %cmp4, label %for.body.lr.ph, label %for.end
for.body.lr.ph: ; preds = %entry
%fpinc = load float, float* @fp_inc, align 4
br label %for.body
for.body: ; preds = %for.body, %for.body.lr.ph
%indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
%x.05 = phi float [ %init, %for.body.lr.ph ], [ %add, %for.body ]
%arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
store float %x.05, float* %arrayidx, align 4
%add = fsub fast float %x.05, %fpinc
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %N
br i1 %exitcond, label %for.end.loopexit, label %for.body
for.end.loopexit: ; preds = %for.body
br label %for.end
for.end: ; preds = %for.end.loopexit, %entry
ret void
}
;void fp_iv_loop2(float init, float * __restrict__ A, int N) {
; float x = init;
; for (int i=0; i < N; ++i) {
; A[i] = x;
; x += 0.5;
; }
;}
; VEC4_INTERL1-LABEL: @fp_iv_loop2(
; VEC4_INTERL1: vector.body
; VEC4_INTERL1: %[[index:.*]] = phi i64 [ 0, %vector.ph ]
; VEC4_INTERL1: sitofp i64 %[[index]] to float
; VEC4_INTERL1: %[[VAR1:.*]] = fmul fast float {{.*}}, 5.000000e-01
; VEC4_INTERL1: %[[VAR2:.*]] = fadd fast float %[[VAR1]]
; VEC4_INTERL1: insertelement <4 x float> undef, float %[[VAR2]], i32 0
; VEC4_INTERL1: shufflevector <4 x float> {{.*}}, <4 x float> undef, <4 x i32> zeroinitializer
; VEC4_INTERL1: fadd fast <4 x float> {{.*}}, <float 0.000000e+00, float 5.000000e-01, float 1.000000e+00, float 1.500000e+00>
; VEC4_INTERL1: store <4 x float>
define void @fp_iv_loop2(float %init, float* noalias nocapture %A, i32 %N) #0 {
entry:
%cmp4 = icmp sgt i32 %N, 0
br i1 %cmp4, label %for.body.preheader, label %for.end
for.body.preheader: ; preds = %entry
br label %for.body
for.body: ; preds = %for.body.preheader, %for.body
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
%x.06 = phi float [ %conv1, %for.body ], [ %init, %for.body.preheader ]
%arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
store float %x.06, float* %arrayidx, align 4
%conv1 = fadd fast float %x.06, 5.000000e-01
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %N
br i1 %exitcond, label %for.end.loopexit, label %for.body
for.end.loopexit: ; preds = %for.body
br label %for.end
for.end: ; preds = %for.end.loopexit, %entry
ret void
}
;void fp_iv_loop3(float init, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, int N) {
; int i = 0;
; float x = init;
; float y = 0.1;
; for (; i < N; ++i) {
; A[i] = x;
; x += fp_inc;
; y -= 0.5;
; B[i] = x + y;
; C[i] = y;
; }
;}
; VEC4_INTERL1-LABEL: @fp_iv_loop3(
; VEC4_INTERL1: vector.body
; VEC4_INTERL1: %[[index:.*]] = phi i64 [ 0, %vector.ph ]
; VEC4_INTERL1: sitofp i64 %[[index]] to float
; VEC4_INTERL1: %[[VAR1:.*]] = fmul fast float {{.*}}, -5.000000e-01
; VEC4_INTERL1: fadd fast float %[[VAR1]]
; VEC4_INTERL1: fadd fast <4 x float> {{.*}}, <float -5.000000e-01, float -1.000000e+00, float -1.500000e+00, float -2.000000e+00>
; VEC4_INTERL1: store <4 x float>
define void @fp_iv_loop3(float %init, float* noalias nocapture %A, float* noalias nocapture %B, float* noalias nocapture %C, i32 %N) #1 {
entry:
%cmp9 = icmp sgt i32 %N, 0
br i1 %cmp9, label %for.body.lr.ph, label %for.end
for.body.lr.ph: ; preds = %entry
%0 = load float, float* @fp_inc, align 4
br label %for.body
for.body: ; preds = %for.body, %for.body.lr.ph
%indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
%y.012 = phi float [ 0x3FB99999A0000000, %for.body.lr.ph ], [ %conv1, %for.body ]
%x.011 = phi float [ %init, %for.body.lr.ph ], [ %add, %for.body ]
%arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
store float %x.011, float* %arrayidx, align 4
%add = fadd fast float %x.011, %0
%conv1 = fadd fast float %y.012, -5.000000e-01
%add2 = fadd fast float %conv1, %add
%arrayidx4 = getelementptr inbounds float, float* %B, i64 %indvars.iv
store float %add2, float* %arrayidx4, align 4
%arrayidx6 = getelementptr inbounds float, float* %C, i64 %indvars.iv
store float %conv1, float* %arrayidx6, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %N
br i1 %exitcond, label %for.end.loopexit, label %for.body
for.end.loopexit:
br label %for.end
for.end:
ret void
}
; Start and step values are constants. There is no 'fmul' operation in this case
;void fp_iv_loop4(float * __restrict__ A, int N) {
; float x = 1.0;
; for (int i=0; i < N; ++i) {
; A[i] = x;
; x += 0.5;
; }
;}
; VEC4_INTERL1-LABEL: @fp_iv_loop4(
; VEC4_INTERL1: vector.body
; VEC4_INTERL1-NOT: fmul fast <4 x float>
; VEC4_INTERL1: %[[induction:.*]] = fadd fast <4 x float> %{{.*}}, <float 0.000000e+00, float 5.000000e-01, float 1.000000e+00, float 1.500000e+00>
; VEC4_INTERL1: store <4 x float> %[[induction]]
define void @fp_iv_loop4(float* noalias nocapture %A, i32 %N) {
entry:
%cmp4 = icmp sgt i32 %N, 0
br i1 %cmp4, label %for.body.preheader, label %for.end
for.body.preheader: ; preds = %entry
br label %for.body
for.body: ; preds = %for.body.preheader, %for.body
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
%x.06 = phi float [ %conv1, %for.body ], [ 1.000000e+00, %for.body.preheader ]
%arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
store float %x.06, float* %arrayidx, align 4
%conv1 = fadd fast float %x.06, 5.000000e-01
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %N
br i1 %exitcond, label %for.end.loopexit, label %for.body
for.end.loopexit: ; preds = %for.body
br label %for.end
for.end: ; preds = %for.end.loopexit, %entry
ret void
}