forked from OSchip/llvm-project
Revert "SLPVectorizer: Handle more horizontal reductions (disabled)"
This reverts commit r191108. The horizontal.ll test case fails under libgmalloc. Thanks Shuxin for pointing this out to me. llvm-svn: 191121
This commit is contained in:
parent
9b04f41899
commit
f1dfbfdde1
|
@ -49,11 +49,6 @@ static cl::opt<int>
|
|||
SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
|
||||
cl::desc("Only vectorize if you gain more than this "
|
||||
"number "));
|
||||
|
||||
static cl::opt<bool>
|
||||
ShouldVectorizeHor("slp-vectorize-hor", cl::init(false), cl::Hidden,
|
||||
cl::desc("Attempt to vectorize horizontal reductions"));
|
||||
|
||||
namespace {
|
||||
|
||||
static const unsigned MinVecRegSize = 128;
|
||||
|
@ -243,21 +238,17 @@ public:
|
|||
}
|
||||
|
||||
/// \brief Vectorize the tree that starts with the elements in \p VL.
|
||||
/// Returns the vectorized root and the scalar operations the root was based
|
||||
/// on.
|
||||
std::pair<Value *, ValueList *> vectorizeTree();
|
||||
void vectorizeTree();
|
||||
|
||||
/// \returns the vectorization cost of the subtree that starts at \p VL.
|
||||
/// A negative number means that this is profitable.
|
||||
int getTreeCost();
|
||||
|
||||
/// Construct a vectorizable tree that starts at \p Roots and is possibly
|
||||
/// used by a reduction of \p RdxOps.
|
||||
void buildTree(ArrayRef<Value *> Roots, ValueSet *RdxOps = 0);
|
||||
/// Construct a vectorizable tree that starts at \p Roots.
|
||||
void buildTree(ArrayRef<Value *> Roots);
|
||||
|
||||
/// Clear the internal data structures that are created by 'buildTree'.
|
||||
void deleteTree() {
|
||||
RdxOps = 0;
|
||||
VectorizableTree.clear();
|
||||
ScalarToTreeEntry.clear();
|
||||
MustGather.clear();
|
||||
|
@ -410,9 +401,6 @@ private:
|
|||
/// Numbers instructions in different blocks.
|
||||
DenseMap<BasicBlock *, BlockNumbering> BlocksNumbers;
|
||||
|
||||
/// Reduction operators.
|
||||
ValueSet *RdxOps;
|
||||
|
||||
// Analysis and block reference.
|
||||
Function *F;
|
||||
ScalarEvolution *SE;
|
||||
|
@ -425,9 +413,8 @@ private:
|
|||
IRBuilder<> Builder;
|
||||
};
|
||||
|
||||
void BoUpSLP::buildTree(ArrayRef<Value *> Roots, ValueSet *Rdx) {
|
||||
void BoUpSLP::buildTree(ArrayRef<Value *> Roots) {
|
||||
deleteTree();
|
||||
RdxOps = Rdx;
|
||||
if (!getSameType(Roots))
|
||||
return;
|
||||
buildTree_rec(Roots, 0);
|
||||
|
@ -458,12 +445,8 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots, ValueSet *Rdx) {
|
|||
assert(!VectorizableTree[Idx].NeedToGather && "Bad state");
|
||||
continue;
|
||||
}
|
||||
Instruction *UserInst = dyn_cast<Instruction>(*User);
|
||||
if (!UserInst)
|
||||
continue;
|
||||
|
||||
// Ignore uses that are part of the reduction.
|
||||
if (Rdx && std::find(Rdx->begin(), Rdx->end(), UserInst) != Rdx->end())
|
||||
if (!isa<Instruction>(*User))
|
||||
continue;
|
||||
|
||||
DEBUG(dbgs() << "SLP: Need to extract:" << **User << " from lane " <<
|
||||
|
@ -595,10 +578,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
|
|||
continue;
|
||||
}
|
||||
|
||||
// This user is part of the reduction.
|
||||
if (RdxOps && RdxOps->count(User))
|
||||
continue;
|
||||
|
||||
// Make sure that we can schedule this unknown user.
|
||||
BlockNumbering &BN = BlocksNumbers[BB];
|
||||
int UserIndex = BN.getIndex(User);
|
||||
|
@ -1393,7 +1372,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
|
|||
return 0;
|
||||
}
|
||||
|
||||
std::pair<Value *, BoUpSLP::ValueList *> BoUpSLP::vectorizeTree() {
|
||||
void BoUpSLP::vectorizeTree() {
|
||||
Builder.SetInsertPoint(F->getEntryBlock().begin());
|
||||
vectorizeTree(&VectorizableTree[0]);
|
||||
|
||||
|
@ -1470,10 +1449,7 @@ std::pair<Value *, BoUpSLP::ValueList *> BoUpSLP::vectorizeTree() {
|
|||
DEBUG(dbgs() << "SLP: \tvalidating user:" << **User << ".\n");
|
||||
assert(!MustGather.count(*User) &&
|
||||
"Replacing gathered value with undef");
|
||||
|
||||
assert((ScalarToTreeEntry.count(*User) ||
|
||||
// It is legal to replace the reduction users by undef.
|
||||
(RdxOps && RdxOps->count(*User))) &&
|
||||
assert(ScalarToTreeEntry.count(*User) &&
|
||||
"Replacing out-of-tree value with undef");
|
||||
}
|
||||
Value *Undef = UndefValue::get(Ty);
|
||||
|
@ -1488,9 +1464,6 @@ std::pair<Value *, BoUpSLP::ValueList *> BoUpSLP::vectorizeTree() {
|
|||
BlocksNumbers[it].forget();
|
||||
}
|
||||
Builder.ClearInsertionPoint();
|
||||
|
||||
return std::make_pair(VectorizableTree[0].VectorizedValue,
|
||||
&VectorizableTree[0].Scalars);
|
||||
}
|
||||
|
||||
void BoUpSLP::optimizeGatherSequence() {
|
||||
|
@ -1914,310 +1887,6 @@ bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {
|
|||
return 0;
|
||||
}
|
||||
|
||||
/// \brief Generate a shuffle mask to be used in a reduction tree.
|
||||
///
|
||||
/// \param VecLen The length of the vector to be reduced.
|
||||
/// \param NumEltsToRdx The number of elements that should be reduced in the
|
||||
/// vector.
|
||||
/// \param IsPairwise Whether the reduction is a pairwise or splitting
|
||||
/// reduction. A pairwise reduction will generate a mask of
|
||||
/// <0,2,...> or <1,3,..> while a splitting reduction will generate
|
||||
/// <2,3, undef,undef> for a vector of 4 and NumElts = 2.
|
||||
/// \param IsLeft True will generate a mask of even elements, odd otherwise.
|
||||
static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx,
|
||||
bool IsPairwise, bool IsLeft,
|
||||
IRBuilder<> &Builder) {
|
||||
assert((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask");
|
||||
|
||||
SmallVector<Constant *, 32> ShuffleMask(
|
||||
VecLen, UndefValue::get(Builder.getInt32Ty()));
|
||||
|
||||
if (IsPairwise)
|
||||
// Build a mask of 0, 2, ... (left) or 1, 3, ... (right).
|
||||
for (unsigned i = 0; i != NumEltsToRdx; ++i)
|
||||
ShuffleMask[i] = Builder.getInt32(2 * i + !IsLeft);
|
||||
else
|
||||
// Move the upper half of the vector to the lower half.
|
||||
for (unsigned i = 0; i != NumEltsToRdx; ++i)
|
||||
ShuffleMask[i] = Builder.getInt32(NumEltsToRdx + i);
|
||||
|
||||
return ConstantVector::get(ShuffleMask);
|
||||
}
|
||||
|
||||
|
||||
/// Model horizontal reductions.
|
||||
///
|
||||
/// A horizontal reduction is a tree of reduction operations (currently add and
|
||||
/// fadd) that has operations that can be put into a vector as its leaf.
|
||||
/// For example, this tree:
|
||||
///
|
||||
/// mul mul mul mul
|
||||
/// \ / \ /
|
||||
/// + +
|
||||
/// \ /
|
||||
/// +
|
||||
/// This tree has "mul" as its reduced values and "+" as its reduction
|
||||
/// operations. A reduction might be feeding into a store or a binary operation
|
||||
/// feeding a phi.
|
||||
/// ...
|
||||
/// \ /
|
||||
/// +
|
||||
/// \
|
||||
/// phi +=
|
||||
///
|
||||
/// Or:
|
||||
/// ...
|
||||
/// \ /
|
||||
/// +
|
||||
/// \
|
||||
/// *p =
|
||||
///
|
||||
class HorizontalReduction {
|
||||
SmallPtrSet<Value *, 16> ReductionOps;
|
||||
SmallVector<Value *, 32> ReducedVals;
|
||||
|
||||
BinaryOperator *ReductionRoot;
|
||||
PHINode *ReductionPHI;
|
||||
|
||||
/// The opcode of the reduction.
|
||||
unsigned ReductionOpcode;
|
||||
/// The opcode of the values we perform a reduction on.
|
||||
unsigned ReducedValueOpcode;
|
||||
/// The width of one full horizontal reduction operation.
|
||||
unsigned ReduxWidth;
|
||||
/// Should we model this reduction as a pairwise reduction tree or a tree that
|
||||
/// splits the vector in halves and adds those halves.
|
||||
bool IsPairwiseReduction;
|
||||
|
||||
public:
|
||||
HorizontalReduction()
|
||||
: ReductionRoot(0), ReductionPHI(0), ReductionOpcode(0),
|
||||
ReducedValueOpcode(0), ReduxWidth(0), IsPairwiseReduction(false) {}
|
||||
|
||||
/// \brief Try to find a reduction tree.
|
||||
bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B,
|
||||
DataLayout *DL) {
|
||||
assert((!Phi ||
|
||||
std::find(Phi->op_begin(), Phi->op_end(), B) != Phi->op_end()) &&
|
||||
"Thi phi needs to use the binary operator");
|
||||
|
||||
// We could have a initial reductions that is not an add.
|
||||
// r *= v1 + v2 + v3 + v4
|
||||
// In such a case start looking for a tree rooted in the first '+'.
|
||||
if (Phi) {
|
||||
if (B->getOperand(0) == Phi) {
|
||||
Phi = 0;
|
||||
B = dyn_cast<BinaryOperator>(B->getOperand(1));
|
||||
} else if (B->getOperand(1) == Phi) {
|
||||
Phi = 0;
|
||||
B = dyn_cast<BinaryOperator>(B->getOperand(0));
|
||||
}
|
||||
}
|
||||
|
||||
if (!B)
|
||||
return false;
|
||||
|
||||
Type *Ty = B->getType();
|
||||
if (Ty->isVectorTy())
|
||||
return false;
|
||||
|
||||
ReductionOpcode = B->getOpcode();
|
||||
ReducedValueOpcode = 0;
|
||||
ReduxWidth = MinVecRegSize / DL->getTypeSizeInBits(Ty);
|
||||
ReductionRoot = B;
|
||||
ReductionPHI = Phi;
|
||||
|
||||
if (ReduxWidth < 4)
|
||||
return false;
|
||||
|
||||
// We currently only support adds.
|
||||
if (ReductionOpcode != Instruction::Add &&
|
||||
ReductionOpcode != Instruction::FAdd)
|
||||
return false;
|
||||
|
||||
// Post order traverse the reduction tree starting at B. We only handle true
|
||||
// trees containing only binary operators.
|
||||
SmallVector<std::pair<BinaryOperator *, unsigned>, 32> Stack;
|
||||
Stack.push_back(std::make_pair(B, 0));
|
||||
while (!Stack.empty()) {
|
||||
BinaryOperator *TreeN = Stack.back().first;
|
||||
unsigned EdgeToVist = Stack.back().second++;
|
||||
bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode;
|
||||
|
||||
// Only handle trees in the current basic block.
|
||||
if (TreeN->getParent() != B->getParent())
|
||||
return false;
|
||||
|
||||
// Each tree node needs to have one user except for the ultimate
|
||||
// reduction.
|
||||
if (!TreeN->hasOneUse() && TreeN != B)
|
||||
return false;
|
||||
|
||||
// Postorder vist.
|
||||
if (EdgeToVist == 2 || IsReducedValue) {
|
||||
if (IsReducedValue) {
|
||||
// Make sure that the opcodes of the operations that we are going to
|
||||
// reduce match.
|
||||
if (!ReducedValueOpcode)
|
||||
ReducedValueOpcode = TreeN->getOpcode();
|
||||
else if (ReducedValueOpcode != TreeN->getOpcode())
|
||||
return false;
|
||||
ReducedVals.push_back(TreeN);
|
||||
} else {
|
||||
// We need to be able to reassociate the adds.
|
||||
if (!TreeN->isAssociative())
|
||||
return false;
|
||||
ReductionOps.insert(TreeN);
|
||||
}
|
||||
// Retract.
|
||||
Stack.pop_back();
|
||||
continue;
|
||||
}
|
||||
|
||||
// Visit left or right.
|
||||
Value *NextV = TreeN->getOperand(EdgeToVist);
|
||||
BinaryOperator *Next = dyn_cast<BinaryOperator>(NextV);
|
||||
if (Next)
|
||||
Stack.push_back(std::make_pair(Next, 0));
|
||||
else if (NextV != Phi)
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/// \brief Attempt to vectorize the tree found by
|
||||
/// matchAssociativeReduction.
|
||||
bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
|
||||
if (ReducedVals.empty())
|
||||
return false;
|
||||
|
||||
unsigned NumReducedVals = ReducedVals.size();
|
||||
if (NumReducedVals < ReduxWidth)
|
||||
return false;
|
||||
|
||||
Value *VectorizedTree = 0;
|
||||
IRBuilder<> Builder(ReductionRoot);
|
||||
FastMathFlags Unsafe;
|
||||
Unsafe.setUnsafeAlgebra();
|
||||
Builder.SetFastMathFlags(Unsafe);
|
||||
unsigned i = 0;
|
||||
|
||||
for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) {
|
||||
ArrayRef<Value *> ValsToReduce(&ReducedVals[i], ReduxWidth);
|
||||
V.buildTree(ValsToReduce, &ReductionOps);
|
||||
|
||||
// Estimate cost.
|
||||
int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]);
|
||||
if (Cost >= -SLPCostThreshold)
|
||||
break;
|
||||
|
||||
DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost
|
||||
<< ". (HorRdx)\n");
|
||||
|
||||
// Vectorize a tree.
|
||||
Value *VectorizedRoot;
|
||||
BoUpSLP::ValueList *Scalars;
|
||||
tie(VectorizedRoot, Scalars) = V.vectorizeTree();
|
||||
|
||||
// Emit a reduction.
|
||||
Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder);
|
||||
if (VectorizedTree) {
|
||||
Builder.SetCurrentDebugLocation(
|
||||
cast<Instruction>((*Scalars)[0])->getDebugLoc());
|
||||
VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
|
||||
ReducedSubTree, "bin.rdx");
|
||||
} else
|
||||
VectorizedTree = ReducedSubTree;
|
||||
}
|
||||
|
||||
if (VectorizedTree) {
|
||||
// Finish the reduction.
|
||||
for (; i < NumReducedVals; ++i) {
|
||||
Builder.SetCurrentDebugLocation(
|
||||
cast<Instruction>(ReducedVals[i])->getDebugLoc());
|
||||
VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
|
||||
ReducedVals[i]);
|
||||
}
|
||||
// Update users.
|
||||
if (ReductionPHI) {
|
||||
assert(ReductionRoot != NULL && "Need a reduction operation");
|
||||
ReductionRoot->setOperand(0, VectorizedTree);
|
||||
ReductionRoot->setOperand(1, ReductionPHI);
|
||||
} else
|
||||
ReductionRoot->replaceAllUsesWith(VectorizedTree);
|
||||
}
|
||||
return VectorizedTree != 0;
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
/// \brief Calcuate the cost of a reduction.
|
||||
int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal) {
|
||||
Type *ScalarTy = FirstReducedVal->getType();
|
||||
Type *VecTy = VectorType::get(ScalarTy, ReduxWidth);
|
||||
|
||||
int PairwiseRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, true);
|
||||
int SplittingRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, false);
|
||||
|
||||
IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost;
|
||||
int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;
|
||||
|
||||
int ScalarReduxCost =
|
||||
ReduxWidth * TTI->getArithmeticInstrCost(ReductionOpcode, VecTy);
|
||||
|
||||
DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost
|
||||
<< " for reduction that starts with " << *FirstReducedVal
|
||||
<< " (It is a "
|
||||
<< (IsPairwiseReduction ? "pairwise" : "splitting")
|
||||
<< " reduction)\n");
|
||||
|
||||
return VecReduxCost - ScalarReduxCost;
|
||||
}
|
||||
|
||||
static Value *createBinOp(IRBuilder<> &Builder, unsigned Opcode, Value *L,
|
||||
Value *R, const Twine &Name = "") {
|
||||
if (Opcode == Instruction::FAdd)
|
||||
return Builder.CreateFAdd(L, R, Name);
|
||||
return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, L, R, Name);
|
||||
}
|
||||
|
||||
/// \brief Emit a horizontal reduction of the vectorized value.
|
||||
Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder) {
|
||||
assert(VectorizedValue && "Need to have a vectorized tree node");
|
||||
Instruction *ValToReduce = dyn_cast<Instruction>(VectorizedValue);
|
||||
assert(isPowerOf2_32(ReduxWidth) &&
|
||||
"We only handle power-of-two reductions for now");
|
||||
|
||||
SmallVector<Constant *, 32> ShuffleMask(ReduxWidth, 0);
|
||||
Value *TmpVec = ValToReduce;
|
||||
for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
|
||||
if (IsPairwiseReduction) {
|
||||
Value *LeftMask =
|
||||
createRdxShuffleMask(ReduxWidth, i, true, true, Builder);
|
||||
Value *RightMask =
|
||||
createRdxShuffleMask(ReduxWidth, i, true, false, Builder);
|
||||
|
||||
Value *LeftShuf = Builder.CreateShuffleVector(
|
||||
TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l");
|
||||
Value *RightShuf = Builder.CreateShuffleVector(
|
||||
TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),
|
||||
"rdx.shuf.r");
|
||||
TmpVec = createBinOp(Builder, ReductionOpcode, LeftShuf, RightShuf,
|
||||
"bin.rdx");
|
||||
} else {
|
||||
Value *UpperHalf =
|
||||
createRdxShuffleMask(ReduxWidth, i, false, false, Builder);
|
||||
Value *Shuf = Builder.CreateShuffleVector(
|
||||
TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf");
|
||||
TmpVec = createBinOp(Builder, ReductionOpcode, TmpVec, Shuf, "bin.rdx");
|
||||
}
|
||||
}
|
||||
|
||||
// The result is in the first element of the vector.
|
||||
return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
|
||||
}
|
||||
};
|
||||
|
||||
/// \brief Recognize construction of vectors like
|
||||
/// %ra = insertelement <4 x float> undef, float %s0, i32 0
|
||||
/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
|
||||
|
@ -2312,17 +1981,6 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
|
|||
if (!BI)
|
||||
continue;
|
||||
|
||||
// Try to match and vectorize a horizontal reduction.
|
||||
HorizontalReduction HorRdx;
|
||||
if (ShouldVectorizeHor &&
|
||||
HorRdx.matchAssociativeReduction(P, BI, DL) &&
|
||||
HorRdx.tryToReduce(R, TTI)) {
|
||||
Changed = true;
|
||||
it = BB->begin();
|
||||
e = BB->end();
|
||||
continue;
|
||||
}
|
||||
|
||||
Value *Inst = BI->getOperand(0);
|
||||
if (Inst == P)
|
||||
Inst = BI->getOperand(1);
|
||||
|
@ -2333,28 +1991,10 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
|
|||
Changed = true;
|
||||
it = BB->begin();
|
||||
e = BB->end();
|
||||
continue;
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Try to vectorize horizontal reductions feeding into a store.
|
||||
if (StoreInst *SI = dyn_cast<StoreInst>(it))
|
||||
if (BinaryOperator *BinOp =
|
||||
dyn_cast<BinaryOperator>(SI->getValueOperand())) {
|
||||
HorizontalReduction HorRdx;
|
||||
if (ShouldVectorizeHor &&
|
||||
((HorRdx.matchAssociativeReduction(0, BinOp, DL) &&
|
||||
HorRdx.tryToReduce(R, TTI)) ||
|
||||
tryToVectorize(BinOp, R))) {
|
||||
Changed = true;
|
||||
it = BB->begin();
|
||||
e = BB->end();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Try to vectorize trees that start at compare instructions.
|
||||
if (CmpInst *CI = dyn_cast<CmpInst>(it)) {
|
||||
if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) {
|
||||
|
|
|
@ -1,415 +0,0 @@
|
|||
; RUN: opt -slp-vectorizer -slp-vectorize-hor -S < %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s
|
||||
|
||||
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
|
||||
|
||||
; #include <stdint.h>
|
||||
;
|
||||
; int foo(float *A, int n) {
|
||||
; float sum = 0;
|
||||
; for (intptr_t i=0; i < n; ++i) {
|
||||
; sum += 7*A[i*4 ] +
|
||||
; 7*A[i*4+1] +
|
||||
; 7*A[i*4+2] +
|
||||
; 7*A[i*4+3];
|
||||
; }
|
||||
; return sum;
|
||||
; }
|
||||
|
||||
; CHECK-LABEL: add_red
|
||||
; CHECK: fmul <4 x float>
|
||||
; CHECK: shufflevector <4 x float>
|
||||
|
||||
define i32 @add_red(float* %A, i32 %n) {
|
||||
entry:
|
||||
%cmp31 = icmp sgt i32 %n, 0
|
||||
br i1 %cmp31, label %for.body.lr.ph, label %for.end
|
||||
|
||||
for.body.lr.ph:
|
||||
%0 = sext i32 %n to i64
|
||||
br label %for.body
|
||||
|
||||
for.body:
|
||||
%i.033 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
|
||||
%sum.032 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add17, %for.body ]
|
||||
%mul = shl nsw i64 %i.033, 2
|
||||
%arrayidx = getelementptr inbounds float* %A, i64 %mul
|
||||
%1 = load float* %arrayidx, align 4
|
||||
%mul2 = fmul float %1, 7.000000e+00
|
||||
%add28 = or i64 %mul, 1
|
||||
%arrayidx4 = getelementptr inbounds float* %A, i64 %add28
|
||||
%2 = load float* %arrayidx4, align 4
|
||||
%mul5 = fmul float %2, 7.000000e+00
|
||||
%add6 = fadd fast float %mul2, %mul5
|
||||
%add829 = or i64 %mul, 2
|
||||
%arrayidx9 = getelementptr inbounds float* %A, i64 %add829
|
||||
%3 = load float* %arrayidx9, align 4
|
||||
%mul10 = fmul float %3, 7.000000e+00
|
||||
%add11 = fadd fast float %add6, %mul10
|
||||
%add1330 = or i64 %mul, 3
|
||||
%arrayidx14 = getelementptr inbounds float* %A, i64 %add1330
|
||||
%4 = load float* %arrayidx14, align 4
|
||||
%mul15 = fmul float %4, 7.000000e+00
|
||||
%add16 = fadd fast float %add11, %mul15
|
||||
%add17 = fadd fast float %sum.032, %add16
|
||||
%inc = add nsw i64 %i.033, 1
|
||||
%exitcond = icmp eq i64 %inc, %0
|
||||
br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
|
||||
|
||||
for.cond.for.end_crit_edge:
|
||||
%phitmp = fptosi float %add17 to i32
|
||||
br label %for.end
|
||||
|
||||
for.end:
|
||||
%sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
|
||||
ret i32 %sum.0.lcssa
|
||||
}
|
||||
|
||||
; int foo(float * restrict A, float * restrict B, int n) {
|
||||
; float sum = 0;
|
||||
; for (intptr_t i=0; i < n; ++i) {
|
||||
; sum *= B[0]*A[i*4 ] +
|
||||
; B[1]*A[i*4+1] +
|
||||
; B[2]*A[i*4+2] +
|
||||
; B[3]*A[i*4+3];
|
||||
; }
|
||||
; return sum;
|
||||
; }
|
||||
|
||||
; CHECK-LABEL: mul_red
|
||||
; CHECK: fmul <4 x float>
|
||||
; CHECK: shufflevector <4 x float>
|
||||
|
||||
define i32 @mul_red(float* noalias %A, float* noalias %B, i32 %n) {
|
||||
entry:
|
||||
%cmp38 = icmp sgt i32 %n, 0
|
||||
br i1 %cmp38, label %for.body.lr.ph, label %for.end
|
||||
|
||||
for.body.lr.ph:
|
||||
%0 = load float* %B, align 4
|
||||
%arrayidx4 = getelementptr inbounds float* %B, i64 1
|
||||
%1 = load float* %arrayidx4, align 4
|
||||
%arrayidx9 = getelementptr inbounds float* %B, i64 2
|
||||
%2 = load float* %arrayidx9, align 4
|
||||
%arrayidx15 = getelementptr inbounds float* %B, i64 3
|
||||
%3 = load float* %arrayidx15, align 4
|
||||
%4 = sext i32 %n to i64
|
||||
br label %for.body
|
||||
|
||||
for.body:
|
||||
%i.040 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
|
||||
%sum.039 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %mul21, %for.body ]
|
||||
%mul = shl nsw i64 %i.040, 2
|
||||
%arrayidx2 = getelementptr inbounds float* %A, i64 %mul
|
||||
%5 = load float* %arrayidx2, align 4
|
||||
%mul3 = fmul float %0, %5
|
||||
%add35 = or i64 %mul, 1
|
||||
%arrayidx6 = getelementptr inbounds float* %A, i64 %add35
|
||||
%6 = load float* %arrayidx6, align 4
|
||||
%mul7 = fmul float %1, %6
|
||||
%add8 = fadd fast float %mul3, %mul7
|
||||
%add1136 = or i64 %mul, 2
|
||||
%arrayidx12 = getelementptr inbounds float* %A, i64 %add1136
|
||||
%7 = load float* %arrayidx12, align 4
|
||||
%mul13 = fmul float %2, %7
|
||||
%add14 = fadd fast float %add8, %mul13
|
||||
%add1737 = or i64 %mul, 3
|
||||
%arrayidx18 = getelementptr inbounds float* %A, i64 %add1737
|
||||
%8 = load float* %arrayidx18, align 4
|
||||
%mul19 = fmul float %3, %8
|
||||
%add20 = fadd fast float %add14, %mul19
|
||||
%mul21 = fmul float %sum.039, %add20
|
||||
%inc = add nsw i64 %i.040, 1
|
||||
%exitcond = icmp eq i64 %inc, %4
|
||||
br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
|
||||
|
||||
for.cond.for.end_crit_edge:
|
||||
%phitmp = fptosi float %mul21 to i32
|
||||
br label %for.end
|
||||
|
||||
for.end:
|
||||
%sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
|
||||
ret i32 %sum.0.lcssa
|
||||
}
|
||||
|
||||
; int foo(float * restrict A, float * restrict B, int n) {
|
||||
; float sum = 0;
|
||||
; for (intptr_t i=0; i < n; ++i) {
|
||||
; sum += B[0]*A[i*6 ] +
|
||||
; B[1]*A[i*6+1] +
|
||||
; B[2]*A[i*6+2] +
|
||||
; B[3]*A[i*6+3] +
|
||||
; B[4]*A[i*6+4] +
|
||||
; B[5]*A[i*6+5] +
|
||||
; B[6]*A[i*6+6] +
|
||||
; B[7]*A[i*6+7] +
|
||||
; B[8]*A[i*6+8];
|
||||
; }
|
||||
; return sum;
|
||||
; }
|
||||
|
||||
; CHECK-LABEL: long_red
|
||||
; CHECK: fmul <4 x float>
|
||||
; CHECK: shufflevector <4 x float>
|
||||
|
||||
define i32 @long_red(float* noalias %A, float* noalias %B, i32 %n) {
|
||||
entry:
|
||||
%cmp81 = icmp sgt i32 %n, 0
|
||||
br i1 %cmp81, label %for.body.lr.ph, label %for.end
|
||||
|
||||
for.body.lr.ph:
|
||||
%0 = load float* %B, align 4
|
||||
%arrayidx4 = getelementptr inbounds float* %B, i64 1
|
||||
%1 = load float* %arrayidx4, align 4
|
||||
%arrayidx9 = getelementptr inbounds float* %B, i64 2
|
||||
%2 = load float* %arrayidx9, align 4
|
||||
%arrayidx15 = getelementptr inbounds float* %B, i64 3
|
||||
%3 = load float* %arrayidx15, align 4
|
||||
%arrayidx21 = getelementptr inbounds float* %B, i64 4
|
||||
%4 = load float* %arrayidx21, align 4
|
||||
%arrayidx27 = getelementptr inbounds float* %B, i64 5
|
||||
%5 = load float* %arrayidx27, align 4
|
||||
%arrayidx33 = getelementptr inbounds float* %B, i64 6
|
||||
%6 = load float* %arrayidx33, align 4
|
||||
%arrayidx39 = getelementptr inbounds float* %B, i64 7
|
||||
%7 = load float* %arrayidx39, align 4
|
||||
%arrayidx45 = getelementptr inbounds float* %B, i64 8
|
||||
%8 = load float* %arrayidx45, align 4
|
||||
%9 = sext i32 %n to i64
|
||||
br label %for.body
|
||||
|
||||
for.body:
|
||||
%i.083 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
|
||||
%sum.082 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add51, %for.body ]
|
||||
%mul = mul nsw i64 %i.083, 6
|
||||
%arrayidx2 = getelementptr inbounds float* %A, i64 %mul
|
||||
%10 = load float* %arrayidx2, align 4
|
||||
%mul3 = fmul fast float %0, %10
|
||||
%add80 = or i64 %mul, 1
|
||||
%arrayidx6 = getelementptr inbounds float* %A, i64 %add80
|
||||
%11 = load float* %arrayidx6, align 4
|
||||
%mul7 = fmul fast float %1, %11
|
||||
%add8 = fadd fast float %mul3, %mul7
|
||||
%add11 = add nsw i64 %mul, 2
|
||||
%arrayidx12 = getelementptr inbounds float* %A, i64 %add11
|
||||
%12 = load float* %arrayidx12, align 4
|
||||
%mul13 = fmul fast float %2, %12
|
||||
%add14 = fadd fast float %add8, %mul13
|
||||
%add17 = add nsw i64 %mul, 3
|
||||
%arrayidx18 = getelementptr inbounds float* %A, i64 %add17
|
||||
%13 = load float* %arrayidx18, align 4
|
||||
%mul19 = fmul fast float %3, %13
|
||||
%add20 = fadd fast float %add14, %mul19
|
||||
%add23 = add nsw i64 %mul, 4
|
||||
%arrayidx24 = getelementptr inbounds float* %A, i64 %add23
|
||||
%14 = load float* %arrayidx24, align 4
|
||||
%mul25 = fmul fast float %4, %14
|
||||
%add26 = fadd fast float %add20, %mul25
|
||||
%add29 = add nsw i64 %mul, 5
|
||||
%arrayidx30 = getelementptr inbounds float* %A, i64 %add29
|
||||
%15 = load float* %arrayidx30, align 4
|
||||
%mul31 = fmul fast float %5, %15
|
||||
%add32 = fadd fast float %add26, %mul31
|
||||
%add35 = add nsw i64 %mul, 6
|
||||
%arrayidx36 = getelementptr inbounds float* %A, i64 %add35
|
||||
%16 = load float* %arrayidx36, align 4
|
||||
%mul37 = fmul fast float %6, %16
|
||||
%add38 = fadd fast float %add32, %mul37
|
||||
%add41 = add nsw i64 %mul, 7
|
||||
%arrayidx42 = getelementptr inbounds float* %A, i64 %add41
|
||||
%17 = load float* %arrayidx42, align 4
|
||||
%mul43 = fmul fast float %7, %17
|
||||
%add44 = fadd fast float %add38, %mul43
|
||||
%add47 = add nsw i64 %mul, 8
|
||||
%arrayidx48 = getelementptr inbounds float* %A, i64 %add47
|
||||
%18 = load float* %arrayidx48, align 4
|
||||
%mul49 = fmul fast float %8, %18
|
||||
%add50 = fadd fast float %add44, %mul49
|
||||
%add51 = fadd fast float %sum.082, %add50
|
||||
%inc = add nsw i64 %i.083, 1
|
||||
%exitcond = icmp eq i64 %inc, %9
|
||||
br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
|
||||
|
||||
for.cond.for.end_crit_edge:
|
||||
%phitmp = fptosi float %add51 to i32
|
||||
br label %for.end
|
||||
|
||||
for.end:
|
||||
%sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
|
||||
ret i32 %sum.0.lcssa
|
||||
}
|
||||
|
||||
; int foo(float * restrict A, float * restrict B, int n) {
|
||||
; float sum = 0;
|
||||
; for (intptr_t i=0; i < n; ++i) {
|
||||
; sum += B[0]*A[i*4 ];
|
||||
; sum += B[1]*A[i*4+1];
|
||||
; sum += B[2]*A[i*4+2];
|
||||
; sum += B[3]*A[i*4+3];
|
||||
; }
|
||||
; return sum;
|
||||
; }
|
||||
|
||||
; CHECK-LABEL: chain_red
|
||||
; CHECK: fmul <4 x float>
|
||||
; CHECK: shufflevector <4 x float>
|
||||
|
||||
define i32 @chain_red(float* noalias %A, float* noalias %B, i32 %n) {
|
||||
entry:
|
||||
%cmp41 = icmp sgt i32 %n, 0
|
||||
br i1 %cmp41, label %for.body.lr.ph, label %for.end
|
||||
|
||||
for.body.lr.ph:
|
||||
%0 = load float* %B, align 4
|
||||
%arrayidx4 = getelementptr inbounds float* %B, i64 1
|
||||
%1 = load float* %arrayidx4, align 4
|
||||
%arrayidx10 = getelementptr inbounds float* %B, i64 2
|
||||
%2 = load float* %arrayidx10, align 4
|
||||
%arrayidx16 = getelementptr inbounds float* %B, i64 3
|
||||
%3 = load float* %arrayidx16, align 4
|
||||
%4 = sext i32 %n to i64
|
||||
br label %for.body
|
||||
|
||||
for.body:
|
||||
%i.043 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
|
||||
%sum.042 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add21, %for.body ]
|
||||
%mul = shl nsw i64 %i.043, 2
|
||||
%arrayidx2 = getelementptr inbounds float* %A, i64 %mul
|
||||
%5 = load float* %arrayidx2, align 4
|
||||
%mul3 = fmul fast float %0, %5
|
||||
%add = fadd fast float %sum.042, %mul3
|
||||
%add638 = or i64 %mul, 1
|
||||
%arrayidx7 = getelementptr inbounds float* %A, i64 %add638
|
||||
%6 = load float* %arrayidx7, align 4
|
||||
%mul8 = fmul fast float %1, %6
|
||||
%add9 = fadd fast float %add, %mul8
|
||||
%add1239 = or i64 %mul, 2
|
||||
%arrayidx13 = getelementptr inbounds float* %A, i64 %add1239
|
||||
%7 = load float* %arrayidx13, align 4
|
||||
%mul14 = fmul fast float %2, %7
|
||||
%add15 = fadd fast float %add9, %mul14
|
||||
%add1840 = or i64 %mul, 3
|
||||
%arrayidx19 = getelementptr inbounds float* %A, i64 %add1840
|
||||
%8 = load float* %arrayidx19, align 4
|
||||
%mul20 = fmul fast float %3, %8
|
||||
%add21 = fadd fast float %add15, %mul20
|
||||
%inc = add nsw i64 %i.043, 1
|
||||
%exitcond = icmp eq i64 %inc, %4
|
||||
br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
|
||||
|
||||
for.cond.for.end_crit_edge:
|
||||
%phitmp = fptosi float %add21 to i32
|
||||
br label %for.end
|
||||
|
||||
for.end:
|
||||
%sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
|
||||
ret i32 %sum.0.lcssa
|
||||
}
|
||||
|
||||
; int foo(float * restrict A, float * restrict B, float * restrict C, int n) {
|
||||
; float sum = 0;
|
||||
; for (intptr_t i=0; i < n; ++i) {
|
||||
; C[i] = B[0] *A[i*4 ] +
|
||||
; B[1] *A[i*4+1] +
|
||||
; B[2] *A[i*4+2] +
|
||||
; B[3] *A[i*4+3];
|
||||
; }
|
||||
; return sum;
|
||||
; }
|
||||
|
||||
; CHECK-LABEL: store_red
|
||||
; CHECK: fmul <4 x float>
|
||||
; CHECK: shufflevector <4 x float>
|
||||
|
||||
define i32 @store_red(float* noalias %A, float* noalias %B, float* noalias %C, i32 %n) {
|
||||
entry:
|
||||
%cmp37 = icmp sgt i32 %n, 0
|
||||
br i1 %cmp37, label %for.body.lr.ph, label %for.end
|
||||
|
||||
for.body.lr.ph:
|
||||
%arrayidx4 = getelementptr inbounds float* %B, i64 1
|
||||
%arrayidx9 = getelementptr inbounds float* %B, i64 2
|
||||
%arrayidx15 = getelementptr inbounds float* %B, i64 3
|
||||
%0 = sext i32 %n to i64
|
||||
br label %for.body
|
||||
|
||||
for.body:
|
||||
%i.039 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
|
||||
%C.addr.038 = phi float* [ %C, %for.body.lr.ph ], [ %incdec.ptr, %for.body ]
|
||||
%1 = load float* %B, align 4
|
||||
%mul = shl nsw i64 %i.039, 2
|
||||
%arrayidx2 = getelementptr inbounds float* %A, i64 %mul
|
||||
%2 = load float* %arrayidx2, align 4
|
||||
%mul3 = fmul fast float %1, %2
|
||||
%3 = load float* %arrayidx4, align 4
|
||||
%add34 = or i64 %mul, 1
|
||||
%arrayidx6 = getelementptr inbounds float* %A, i64 %add34
|
||||
%4 = load float* %arrayidx6, align 4
|
||||
%mul7 = fmul fast float %3, %4
|
||||
%add8 = fadd fast float %mul3, %mul7
|
||||
%5 = load float* %arrayidx9, align 4
|
||||
%add1135 = or i64 %mul, 2
|
||||
%arrayidx12 = getelementptr inbounds float* %A, i64 %add1135
|
||||
%6 = load float* %arrayidx12, align 4
|
||||
%mul13 = fmul fast float %5, %6
|
||||
%add14 = fadd fast float %add8, %mul13
|
||||
%7 = load float* %arrayidx15, align 4
|
||||
%add1736 = or i64 %mul, 3
|
||||
%arrayidx18 = getelementptr inbounds float* %A, i64 %add1736
|
||||
%8 = load float* %arrayidx18, align 4
|
||||
%mul19 = fmul fast float %7, %8
|
||||
%add20 = fadd fast float %add14, %mul19
|
||||
store float %add20, float* %C.addr.038, align 4
|
||||
%incdec.ptr = getelementptr inbounds float* %C.addr.038, i64 1
|
||||
%inc = add nsw i64 %i.039, 1
|
||||
%exitcond = icmp eq i64 %inc, %0
|
||||
br i1 %exitcond, label %for.end, label %for.body
|
||||
|
||||
for.end:
|
||||
ret i32 0
|
||||
}
|
||||
|
||||
|
||||
; void foo(double * restrict A, double * restrict B, double * restrict C,
|
||||
; int n) {
|
||||
; for (intptr_t i=0; i < n; ++i) {
|
||||
; C[i] = B[0] *A[i*4 ] + B[1] *A[i*4+1];
|
||||
; }
|
||||
; }
|
||||
|
||||
; CHECK-LABEL: store_red_double
|
||||
; CHECK: fmul <2 x double>
|
||||
; CHECK: extractelement <2 x double>
|
||||
; CHECK: extractelement <2 x double>
|
||||
|
||||
define void @store_red_double(double* noalias %A, double* noalias %B, double* noalias %C, i32 %n) {
|
||||
entry:
|
||||
%cmp17 = icmp sgt i32 %n, 0
|
||||
br i1 %cmp17, label %for.body.lr.ph, label %for.end
|
||||
|
||||
for.body.lr.ph:
|
||||
%0 = load double* %B, align 8
|
||||
%arrayidx4 = getelementptr inbounds double* %B, i64 1
|
||||
%1 = load double* %arrayidx4, align 8
|
||||
%2 = sext i32 %n to i64
|
||||
br label %for.body
|
||||
|
||||
for.body:
|
||||
%i.018 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
|
||||
%mul = shl nsw i64 %i.018, 2
|
||||
%arrayidx2 = getelementptr inbounds double* %A, i64 %mul
|
||||
%3 = load double* %arrayidx2, align 8
|
||||
%mul3 = fmul fast double %0, %3
|
||||
%add16 = or i64 %mul, 1
|
||||
%arrayidx6 = getelementptr inbounds double* %A, i64 %add16
|
||||
%4 = load double* %arrayidx6, align 8
|
||||
%mul7 = fmul fast double %1, %4
|
||||
%add8 = fadd fast double %mul3, %mul7
|
||||
%arrayidx9 = getelementptr inbounds double* %C, i64 %i.018
|
||||
store double %add8, double* %arrayidx9, align 8
|
||||
%inc = add nsw i64 %i.018, 1
|
||||
%exitcond = icmp eq i64 %inc, %2
|
||||
br i1 %exitcond, label %for.end, label %for.body
|
||||
|
||||
for.end:
|
||||
ret void
|
||||
}
|
Loading…
Reference in New Issue