SLPVectorizer: add initial support for reduction variable vectorization.

llvm-svn: 179470
This commit is contained in:
Nadav Rotem 2013-04-14 03:22:20 +00:00
parent dc1ed57858
commit 0b9cf8567b
4 changed files with 147 additions and 7 deletions

View File

@ -26,6 +26,7 @@
#include "llvm/Analysis/Verifier.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"
@ -38,7 +39,7 @@
using namespace llvm;
static cl::opt<int>
SLPCostThreshold("slp-threshold", cl::init(1), cl::Hidden,
SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
cl::desc("Only vectorize trees if the gain is above this "
"number. (gain = -cost of vectorization)"));
namespace {
@ -63,7 +64,7 @@ struct SLPVectorizer : public BasicBlockPass {
/// object. We sort the stores to their base objects to reduce the cost of the
/// quadratic search on the stores. TODO: We can further reduce this cost
/// if we flush the chain creation every time we run into a memory barrier.
bool CollectStores(BasicBlock *BB, BoUpSLP &R) {
bool collectStores(BasicBlock *BB, BoUpSLP &R) {
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
StoreInst *SI = dyn_cast<StoreInst>(it);
if (!SI)
@ -84,7 +85,79 @@ struct SLPVectorizer : public BasicBlockPass {
return true;
bool RollStoreChains(BoUpSLP &R) {
bool tryToVectorizePair(BinaryOperator *A, BinaryOperator *B, BoUpSLP &R) {
if (!A || !B) return false;
BoUpSLP::ValueList VL;
int Cost = R.getTreeCost(VL);
DEBUG(dbgs()<<"SLP: Cost of pair:" << Cost << ".\n");
if (Cost >= -SLPCostThreshold) return false;
DEBUG(dbgs()<<"SLP: Vectorizing pair.\n");
return true;
bool tryToVectorizeCandidate(BinaryOperator *V, BoUpSLP &R) {
if (!V) return false;
BinaryOperator *A = dyn_cast<BinaryOperator>(V->getOperand(0));
BinaryOperator *B = dyn_cast<BinaryOperator>(V->getOperand(1));
// Try to vectorize V.
if (tryToVectorizePair(A, B, R)) return true;
// Try to skip B.
if (B && B->hasOneUse()) {
BinaryOperator *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
BinaryOperator *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
if (tryToVectorizePair(A, B0, R)) {
return true;
if (tryToVectorizePair(A, B1, R)) {
return true;
// Try to slip A.
if (A && A->hasOneUse()) {
BinaryOperator *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
BinaryOperator *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
if (tryToVectorizePair(A0, B, R)) {
return true;
if (tryToVectorizePair(A1, B, R)) {
return true;
return 0;
bool vectorizeReductions(BasicBlock *BB, BoUpSLP &R) {
bool Changed = false;
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
if (isa<DbgInfoIntrinsic>(it)) continue;
PHINode *P = dyn_cast<PHINode>(it);
if (!P) return Changed;
// Check that the PHI is a reduction PHI.
if (P->getNumIncomingValues() != 2) return Changed;
Value *Rdx = (P->getIncomingBlock(0) == BB ? P->getIncomingValue(0) :
(P->getIncomingBlock(1) == BB ? P->getIncomingValue(1) : 0));
// Check if this is a Binary Operator.
BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx);
if (!BI) continue;
Value *Inst = BI->getOperand(0);
if (Inst == P) Inst = BI->getOperand(1);
Changed |= tryToVectorizeCandidate(dyn_cast<BinaryOperator>(Inst), R);
return Changed;
bool rollStoreChains(BoUpSLP &R) {
bool Changed = false;
// Attempt to sort and vectorize each of the store-groups.
for (StoreListMap::iterator it = StoreRefs.begin(), e = StoreRefs.end();
@ -116,13 +189,15 @@ struct SLPVectorizer : public BasicBlockPass {
// he store instructions.
if (!CollectStores(&BB, R))
return false;
bool Changed = vectorizeReductions(&BB, R);
bool Changed = RollStoreChains(R);
if (Changed) {
if (!collectStores(&BB, R))
return Changed;
if (rollStoreChains(R)) {
DEBUG(dbgs()<<"SLP: vectorized in \""<<BB.getParent()->getName()<<"\"\n");
Changed |= true;
return Changed;

View File

@ -208,6 +208,16 @@ Value *BoUpSLP::isUnsafeToSink(Instruction *Src, Instruction *Dst) {
return 0;
void BoUpSLP::vectorizeArith(ValueList &Operands) {
Value *Vec = vectorizeTree(Operands, Operands.size());
BasicBlock::iterator Loc = cast<Instruction>(Vec);
IRBuilder<> Builder(++Loc);
for (unsigned i = 0, e = Operands.size(); i != e; ++i) {
Value *S = Builder.CreateExtractElement(Vec, Builder.getInt32(i));
int BoUpSLP::getTreeCost(ValueList &VL) {
// Get rid of the list of stores that were removed, and from the
// lists of instructions with multiple users.

View File

@ -66,6 +66,9 @@ struct BoUpSLP {
/// \returns true if the basic block was modified.
bool vectorizeStores(StoreList &Stores, int costThreshold);
/// \brief Vectorize a group of scalars into a vector tree.
void vectorizeArith(ValueList &Operands);
/// \returns This method contains the recursive part of getTreeCost.
int getTreeCost_rec(ValueList &VL, unsigned Depth);

View File

@ -0,0 +1,52 @@
; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
target triple = "i386-apple-macosx10.8.0"
; int foo(double *A, int n, int m) {
; double sum = 0, v1 = 2, v0 = 3;
; for (int i=0; i < n; ++i)
; sum += 7*A[i*2] + 7*A[i*2+1];
; return sum;
; }
;CHECK: reduce
;CHECK: load <2 x double>
;CHECK: fmul <2 x double>
;CHECK: ret
define i32 @reduce(double* nocapture %A, i32 %n, i32 %m) #0 {
%cmp13 = icmp sgt i32 %n, 0
br i1 %cmp13, label %for.body, label %for.end
for.body: ; preds = %entry, %for.body
%i.015 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
%sum.014 = phi double [ %add6, %for.body ], [ 0.000000e+00, %entry ]
%mul = shl nsw i32 %i.015, 1
%arrayidx = getelementptr inbounds double* %A, i32 %mul
%0 = load double* %arrayidx, align 4, !tbaa !0
%mul1 = fmul double %0, 7.000000e+00
%add12 = or i32 %mul, 1
%arrayidx3 = getelementptr inbounds double* %A, i32 %add12
%1 = load double* %arrayidx3, align 4, !tbaa !0
%mul4 = fmul double %1, 7.000000e+00
%add5 = fadd double %mul1, %mul4
%add6 = fadd double %sum.014, %add5
%inc = add nsw i32 %i.015, 1
%exitcond = icmp eq i32 %inc, %n
br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
for.cond.for.end_crit_edge: ; preds = %for.body
%phitmp = fptosi double %add6 to i32
br label %for.end
for.end: ; preds = %for.cond.for.end_crit_edge, %entry
%sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
ret i32 %sum.0.lcssa
attributes #0 = { nounwind readonly ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
!0 = metadata !{metadata !"double", metadata !1}
!1 = metadata !{metadata !"omnipotent char", metadata !2}
!2 = metadata !{metadata !"Simple C/C++ TBAA"}