forked from OSchip/llvm-project
Add support for bottom-up SLP vectorization infrastructure.
This commit adds the infrastructure for performing bottom-up SLP vectorization (and other optimizations) on parallel computations. The infrastructure has three potential users: 1. The loop vectorizer needs to be able to vectorize AOS data structures such as (sum += A[i] + A[i+1]). 2. The BB-vectorizer needs this infrastructure for bottom-up SLP vectorization, because bottom-up vectorization is faster to compute. 3. A loop-roller needs to be able to analyze consecutive chains and roll them into a loop, in order to reduce code size. A loop roller does not need to create vector instructions, and this infrastructure separates the chain analysis from the vectorization. This patch also includes a simple (100 LOC) bottom up SLP vectorizer that uses the infrastructure, and can vectorize this code: void SAXPY(int *x, int *y, int a, int i) { x[i] = a * x[i] + y[i]; x[i+1] = a * x[i+1] + y[i+1]; x[i+2] = a * x[i+2] + y[i+2]; x[i+3] = a * x[i+3] + y[i+3]; } llvm-svn: 179117
This commit is contained in:
parent
caeddf5a96
commit
2d9dec322e
|
@ -271,6 +271,7 @@ void initializeInstSimplifierPass(PassRegistry&);
|
|||
void initializeUnpackMachineBundlesPass(PassRegistry&);
|
||||
void initializeFinalizeMachineBundlesPass(PassRegistry&);
|
||||
void initializeLoopVectorizePass(PassRegistry&);
|
||||
void initializeSLPVectorizerPass(PassRegistry&);
|
||||
void initializeBBVectorizePass(PassRegistry&);
|
||||
void initializeMachineFunctionPrinterPassPass(PassRegistry&);
|
||||
}
|
||||
|
|
|
@ -161,6 +161,7 @@ namespace {
|
|||
(void) llvm::createMemDepPrinter();
|
||||
(void) llvm::createInstructionSimplifierPass();
|
||||
(void) llvm::createLoopVectorizePass();
|
||||
(void) llvm::createSLPVectorizerPass();
|
||||
(void) llvm::createBBVectorizePass();
|
||||
|
||||
(void)new llvm::IntervalPartition();
|
||||
|
|
|
@ -116,6 +116,12 @@ createBBVectorizePass(const VectorizeConfig &C = VectorizeConfig());
|
|||
//
|
||||
Pass *createLoopVectorizePass();
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// SLPVectorizer - Create a bottom-up SLP vectorizer pass.
|
||||
//
|
||||
Pass *createSLPVectorizerPass();
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
/// @brief Vectorize the BasicBlock.
|
||||
///
|
||||
|
|
|
@ -2,6 +2,8 @@ add_llvm_library(LLVMVectorize
|
|||
BBVectorize.cpp
|
||||
Vectorize.cpp
|
||||
LoopVectorize.cpp
|
||||
SLPVectorizer.cpp
|
||||
VecUtils.cpp
|
||||
)
|
||||
|
||||
add_dependencies(LLVMVectorize intrinsics_gen)
|
||||
|
|
|
@ -0,0 +1,153 @@
|
|||
//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
|
||||
// stores that can be put together into vector-stores. Next, it attempts to
|
||||
// construct vectorizable tree using the use-def chains. If a profitable tree
|
||||
// was found, the SLP vectorizer performs vectorization on the tree.
|
||||
//
|
||||
// The pass is inspired by the work described in the paper:
|
||||
// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
#define SV_NAME "slp-vectorizer"
|
||||
#define DEBUG_TYPE SV_NAME
|
||||
|
||||
#include "VecUtils.h"
|
||||
#include "llvm/Transforms/Vectorize.h"
|
||||
#include "llvm/Analysis/AliasAnalysis.h"
|
||||
#include "llvm/Analysis/ScalarEvolution.h"
|
||||
#include "llvm/Analysis/TargetTransformInfo.h"
|
||||
#include "llvm/Analysis/Verifier.h"
|
||||
#include "llvm/IR/DataLayout.h"
|
||||
#include "llvm/IR/Instructions.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include "llvm/IR/Type.h"
|
||||
#include "llvm/IR/Value.h"
|
||||
#include "llvm/Pass.h"
|
||||
#include "llvm/Support/CommandLine.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
#include "llvm/Support/raw_ostream.h"
|
||||
#include <map>
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
static cl::opt<int>
|
||||
SLPCostThreshold("slp-threshold", cl::init(1), cl::Hidden,
|
||||
cl::desc("Only vectorize trees if the gain is above this "
|
||||
"number. (gain = -cost of vectorization)"));
|
||||
namespace {
|
||||
|
||||
/// The SLPVectorizer Pass.
|
||||
struct SLPVectorizer : public BasicBlockPass {
|
||||
typedef std::map<Value*, BoUpSLP::StoreList> StoreListMap;
|
||||
|
||||
/// Pass identification, replacement for typeid
|
||||
static char ID;
|
||||
|
||||
explicit SLPVectorizer() : BasicBlockPass(ID) {
|
||||
initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());
|
||||
}
|
||||
|
||||
ScalarEvolution *SE;
|
||||
DataLayout *DL;
|
||||
TargetTransformInfo *TTI;
|
||||
AliasAnalysis *AA;
|
||||
|
||||
/// \brief Collect memory references and sort them according to their base
|
||||
/// object. We sort the stores to their base objects to reduce the cost of the
|
||||
/// quadratic search on the stores. TODO: We can further reduce this cost
|
||||
/// if we flush the chain creation every time we run into a memory barrier.
|
||||
bool CollectStores(BasicBlock *BB, BoUpSLP &R) {
|
||||
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
|
||||
// Can't vectorize instructions with side effects.
|
||||
if (it->mayThrow())
|
||||
return false;
|
||||
|
||||
StoreInst *SI = dyn_cast<StoreInst>(it);
|
||||
if (!SI)
|
||||
continue;
|
||||
|
||||
// Check that the pointer points to scalars.
|
||||
if (SI->getValueOperand()->getType()->isAggregateType())
|
||||
return false;
|
||||
|
||||
// Find the base of the GEP.
|
||||
Value *Ptr = SI->getPointerOperand();
|
||||
if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr))
|
||||
Ptr = GEP->getPointerOperand();
|
||||
|
||||
// Save the store locations.
|
||||
StoreRefs[Ptr].push_back(SI);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool RollStoreChains(BoUpSLP &R) {
|
||||
bool Changed = false;
|
||||
// Attempt to sort and vectorize each of the store-groups.
|
||||
for (StoreListMap::iterator it = StoreRefs.begin(), e = StoreRefs.end();
|
||||
it != e; ++it) {
|
||||
if (it->second.size() < 2)
|
||||
continue;
|
||||
Changed |= R.vectorizeStores(it->second, -SLPCostThreshold);
|
||||
}
|
||||
return Changed;
|
||||
}
|
||||
|
||||
virtual bool runOnBasicBlock(BasicBlock &BB) {
|
||||
SE = &getAnalysis<ScalarEvolution>();
|
||||
DL = getAnalysisIfAvailable<DataLayout>();
|
||||
TTI = &getAnalysis<TargetTransformInfo>();
|
||||
AA = &getAnalysis<AliasAnalysis>();
|
||||
StoreRefs.clear();
|
||||
|
||||
// Use the bollom up slp vectorizer to construct chains that start with
|
||||
// he store instructions.
|
||||
BoUpSLP R(&BB, SE, DL, TTI, AA);
|
||||
|
||||
if (!CollectStores(&BB, R))
|
||||
return false;
|
||||
|
||||
bool Changed = RollStoreChains(R);
|
||||
if (Changed) {
|
||||
DEBUG(dbgs()<<"Rolled chains in \""<<BB.getParent()->getName()<<"\"\n");
|
||||
DEBUG(verifyFunction(*BB.getParent()));
|
||||
}
|
||||
|
||||
return Changed;
|
||||
}
|
||||
|
||||
virtual void getAnalysisUsage(AnalysisUsage &AU) const {
|
||||
BasicBlockPass::getAnalysisUsage(AU);
|
||||
AU.addRequired<ScalarEvolution>();
|
||||
AU.addRequired<AliasAnalysis>();
|
||||
AU.addRequired<TargetTransformInfo>();
|
||||
}
|
||||
|
||||
private:
|
||||
StoreListMap StoreRefs;
|
||||
};
|
||||
|
||||
} // end anonymous namespace
|
||||
|
||||
char SLPVectorizer::ID = 0;
|
||||
static const char lv_name[] = "SLP Vectorizer";
|
||||
INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)
|
||||
INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
|
||||
INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
|
||||
INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
|
||||
INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
|
||||
INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
|
||||
|
||||
namespace llvm {
|
||||
Pass *createSLPVectorizerPass() {
|
||||
return new SLPVectorizer();
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,439 @@
|
|||
//===- VecUtils.h --- Vectorization Utilities -----------------------------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
#define DEBUG_TYPE "VecUtils"
|
||||
|
||||
#include "VecUtils.h"
|
||||
#include "llvm/ADT/DenseMap.h"
|
||||
#include "llvm/ADT/SmallPtrSet.h"
|
||||
#include "llvm/ADT/SmallSet.h"
|
||||
#include "llvm/ADT/SmallVector.h"
|
||||
#include "llvm/Analysis/AliasAnalysis.h"
|
||||
#include "llvm/Analysis/ScalarEvolution.h"
|
||||
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
|
||||
#include "llvm/Analysis/TargetTransformInfo.h"
|
||||
#include "llvm/Analysis/Verifier.h"
|
||||
#include "llvm/IR/Constants.h"
|
||||
#include "llvm/IR/DataLayout.h"
|
||||
#include "llvm/IR/Function.h"
|
||||
#include "llvm/IR/Instructions.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include "llvm/IR/Type.h"
|
||||
#include "llvm/IR/Value.h"
|
||||
#include "llvm/Pass.h"
|
||||
#include "llvm/Support/CommandLine.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
#include "llvm/Support/raw_ostream.h"
|
||||
#include "llvm/Target/TargetLibraryInfo.h"
|
||||
#include "llvm/Transforms/Scalar.h"
|
||||
#include "llvm/Transforms/Utils/Local.h"
|
||||
#include <algorithm>
|
||||
#include <map>
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
namespace llvm {
|
||||
|
||||
BoUpSLP::BoUpSLP(BasicBlock *Bb, ScalarEvolution *S, DataLayout *Dl,
|
||||
TargetTransformInfo *Tti, AliasAnalysis *Aa) :
|
||||
BB(Bb), SE(S), DL(Dl), TTI(Tti), AA(Aa) {
|
||||
numberInstructions();
|
||||
}
|
||||
|
||||
void BoUpSLP::numberInstructions() {
|
||||
int Loc = 0;
|
||||
InstrIdx.clear();
|
||||
InstrVec.clear();
|
||||
// Number the instructions in the block.
|
||||
for (BasicBlock::iterator it=BB->begin(), e=BB->end(); it != e; ++it) {
|
||||
InstrIdx[it] = Loc++;
|
||||
InstrVec.push_back(it);
|
||||
assert(InstrVec[InstrIdx[it]] == it && "Invalid allocation");
|
||||
}
|
||||
}
|
||||
|
||||
Value *BoUpSLP::getPointerOperand(Value *I) {
|
||||
if (LoadInst *LI = dyn_cast<LoadInst>(I)) return LI->getPointerOperand();
|
||||
if (StoreInst *SI = dyn_cast<StoreInst>(I)) return SI->getPointerOperand();
|
||||
return 0;
|
||||
}
|
||||
|
||||
unsigned BoUpSLP::getAddressSpaceOperand(Value *I) {
|
||||
if (LoadInst *L=dyn_cast<LoadInst>(I)) return L->getPointerAddressSpace();
|
||||
if (StoreInst *S=dyn_cast<StoreInst>(I)) return S->getPointerAddressSpace();
|
||||
return -1;
|
||||
}
|
||||
|
||||
bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) {
|
||||
Value *PtrA = getPointerOperand(A);
|
||||
Value *PtrB = getPointerOperand(B);
|
||||
unsigned ASA = getAddressSpaceOperand(A);
|
||||
unsigned ASB = getAddressSpaceOperand(B);
|
||||
|
||||
// Check that the address spaces match and that the pointers are valid.
|
||||
if (!PtrA || !PtrB || (ASA != ASB)) return false;
|
||||
|
||||
// Check that A and B are of the same type.
|
||||
if (PtrA->getType() != PtrB->getType()) return false;
|
||||
|
||||
// Calculate the distance.
|
||||
const SCEV *PtrSCEVA = SE->getSCEV(PtrA);
|
||||
const SCEV *PtrSCEVB = SE->getSCEV(PtrB);
|
||||
const SCEV *OffsetSCEV = SE->getMinusSCEV(PtrSCEVA, PtrSCEVB);
|
||||
const SCEVConstant *ConstOffSCEV = dyn_cast<SCEVConstant>(OffsetSCEV);
|
||||
|
||||
// Non constant distance.
|
||||
if (!ConstOffSCEV) return false;
|
||||
|
||||
unsigned Offset = ConstOffSCEV->getValue()->getSExtValue();
|
||||
Type *Ty = cast<PointerType>(PtrA->getType())->getElementType();
|
||||
// The Instructions are connsecutive if the size of the first load/store is
|
||||
// the same as the offset.
|
||||
unsigned Sz = (DL ? DL->getTypeStoreSize(Ty) : Ty->getScalarSizeInBits()/8);
|
||||
return ((-Offset) == Sz);
|
||||
}
|
||||
|
||||
bool BoUpSLP::vectorizeStores(StoreList &Stores, int costThreshold) {
|
||||
ValueSet Heads, Tails;
|
||||
SmallDenseMap<Value*, Value*> ConsecutiveChain;
|
||||
bool Changed = false;
|
||||
|
||||
// Do a quadratic search on all of the given stores and find
|
||||
// all of the pairs of loads that follow each other.
|
||||
for (unsigned i = 0, e = Stores.size(); i < e; ++i)
|
||||
for (unsigned j = 0; j < e; ++j) {
|
||||
if (i == j) continue;
|
||||
if (isConsecutiveAccess(Stores[i], Stores[j])) {
|
||||
Tails.insert(Stores[j]);
|
||||
Heads.insert(Stores[i]);
|
||||
ConsecutiveChain[Stores[i]] = Stores[j];
|
||||
}
|
||||
}
|
||||
|
||||
// For stores that start but don't end a link in the chain:
|
||||
for (ValueSet::iterator it = Heads.begin(), e = Heads.end();it != e; ++it) {
|
||||
if (Tails.count(*it)) continue;
|
||||
|
||||
// We found a store instr that starts a chain. Now follow the chain and try
|
||||
// to vectorize it.
|
||||
ValueList Operands;
|
||||
Value *I = *it;
|
||||
int MinCost = 0, MinVF = 0;
|
||||
while (Tails.count(I) || Heads.count(I)) {
|
||||
Operands.push_back(I);
|
||||
unsigned VF = Operands.size();
|
||||
if (isPowerOf2_32(VF) && VF > 1) {
|
||||
int cost = getTreeRollCost(Operands, 0);
|
||||
DEBUG(dbgs() << "Found cost=" << cost << " for VF=" << VF << "\n");
|
||||
if (cost < MinCost) { MinCost = cost; MinVF = VF; }
|
||||
}
|
||||
// Move to the next value in the chain.
|
||||
I = ConsecutiveChain[I];
|
||||
}
|
||||
|
||||
if (MinCost <= costThreshold && MinVF > 1) {
|
||||
DEBUG(dbgs() << "Decided to vectorize cost=" << MinCost << "\n");
|
||||
vectorizeTree(Operands, MinVF);
|
||||
Stores.clear();
|
||||
// The current numbering is invalid because we added and removed instrs.
|
||||
numberInstructions();
|
||||
Changed = true;
|
||||
}
|
||||
}
|
||||
|
||||
return Changed;
|
||||
}
|
||||
|
||||
int BoUpSLP::getScalarizationCost(Type *Ty) {
|
||||
int Cost = 0;
|
||||
for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
|
||||
Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
|
||||
return Cost;
|
||||
}
|
||||
|
||||
AliasAnalysis::Location BoUpSLP::getLocation(Instruction *I) {
|
||||
if (StoreInst *SI = dyn_cast<StoreInst>(I)) return AA->getLocation(SI);
|
||||
if (LoadInst *LI = dyn_cast<LoadInst>(I)) return AA->getLocation(LI);
|
||||
return AliasAnalysis::Location();
|
||||
}
|
||||
|
||||
Value *BoUpSLP::isUnsafeToSink(Instruction *Src, Instruction *Dst) {
|
||||
assert(Src->getParent() == Dst->getParent() && "Not the same BB");
|
||||
BasicBlock::iterator I = Src, E = Dst;
|
||||
/// Scan all of the instruction from SRC to DST and check if
|
||||
/// the source may alias.
|
||||
for (++I; I != E; ++I) {
|
||||
// Ignore store instructions that are marked as 'ignore'.
|
||||
if (MemBarrierIgnoreList.count(I)) continue;
|
||||
if (Src->mayWriteToMemory()) /* Write */ {
|
||||
if (!I->mayReadOrWriteMemory()) continue;
|
||||
} else /* Read */ {
|
||||
if (!I->mayWriteToMemory()) continue;
|
||||
}
|
||||
AliasAnalysis::Location A = getLocation(&*I);
|
||||
AliasAnalysis::Location B = getLocation(Src);
|
||||
|
||||
if (!A.Ptr || !B.Ptr || AA->alias(A, B))
|
||||
return I;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int BoUpSLP::getTreeRollCost(ValueList &VL, unsigned Depth) {
|
||||
if (Depth == 6) return max_cost;
|
||||
Type *ScalarTy = VL[0]->getType();
|
||||
|
||||
if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
|
||||
ScalarTy = SI->getValueOperand()->getType();
|
||||
|
||||
/// Don't mess with vectors.
|
||||
if (ScalarTy->isVectorTy()) return max_cost;
|
||||
|
||||
VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
|
||||
|
||||
// Check if all of the operands are constants.
|
||||
bool AllConst = true;
|
||||
bool AllSameScalar = true;
|
||||
for (unsigned i = 0, e = VL.size(); i < e; ++i) {
|
||||
AllConst &= isa<Constant>(VL[i]);
|
||||
AllSameScalar &= (VL[0] == VL[i]);
|
||||
// Must have a single use.
|
||||
Instruction *I = dyn_cast<Instruction>(VL[i]);
|
||||
// Need to scalarize instructions with multiple users or from other BBs.
|
||||
if (I && ((I->getNumUses() > 1) || (I->getParent() != BB)))
|
||||
return getScalarizationCost(VecTy);
|
||||
}
|
||||
|
||||
// Is this a simple vector constant.
|
||||
if (AllConst) return 0;
|
||||
|
||||
// If all of the operands are identical we can broadcast them.
|
||||
if (AllSameScalar)
|
||||
return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
|
||||
|
||||
// Scalarize unknown structures.
|
||||
Instruction *VL0 = dyn_cast<Instruction>(VL[0]);
|
||||
if (!VL0) return getScalarizationCost(VecTy);
|
||||
assert(VL0->getParent() == BB && "Wrong BB");
|
||||
|
||||
unsigned Opcode = VL0->getOpcode();
|
||||
for (unsigned i = 0, e = VL.size(); i < e; ++i) {
|
||||
Instruction *I = dyn_cast<Instruction>(VL[i]);
|
||||
// If not all of the instructions are identical then we have to scalarize.
|
||||
if (!I || Opcode != I->getOpcode()) return getScalarizationCost(VecTy);
|
||||
}
|
||||
|
||||
// Check if it is safe to sink the loads or the stores.
|
||||
if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
|
||||
int MaxIdx = InstrIdx[VL0];
|
||||
for (unsigned i = 1, e = VL.size(); i < e; ++i )
|
||||
MaxIdx = std::max(MaxIdx, InstrIdx[VL[i]]);
|
||||
|
||||
Instruction *Last = InstrVec[MaxIdx];
|
||||
for (unsigned i = 0, e = VL.size(); i < e; ++i ) {
|
||||
if (VL[i] == Last) continue;
|
||||
Value *Barrier = isUnsafeToSink(cast<Instruction>(VL[i]), Last);
|
||||
if (Barrier) {
|
||||
DEBUG(dbgs() << "LR: Can't sink " << *VL[i] << "\n down to " <<
|
||||
*Last << "\n because of " << *Barrier << "\n");
|
||||
return max_cost;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
switch (Opcode) {
|
||||
case Instruction::Add:
|
||||
case Instruction::FAdd:
|
||||
case Instruction::Sub:
|
||||
case Instruction::FSub:
|
||||
case Instruction::Mul:
|
||||
case Instruction::FMul:
|
||||
case Instruction::UDiv:
|
||||
case Instruction::SDiv:
|
||||
case Instruction::FDiv:
|
||||
case Instruction::URem:
|
||||
case Instruction::SRem:
|
||||
case Instruction::FRem:
|
||||
case Instruction::Shl:
|
||||
case Instruction::LShr:
|
||||
case Instruction::AShr:
|
||||
case Instruction::And:
|
||||
case Instruction::Or:
|
||||
case Instruction::Xor: {
|
||||
ValueList Operands;
|
||||
int Cost = 0;
|
||||
// Calculate the cost of all of the operands.
|
||||
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
|
||||
// Prepare the operand vector.
|
||||
for (unsigned j = 0; j < VL.size(); ++j)
|
||||
Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
|
||||
Cost += getTreeRollCost(Operands, Depth+1);
|
||||
Operands.clear();
|
||||
}
|
||||
|
||||
// Calculate the cost of this instruction.
|
||||
int ScalarCost = VecTy->getNumElements() *
|
||||
TTI->getArithmeticInstrCost(Opcode, ScalarTy);
|
||||
int VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy);
|
||||
Cost += (VecCost - ScalarCost);
|
||||
return Cost;
|
||||
}
|
||||
case Instruction::Load: {
|
||||
// If we are scalarize the loads, add the cost of forming the vector.
|
||||
for (unsigned i = 0, e = VL.size()-1; i < e; ++i)
|
||||
if (!isConsecutiveAccess(VL[i], VL[i+1]))
|
||||
return getScalarizationCost(VecTy);
|
||||
|
||||
// Cost of wide load - cost of scalar loads.
|
||||
int ScalarLdCost = VecTy->getNumElements() *
|
||||
TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
|
||||
int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
|
||||
return VecLdCost - ScalarLdCost;
|
||||
}
|
||||
case Instruction::Store: {
|
||||
// We know that we can merge the stores. Calculate the cost.
|
||||
int ScalarStCost = VecTy->getNumElements() *
|
||||
TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0);
|
||||
int VecStCost = TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1,0);
|
||||
int StoreCost = VecStCost - ScalarStCost;
|
||||
|
||||
ValueList Operands;
|
||||
for (unsigned j = 0; j < VL.size(); ++j) {
|
||||
Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
|
||||
MemBarrierIgnoreList.insert(VL[j]);
|
||||
}
|
||||
|
||||
int TotalCost = StoreCost + getTreeRollCost(Operands, Depth + 1);
|
||||
MemBarrierIgnoreList.clear();
|
||||
return TotalCost;
|
||||
}
|
||||
default:
|
||||
// Unable to vectorize unknown instructions.
|
||||
return getScalarizationCost(VecTy);
|
||||
}
|
||||
}
|
||||
|
||||
Instruction *BoUpSLP::GetLastInstr(ValueList &VL, unsigned VF) {
|
||||
int MaxIdx = InstrIdx[BB->getFirstNonPHI()];
|
||||
for (unsigned i = 0; i < VF; ++i )
|
||||
MaxIdx = std::max(MaxIdx, InstrIdx[VL[i]]);
|
||||
return InstrVec[MaxIdx + 1];
|
||||
}
|
||||
|
||||
Value *BoUpSLP::Scalarize(ValueList &VL, VectorType *Ty) {
|
||||
IRBuilder<> Builder(GetLastInstr(VL, Ty->getNumElements()));
|
||||
Value *Vec = UndefValue::get(Ty);
|
||||
for (unsigned i=0; i < Ty->getNumElements(); ++i)
|
||||
Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
|
||||
return Vec;
|
||||
}
|
||||
|
||||
Value *BoUpSLP::vectorizeTree(ValueList &VL, int VF) {
|
||||
Type *ScalarTy = VL[0]->getType();
|
||||
if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
|
||||
ScalarTy = SI->getValueOperand()->getType();
|
||||
VectorType *VecTy = VectorType::get(ScalarTy, VF);
|
||||
|
||||
// Check if all of the operands are constants or identical.
|
||||
bool AllConst = true;
|
||||
bool AllSameScalar = true;
|
||||
for (unsigned i = 0, e = VF; i < e; ++i) {
|
||||
AllConst &= !!dyn_cast<Constant>(VL[i]);
|
||||
AllSameScalar &= (VL[0] == VL[i]);
|
||||
// Must have a single use.
|
||||
Instruction *I = dyn_cast<Instruction>(VL[i]);
|
||||
if (I && (I->getNumUses() > 1 || I->getParent() != BB))
|
||||
return Scalarize(VL, VecTy);
|
||||
}
|
||||
|
||||
// Is this a simple vector constant.
|
||||
if (AllConst || AllSameScalar) return Scalarize(VL, VecTy);
|
||||
|
||||
// Scalarize unknown structures.
|
||||
Instruction *VL0 = dyn_cast<Instruction>(VL[0]);
|
||||
if (!VL0) return Scalarize(VL, VecTy);
|
||||
|
||||
unsigned Opcode = VL0->getOpcode();
|
||||
for (unsigned i = 0, e = VF; i < e; ++i) {
|
||||
Instruction *I = dyn_cast<Instruction>(VL[i]);
|
||||
// If not all of the instructions are identical then we have to scalarize.
|
||||
if (!I || Opcode != I->getOpcode()) return Scalarize(VL, VecTy);
|
||||
}
|
||||
|
||||
switch (Opcode) {
|
||||
case Instruction::Add:
|
||||
case Instruction::FAdd:
|
||||
case Instruction::Sub:
|
||||
case Instruction::FSub:
|
||||
case Instruction::Mul:
|
||||
case Instruction::FMul:
|
||||
case Instruction::UDiv:
|
||||
case Instruction::SDiv:
|
||||
case Instruction::FDiv:
|
||||
case Instruction::URem:
|
||||
case Instruction::SRem:
|
||||
case Instruction::FRem:
|
||||
case Instruction::Shl:
|
||||
case Instruction::LShr:
|
||||
case Instruction::AShr:
|
||||
case Instruction::And:
|
||||
case Instruction::Or:
|
||||
case Instruction::Xor: {
|
||||
ValueList LHSVL, RHSVL;
|
||||
for (int i = 0; i < VF; ++i) {
|
||||
RHSVL.push_back(cast<Instruction>(VL[i])->getOperand(0));
|
||||
LHSVL.push_back(cast<Instruction>(VL[i])->getOperand(1));
|
||||
}
|
||||
|
||||
Value *RHS = vectorizeTree(RHSVL, VF);
|
||||
Value *LHS = vectorizeTree(LHSVL, VF);
|
||||
IRBuilder<> Builder(GetLastInstr(VL, VF));
|
||||
BinaryOperator *BinOp = dyn_cast<BinaryOperator>(VL0);
|
||||
return Builder.CreateBinOp(BinOp->getOpcode(), RHS,LHS);
|
||||
}
|
||||
case Instruction::Load: {
|
||||
LoadInst *LI = dyn_cast<LoadInst>(VL0);
|
||||
unsigned Alignment = LI->getAlignment();
|
||||
|
||||
// Check if all of the loads are consecutive.
|
||||
for (unsigned i = 1, e = VF; i < e; ++i)
|
||||
if (!isConsecutiveAccess(VL[i-1], VL[i]))
|
||||
return Scalarize(VL, VecTy);
|
||||
|
||||
IRBuilder<> Builder(GetLastInstr(VL, VF));
|
||||
Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
|
||||
VecTy->getPointerTo());
|
||||
LI = Builder.CreateLoad(VecPtr);
|
||||
LI->setAlignment(Alignment);
|
||||
return LI;
|
||||
}
|
||||
case Instruction::Store: {
|
||||
StoreInst *SI = dyn_cast<StoreInst>(VL0);
|
||||
unsigned Alignment = SI->getAlignment();
|
||||
|
||||
ValueList ValueOp;
|
||||
for (int i = 0; i < VF; ++i)
|
||||
ValueOp.push_back(cast<StoreInst>(VL[i])->getValueOperand());
|
||||
|
||||
Value *VecValue = vectorizeTree(ValueOp, VF);
|
||||
|
||||
IRBuilder<> Builder(GetLastInstr(VL, VF));
|
||||
Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(),
|
||||
VecTy->getPointerTo());
|
||||
Builder.CreateStore(VecValue, VecPtr)->setAlignment(Alignment);
|
||||
|
||||
for (int i = 0; i < VF; ++i)
|
||||
cast<Instruction>(VL[i])->eraseFromParent();
|
||||
return 0;
|
||||
}
|
||||
default:
|
||||
return Scalarize(VL, VecTy);
|
||||
}
|
||||
}
|
||||
|
||||
} // end of namespace
|
|
@ -0,0 +1,108 @@
|
|||
//===- VecUtils.cpp - Vectorization Utilities -----------------------------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This family of classes and functions manipulate vectors and chains of
|
||||
// vectors.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef LLVM_TRANSFORMS_VECTORIZE_AOSVECTORIZER_H
|
||||
#define LLVM_TRANSFORMS_VECTORIZE_AOSVECTORIZER_H
|
||||
|
||||
#include "llvm/ADT/DenseMap.h"
|
||||
#include "llvm/ADT/SmallPtrSet.h"
|
||||
#include "llvm/ADT/SmallSet.h"
|
||||
#include "llvm/ADT/SmallVector.h"
|
||||
#include "llvm/Analysis/AliasAnalysis.h"
|
||||
#include <vector>
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
namespace llvm {
|
||||
|
||||
class BasicBlock; class Instruction; class Type;
|
||||
class VectorType; class StoreInst; class Value;
|
||||
class ScalarEvolution; class DataLayout;
|
||||
class TargetTransformInfo; class AliasAnalysis;
|
||||
|
||||
/// Bottom Up SLP vectorization utility class.
|
||||
struct BoUpSLP {
|
||||
typedef SmallVector<Value*, 8> ValueList;
|
||||
typedef SmallPtrSet<Value*, 16> ValueSet;
|
||||
typedef SmallVector<StoreInst*, 8> StoreList;
|
||||
static const int max_cost = 1<<20;
|
||||
|
||||
// \brief C'tor.
|
||||
BoUpSLP(BasicBlock *Bb, ScalarEvolution *Se, DataLayout *Dl,
|
||||
TargetTransformInfo *Tti, AliasAnalysis *Aa);
|
||||
|
||||
/// \returns true if the memory operations A and B are consecutive.
|
||||
bool isConsecutiveAccess(Value *A, Value *B);
|
||||
|
||||
/// \brief Vectorize the tree that starts with the elements in \p VL.
|
||||
/// \returns the vectorized value.
|
||||
Value *vectorizeTree(ValueList &VL, int VF);
|
||||
|
||||
/// \returns the vectorization cost of the subtree that starts at \p VL.
|
||||
/// A negative number means that this is profitable.
|
||||
int getTreeRollCost(ValueList &VL, unsigned Depth);
|
||||
|
||||
/// \brief Take the pointer operand from the Load/Store instruction.
|
||||
/// \returns NULL if this is not a valid Load/Store instruction.
|
||||
static Value *getPointerOperand(Value *I);
|
||||
|
||||
/// \brief Take the address space operand from the Load/Store instruction.
|
||||
/// \returns -1 if this is not a valid Load/Store instruction.
|
||||
static unsigned getAddressSpaceOperand(Value *I);
|
||||
|
||||
/// \brief Attempts to order and vectorize a sequence of stores. This
|
||||
/// function does a quadratic scan of the given stores.
|
||||
/// \returns true if the basic block was modified.
|
||||
bool vectorizeStores(StoreList &Stores, int costThreshold);
|
||||
|
||||
/// \brief Number all of the instructions in the block.
|
||||
void numberInstructions();
|
||||
|
||||
private:
|
||||
/// \returns the scalarization cost for this type. Scalarization in this
|
||||
/// context means the creation of vectors from a group of scalars.
|
||||
int getScalarizationCost(Type *Ty);
|
||||
|
||||
/// \returns the AA location that is being access by the instruction.
|
||||
AliasAnalysis::Location getLocation(Instruction *I);
|
||||
|
||||
/// \brief Checks if it is possible to sink an instruction from
|
||||
/// \p Src to \p Dst.
|
||||
/// \returns the pointer to the barrier instruction if we can't sink.
|
||||
Value *isUnsafeToSink(Instruction *Src, Instruction *Dst);
|
||||
|
||||
/// \returns the instruction that appears last in the BB from \p VL.
|
||||
/// Only consider the first \p VF elements.
|
||||
Instruction *GetLastInstr(ValueList &VL, unsigned VF);
|
||||
|
||||
/// \returns a vector from a collection of scalars in \p VL.
|
||||
Value *Scalarize(ValueList &VL, VectorType *Ty);
|
||||
|
||||
// Maps instructions to numbers and back.
|
||||
SmallDenseMap<Value*, int> InstrIdx;
|
||||
std::vector<Instruction*> InstrVec;
|
||||
// A list of instructions to ignore while sinking
|
||||
// memory instructions.
|
||||
SmallSet<Value*, 8> MemBarrierIgnoreList;
|
||||
// Analysis and block reference.
|
||||
BasicBlock *BB;
|
||||
ScalarEvolution *SE;
|
||||
DataLayout *DL;
|
||||
TargetTransformInfo *TTI;
|
||||
AliasAnalysis *AA;
|
||||
};
|
||||
|
||||
} // end of namespace
|
||||
# endif //LLVM_TRANSFORMS_VECTORIZE_AOSVECTORIZER_H
|
||||
|
|
@ -28,6 +28,7 @@ using namespace llvm;
|
|||
void llvm::initializeVectorization(PassRegistry &Registry) {
|
||||
initializeBBVectorizePass(Registry);
|
||||
initializeLoopVectorizePass(Registry);
|
||||
initializeSLPVectorizerPass(Registry);
|
||||
}
|
||||
|
||||
void LLVMInitializeVectorization(LLVMPassRegistryRef R) {
|
||||
|
@ -41,3 +42,7 @@ void LLVMAddBBVectorizePass(LLVMPassManagerRef PM) {
|
|||
void LLVMAddLoopVectorizePass(LLVMPassManagerRef PM) {
|
||||
unwrap(PM)->add(createLoopVectorizePass());
|
||||
}
|
||||
|
||||
void LLVMAddLoopRollerPass(LLVMPassManagerRef PM) {
|
||||
unwrap(PM)->add(createSLPVectorizerPass());
|
||||
}
|
||||
|
|
|
@ -0,0 +1,51 @@
|
|||
; RUN: opt < %s -basicaa -slp-vectorizer -slp-threshold=1000 -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
|
||||
|
||||
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
|
||||
target triple = "x86_64-apple-macosx10.8.0"
|
||||
|
||||
; Check that the command line flag works.
|
||||
;CHECK:rollable
|
||||
;CHECK-NOT:load <4 x i32>
|
||||
;CHECK: ret
|
||||
|
||||
define i32 @rollable(i32* noalias nocapture %in, i32* noalias nocapture %out, i64 %n) nounwind ssp uwtable {
|
||||
%1 = icmp eq i64 %n, 0
|
||||
br i1 %1, label %._crit_edge, label %.lr.ph
|
||||
|
||||
.lr.ph: ; preds = %0, %.lr.ph
|
||||
%i.019 = phi i64 [ %26, %.lr.ph ], [ 0, %0 ]
|
||||
%2 = shl i64 %i.019, 2
|
||||
%3 = getelementptr inbounds i32* %in, i64 %2
|
||||
%4 = load i32* %3, align 4
|
||||
%5 = or i64 %2, 1
|
||||
%6 = getelementptr inbounds i32* %in, i64 %5
|
||||
%7 = load i32* %6, align 4
|
||||
%8 = or i64 %2, 2
|
||||
%9 = getelementptr inbounds i32* %in, i64 %8
|
||||
%10 = load i32* %9, align 4
|
||||
%11 = or i64 %2, 3
|
||||
%12 = getelementptr inbounds i32* %in, i64 %11
|
||||
%13 = load i32* %12, align 4
|
||||
%14 = mul i32 %4, 7
|
||||
%15 = add i32 %14, 7
|
||||
%16 = mul i32 %7, 7
|
||||
%17 = add i32 %16, 14
|
||||
%18 = mul i32 %10, 7
|
||||
%19 = add i32 %18, 21
|
||||
%20 = mul i32 %13, 7
|
||||
%21 = add i32 %20, 28
|
||||
%22 = getelementptr inbounds i32* %out, i64 %2
|
||||
store i32 %15, i32* %22, align 4
|
||||
%23 = getelementptr inbounds i32* %out, i64 %5
|
||||
store i32 %17, i32* %23, align 4
|
||||
%24 = getelementptr inbounds i32* %out, i64 %8
|
||||
store i32 %19, i32* %24, align 4
|
||||
%25 = getelementptr inbounds i32* %out, i64 %11
|
||||
store i32 %21, i32* %25, align 4
|
||||
%26 = add i64 %i.019, 1
|
||||
%exitcond = icmp eq i64 %26, %n
|
||||
br i1 %exitcond, label %._crit_edge, label %.lr.ph
|
||||
|
||||
._crit_edge: ; preds = %.lr.ph, %0
|
||||
ret i32 undef
|
||||
}
|
|
@ -0,0 +1,6 @@
|
|||
config.suffixes = ['.ll', '.c', '.cpp']
|
||||
|
||||
targets = set(config.root.targets_to_build.split())
|
||||
if not 'X86' in targets:
|
||||
config.unsupported = True
|
||||
|
|
@ -0,0 +1,47 @@
|
|||
; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
|
||||
|
||||
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
|
||||
target triple = "x86_64-apple-macosx10.7.0"
|
||||
|
||||
;int foo (int *A, int n) {
|
||||
; A[0] += n * 5 + 7;
|
||||
; A[1] += n * 5 + 8;
|
||||
; A[2] += n * 5 + 9;
|
||||
; A[3] += n * 5 + 10;
|
||||
; A[4] += n * 5 + 11;
|
||||
;}
|
||||
|
||||
;CHECK: @foo
|
||||
;CHECK: insertelement <4 x i32>
|
||||
;CHECK: load <4 x i32>
|
||||
;CHECK: add <4 x i32>
|
||||
;CHECK: store <4 x i32>
|
||||
;CHECK: ret
|
||||
define i32 @foo(i32* nocapture %A, i32 %n) nounwind ssp uwtable {
|
||||
%1 = mul nsw i32 %n, 5
|
||||
%2 = add nsw i32 %1, 7
|
||||
%3 = load i32* %A, align 4
|
||||
%4 = add nsw i32 %2, %3
|
||||
store i32 %4, i32* %A, align 4
|
||||
%5 = add nsw i32 %1, 8
|
||||
%6 = getelementptr inbounds i32* %A, i64 1
|
||||
%7 = load i32* %6, align 4
|
||||
%8 = add nsw i32 %5, %7
|
||||
store i32 %8, i32* %6, align 4
|
||||
%9 = add nsw i32 %1, 9
|
||||
%10 = getelementptr inbounds i32* %A, i64 2
|
||||
%11 = load i32* %10, align 4
|
||||
%12 = add nsw i32 %9, %11
|
||||
store i32 %12, i32* %10, align 4
|
||||
%13 = add nsw i32 %1, 10
|
||||
%14 = getelementptr inbounds i32* %A, i64 3
|
||||
%15 = load i32* %14, align 4
|
||||
%16 = add nsw i32 %13, %15
|
||||
store i32 %16, i32* %14, align 4
|
||||
%17 = add nsw i32 %1, 11
|
||||
%18 = getelementptr inbounds i32* %A, i64 4
|
||||
%19 = load i32* %18, align 4
|
||||
%20 = add nsw i32 %17, %19
|
||||
store i32 %20, i32* %18, align 4
|
||||
ret i32 undef
|
||||
}
|
|
@ -0,0 +1,50 @@
|
|||
; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
|
||||
|
||||
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
|
||||
target triple = "x86_64-apple-macosx10.8.0"
|
||||
|
||||
; SLP vectorization example from http://cs.stanford.edu/people/eschkufz/research/asplos291-schkufza.pdf
|
||||
;CHECK: SAXPY
|
||||
;CHECK: mul <4 x i32>
|
||||
;CHECK: ret
|
||||
|
||||
define void @SAXPY(i32* noalias nocapture %x, i32* noalias nocapture %y, i32 %a, i64 %i) #0 {
|
||||
%1 = getelementptr inbounds i32* %x, i64 %i
|
||||
%2 = load i32* %1, align 4, !tbaa !0
|
||||
%3 = mul nsw i32 %2, %a
|
||||
%4 = getelementptr inbounds i32* %y, i64 %i
|
||||
%5 = load i32* %4, align 4, !tbaa !0
|
||||
%6 = add nsw i32 %3, %5
|
||||
store i32 %6, i32* %1, align 4, !tbaa !0
|
||||
%7 = add i64 %i, 1
|
||||
%8 = getelementptr inbounds i32* %x, i64 %7
|
||||
%9 = load i32* %8, align 4, !tbaa !0
|
||||
%10 = mul nsw i32 %9, %a
|
||||
%11 = getelementptr inbounds i32* %y, i64 %7
|
||||
%12 = load i32* %11, align 4, !tbaa !0
|
||||
%13 = add nsw i32 %10, %12
|
||||
store i32 %13, i32* %8, align 4, !tbaa !0
|
||||
%14 = add i64 %i, 2
|
||||
%15 = getelementptr inbounds i32* %x, i64 %14
|
||||
%16 = load i32* %15, align 4, !tbaa !0
|
||||
%17 = mul nsw i32 %16, %a
|
||||
%18 = getelementptr inbounds i32* %y, i64 %14
|
||||
%19 = load i32* %18, align 4, !tbaa !0
|
||||
%20 = add nsw i32 %17, %19
|
||||
store i32 %20, i32* %15, align 4, !tbaa !0
|
||||
%21 = add i64 %i, 3
|
||||
%22 = getelementptr inbounds i32* %x, i64 %21
|
||||
%23 = load i32* %22, align 4, !tbaa !0
|
||||
%24 = mul nsw i32 %23, %a
|
||||
%25 = getelementptr inbounds i32* %y, i64 %21
|
||||
%26 = load i32* %25, align 4, !tbaa !0
|
||||
%27 = add nsw i32 %24, %26
|
||||
store i32 %27, i32* %22, align 4, !tbaa !0
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
|
||||
!0 = metadata !{metadata !"int", metadata !1}
|
||||
!1 = metadata !{metadata !"omnipotent char", metadata !2}
|
||||
!2 = metadata !{metadata !"Simple C/C++ TBAA"}
|
|
@ -0,0 +1,100 @@
|
|||
; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
|
||||
|
||||
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
|
||||
target triple = "x86_64-apple-macosx10.8.0"
|
||||
|
||||
;CHECK:rollable
|
||||
define i32 @rollable(i32* noalias nocapture %in, i32* noalias nocapture %out, i64 %n) nounwind ssp uwtable {
|
||||
%1 = icmp eq i64 %n, 0
|
||||
br i1 %1, label %._crit_edge, label %.lr.ph
|
||||
|
||||
.lr.ph: ; preds = %0, %.lr.ph
|
||||
%i.019 = phi i64 [ %26, %.lr.ph ], [ 0, %0 ]
|
||||
%2 = shl i64 %i.019, 2
|
||||
%3 = getelementptr inbounds i32* %in, i64 %2
|
||||
;CHECK:load <4 x i32>
|
||||
%4 = load i32* %3, align 4
|
||||
%5 = or i64 %2, 1
|
||||
%6 = getelementptr inbounds i32* %in, i64 %5
|
||||
%7 = load i32* %6, align 4
|
||||
%8 = or i64 %2, 2
|
||||
%9 = getelementptr inbounds i32* %in, i64 %8
|
||||
%10 = load i32* %9, align 4
|
||||
%11 = or i64 %2, 3
|
||||
%12 = getelementptr inbounds i32* %in, i64 %11
|
||||
%13 = load i32* %12, align 4
|
||||
;CHECK:mul <4 x i32>
|
||||
%14 = mul i32 %4, 7
|
||||
;CHECK:add <4 x i32>
|
||||
%15 = add i32 %14, 7
|
||||
%16 = mul i32 %7, 7
|
||||
%17 = add i32 %16, 14
|
||||
%18 = mul i32 %10, 7
|
||||
%19 = add i32 %18, 21
|
||||
%20 = mul i32 %13, 7
|
||||
%21 = add i32 %20, 28
|
||||
%22 = getelementptr inbounds i32* %out, i64 %2
|
||||
;CHECK:store <4 x i32>
|
||||
store i32 %15, i32* %22, align 4
|
||||
%23 = getelementptr inbounds i32* %out, i64 %5
|
||||
store i32 %17, i32* %23, align 4
|
||||
%24 = getelementptr inbounds i32* %out, i64 %8
|
||||
store i32 %19, i32* %24, align 4
|
||||
%25 = getelementptr inbounds i32* %out, i64 %11
|
||||
store i32 %21, i32* %25, align 4
|
||||
%26 = add i64 %i.019, 1
|
||||
%exitcond = icmp eq i64 %26, %n
|
||||
br i1 %exitcond, label %._crit_edge, label %.lr.ph
|
||||
|
||||
._crit_edge: ; preds = %.lr.ph, %0
|
||||
;CHECK: ret
|
||||
ret i32 undef
|
||||
}
|
||||
|
||||
;CHECK:unrollable
|
||||
;CHECK-NOT: <4 x i32>
|
||||
;CHECK: ret
|
||||
define i32 @unrollable(i32* %in, i32* %out, i64 %n) nounwind ssp uwtable {
|
||||
%1 = icmp eq i64 %n, 0
|
||||
br i1 %1, label %._crit_edge, label %.lr.ph
|
||||
|
||||
.lr.ph: ; preds = %0, %.lr.ph
|
||||
%i.019 = phi i64 [ %26, %.lr.ph ], [ 0, %0 ]
|
||||
%2 = shl i64 %i.019, 2
|
||||
%3 = getelementptr inbounds i32* %in, i64 %2
|
||||
%4 = load i32* %3, align 4
|
||||
%5 = or i64 %2, 1
|
||||
%6 = getelementptr inbounds i32* %in, i64 %5
|
||||
%7 = load i32* %6, align 4
|
||||
%8 = or i64 %2, 2
|
||||
%9 = getelementptr inbounds i32* %in, i64 %8
|
||||
%10 = load i32* %9, align 4
|
||||
%11 = or i64 %2, 3
|
||||
%12 = getelementptr inbounds i32* %in, i64 %11
|
||||
%13 = load i32* %12, align 4
|
||||
%14 = mul i32 %4, 7
|
||||
%15 = add i32 %14, 7
|
||||
%16 = mul i32 %7, 7
|
||||
%17 = add i32 %16, 14
|
||||
%18 = mul i32 %10, 7
|
||||
%19 = add i32 %18, 21
|
||||
%20 = mul i32 %13, 7
|
||||
%21 = add i32 %20, 28
|
||||
%22 = getelementptr inbounds i32* %out, i64 %2
|
||||
store i32 %15, i32* %22, align 4
|
||||
%23 = getelementptr inbounds i32* %out, i64 %5
|
||||
store i32 %17, i32* %23, align 4
|
||||
%barrier = call i32 @goo(i32 0) ; <---------------- memory barrier.
|
||||
%24 = getelementptr inbounds i32* %out, i64 %8
|
||||
store i32 %19, i32* %24, align 4
|
||||
%25 = getelementptr inbounds i32* %out, i64 %11
|
||||
store i32 %21, i32* %25, align 4
|
||||
%26 = add i64 %i.019, 1
|
||||
%exitcond = icmp eq i64 %26, %n
|
||||
br i1 %exitcond, label %._crit_edge, label %.lr.ph
|
||||
|
||||
._crit_edge: ; preds = %.lr.ph, %0
|
||||
ret i32 undef
|
||||
}
|
||||
|
||||
declare i32 @goo(i32)
|
|
@ -0,0 +1,25 @@
|
|||
; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
|
||||
|
||||
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
|
||||
target triple = "x86_64-apple-macosx10.8.0"
|
||||
|
||||
; Simple 3-pair chain with loads and stores
|
||||
; CHECK: test1
|
||||
; CHECK: store <2 x double>
|
||||
; CHECK: ret
|
||||
define void @test1(double* %a, double* %b, double* %c) nounwind uwtable readonly {
|
||||
entry:
|
||||
%i0 = load double* %a, align 8
|
||||
%i1 = load double* %b, align 8
|
||||
%mul = fmul double %i0, %i1
|
||||
%arrayidx3 = getelementptr inbounds double* %a, i64 1
|
||||
%i3 = load double* %arrayidx3, align 8
|
||||
%arrayidx4 = getelementptr inbounds double* %b, i64 1
|
||||
%i4 = load double* %arrayidx4, align 8
|
||||
%mul5 = fmul double %i3, %i4
|
||||
store double %mul, double* %c, align 8
|
||||
%arrayidx5 = getelementptr inbounds double* %c, i64 1
|
||||
store double %mul5, double* %arrayidx5, align 8
|
||||
ret void
|
||||
}
|
||||
|
|
@ -0,0 +1 @@
|
|||
config.suffixes = ['.ll', '.c', '.cpp']
|
Loading…
Reference in New Issue