2016-07-01 07:11:38 +08:00
|
|
|
//===----- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer ----------===//
|
|
|
|
//
|
|
|
|
// The LLVM Compiler Infrastructure
|
|
|
|
//
|
|
|
|
// This file is distributed under the University of Illinois Open Source
|
|
|
|
// License. See LICENSE.TXT for details.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
#include "llvm/ADT/MapVector.h"
|
|
|
|
#include "llvm/ADT/PostOrderIterator.h"
|
|
|
|
#include "llvm/ADT/SetVector.h"
|
|
|
|
#include "llvm/ADT/Statistic.h"
|
|
|
|
#include "llvm/ADT/Triple.h"
|
|
|
|
#include "llvm/Analysis/AliasAnalysis.h"
|
|
|
|
#include "llvm/Analysis/ScalarEvolution.h"
|
|
|
|
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
|
|
|
|
#include "llvm/Analysis/TargetTransformInfo.h"
|
|
|
|
#include "llvm/Analysis/ValueTracking.h"
|
|
|
|
#include "llvm/Analysis/VectorUtils.h"
|
|
|
|
#include "llvm/IR/DataLayout.h"
|
|
|
|
#include "llvm/IR/Dominators.h"
|
|
|
|
#include "llvm/IR/IRBuilder.h"
|
|
|
|
#include "llvm/IR/Instructions.h"
|
|
|
|
#include "llvm/IR/Module.h"
|
|
|
|
#include "llvm/IR/Type.h"
|
|
|
|
#include "llvm/IR/Value.h"
|
|
|
|
#include "llvm/Support/CommandLine.h"
|
|
|
|
#include "llvm/Support/Debug.h"
|
|
|
|
#include "llvm/Support/raw_ostream.h"
|
2016-07-08 04:10:35 +08:00
|
|
|
#include "llvm/Transforms/Vectorize.h"
|
2016-07-01 07:11:38 +08:00
|
|
|
|
|
|
|
using namespace llvm;
|
|
|
|
|
|
|
|
#define DEBUG_TYPE "load-store-vectorizer"
|
|
|
|
STATISTIC(NumVectorInstructions, "Number of vector accesses generated");
|
|
|
|
STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
|
|
|
|
// TODO: Remove this
|
|
|
|
static const unsigned TargetBaseAlign = 4;
|
|
|
|
|
2016-07-20 07:19:16 +08:00
|
|
|
typedef SmallVector<Value *, 8> ValueList;
|
|
|
|
typedef MapVector<Value *, ValueList> ValueListMap;
|
2016-07-01 07:11:38 +08:00
|
|
|
|
2016-07-20 07:19:16 +08:00
|
|
|
class Vectorizer {
|
2016-07-01 07:11:38 +08:00
|
|
|
Function &F;
|
|
|
|
AliasAnalysis &AA;
|
|
|
|
DominatorTree &DT;
|
|
|
|
ScalarEvolution &SE;
|
2016-07-01 10:07:22 +08:00
|
|
|
TargetTransformInfo &TTI;
|
2016-07-01 07:11:38 +08:00
|
|
|
const DataLayout &DL;
|
|
|
|
IRBuilder<> Builder;
|
|
|
|
|
|
|
|
public:
|
|
|
|
Vectorizer(Function &F, AliasAnalysis &AA, DominatorTree &DT,
|
2016-07-01 10:07:22 +08:00
|
|
|
ScalarEvolution &SE, TargetTransformInfo &TTI)
|
2016-07-08 04:10:35 +08:00
|
|
|
: F(F), AA(AA), DT(DT), SE(SE), TTI(TTI),
|
|
|
|
DL(F.getParent()->getDataLayout()), Builder(SE.getContext()) {}
|
2016-07-01 07:11:38 +08:00
|
|
|
|
|
|
|
bool run();
|
|
|
|
|
|
|
|
private:
|
|
|
|
Value *getPointerOperand(Value *I);
|
|
|
|
|
|
|
|
unsigned getPointerAddressSpace(Value *I);
|
|
|
|
|
2016-07-01 10:09:38 +08:00
|
|
|
unsigned getAlignment(LoadInst *LI) const {
|
|
|
|
unsigned Align = LI->getAlignment();
|
|
|
|
if (Align != 0)
|
|
|
|
return Align;
|
|
|
|
|
|
|
|
return DL.getABITypeAlignment(LI->getType());
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned getAlignment(StoreInst *SI) const {
|
|
|
|
unsigned Align = SI->getAlignment();
|
|
|
|
if (Align != 0)
|
|
|
|
return Align;
|
|
|
|
|
|
|
|
return DL.getABITypeAlignment(SI->getValueOperand()->getType());
|
|
|
|
}
|
|
|
|
|
2016-07-01 07:11:38 +08:00
|
|
|
bool isConsecutiveAccess(Value *A, Value *B);
|
|
|
|
|
Correct ordering of loads/stores.
Summary:
Aiming to correct the ordering of loads/stores. This patch changes the
insert point for loads to the position of the first load.
It updates the ordering method for loads to insert before, rather than after.
Before this patch the following sequence:
"load a[1], store a[1], store a[0], load a[2]"
Would incorrectly vectorize to "store a[0,1], load a[1,2]".
The correctness check was assuming the insertion point for loads is at
the position of the first load, when in practice it was at the last
load. An alternative fix would have been to invert the correctness check.
The current fix changes insert position but also requires reordering of
instructions before the vectorized load.
Updated testcases to reflect the changes.
Reviewers: tstellarAMD, llvm-commits, jlebar, arsenm
Subscribers: mzolotukhin
Differential Revision: http://reviews.llvm.org/D22071
llvm-svn: 275117
2016-07-12 06:34:29 +08:00
|
|
|
/// After vectorization, reorder the instructions that I depends on
|
|
|
|
/// (the instructions defining its operands), to ensure they dominate I.
|
2016-07-01 07:11:38 +08:00
|
|
|
void reorder(Instruction *I);
|
|
|
|
|
|
|
|
/// Returns the first and the last instructions in Chain.
|
|
|
|
std::pair<BasicBlock::iterator, BasicBlock::iterator>
|
|
|
|
getBoundaryInstrs(ArrayRef<Value *> Chain);
|
|
|
|
|
|
|
|
/// Erases the original instructions after vectorizing.
|
|
|
|
void eraseInstructions(ArrayRef<Value *> Chain);
|
|
|
|
|
|
|
|
/// "Legalize" the vector type that would be produced by combining \p
|
|
|
|
/// ElementSizeBits elements in \p Chain. Break into two pieces such that the
|
|
|
|
/// total size of each piece is 1, 2 or a multiple of 4 bytes. \p Chain is
|
|
|
|
/// expected to have more than 4 elements.
|
|
|
|
std::pair<ArrayRef<Value *>, ArrayRef<Value *>>
|
|
|
|
splitOddVectorElts(ArrayRef<Value *> Chain, unsigned ElementSizeBits);
|
|
|
|
|
2016-07-20 07:19:20 +08:00
|
|
|
/// Finds the largest prefix of Chain that's vectorizable, checking for
|
|
|
|
/// intervening instructions which may affect the memory accessed by the
|
|
|
|
/// instructions within Chain.
|
|
|
|
///
|
2016-07-20 08:55:12 +08:00
|
|
|
/// The elements of \p Chain must be all loads or all stores and must be in
|
|
|
|
/// address order.
|
2016-07-20 07:19:20 +08:00
|
|
|
ArrayRef<Value *> getVectorizablePrefix(ArrayRef<Value *> Chain);
|
2016-07-01 07:11:38 +08:00
|
|
|
|
|
|
|
/// Collects load and store instructions to vectorize.
|
2016-07-20 07:19:16 +08:00
|
|
|
std::pair<ValueListMap, ValueListMap> collectInstructions(BasicBlock *BB);
|
2016-07-01 07:11:38 +08:00
|
|
|
|
|
|
|
/// Processes the collected instructions, the \p Map. The elements of \p Map
|
|
|
|
/// should be all loads or all stores.
|
|
|
|
bool vectorizeChains(ValueListMap &Map);
|
|
|
|
|
|
|
|
/// Finds the load/stores to consecutive memory addresses and vectorizes them.
|
|
|
|
bool vectorizeInstructions(ArrayRef<Value *> Instrs);
|
|
|
|
|
|
|
|
/// Vectorizes the load instructions in Chain.
|
2016-07-14 05:20:01 +08:00
|
|
|
bool vectorizeLoadChain(ArrayRef<Value *> Chain,
|
|
|
|
SmallPtrSet<Value *, 16> *InstructionsProcessed);
|
2016-07-01 07:11:38 +08:00
|
|
|
|
|
|
|
/// Vectorizes the store instructions in Chain.
|
2016-07-14 05:20:01 +08:00
|
|
|
bool vectorizeStoreChain(ArrayRef<Value *> Chain,
|
|
|
|
SmallPtrSet<Value *, 16> *InstructionsProcessed);
|
2016-07-12 04:46:17 +08:00
|
|
|
|
|
|
|
/// Check if this load/store access is misaligned accesses
|
|
|
|
bool accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
|
|
|
|
unsigned Alignment);
|
2016-07-01 07:11:38 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
class LoadStoreVectorizer : public FunctionPass {
|
|
|
|
public:
|
|
|
|
static char ID;
|
|
|
|
|
2016-07-01 10:07:22 +08:00
|
|
|
LoadStoreVectorizer() : FunctionPass(ID) {
|
2016-07-01 07:11:38 +08:00
|
|
|
initializeLoadStoreVectorizerPass(*PassRegistry::getPassRegistry());
|
|
|
|
}
|
|
|
|
|
|
|
|
bool runOnFunction(Function &F) override;
|
|
|
|
|
|
|
|
const char *getPassName() const override {
|
|
|
|
return "GPU Load and Store Vectorizer";
|
|
|
|
}
|
|
|
|
|
|
|
|
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
|
|
|
AU.addRequired<AAResultsWrapperPass>();
|
|
|
|
AU.addRequired<ScalarEvolutionWrapperPass>();
|
|
|
|
AU.addRequired<DominatorTreeWrapperPass>();
|
2016-07-01 10:07:22 +08:00
|
|
|
AU.addRequired<TargetTransformInfoWrapperPass>();
|
2016-07-01 07:11:38 +08:00
|
|
|
AU.setPreservesCFG();
|
|
|
|
}
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
|
|
|
INITIALIZE_PASS_BEGIN(LoadStoreVectorizer, DEBUG_TYPE,
|
2016-07-02 07:26:54 +08:00
|
|
|
"Vectorize load and Store instructions", false, false)
|
2016-07-01 07:11:38 +08:00
|
|
|
INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
|
|
|
|
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
|
|
|
|
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
|
|
|
|
INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
|
2016-07-01 10:07:22 +08:00
|
|
|
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
|
2016-07-01 07:11:38 +08:00
|
|
|
INITIALIZE_PASS_END(LoadStoreVectorizer, DEBUG_TYPE,
|
2016-07-02 07:26:54 +08:00
|
|
|
"Vectorize load and store instructions", false, false)
|
2016-07-01 07:11:38 +08:00
|
|
|
|
|
|
|
char LoadStoreVectorizer::ID = 0;
|
|
|
|
|
2016-07-01 10:07:22 +08:00
|
|
|
Pass *llvm::createLoadStoreVectorizerPass() {
|
|
|
|
return new LoadStoreVectorizer();
|
2016-07-01 07:11:38 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
bool LoadStoreVectorizer::runOnFunction(Function &F) {
|
2016-07-01 07:50:18 +08:00
|
|
|
// Don't vectorize when the attribute NoImplicitFloat is used.
|
|
|
|
if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat))
|
|
|
|
return false;
|
|
|
|
|
2016-07-01 07:11:38 +08:00
|
|
|
AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
|
|
|
|
DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
|
|
|
|
ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
|
2016-07-08 04:10:35 +08:00
|
|
|
TargetTransformInfo &TTI =
|
|
|
|
getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
|
2016-07-01 07:11:38 +08:00
|
|
|
|
2016-07-01 10:07:22 +08:00
|
|
|
Vectorizer V(F, AA, DT, SE, TTI);
|
2016-07-01 07:11:38 +08:00
|
|
|
return V.run();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Vectorizer Implementation
|
|
|
|
bool Vectorizer::run() {
|
|
|
|
bool Changed = false;
|
|
|
|
|
|
|
|
// Scan the blocks in the function in post order.
|
|
|
|
for (BasicBlock *BB : post_order(&F)) {
|
2016-07-20 07:19:16 +08:00
|
|
|
ValueListMap LoadRefs, StoreRefs;
|
|
|
|
std::tie(LoadRefs, StoreRefs) = collectInstructions(BB);
|
2016-07-01 07:11:38 +08:00
|
|
|
Changed |= vectorizeChains(LoadRefs);
|
|
|
|
Changed |= vectorizeChains(StoreRefs);
|
|
|
|
}
|
|
|
|
|
|
|
|
return Changed;
|
|
|
|
}
|
|
|
|
|
|
|
|
Value *Vectorizer::getPointerOperand(Value *I) {
|
|
|
|
if (LoadInst *LI = dyn_cast<LoadInst>(I))
|
|
|
|
return LI->getPointerOperand();
|
|
|
|
if (StoreInst *SI = dyn_cast<StoreInst>(I))
|
|
|
|
return SI->getPointerOperand();
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned Vectorizer::getPointerAddressSpace(Value *I) {
|
|
|
|
if (LoadInst *L = dyn_cast<LoadInst>(I))
|
|
|
|
return L->getPointerAddressSpace();
|
|
|
|
if (StoreInst *S = dyn_cast<StoreInst>(I))
|
|
|
|
return S->getPointerAddressSpace();
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
// FIXME: Merge with llvm::isConsecutiveAccess
|
|
|
|
bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
|
|
|
|
Value *PtrA = getPointerOperand(A);
|
|
|
|
Value *PtrB = getPointerOperand(B);
|
|
|
|
unsigned ASA = getPointerAddressSpace(A);
|
|
|
|
unsigned ASB = getPointerAddressSpace(B);
|
|
|
|
|
|
|
|
// Check that the address spaces match and that the pointers are valid.
|
|
|
|
if (!PtrA || !PtrB || (ASA != ASB))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// Make sure that A and B are different pointers of the same size type.
|
|
|
|
unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
|
|
|
|
Type *PtrATy = PtrA->getType()->getPointerElementType();
|
|
|
|
Type *PtrBTy = PtrB->getType()->getPointerElementType();
|
|
|
|
if (PtrA == PtrB ||
|
|
|
|
DL.getTypeStoreSize(PtrATy) != DL.getTypeStoreSize(PtrBTy) ||
|
|
|
|
DL.getTypeStoreSize(PtrATy->getScalarType()) !=
|
2016-07-08 04:10:35 +08:00
|
|
|
DL.getTypeStoreSize(PtrBTy->getScalarType()))
|
2016-07-01 07:11:38 +08:00
|
|
|
return false;
|
|
|
|
|
|
|
|
APInt Size(PtrBitWidth, DL.getTypeStoreSize(PtrATy));
|
|
|
|
|
|
|
|
APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0);
|
|
|
|
PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
|
|
|
|
PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
|
|
|
|
|
|
|
|
APInt OffsetDelta = OffsetB - OffsetA;
|
|
|
|
|
|
|
|
// Check if they are based on the same pointer. That makes the offsets
|
|
|
|
// sufficient.
|
|
|
|
if (PtrA == PtrB)
|
|
|
|
return OffsetDelta == Size;
|
|
|
|
|
|
|
|
// Compute the necessary base pointer delta to have the necessary final delta
|
|
|
|
// equal to the size.
|
|
|
|
APInt BaseDelta = Size - OffsetDelta;
|
|
|
|
|
|
|
|
// Compute the distance with SCEV between the base pointers.
|
|
|
|
const SCEV *PtrSCEVA = SE.getSCEV(PtrA);
|
|
|
|
const SCEV *PtrSCEVB = SE.getSCEV(PtrB);
|
|
|
|
const SCEV *C = SE.getConstant(BaseDelta);
|
|
|
|
const SCEV *X = SE.getAddExpr(PtrSCEVA, C);
|
|
|
|
if (X == PtrSCEVB)
|
|
|
|
return true;
|
|
|
|
|
|
|
|
// Sometimes even this doesn't work, because SCEV can't always see through
|
|
|
|
// patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking
|
|
|
|
// things the hard way.
|
|
|
|
|
|
|
|
// Look through GEPs after checking they're the same except for the last
|
|
|
|
// index.
|
|
|
|
GetElementPtrInst *GEPA = dyn_cast<GetElementPtrInst>(getPointerOperand(A));
|
|
|
|
GetElementPtrInst *GEPB = dyn_cast<GetElementPtrInst>(getPointerOperand(B));
|
|
|
|
if (!GEPA || !GEPB || GEPA->getNumOperands() != GEPB->getNumOperands())
|
|
|
|
return false;
|
|
|
|
unsigned FinalIndex = GEPA->getNumOperands() - 1;
|
|
|
|
for (unsigned i = 0; i < FinalIndex; i++)
|
|
|
|
if (GEPA->getOperand(i) != GEPB->getOperand(i))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
Instruction *OpA = dyn_cast<Instruction>(GEPA->getOperand(FinalIndex));
|
|
|
|
Instruction *OpB = dyn_cast<Instruction>(GEPB->getOperand(FinalIndex));
|
|
|
|
if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() ||
|
|
|
|
OpA->getType() != OpB->getType())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// Only look through a ZExt/SExt.
|
|
|
|
if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
|
|
|
|
return false;
|
|
|
|
|
2016-07-01 10:16:24 +08:00
|
|
|
bool Signed = isa<SExtInst>(OpA);
|
|
|
|
|
2016-07-01 07:11:38 +08:00
|
|
|
OpA = dyn_cast<Instruction>(OpA->getOperand(0));
|
|
|
|
OpB = dyn_cast<Instruction>(OpB->getOperand(0));
|
|
|
|
if (!OpA || !OpB || OpA->getType() != OpB->getType())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// Now we need to prove that adding 1 to OpA won't overflow.
|
2016-07-01 10:16:24 +08:00
|
|
|
bool Safe = false;
|
|
|
|
// First attempt: if OpB is an add with NSW/NUW, and OpB is 1 added to OpA,
|
|
|
|
// we're okay.
|
|
|
|
if (OpB->getOpcode() == Instruction::Add &&
|
|
|
|
isa<ConstantInt>(OpB->getOperand(1)) &&
|
|
|
|
cast<ConstantInt>(OpB->getOperand(1))->getSExtValue() > 0) {
|
|
|
|
if (Signed)
|
|
|
|
Safe = cast<BinaryOperator>(OpB)->hasNoSignedWrap();
|
|
|
|
else
|
|
|
|
Safe = cast<BinaryOperator>(OpB)->hasNoUnsignedWrap();
|
|
|
|
}
|
|
|
|
|
2016-07-01 07:11:38 +08:00
|
|
|
unsigned BitWidth = OpA->getType()->getScalarSizeInBits();
|
2016-07-01 10:16:24 +08:00
|
|
|
|
|
|
|
// Second attempt:
|
2016-07-01 07:11:38 +08:00
|
|
|
// If any bits are known to be zero other than the sign bit in OpA, we can
|
|
|
|
// add 1 to it while guaranteeing no overflow of any sort.
|
2016-07-01 10:16:24 +08:00
|
|
|
if (!Safe) {
|
|
|
|
APInt KnownZero(BitWidth, 0);
|
|
|
|
APInt KnownOne(BitWidth, 0);
|
|
|
|
computeKnownBits(OpA, KnownZero, KnownOne, DL, 0, nullptr, OpA, &DT);
|
|
|
|
KnownZero &= ~APInt::getHighBitsSet(BitWidth, 1);
|
|
|
|
if (KnownZero != 0)
|
|
|
|
Safe = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!Safe)
|
2016-07-01 07:11:38 +08:00
|
|
|
return false;
|
|
|
|
|
|
|
|
const SCEV *OffsetSCEVA = SE.getSCEV(OpA);
|
|
|
|
const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
|
|
|
|
const SCEV *One = SE.getConstant(APInt(BitWidth, 1));
|
|
|
|
const SCEV *X2 = SE.getAddExpr(OffsetSCEVA, One);
|
|
|
|
return X2 == OffsetSCEVB;
|
|
|
|
}
|
|
|
|
|
|
|
|
void Vectorizer::reorder(Instruction *I) {
|
Correct ordering of loads/stores.
Summary:
Aiming to correct the ordering of loads/stores. This patch changes the
insert point for loads to the position of the first load.
It updates the ordering method for loads to insert before, rather than after.
Before this patch the following sequence:
"load a[1], store a[1], store a[0], load a[2]"
Would incorrectly vectorize to "store a[0,1], load a[1,2]".
The correctness check was assuming the insertion point for loads is at
the position of the first load, when in practice it was at the last
load. An alternative fix would have been to invert the correctness check.
The current fix changes insert position but also requires reordering of
instructions before the vectorized load.
Updated testcases to reflect the changes.
Reviewers: tstellarAMD, llvm-commits, jlebar, arsenm
Subscribers: mzolotukhin
Differential Revision: http://reviews.llvm.org/D22071
llvm-svn: 275117
2016-07-12 06:34:29 +08:00
|
|
|
SmallPtrSet<Instruction *, 16> InstructionsToMove;
|
|
|
|
SmallVector<Instruction *, 16> Worklist;
|
|
|
|
|
|
|
|
Worklist.push_back(I);
|
|
|
|
while (!Worklist.empty()) {
|
|
|
|
Instruction *IW = Worklist.pop_back_val();
|
|
|
|
int NumOperands = IW->getNumOperands();
|
|
|
|
for (int i = 0; i < NumOperands; i++) {
|
|
|
|
Instruction *IM = dyn_cast<Instruction>(IW->getOperand(i));
|
|
|
|
if (!IM || IM->getOpcode() == Instruction::PHI)
|
|
|
|
continue;
|
2016-07-01 07:11:38 +08:00
|
|
|
|
Correct ordering of loads/stores.
Summary:
Aiming to correct the ordering of loads/stores. This patch changes the
insert point for loads to the position of the first load.
It updates the ordering method for loads to insert before, rather than after.
Before this patch the following sequence:
"load a[1], store a[1], store a[0], load a[2]"
Would incorrectly vectorize to "store a[0,1], load a[1,2]".
The correctness check was assuming the insertion point for loads is at
the position of the first load, when in practice it was at the last
load. An alternative fix would have been to invert the correctness check.
The current fix changes insert position but also requires reordering of
instructions before the vectorized load.
Updated testcases to reflect the changes.
Reviewers: tstellarAMD, llvm-commits, jlebar, arsenm
Subscribers: mzolotukhin
Differential Revision: http://reviews.llvm.org/D22071
llvm-svn: 275117
2016-07-12 06:34:29 +08:00
|
|
|
if (!DT.dominates(IM, I)) {
|
|
|
|
InstructionsToMove.insert(IM);
|
|
|
|
Worklist.push_back(IM);
|
|
|
|
assert(IM->getParent() == IW->getParent() &&
|
|
|
|
"Instructions to move should be in the same basic block");
|
|
|
|
}
|
2016-07-01 07:11:38 +08:00
|
|
|
}
|
|
|
|
}
|
Correct ordering of loads/stores.
Summary:
Aiming to correct the ordering of loads/stores. This patch changes the
insert point for loads to the position of the first load.
It updates the ordering method for loads to insert before, rather than after.
Before this patch the following sequence:
"load a[1], store a[1], store a[0], load a[2]"
Would incorrectly vectorize to "store a[0,1], load a[1,2]".
The correctness check was assuming the insertion point for loads is at
the position of the first load, when in practice it was at the last
load. An alternative fix would have been to invert the correctness check.
The current fix changes insert position but also requires reordering of
instructions before the vectorized load.
Updated testcases to reflect the changes.
Reviewers: tstellarAMD, llvm-commits, jlebar, arsenm
Subscribers: mzolotukhin
Differential Revision: http://reviews.llvm.org/D22071
llvm-svn: 275117
2016-07-12 06:34:29 +08:00
|
|
|
|
|
|
|
// All instructions to move should follow I. Start from I, not from begin().
|
|
|
|
for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;
|
|
|
|
++BBI) {
|
|
|
|
if (!is_contained(InstructionsToMove, &*BBI))
|
|
|
|
continue;
|
|
|
|
Instruction *IM = &*BBI;
|
|
|
|
--BBI;
|
|
|
|
IM->removeFromParent();
|
|
|
|
IM->insertBefore(I);
|
|
|
|
}
|
2016-07-01 07:11:38 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
std::pair<BasicBlock::iterator, BasicBlock::iterator>
|
|
|
|
Vectorizer::getBoundaryInstrs(ArrayRef<Value *> Chain) {
|
|
|
|
Instruction *C0 = cast<Instruction>(Chain[0]);
|
|
|
|
BasicBlock::iterator FirstInstr = C0->getIterator();
|
|
|
|
BasicBlock::iterator LastInstr = C0->getIterator();
|
|
|
|
|
|
|
|
BasicBlock *BB = C0->getParent();
|
|
|
|
unsigned NumFound = 0;
|
|
|
|
for (Instruction &I : *BB) {
|
|
|
|
if (!is_contained(Chain, &I))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
++NumFound;
|
|
|
|
if (NumFound == 1) {
|
|
|
|
FirstInstr = I.getIterator();
|
2016-07-02 05:44:12 +08:00
|
|
|
}
|
|
|
|
if (NumFound == Chain.size()) {
|
2016-07-01 07:11:38 +08:00
|
|
|
LastInstr = I.getIterator();
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-07-02 05:44:12 +08:00
|
|
|
// Range is [first, last).
|
|
|
|
return std::make_pair(FirstInstr, ++LastInstr);
|
2016-07-01 07:11:38 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
void Vectorizer::eraseInstructions(ArrayRef<Value *> Chain) {
|
|
|
|
SmallVector<Instruction *, 16> Instrs;
|
|
|
|
for (Value *V : Chain) {
|
|
|
|
Value *PtrOperand = getPointerOperand(V);
|
|
|
|
assert(PtrOperand && "Instruction must have a pointer operand.");
|
|
|
|
Instrs.push_back(cast<Instruction>(V));
|
|
|
|
if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(PtrOperand))
|
|
|
|
Instrs.push_back(GEP);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Erase instructions.
|
|
|
|
for (Value *V : Instrs) {
|
|
|
|
Instruction *Instr = cast<Instruction>(V);
|
|
|
|
if (Instr->use_empty())
|
|
|
|
Instr->eraseFromParent();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
std::pair<ArrayRef<Value *>, ArrayRef<Value *>>
|
|
|
|
Vectorizer::splitOddVectorElts(ArrayRef<Value *> Chain,
|
|
|
|
unsigned ElementSizeBits) {
|
|
|
|
unsigned ElemSizeInBytes = ElementSizeBits / 8;
|
|
|
|
unsigned SizeInBytes = ElemSizeInBytes * Chain.size();
|
|
|
|
unsigned NumRight = (SizeInBytes % 4) / ElemSizeInBytes;
|
|
|
|
unsigned NumLeft = Chain.size() - NumRight;
|
|
|
|
return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft));
|
|
|
|
}
|
|
|
|
|
2016-07-20 07:19:20 +08:00
|
|
|
ArrayRef<Value *> Vectorizer::getVectorizablePrefix(ArrayRef<Value *> Chain) {
|
2016-07-20 08:55:12 +08:00
|
|
|
// These are in BB order, unlike Chain, which is in address order.
|
2016-07-01 07:11:38 +08:00
|
|
|
SmallVector<std::pair<Value *, unsigned>, 16> MemoryInstrs;
|
|
|
|
SmallVector<std::pair<Value *, unsigned>, 16> ChainInstrs;
|
|
|
|
|
2016-07-21 04:07:37 +08:00
|
|
|
bool IsLoadChain = isa<LoadInst>(Chain[0]);
|
|
|
|
DEBUG({
|
|
|
|
for (Value *V : Chain) {
|
|
|
|
if (IsLoadChain)
|
|
|
|
assert(isa<LoadInst>(V) &&
|
|
|
|
"All elements of Chain must be loads, or all must be stores.");
|
|
|
|
else
|
|
|
|
assert(isa<StoreInst>(V) &&
|
|
|
|
"All elements of Chain must be loads, or all must be stores.");
|
|
|
|
}
|
|
|
|
});
|
|
|
|
|
2016-07-14 05:20:01 +08:00
|
|
|
unsigned InstrIdx = 0;
|
2016-07-20 07:19:20 +08:00
|
|
|
for (Instruction &I : make_range(getBoundaryInstrs(Chain))) {
|
2016-07-20 07:19:18 +08:00
|
|
|
++InstrIdx;
|
2016-07-01 07:11:38 +08:00
|
|
|
if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
|
2016-07-20 07:19:18 +08:00
|
|
|
if (!is_contained(Chain, &I))
|
|
|
|
MemoryInstrs.push_back({&I, InstrIdx});
|
2016-07-01 07:11:38 +08:00
|
|
|
else
|
2016-07-20 07:19:18 +08:00
|
|
|
ChainInstrs.push_back({&I, InstrIdx});
|
2016-07-21 04:07:37 +08:00
|
|
|
} else if (IsLoadChain && (I.mayWriteToMemory() || I.mayThrow())) {
|
|
|
|
DEBUG(dbgs() << "LSV: Found may-write/throw operation: " << I << '\n');
|
|
|
|
break;
|
|
|
|
} else if (!IsLoadChain && (I.mayReadOrWriteMemory() || I.mayThrow())) {
|
|
|
|
DEBUG(dbgs() << "LSV: Found may-read/write/throw operation: " << I
|
|
|
|
<< '\n');
|
2016-07-21 04:07:34 +08:00
|
|
|
break;
|
2016-07-01 07:11:38 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-07-20 08:55:12 +08:00
|
|
|
// Loop until we find an instruction in ChainInstrs that we can't vectorize.
|
|
|
|
unsigned ChainInstrIdx, ChainInstrsLen;
|
|
|
|
for (ChainInstrIdx = 0, ChainInstrsLen = ChainInstrs.size();
|
|
|
|
ChainInstrIdx < ChainInstrsLen; ++ChainInstrIdx) {
|
|
|
|
Value *ChainInstr = ChainInstrs[ChainInstrIdx].first;
|
|
|
|
unsigned ChainInstrLoc = ChainInstrs[ChainInstrIdx].second;
|
|
|
|
bool AliasFound = false;
|
2016-07-14 05:20:01 +08:00
|
|
|
for (auto EntryMem : MemoryInstrs) {
|
2016-07-20 08:55:12 +08:00
|
|
|
Value *MemInstr = EntryMem.first;
|
|
|
|
unsigned MemInstrLoc = EntryMem.second;
|
|
|
|
if (isa<LoadInst>(MemInstr) && isa<LoadInst>(ChainInstr))
|
2016-07-01 07:11:38 +08:00
|
|
|
continue;
|
|
|
|
|
|
|
|
// We can ignore the alias as long as the load comes before the store,
|
|
|
|
// because that means we won't be moving the load past the store to
|
|
|
|
// vectorize it (the vectorized load is inserted at the location of the
|
|
|
|
// first load in the chain).
|
2016-07-20 08:55:12 +08:00
|
|
|
if (isa<StoreInst>(MemInstr) && isa<LoadInst>(ChainInstr) &&
|
|
|
|
ChainInstrLoc < MemInstrLoc)
|
2016-07-01 07:11:38 +08:00
|
|
|
continue;
|
|
|
|
|
|
|
|
// Same case, but in reverse.
|
2016-07-20 08:55:12 +08:00
|
|
|
if (isa<LoadInst>(MemInstr) && isa<StoreInst>(ChainInstr) &&
|
|
|
|
ChainInstrLoc > MemInstrLoc)
|
2016-07-01 07:11:38 +08:00
|
|
|
continue;
|
|
|
|
|
2016-07-20 08:55:12 +08:00
|
|
|
Instruction *M0 = cast<Instruction>(MemInstr);
|
|
|
|
Instruction *M1 = cast<Instruction>(ChainInstr);
|
2016-07-01 09:47:46 +08:00
|
|
|
|
|
|
|
if (!AA.isNoAlias(MemoryLocation::get(M0), MemoryLocation::get(M1))) {
|
2016-07-08 04:10:35 +08:00
|
|
|
DEBUG({
|
2016-07-01 09:47:46 +08:00
|
|
|
Value *Ptr0 = getPointerOperand(M0);
|
|
|
|
Value *Ptr1 = getPointerOperand(M1);
|
2016-07-20 07:19:18 +08:00
|
|
|
dbgs() << "LSV: Found alias:\n"
|
|
|
|
" Aliasing instruction and pointer:\n"
|
2016-07-20 08:55:12 +08:00
|
|
|
<< " " << *MemInstr << '\n'
|
2016-07-20 07:19:18 +08:00
|
|
|
<< " " << *Ptr0 << '\n'
|
|
|
|
<< " Aliased instruction and pointer:\n"
|
2016-07-20 08:55:12 +08:00
|
|
|
<< " " << *ChainInstr << '\n'
|
2016-07-20 07:19:18 +08:00
|
|
|
<< " " << *Ptr1 << '\n';
|
2016-07-08 04:10:35 +08:00
|
|
|
});
|
2016-07-20 08:55:12 +08:00
|
|
|
AliasFound = true;
|
|
|
|
break;
|
2016-07-01 07:11:38 +08:00
|
|
|
}
|
|
|
|
}
|
2016-07-20 08:55:12 +08:00
|
|
|
if (AliasFound)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Find the largest prefix of Chain whose elements are all in
|
|
|
|
// ChainInstrs[0, ChainInstrIdx). This is the largest vectorizable prefix of
|
|
|
|
// Chain. (Recall that Chain is in address order, but ChainInstrs is in BB
|
|
|
|
// order.)
|
|
|
|
auto VectorizableChainInstrs =
|
|
|
|
makeArrayRef(ChainInstrs.data(), ChainInstrIdx);
|
|
|
|
unsigned ChainIdx, ChainLen;
|
|
|
|
for (ChainIdx = 0, ChainLen = Chain.size(); ChainIdx < ChainLen; ++ChainIdx) {
|
|
|
|
Value *V = Chain[ChainIdx];
|
|
|
|
if (!any_of(VectorizableChainInstrs,
|
|
|
|
[V](std::pair<Value *, unsigned> CI) { return V == CI.first; }))
|
|
|
|
break;
|
2016-07-01 07:11:38 +08:00
|
|
|
}
|
2016-07-20 08:55:12 +08:00
|
|
|
return Chain.slice(0, ChainIdx);
|
2016-07-01 07:11:38 +08:00
|
|
|
}
|
|
|
|
|
2016-07-20 07:19:16 +08:00
|
|
|
std::pair<ValueListMap, ValueListMap>
|
|
|
|
Vectorizer::collectInstructions(BasicBlock *BB) {
|
|
|
|
ValueListMap LoadRefs;
|
|
|
|
ValueListMap StoreRefs;
|
2016-07-01 07:11:38 +08:00
|
|
|
|
|
|
|
for (Instruction &I : *BB) {
|
|
|
|
if (!I.mayReadOrWriteMemory())
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
|
|
|
|
if (!LI->isSimple())
|
|
|
|
continue;
|
|
|
|
|
|
|
|
Type *Ty = LI->getType();
|
|
|
|
if (!VectorType::isValidElementType(Ty->getScalarType()))
|
|
|
|
continue;
|
|
|
|
|
2016-07-01 08:36:54 +08:00
|
|
|
// Skip weird non-byte sizes. They probably aren't worth the effort of
|
|
|
|
// handling correctly.
|
|
|
|
unsigned TySize = DL.getTypeSizeInBits(Ty);
|
|
|
|
if (TySize < 8)
|
|
|
|
continue;
|
|
|
|
|
2016-07-01 10:07:22 +08:00
|
|
|
Value *Ptr = LI->getPointerOperand();
|
|
|
|
unsigned AS = Ptr->getType()->getPointerAddressSpace();
|
|
|
|
unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
|
|
|
|
|
2016-07-01 07:11:38 +08:00
|
|
|
// No point in looking at these if they're too big to vectorize.
|
2016-07-01 08:36:54 +08:00
|
|
|
if (TySize > VecRegSize / 2)
|
2016-07-01 07:11:38 +08:00
|
|
|
continue;
|
|
|
|
|
|
|
|
// Make sure all the users of a vector are constant-index extracts.
|
2016-07-08 04:10:35 +08:00
|
|
|
if (isa<VectorType>(Ty) && !all_of(LI->users(), [LI](const User *U) {
|
2016-07-01 07:11:38 +08:00
|
|
|
const Instruction *UI = cast<Instruction>(U);
|
|
|
|
return isa<ExtractElementInst>(UI) &&
|
|
|
|
isa<ConstantInt>(UI->getOperand(1));
|
|
|
|
}))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
// TODO: Target hook to filter types.
|
|
|
|
|
|
|
|
// Save the load locations.
|
2016-07-01 10:07:22 +08:00
|
|
|
Value *ObjPtr = GetUnderlyingObject(Ptr, DL);
|
|
|
|
LoadRefs[ObjPtr].push_back(LI);
|
2016-07-01 07:11:38 +08:00
|
|
|
|
|
|
|
} else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
|
|
|
|
if (!SI->isSimple())
|
|
|
|
continue;
|
|
|
|
|
|
|
|
Type *Ty = SI->getValueOperand()->getType();
|
|
|
|
if (!VectorType::isValidElementType(Ty->getScalarType()))
|
|
|
|
continue;
|
|
|
|
|
2016-07-01 08:36:54 +08:00
|
|
|
// Skip weird non-byte sizes. They probably aren't worth the effort of
|
|
|
|
// handling correctly.
|
|
|
|
unsigned TySize = DL.getTypeSizeInBits(Ty);
|
|
|
|
if (TySize < 8)
|
|
|
|
continue;
|
|
|
|
|
2016-07-01 10:07:22 +08:00
|
|
|
Value *Ptr = SI->getPointerOperand();
|
|
|
|
unsigned AS = Ptr->getType()->getPointerAddressSpace();
|
|
|
|
unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
|
2016-07-01 08:36:54 +08:00
|
|
|
if (TySize > VecRegSize / 2)
|
2016-07-01 07:11:38 +08:00
|
|
|
continue;
|
|
|
|
|
2016-07-08 04:10:35 +08:00
|
|
|
if (isa<VectorType>(Ty) && !all_of(SI->users(), [SI](const User *U) {
|
2016-07-01 07:11:38 +08:00
|
|
|
const Instruction *UI = cast<Instruction>(U);
|
|
|
|
return isa<ExtractElementInst>(UI) &&
|
|
|
|
isa<ConstantInt>(UI->getOperand(1));
|
|
|
|
}))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
// Save store location.
|
2016-07-01 10:07:22 +08:00
|
|
|
Value *ObjPtr = GetUnderlyingObject(Ptr, DL);
|
|
|
|
StoreRefs[ObjPtr].push_back(SI);
|
2016-07-01 07:11:38 +08:00
|
|
|
}
|
|
|
|
}
|
2016-07-20 07:19:16 +08:00
|
|
|
|
|
|
|
return {LoadRefs, StoreRefs};
|
2016-07-01 07:11:38 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
bool Vectorizer::vectorizeChains(ValueListMap &Map) {
|
|
|
|
bool Changed = false;
|
|
|
|
|
|
|
|
for (const std::pair<Value *, ValueList> &Chain : Map) {
|
|
|
|
unsigned Size = Chain.second.size();
|
|
|
|
if (Size < 2)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
DEBUG(dbgs() << "LSV: Analyzing a chain of length " << Size << ".\n");
|
|
|
|
|
|
|
|
// Process the stores in chunks of 64.
|
|
|
|
for (unsigned CI = 0, CE = Size; CI < CE; CI += 64) {
|
|
|
|
unsigned Len = std::min<unsigned>(CE - CI, 64);
|
|
|
|
ArrayRef<Value *> Chunk(&Chain.second[CI], Len);
|
|
|
|
Changed |= vectorizeInstructions(Chunk);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return Changed;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool Vectorizer::vectorizeInstructions(ArrayRef<Value *> Instrs) {
|
|
|
|
DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size() << " instructions.\n");
|
|
|
|
SmallSetVector<int, 16> Heads, Tails;
|
|
|
|
int ConsecutiveChain[64];
|
|
|
|
|
|
|
|
// Do a quadratic search on all of the given stores and find all of the pairs
|
|
|
|
// of stores that follow each other.
|
|
|
|
for (int i = 0, e = Instrs.size(); i < e; ++i) {
|
|
|
|
ConsecutiveChain[i] = -1;
|
|
|
|
for (int j = e - 1; j >= 0; --j) {
|
|
|
|
if (i == j)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (isConsecutiveAccess(Instrs[i], Instrs[j])) {
|
|
|
|
if (ConsecutiveChain[i] != -1) {
|
|
|
|
int CurDistance = std::abs(ConsecutiveChain[i] - i);
|
|
|
|
int NewDistance = std::abs(ConsecutiveChain[i] - j);
|
|
|
|
if (j < i || NewDistance > CurDistance)
|
|
|
|
continue; // Should not insert.
|
|
|
|
}
|
|
|
|
|
|
|
|
Tails.insert(j);
|
|
|
|
Heads.insert(i);
|
|
|
|
ConsecutiveChain[i] = j;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
bool Changed = false;
|
2016-07-14 05:20:01 +08:00
|
|
|
SmallPtrSet<Value *, 16> InstructionsProcessed;
|
2016-07-01 07:11:38 +08:00
|
|
|
|
|
|
|
for (int Head : Heads) {
|
2016-07-14 05:20:01 +08:00
|
|
|
if (InstructionsProcessed.count(Instrs[Head]))
|
|
|
|
continue;
|
|
|
|
bool longerChainExists = false;
|
|
|
|
for (unsigned TIt = 0; TIt < Tails.size(); TIt++)
|
|
|
|
if (Head == Tails[TIt] &&
|
|
|
|
!InstructionsProcessed.count(Instrs[Heads[TIt]])) {
|
|
|
|
longerChainExists = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (longerChainExists)
|
2016-07-01 07:11:38 +08:00
|
|
|
continue;
|
|
|
|
|
|
|
|
// We found an instr that starts a chain. Now follow the chain and try to
|
|
|
|
// vectorize it.
|
|
|
|
SmallVector<Value *, 16> Operands;
|
|
|
|
int I = Head;
|
|
|
|
while (I != -1 && (Tails.count(I) || Heads.count(I))) {
|
2016-07-14 05:20:01 +08:00
|
|
|
if (InstructionsProcessed.count(Instrs[I]))
|
2016-07-01 07:11:38 +08:00
|
|
|
break;
|
|
|
|
|
|
|
|
Operands.push_back(Instrs[I]);
|
|
|
|
I = ConsecutiveChain[I];
|
|
|
|
}
|
|
|
|
|
|
|
|
bool Vectorized = false;
|
|
|
|
if (isa<LoadInst>(*Operands.begin()))
|
2016-07-14 05:20:01 +08:00
|
|
|
Vectorized = vectorizeLoadChain(Operands, &InstructionsProcessed);
|
2016-07-01 07:11:38 +08:00
|
|
|
else
|
2016-07-14 05:20:01 +08:00
|
|
|
Vectorized = vectorizeStoreChain(Operands, &InstructionsProcessed);
|
2016-07-01 07:11:38 +08:00
|
|
|
|
|
|
|
Changed |= Vectorized;
|
|
|
|
}
|
|
|
|
|
|
|
|
return Changed;
|
|
|
|
}
|
|
|
|
|
2016-07-14 05:20:01 +08:00
|
|
|
bool Vectorizer::vectorizeStoreChain(
|
|
|
|
ArrayRef<Value *> Chain, SmallPtrSet<Value *, 16> *InstructionsProcessed) {
|
2016-07-01 07:11:38 +08:00
|
|
|
StoreInst *S0 = cast<StoreInst>(Chain[0]);
|
2016-07-01 08:37:01 +08:00
|
|
|
|
|
|
|
// If the vector has an int element, default to int for the whole load.
|
|
|
|
Type *StoreTy;
|
|
|
|
for (const auto &V : Chain) {
|
|
|
|
StoreTy = cast<StoreInst>(V)->getValueOperand()->getType();
|
|
|
|
if (StoreTy->isIntOrIntVectorTy())
|
|
|
|
break;
|
2016-07-01 09:55:52 +08:00
|
|
|
|
|
|
|
if (StoreTy->isPtrOrPtrVectorTy()) {
|
|
|
|
StoreTy = Type::getIntNTy(F.getParent()->getContext(),
|
|
|
|
DL.getTypeSizeInBits(StoreTy));
|
|
|
|
break;
|
|
|
|
}
|
2016-07-01 08:37:01 +08:00
|
|
|
}
|
|
|
|
|
2016-07-01 07:11:38 +08:00
|
|
|
unsigned Sz = DL.getTypeSizeInBits(StoreTy);
|
2016-07-01 10:07:22 +08:00
|
|
|
unsigned AS = S0->getPointerAddressSpace();
|
|
|
|
unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
|
2016-07-01 07:11:38 +08:00
|
|
|
unsigned VF = VecRegSize / Sz;
|
|
|
|
unsigned ChainSize = Chain.size();
|
|
|
|
|
2016-07-14 05:20:01 +08:00
|
|
|
if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
|
|
|
|
InstructionsProcessed->insert(Chain.begin(), Chain.end());
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2016-07-20 07:19:20 +08:00
|
|
|
ArrayRef<Value *> NewChain = getVectorizablePrefix(Chain);
|
|
|
|
if (NewChain.empty()) {
|
2016-07-21 04:07:34 +08:00
|
|
|
// No vectorization possible.
|
2016-07-14 05:20:01 +08:00
|
|
|
InstructionsProcessed->insert(Chain.begin(), Chain.end());
|
2016-07-01 07:11:38 +08:00
|
|
|
return false;
|
2016-07-14 05:20:01 +08:00
|
|
|
}
|
2016-07-20 07:19:20 +08:00
|
|
|
if (NewChain.size() == 1) {
|
2016-07-14 05:20:01 +08:00
|
|
|
// Failed after the first instruction. Discard it and try the smaller chain.
|
2016-07-20 07:19:20 +08:00
|
|
|
InstructionsProcessed->insert(NewChain.front());
|
2016-07-14 05:20:01 +08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Update Chain to the valid vectorizable subchain.
|
2016-07-20 07:19:20 +08:00
|
|
|
Chain = NewChain;
|
2016-07-14 05:20:01 +08:00
|
|
|
ChainSize = Chain.size();
|
2016-07-01 07:11:38 +08:00
|
|
|
|
|
|
|
// Store size should be 1B, 2B or multiple of 4B.
|
|
|
|
// TODO: Target hook for size constraint?
|
|
|
|
unsigned SzInBytes = (Sz / 8) * ChainSize;
|
|
|
|
if (SzInBytes > 2 && SzInBytes % 4 != 0) {
|
|
|
|
DEBUG(dbgs() << "LSV: Size should be 1B, 2B "
|
|
|
|
"or multiple of 4B. Splitting.\n");
|
|
|
|
if (SzInBytes == 3)
|
2016-07-14 05:20:01 +08:00
|
|
|
return vectorizeStoreChain(Chain.slice(0, ChainSize - 1),
|
|
|
|
InstructionsProcessed);
|
2016-07-01 07:11:38 +08:00
|
|
|
|
|
|
|
auto Chains = splitOddVectorElts(Chain, Sz);
|
2016-07-14 05:20:01 +08:00
|
|
|
return vectorizeStoreChain(Chains.first, InstructionsProcessed) |
|
|
|
|
vectorizeStoreChain(Chains.second, InstructionsProcessed);
|
2016-07-01 07:11:38 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
VectorType *VecTy;
|
|
|
|
VectorType *VecStoreTy = dyn_cast<VectorType>(StoreTy);
|
|
|
|
if (VecStoreTy)
|
|
|
|
VecTy = VectorType::get(StoreTy->getScalarType(),
|
|
|
|
Chain.size() * VecStoreTy->getNumElements());
|
|
|
|
else
|
|
|
|
VecTy = VectorType::get(StoreTy, Chain.size());
|
|
|
|
|
|
|
|
// If it's more than the max vector size, break it into two pieces.
|
|
|
|
// TODO: Target hook to control types to split to.
|
|
|
|
if (ChainSize > VF) {
|
|
|
|
DEBUG(dbgs() << "LSV: Vector factor is too big."
|
|
|
|
" Creating two separate arrays.\n");
|
2016-07-14 05:20:01 +08:00
|
|
|
return vectorizeStoreChain(Chain.slice(0, VF), InstructionsProcessed) |
|
|
|
|
vectorizeStoreChain(Chain.slice(VF), InstructionsProcessed);
|
2016-07-01 07:11:38 +08:00
|
|
|
}
|
|
|
|
|
2016-07-08 04:10:35 +08:00
|
|
|
DEBUG({
|
2016-07-01 07:11:38 +08:00
|
|
|
dbgs() << "LSV: Stores to vectorize:\n";
|
|
|
|
for (Value *V : Chain)
|
2016-07-20 07:19:18 +08:00
|
|
|
dbgs() << " " << *V << "\n";
|
2016-07-08 04:10:35 +08:00
|
|
|
});
|
2016-07-01 07:11:38 +08:00
|
|
|
|
2016-07-14 05:20:01 +08:00
|
|
|
// We won't try again to vectorize the elements of the chain, regardless of
|
|
|
|
// whether we succeed below.
|
|
|
|
InstructionsProcessed->insert(Chain.begin(), Chain.end());
|
|
|
|
|
2016-07-01 07:11:38 +08:00
|
|
|
// Check alignment restrictions.
|
2016-07-01 10:09:38 +08:00
|
|
|
unsigned Alignment = getAlignment(S0);
|
2016-07-01 07:11:38 +08:00
|
|
|
|
|
|
|
// If the store is going to be misaligned, don't vectorize it.
|
2016-07-12 04:46:17 +08:00
|
|
|
if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
|
|
|
|
if (S0->getPointerAddressSpace() != 0)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// If we're storing to an object on the stack, we control its alignment,
|
|
|
|
// so we can cheat and change it!
|
|
|
|
Value *V = GetUnderlyingObject(S0->getPointerOperand(), DL);
|
|
|
|
if (AllocaInst *AI = dyn_cast_or_null<AllocaInst>(V)) {
|
|
|
|
AI->setAlignment(TargetBaseAlign);
|
|
|
|
Alignment = TargetBaseAlign;
|
2016-07-01 07:11:38 +08:00
|
|
|
} else {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-07-20 07:19:20 +08:00
|
|
|
BasicBlock::iterator First, Last;
|
|
|
|
std::tie(First, Last) = getBoundaryInstrs(Chain);
|
2016-07-01 07:11:38 +08:00
|
|
|
Builder.SetInsertPoint(&*Last);
|
|
|
|
|
|
|
|
Value *Vec = UndefValue::get(VecTy);
|
|
|
|
|
|
|
|
if (VecStoreTy) {
|
|
|
|
unsigned VecWidth = VecStoreTy->getNumElements();
|
|
|
|
for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
|
|
|
|
StoreInst *Store = cast<StoreInst>(Chain[I]);
|
|
|
|
for (unsigned J = 0, NE = VecStoreTy->getNumElements(); J != NE; ++J) {
|
|
|
|
unsigned NewIdx = J + I * VecWidth;
|
|
|
|
Value *Extract = Builder.CreateExtractElement(Store->getValueOperand(),
|
|
|
|
Builder.getInt32(J));
|
|
|
|
if (Extract->getType() != StoreTy->getScalarType())
|
|
|
|
Extract = Builder.CreateBitCast(Extract, StoreTy->getScalarType());
|
|
|
|
|
2016-07-08 04:10:35 +08:00
|
|
|
Value *Insert =
|
|
|
|
Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(NewIdx));
|
2016-07-01 07:11:38 +08:00
|
|
|
Vec = Insert;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
|
|
|
|
StoreInst *Store = cast<StoreInst>(Chain[I]);
|
|
|
|
Value *Extract = Store->getValueOperand();
|
|
|
|
if (Extract->getType() != StoreTy->getScalarType())
|
2016-07-08 04:10:35 +08:00
|
|
|
Extract =
|
|
|
|
Builder.CreateBitOrPointerCast(Extract, StoreTy->getScalarType());
|
2016-07-01 07:11:38 +08:00
|
|
|
|
2016-07-08 04:10:35 +08:00
|
|
|
Value *Insert =
|
|
|
|
Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(I));
|
2016-07-01 07:11:38 +08:00
|
|
|
Vec = Insert;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Value *Bitcast =
|
2016-07-08 04:10:35 +08:00
|
|
|
Builder.CreateBitCast(S0->getPointerOperand(), VecTy->getPointerTo(AS));
|
2016-07-01 07:11:38 +08:00
|
|
|
StoreInst *SI = cast<StoreInst>(Builder.CreateStore(Vec, Bitcast));
|
|
|
|
propagateMetadata(SI, Chain);
|
|
|
|
SI->setAlignment(Alignment);
|
|
|
|
|
|
|
|
eraseInstructions(Chain);
|
|
|
|
++NumVectorInstructions;
|
|
|
|
NumScalarsVectorized += Chain.size();
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2016-07-14 05:20:01 +08:00
|
|
|
bool Vectorizer::vectorizeLoadChain(
|
|
|
|
ArrayRef<Value *> Chain, SmallPtrSet<Value *, 16> *InstructionsProcessed) {
|
2016-07-01 07:11:38 +08:00
|
|
|
LoadInst *L0 = cast<LoadInst>(Chain[0]);
|
2016-07-01 08:37:01 +08:00
|
|
|
|
|
|
|
// If the vector has an int element, default to int for the whole load.
|
|
|
|
Type *LoadTy;
|
|
|
|
for (const auto &V : Chain) {
|
|
|
|
LoadTy = cast<LoadInst>(V)->getType();
|
|
|
|
if (LoadTy->isIntOrIntVectorTy())
|
|
|
|
break;
|
2016-07-01 09:55:52 +08:00
|
|
|
|
|
|
|
if (LoadTy->isPtrOrPtrVectorTy()) {
|
|
|
|
LoadTy = Type::getIntNTy(F.getParent()->getContext(),
|
|
|
|
DL.getTypeSizeInBits(LoadTy));
|
|
|
|
break;
|
|
|
|
}
|
2016-07-01 08:37:01 +08:00
|
|
|
}
|
|
|
|
|
2016-07-01 07:11:38 +08:00
|
|
|
unsigned Sz = DL.getTypeSizeInBits(LoadTy);
|
2016-07-01 10:07:22 +08:00
|
|
|
unsigned AS = L0->getPointerAddressSpace();
|
|
|
|
unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
|
2016-07-01 07:11:38 +08:00
|
|
|
unsigned VF = VecRegSize / Sz;
|
|
|
|
unsigned ChainSize = Chain.size();
|
|
|
|
|
2016-07-14 05:20:01 +08:00
|
|
|
if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
|
|
|
|
InstructionsProcessed->insert(Chain.begin(), Chain.end());
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2016-07-20 07:19:20 +08:00
|
|
|
ArrayRef<Value *> NewChain = getVectorizablePrefix(Chain);
|
|
|
|
if (NewChain.empty()) {
|
2016-07-21 04:07:34 +08:00
|
|
|
// No vectorization possible.
|
2016-07-14 05:20:01 +08:00
|
|
|
InstructionsProcessed->insert(Chain.begin(), Chain.end());
|
|
|
|
return false;
|
|
|
|
}
|
2016-07-20 07:19:20 +08:00
|
|
|
if (NewChain.size() == 1) {
|
2016-07-14 05:20:01 +08:00
|
|
|
// Failed after the first instruction. Discard it and try the smaller chain.
|
2016-07-20 07:19:20 +08:00
|
|
|
InstructionsProcessed->insert(NewChain.front());
|
2016-07-01 07:11:38 +08:00
|
|
|
return false;
|
2016-07-14 05:20:01 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Update Chain to the valid vectorizable subchain.
|
2016-07-20 07:19:20 +08:00
|
|
|
Chain = NewChain;
|
2016-07-14 05:20:01 +08:00
|
|
|
ChainSize = Chain.size();
|
2016-07-01 07:11:38 +08:00
|
|
|
|
|
|
|
// Load size should be 1B, 2B or multiple of 4B.
|
|
|
|
// TODO: Should size constraint be a target hook?
|
|
|
|
unsigned SzInBytes = (Sz / 8) * ChainSize;
|
|
|
|
if (SzInBytes > 2 && SzInBytes % 4 != 0) {
|
2016-07-08 04:10:35 +08:00
|
|
|
DEBUG(dbgs() << "LSV: Size should be 1B, 2B "
|
|
|
|
"or multiple of 4B. Splitting.\n");
|
2016-07-01 07:11:38 +08:00
|
|
|
if (SzInBytes == 3)
|
2016-07-14 05:20:01 +08:00
|
|
|
return vectorizeLoadChain(Chain.slice(0, ChainSize - 1),
|
|
|
|
InstructionsProcessed);
|
2016-07-01 07:11:38 +08:00
|
|
|
auto Chains = splitOddVectorElts(Chain, Sz);
|
2016-07-14 05:20:01 +08:00
|
|
|
return vectorizeLoadChain(Chains.first, InstructionsProcessed) |
|
|
|
|
vectorizeLoadChain(Chains.second, InstructionsProcessed);
|
2016-07-01 07:11:38 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
VectorType *VecTy;
|
|
|
|
VectorType *VecLoadTy = dyn_cast<VectorType>(LoadTy);
|
|
|
|
if (VecLoadTy)
|
|
|
|
VecTy = VectorType::get(LoadTy->getScalarType(),
|
|
|
|
Chain.size() * VecLoadTy->getNumElements());
|
|
|
|
else
|
|
|
|
VecTy = VectorType::get(LoadTy, Chain.size());
|
|
|
|
|
|
|
|
// If it's more than the max vector size, break it into two pieces.
|
|
|
|
// TODO: Target hook to control types to split to.
|
|
|
|
if (ChainSize > VF) {
|
|
|
|
DEBUG(dbgs() << "LSV: Vector factor is too big. "
|
|
|
|
"Creating two separate arrays.\n");
|
2016-07-14 05:20:01 +08:00
|
|
|
return vectorizeLoadChain(Chain.slice(0, VF), InstructionsProcessed) |
|
|
|
|
vectorizeLoadChain(Chain.slice(VF), InstructionsProcessed);
|
2016-07-01 07:11:38 +08:00
|
|
|
}
|
|
|
|
|
2016-07-14 05:20:01 +08:00
|
|
|
// We won't try again to vectorize the elements of the chain, regardless of
|
|
|
|
// whether we succeed below.
|
|
|
|
InstructionsProcessed->insert(Chain.begin(), Chain.end());
|
|
|
|
|
2016-07-01 07:11:38 +08:00
|
|
|
// Check alignment restrictions.
|
2016-07-01 10:09:38 +08:00
|
|
|
unsigned Alignment = getAlignment(L0);
|
2016-07-01 07:11:38 +08:00
|
|
|
|
|
|
|
// If the load is going to be misaligned, don't vectorize it.
|
2016-07-12 04:46:17 +08:00
|
|
|
if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
|
|
|
|
if (L0->getPointerAddressSpace() != 0)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// If we're loading from an object on the stack, we control its alignment,
|
|
|
|
// so we can cheat and change it!
|
|
|
|
Value *V = GetUnderlyingObject(L0->getPointerOperand(), DL);
|
|
|
|
if (AllocaInst *AI = dyn_cast_or_null<AllocaInst>(V)) {
|
|
|
|
AI->setAlignment(TargetBaseAlign);
|
|
|
|
Alignment = TargetBaseAlign;
|
2016-07-01 07:11:38 +08:00
|
|
|
} else {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-07-08 04:10:35 +08:00
|
|
|
DEBUG({
|
2016-07-01 07:11:38 +08:00
|
|
|
dbgs() << "LSV: Loads to vectorize:\n";
|
|
|
|
for (Value *V : Chain)
|
|
|
|
V->dump();
|
2016-07-08 04:10:35 +08:00
|
|
|
});
|
2016-07-01 07:11:38 +08:00
|
|
|
|
2016-07-20 07:19:20 +08:00
|
|
|
// getVectorizablePrefix already computed getBoundaryInstrs. The value of
|
|
|
|
// Last may have changed since then, but the value of First won't have. If it
|
|
|
|
// matters, we could compute getBoundaryInstrs only once and reuse it here.
|
|
|
|
BasicBlock::iterator First, Last;
|
|
|
|
std::tie(First, Last) = getBoundaryInstrs(Chain);
|
Correct ordering of loads/stores.
Summary:
Aiming to correct the ordering of loads/stores. This patch changes the
insert point for loads to the position of the first load.
It updates the ordering method for loads to insert before, rather than after.
Before this patch the following sequence:
"load a[1], store a[1], store a[0], load a[2]"
Would incorrectly vectorize to "store a[0,1], load a[1,2]".
The correctness check was assuming the insertion point for loads is at
the position of the first load, when in practice it was at the last
load. An alternative fix would have been to invert the correctness check.
The current fix changes insert position but also requires reordering of
instructions before the vectorized load.
Updated testcases to reflect the changes.
Reviewers: tstellarAMD, llvm-commits, jlebar, arsenm
Subscribers: mzolotukhin
Differential Revision: http://reviews.llvm.org/D22071
llvm-svn: 275117
2016-07-12 06:34:29 +08:00
|
|
|
Builder.SetInsertPoint(&*First);
|
2016-07-01 07:11:38 +08:00
|
|
|
|
|
|
|
Value *Bitcast =
|
2016-07-08 04:10:35 +08:00
|
|
|
Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS));
|
2016-07-01 07:11:38 +08:00
|
|
|
|
|
|
|
LoadInst *LI = cast<LoadInst>(Builder.CreateLoad(Bitcast));
|
|
|
|
propagateMetadata(LI, Chain);
|
|
|
|
LI->setAlignment(Alignment);
|
|
|
|
|
|
|
|
if (VecLoadTy) {
|
|
|
|
SmallVector<Instruction *, 16> InstrsToErase;
|
|
|
|
|
|
|
|
unsigned VecWidth = VecLoadTy->getNumElements();
|
|
|
|
for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
|
|
|
|
for (auto Use : Chain[I]->users()) {
|
|
|
|
Instruction *UI = cast<Instruction>(Use);
|
|
|
|
unsigned Idx = cast<ConstantInt>(UI->getOperand(1))->getZExtValue();
|
|
|
|
unsigned NewIdx = Idx + I * VecWidth;
|
|
|
|
Value *V = Builder.CreateExtractElement(LI, Builder.getInt32(NewIdx));
|
|
|
|
Instruction *Extracted = cast<Instruction>(V);
|
|
|
|
if (Extracted->getType() != UI->getType())
|
2016-07-08 04:10:35 +08:00
|
|
|
Extracted = cast<Instruction>(
|
|
|
|
Builder.CreateBitCast(Extracted, UI->getType()));
|
2016-07-01 07:11:38 +08:00
|
|
|
|
|
|
|
// Replace the old instruction.
|
|
|
|
UI->replaceAllUsesWith(Extracted);
|
|
|
|
InstrsToErase.push_back(UI);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-07-28 05:45:48 +08:00
|
|
|
// Bitcast might not be an Instruction, if the value being loaded is a
|
|
|
|
// constant. In that case, no need to reorder anything.
|
|
|
|
if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))
|
|
|
|
reorder(BitcastInst);
|
2016-07-01 07:11:38 +08:00
|
|
|
|
|
|
|
for (auto I : InstrsToErase)
|
|
|
|
I->eraseFromParent();
|
|
|
|
} else {
|
|
|
|
for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
|
|
|
|
Value *V = Builder.CreateExtractElement(LI, Builder.getInt32(I));
|
|
|
|
Instruction *Extracted = cast<Instruction>(V);
|
|
|
|
Instruction *UI = cast<Instruction>(Chain[I]);
|
2016-07-01 09:55:52 +08:00
|
|
|
if (Extracted->getType() != UI->getType()) {
|
2016-07-08 04:10:35 +08:00
|
|
|
Extracted = cast<Instruction>(
|
|
|
|
Builder.CreateBitOrPointerCast(Extracted, UI->getType()));
|
2016-07-01 09:55:52 +08:00
|
|
|
}
|
2016-07-01 07:11:38 +08:00
|
|
|
|
|
|
|
// Replace the old instruction.
|
|
|
|
UI->replaceAllUsesWith(Extracted);
|
|
|
|
}
|
|
|
|
|
2016-07-28 05:45:48 +08:00
|
|
|
if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))
|
|
|
|
reorder(BitcastInst);
|
2016-07-01 07:11:38 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
eraseInstructions(Chain);
|
|
|
|
|
|
|
|
++NumVectorInstructions;
|
|
|
|
NumScalarsVectorized += Chain.size();
|
|
|
|
return true;
|
|
|
|
}
|
2016-07-12 04:46:17 +08:00
|
|
|
|
|
|
|
bool Vectorizer::accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
|
|
|
|
unsigned Alignment) {
|
|
|
|
bool Fast = false;
|
|
|
|
bool Allows = TTI.allowsMisalignedMemoryAccesses(SzInBytes * 8, AddressSpace,
|
|
|
|
Alignment, &Fast);
|
|
|
|
// TODO: Remove TargetBaseAlign
|
|
|
|
return !(Allows && Fast) && (Alignment % SzInBytes) != 0 &&
|
|
|
|
(Alignment % TargetBaseAlign) != 0;
|
|
|
|
}
|