2012-10-18 02:25:06 +08:00
|
|
|
//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
|
|
|
|
//
|
|
|
|
// The LLVM Compiler Infrastructure
|
|
|
|
//
|
|
|
|
// This file is distributed under the University of Illinois Open Source
|
|
|
|
// License. See LICENSE.TXT for details.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
2013-01-07 18:44:06 +08:00
|
|
|
//
|
|
|
|
// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
|
2013-05-06 11:06:36 +08:00
|
|
|
// and generates target-independent LLVM-IR.
|
|
|
|
// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
|
|
|
|
// of instructions in order to estimate the profitability of vectorization.
|
2013-01-07 18:44:06 +08:00
|
|
|
//
|
2013-02-09 01:43:32 +08:00
|
|
|
// The loop vectorizer combines consecutive loop iterations into a single
|
2013-01-07 18:44:06 +08:00
|
|
|
// 'wide' iteration. After this transformation the index is incremented
|
|
|
|
// by the SIMD vector width, and not by one.
|
|
|
|
//
|
|
|
|
// This pass has three parts:
|
|
|
|
// 1. The main loop pass that drives the different parts.
|
|
|
|
// 2. LoopVectorizationLegality - A unit that checks for the legality
|
|
|
|
// of the vectorization.
|
|
|
|
// 3. InnerLoopVectorizer - A unit that performs the actual
|
|
|
|
// widening of instructions.
|
|
|
|
// 4. LoopVectorizationCostModel - A unit that checks for the profitability
|
|
|
|
// of vectorization. It decides on the optimal vector width, which
|
|
|
|
// can be one, if vectorization is not profitable.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
|
|
|
// The reduction-variable vectorization is based on the paper:
|
|
|
|
// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
|
|
|
|
//
|
|
|
|
// Variable uniformity checks are inspired by:
|
2013-02-08 20:58:29 +08:00
|
|
|
// Karrenberg, R. and Hack, S. Whole Function Vectorization.
|
2013-01-07 18:44:06 +08:00
|
|
|
//
|
|
|
|
// Other ideas/concepts are from:
|
|
|
|
// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
|
|
|
|
//
|
|
|
|
// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
|
|
|
|
// Vectorizing Compilers.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
#include "llvm/Transforms/Vectorize.h"
|
|
|
|
#include "llvm/ADT/DenseMap.h"
|
2013-06-24 11:55:44 +08:00
|
|
|
#include "llvm/ADT/EquivalenceClasses.h"
|
2013-11-02 21:39:00 +08:00
|
|
|
#include "llvm/ADT/Hashing.h"
|
2013-01-07 18:44:06 +08:00
|
|
|
#include "llvm/ADT/MapVector.h"
|
2013-06-24 20:09:12 +08:00
|
|
|
#include "llvm/ADT/SetVector.h"
|
2013-01-07 18:44:06 +08:00
|
|
|
#include "llvm/ADT/SmallPtrSet.h"
|
2013-01-05 01:48:25 +08:00
|
|
|
#include "llvm/ADT/SmallSet.h"
|
2013-01-07 18:44:06 +08:00
|
|
|
#include "llvm/ADT/SmallVector.h"
|
2014-04-23 16:40:37 +08:00
|
|
|
#include "llvm/ADT/Statistic.h"
|
2012-10-18 02:25:06 +08:00
|
|
|
#include "llvm/ADT/StringExtras.h"
|
|
|
|
#include "llvm/Analysis/AliasAnalysis.h"
|
[LoopVectorize] Use AA to partition potential dependency checks
Prior to this change, the loop vectorizer did not make use of the alias
analysis infrastructure. Instead, it performed memory dependence analysis using
ScalarEvolution-based linear dependence checks within equivalence classes
derived from the results of ValueTracking's GetUnderlyingObjects.
Unfortunately, this meant that:
1. The loop vectorizer had logic that essentially duplicated that in BasicAA
for aliasing based on identified objects.
2. The loop vectorizer could not partition the space of dependency checks
based on information only easily available from within AA (TBAA metadata is
currently the prime example).
This means, for example, regardless of whether -fno-strict-aliasing was
provided, the vectorizer would only vectorize this loop with a runtime
memory-overlap check:
void foo(int *a, float *b) {
for (int i = 0; i < 1600; ++i)
a[i] = b[i];
}
This is suboptimal because the TBAA metadata already provides the information
necessary to show that this check unnecessary. Of course, the vectorizer has a
limit on the number of such checks it will insert, so in practice, ignoring
TBAA means not vectorizing more-complicated loops that we should.
This change causes the vectorizer to use an AliasSetTracker to keep track of
the pointers in the loop. The resulting alias sets are then used to partition
the space of dependency checks, and potential runtime checks; this results in
more-efficient vectorizations.
When pointer locations are added to the AliasSetTracker, two things are done:
1. The location size is set to UnknownSize (otherwise you'd not catch
inter-iteration dependencies)
2. For instructions in blocks that would need to be predicated, TBAA is
removed (because the metadata might have a control dependency on the condition
being speculated).
For non-predicated blocks, you can leave the TBAA metadata. This is safe
because you can't have an iteration dependency on the TBAA metadata (if you
did, and you unrolled sufficiently, you'd end up with the same pointer value
used by two accesses that TBAA says should not alias, and that would yield
undefined behavior).
llvm-svn: 213486
2014-07-21 07:07:52 +08:00
|
|
|
#include "llvm/Analysis/AliasSetTracker.h"
|
2015-01-04 20:03:27 +08:00
|
|
|
#include "llvm/Analysis/AssumptionCache.h"
|
2014-01-27 21:11:50 +08:00
|
|
|
#include "llvm/Analysis/BlockFrequencyInfo.h"
|
2014-10-15 06:59:49 +08:00
|
|
|
#include "llvm/Analysis/CodeMetrics.h"
|
2015-02-02 00:56:15 +08:00
|
|
|
#include "llvm/Analysis/LoopAccessAnalysis.h"
|
2012-10-18 02:25:06 +08:00
|
|
|
#include "llvm/Analysis/LoopInfo.h"
|
2012-12-04 14:15:11 +08:00
|
|
|
#include "llvm/Analysis/LoopIterator.h"
|
2012-12-04 00:50:05 +08:00
|
|
|
#include "llvm/Analysis/LoopPass.h"
|
2013-01-07 18:44:06 +08:00
|
|
|
#include "llvm/Analysis/ScalarEvolution.h"
|
2012-12-11 05:39:02 +08:00
|
|
|
#include "llvm/Analysis/ScalarEvolutionExpander.h"
|
2012-12-04 00:50:05 +08:00
|
|
|
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
|
2013-01-07 11:08:10 +08:00
|
|
|
#include "llvm/Analysis/TargetTransformInfo.h"
|
2012-10-25 04:36:32 +08:00
|
|
|
#include "llvm/Analysis/ValueTracking.h"
|
2013-01-02 19:36:10 +08:00
|
|
|
#include "llvm/IR/Constants.h"
|
|
|
|
#include "llvm/IR/DataLayout.h"
|
2014-04-07 20:32:17 +08:00
|
|
|
#include "llvm/IR/DebugInfo.h"
|
2013-01-02 19:36:10 +08:00
|
|
|
#include "llvm/IR/DerivedTypes.h"
|
2014-05-22 22:19:46 +08:00
|
|
|
#include "llvm/IR/DiagnosticInfo.h"
|
2014-01-13 17:26:24 +08:00
|
|
|
#include "llvm/IR/Dominators.h"
|
2013-01-02 19:36:10 +08:00
|
|
|
#include "llvm/IR/Function.h"
|
2013-01-07 18:44:06 +08:00
|
|
|
#include "llvm/IR/IRBuilder.h"
|
2013-01-02 19:36:10 +08:00
|
|
|
#include "llvm/IR/Instructions.h"
|
|
|
|
#include "llvm/IR/IntrinsicInst.h"
|
|
|
|
#include "llvm/IR/LLVMContext.h"
|
|
|
|
#include "llvm/IR/Module.h"
|
2014-03-04 19:08:18 +08:00
|
|
|
#include "llvm/IR/PatternMatch.h"
|
2013-01-02 19:36:10 +08:00
|
|
|
#include "llvm/IR/Type.h"
|
|
|
|
#include "llvm/IR/Value.h"
|
2014-03-04 19:17:44 +08:00
|
|
|
#include "llvm/IR/ValueHandle.h"
|
2014-01-13 17:26:24 +08:00
|
|
|
#include "llvm/IR/Verifier.h"
|
2012-12-04 00:50:05 +08:00
|
|
|
#include "llvm/Pass.h"
|
2014-01-27 21:11:50 +08:00
|
|
|
#include "llvm/Support/BranchProbability.h"
|
2012-10-18 02:25:06 +08:00
|
|
|
#include "llvm/Support/CommandLine.h"
|
|
|
|
#include "llvm/Support/Debug.h"
|
2014-01-07 19:48:04 +08:00
|
|
|
#include "llvm/Support/raw_ostream.h"
|
2012-12-04 00:50:05 +08:00
|
|
|
#include "llvm/Transforms/Scalar.h"
|
|
|
|
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
|
2012-10-18 02:25:06 +08:00
|
|
|
#include "llvm/Transforms/Utils/Local.h"
|
2014-04-09 22:20:47 +08:00
|
|
|
#include "llvm/Transforms/Utils/VectorUtils.h"
|
2015-03-27 11:44:15 +08:00
|
|
|
#include "llvm/Transforms/Utils/LoopUtils.h"
|
2013-01-07 18:44:06 +08:00
|
|
|
#include <algorithm>
|
|
|
|
#include <map>
|
2014-04-30 15:21:01 +08:00
|
|
|
#include <tuple>
|
2013-01-07 18:44:06 +08:00
|
|
|
|
|
|
|
using namespace llvm;
|
2013-04-20 05:03:36 +08:00
|
|
|
using namespace llvm::PatternMatch;
|
2012-10-18 02:25:06 +08:00
|
|
|
|
2014-04-22 10:55:47 +08:00
|
|
|
#define LV_NAME "loop-vectorize"
|
|
|
|
#define DEBUG_TYPE LV_NAME
|
|
|
|
|
2014-04-23 16:40:37 +08:00
|
|
|
STATISTIC(LoopsVectorized, "Number of loops vectorized");
|
|
|
|
STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
|
|
|
|
|
2012-12-04 05:06:35 +08:00
|
|
|
static cl::opt<bool>
|
2012-12-21 12:47:54 +08:00
|
|
|
EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
|
2012-12-04 05:06:35 +08:00
|
|
|
cl::desc("Enable if-conversion during vectorization."));
|
|
|
|
|
2013-01-07 18:44:06 +08:00
|
|
|
/// We don't vectorize loops with a known constant trip count below this number.
|
2013-01-30 05:42:08 +08:00
|
|
|
static cl::opt<unsigned>
|
2013-02-08 06:34:07 +08:00
|
|
|
TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16),
|
|
|
|
cl::Hidden,
|
|
|
|
cl::desc("Don't vectorize loops with a constant "
|
|
|
|
"trip count that is smaller than this "
|
|
|
|
"value."));
|
2013-01-08 05:54:51 +08:00
|
|
|
|
2014-01-11 02:20:32 +08:00
|
|
|
/// This enables versioning on the strides of symbolically striding memory
|
|
|
|
/// accesses in code like the following.
|
|
|
|
/// for (i = 0; i < N; ++i)
|
|
|
|
/// A[i * Stride1] += B[i * Stride2] ...
|
|
|
|
///
|
|
|
|
/// Will be roughly translated to
|
|
|
|
/// if (Stride1 == 1 && Stride2 == 1) {
|
|
|
|
/// for (i = 0; i < N; i+=4)
|
|
|
|
/// A[i:i+3] += ...
|
|
|
|
/// } else
|
|
|
|
/// ...
|
|
|
|
static cl::opt<bool> EnableMemAccessVersioning(
|
2014-01-12 04:40:34 +08:00
|
|
|
"enable-mem-access-versioning", cl::init(true), cl::Hidden,
|
2014-01-11 02:20:32 +08:00
|
|
|
cl::desc("Enable symblic stride memory access versioning"));
|
|
|
|
|
2013-01-08 05:54:51 +08:00
|
|
|
/// We don't unroll loops with a known constant trip count below this number.
|
|
|
|
static const unsigned TinyTripCountUnrollThreshold = 128;
|
|
|
|
|
2014-01-27 19:12:19 +08:00
|
|
|
static cl::opt<unsigned> ForceTargetNumScalarRegs(
|
|
|
|
"force-target-num-scalar-regs", cl::init(0), cl::Hidden,
|
|
|
|
cl::desc("A flag that overrides the target's number of scalar registers."));
|
|
|
|
|
|
|
|
static cl::opt<unsigned> ForceTargetNumVectorRegs(
|
|
|
|
"force-target-num-vector-regs", cl::init(0), cl::Hidden,
|
|
|
|
cl::desc("A flag that overrides the target's number of vector registers."));
|
|
|
|
|
2014-09-11 01:58:16 +08:00
|
|
|
/// Maximum vectorization interleave count.
|
|
|
|
static const unsigned MaxInterleaveFactor = 16;
|
2013-03-02 09:33:49 +08:00
|
|
|
|
2014-09-11 01:58:16 +08:00
|
|
|
static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
|
|
|
|
"force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
|
|
|
|
cl::desc("A flag that overrides the target's max interleave factor for "
|
|
|
|
"scalar loops."));
|
2014-01-27 19:12:19 +08:00
|
|
|
|
2014-09-11 01:58:16 +08:00
|
|
|
static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
|
|
|
|
"force-target-max-vector-interleave", cl::init(0), cl::Hidden,
|
|
|
|
cl::desc("A flag that overrides the target's max interleave factor for "
|
2014-01-27 19:12:19 +08:00
|
|
|
"vectorized loops."));
|
|
|
|
|
2014-01-27 19:41:50 +08:00
|
|
|
static cl::opt<unsigned> ForceTargetInstructionCost(
|
|
|
|
"force-target-instruction-cost", cl::init(0), cl::Hidden,
|
|
|
|
cl::desc("A flag that overrides the target's expected cost for "
|
|
|
|
"an instruction to a single constant value. Mostly "
|
|
|
|
"useful for getting consistent testing."));
|
|
|
|
|
2014-01-27 19:12:19 +08:00
|
|
|
static cl::opt<unsigned> SmallLoopCost(
|
|
|
|
"small-loop-cost", cl::init(20), cl::Hidden,
|
|
|
|
cl::desc("The cost of a loop that is considered 'small' by the unroller."));
|
2013-08-27 06:33:26 +08:00
|
|
|
|
2014-01-28 17:10:41 +08:00
|
|
|
static cl::opt<bool> LoopVectorizeWithBlockFrequency(
|
|
|
|
"loop-vectorize-with-block-frequency", cl::init(false), cl::Hidden,
|
|
|
|
cl::desc("Enable the use of the block frequency analysis to access PGO "
|
|
|
|
"heuristics minimizing code growth in cold regions and being more "
|
|
|
|
"aggressive in hot regions."));
|
|
|
|
|
2014-01-28 09:01:53 +08:00
|
|
|
// Runtime unroll loops for load/store throughput.
|
|
|
|
static cl::opt<bool> EnableLoadStoreRuntimeUnroll(
|
2014-02-02 11:12:34 +08:00
|
|
|
"enable-loadstore-runtime-unroll", cl::init(true), cl::Hidden,
|
2014-01-28 09:01:53 +08:00
|
|
|
cl::desc("Enable runtime unrolling until load/store ports are saturated"));
|
|
|
|
|
|
|
|
/// The number of stores in a loop that are allowed to need predication.
|
|
|
|
static cl::opt<unsigned> NumberOfStoresToPredicate(
|
2014-02-02 11:12:34 +08:00
|
|
|
"vectorize-num-stores-pred", cl::init(1), cl::Hidden,
|
2014-01-28 09:01:53 +08:00
|
|
|
cl::desc("Max number of stores to be predicated behind an if."));
|
|
|
|
|
2014-01-29 12:36:12 +08:00
|
|
|
static cl::opt<bool> EnableIndVarRegisterHeur(
|
2014-02-02 11:12:34 +08:00
|
|
|
"enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
|
2014-01-29 12:36:12 +08:00
|
|
|
cl::desc("Count the induction variable only once when unrolling"));
|
|
|
|
|
2014-01-28 09:01:53 +08:00
|
|
|
static cl::opt<bool> EnableCondStoresVectorization(
|
|
|
|
"enable-cond-stores-vec", cl::init(false), cl::Hidden,
|
|
|
|
cl::desc("Enable if predication of stores during vectorization."));
|
|
|
|
|
2014-08-21 07:53:52 +08:00
|
|
|
static cl::opt<unsigned> MaxNestedScalarReductionUF(
|
|
|
|
"max-nested-scalar-reduction-unroll", cl::init(2), cl::Hidden,
|
|
|
|
cl::desc("The maximum unroll factor to use when unrolling a scalar "
|
|
|
|
"reduction in a nested loop."));
|
|
|
|
|
2012-10-18 02:25:06 +08:00
|
|
|
namespace {
|
|
|
|
|
2013-01-07 18:44:06 +08:00
|
|
|
// Forward declarations.
|
|
|
|
class LoopVectorizationLegality;
|
|
|
|
class LoopVectorizationCostModel;
|
2014-08-02 08:14:03 +08:00
|
|
|
class LoopVectorizeHints;
|
2013-01-07 18:44:06 +08:00
|
|
|
|
2015-02-20 03:15:15 +08:00
|
|
|
/// \brief This modifies LoopAccessReport to initialize message with
|
|
|
|
/// loop-vectorizer-specific part.
|
|
|
|
class VectorizationReport : public LoopAccessReport {
|
|
|
|
public:
|
|
|
|
VectorizationReport(Instruction *I = nullptr)
|
|
|
|
: LoopAccessReport("loop not vectorized: ", I) {}
|
|
|
|
|
|
|
|
/// \brief This allows promotion of the loop-access analysis report into the
|
|
|
|
/// loop-vectorizer report. It modifies the message to add the
|
|
|
|
/// loop-vectorizer-specific part of the message.
|
|
|
|
explicit VectorizationReport(const LoopAccessReport &R)
|
|
|
|
: LoopAccessReport(Twine("loop not vectorized: ") + R.str(),
|
|
|
|
R.getInstr()) {}
|
|
|
|
};
|
|
|
|
|
2015-03-03 04:43:24 +08:00
|
|
|
/// A helper function for converting Scalar types to vector types.
|
|
|
|
/// If the incoming type is void, we return void. If the VF is 1, we return
|
|
|
|
/// the scalar type.
|
|
|
|
static Type* ToVectorTy(Type *Scalar, unsigned VF) {
|
|
|
|
if (Scalar->isVoidTy() || VF == 1)
|
|
|
|
return Scalar;
|
|
|
|
return VectorType::get(Scalar, VF);
|
|
|
|
}
|
|
|
|
|
2013-01-07 18:44:06 +08:00
|
|
|
/// InnerLoopVectorizer vectorizes loops which contain only one basic
|
|
|
|
/// block to a specified vectorization factor (VF).
|
|
|
|
/// This class performs the widening of scalars into vectors, or multiple
|
|
|
|
/// scalars. This class also implements the following features:
|
|
|
|
/// * It inserts an epilogue loop for handling loops that don't have iteration
|
|
|
|
/// counts that are known to be a multiple of the vectorization factor.
|
|
|
|
/// * It handles the code generation for reduction variables.
|
|
|
|
/// * Scalarization (implementation using scalars) of un-vectorizable
|
|
|
|
/// instructions.
|
|
|
|
/// InnerLoopVectorizer does not perform any vectorization-legality
|
|
|
|
/// checks, and relies on the caller to check for the different legality
|
|
|
|
/// aspects. The InnerLoopVectorizer relies on the
|
|
|
|
/// LoopVectorizationLegality class to provide information about the induction
|
|
|
|
/// and reduction variables that were found to a given vectorization factor.
|
|
|
|
class InnerLoopVectorizer {
|
|
|
|
public:
|
|
|
|
InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
|
2015-03-10 10:37:25 +08:00
|
|
|
DominatorTree *DT, const TargetLibraryInfo *TLI,
|
2015-03-18 03:17:18 +08:00
|
|
|
const TargetTransformInfo *TTI, unsigned VecWidth,
|
|
|
|
unsigned UnrollFactor)
|
|
|
|
: OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
|
|
|
|
VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()),
|
|
|
|
Induction(nullptr), OldInduction(nullptr), WidenMap(UnrollFactor),
|
|
|
|
Legal(nullptr), AddedSafetyChecks(false) {}
|
2013-01-07 18:44:06 +08:00
|
|
|
|
|
|
|
// Perform the actual loop widening (vectorization).
|
2014-01-11 02:20:32 +08:00
|
|
|
void vectorize(LoopVectorizationLegality *L) {
|
|
|
|
Legal = L;
|
2013-01-07 18:44:06 +08:00
|
|
|
// Create a new empty loop. Unlink the old loop and connect the new one.
|
2014-01-11 02:20:32 +08:00
|
|
|
createEmptyLoop();
|
2013-01-07 18:44:06 +08:00
|
|
|
// Widen each instruction in the old loop to a new one in the new loop.
|
|
|
|
// Use the Legality module to find the induction and reduction variables.
|
2014-01-11 02:20:32 +08:00
|
|
|
vectorizeLoop();
|
2013-01-07 18:44:06 +08:00
|
|
|
// Register the new loop and update the analysis passes.
|
|
|
|
updateAnalysis();
|
|
|
|
}
|
|
|
|
|
2015-03-09 14:14:18 +08:00
|
|
|
// Return true if any runtime check is added.
|
|
|
|
bool IsSafetyChecksAdded() {
|
|
|
|
return AddedSafetyChecks;
|
|
|
|
}
|
|
|
|
|
2013-08-27 06:33:26 +08:00
|
|
|
virtual ~InnerLoopVectorizer() {}
|
|
|
|
|
|
|
|
protected:
|
2013-01-07 18:44:06 +08:00
|
|
|
/// A small list of PHINodes.
|
|
|
|
typedef SmallVector<PHINode*, 4> PhiVector;
|
|
|
|
/// When we unroll loops we have multiple vector values for each scalar.
|
|
|
|
/// This data structure holds the unrolled and vectorized values that
|
|
|
|
/// originated from one scalar instruction.
|
|
|
|
typedef SmallVector<Value*, 2> VectorParts;
|
|
|
|
|
2015-04-24 08:10:27 +08:00
|
|
|
// When we if-convert we need to create edge masks. We have to cache values
|
|
|
|
// so that we don't end up with exponential recursion/IR.
|
2013-06-28 04:31:06 +08:00
|
|
|
typedef DenseMap<std::pair<BasicBlock*, BasicBlock*>,
|
|
|
|
VectorParts> EdgeMaskCache;
|
|
|
|
|
2015-04-24 08:10:27 +08:00
|
|
|
/// \brief Add checks for strides that were assumed to be 1.
|
2014-01-11 02:20:32 +08:00
|
|
|
///
|
|
|
|
/// Returns the last check instruction and the first check instruction in the
|
|
|
|
/// pair as (first, last).
|
|
|
|
std::pair<Instruction *, Instruction *> addStrideCheck(Instruction *Loc);
|
|
|
|
|
2013-01-07 18:44:06 +08:00
|
|
|
/// Create an empty loop, based on the loop ranges of the old loop.
|
2014-01-11 02:20:32 +08:00
|
|
|
void createEmptyLoop();
|
2013-01-07 18:44:06 +08:00
|
|
|
/// Copy and widen the instructions from the old loop.
|
2014-01-11 02:20:32 +08:00
|
|
|
virtual void vectorizeLoop();
|
2013-08-27 06:33:26 +08:00
|
|
|
|
|
|
|
/// \brief The Loop exit block may have single value PHI nodes where the
|
|
|
|
/// incoming value is 'Undef'. While vectorizing we only handled real values
|
|
|
|
/// that were defined inside the loop. Here we fix the 'undef case'.
|
|
|
|
/// See PR14725.
|
|
|
|
void fixLCSSAPHIs();
|
2013-01-07 18:44:06 +08:00
|
|
|
|
|
|
|
/// A helper function that computes the predicate of the block BB, assuming
|
|
|
|
/// that the header block of the loop is set to True. It returns the *entry*
|
|
|
|
/// mask for the block BB.
|
|
|
|
VectorParts createBlockInMask(BasicBlock *BB);
|
|
|
|
/// A helper function that computes the predicate of the edge between SRC
|
|
|
|
/// and DST.
|
|
|
|
VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst);
|
|
|
|
|
|
|
|
/// A helper function to vectorize a single BB within the innermost loop.
|
2014-01-11 02:20:32 +08:00
|
|
|
void vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV);
|
2013-01-07 18:44:06 +08:00
|
|
|
|
2013-08-27 06:33:26 +08:00
|
|
|
/// Vectorize a single PHINode in a block. This method handles the induction
|
|
|
|
/// variable canonicalization. It supports both VF = 1 for unrolled loops and
|
|
|
|
/// arbitrary length vectors.
|
|
|
|
void widenPHIInstruction(Instruction *PN, VectorParts &Entry,
|
|
|
|
unsigned UF, unsigned VF, PhiVector *PV);
|
|
|
|
|
2013-01-07 18:44:06 +08:00
|
|
|
/// Insert the new loop to the loop hierarchy and pass manager
|
|
|
|
/// and update the analysis passes.
|
|
|
|
void updateAnalysis();
|
|
|
|
|
|
|
|
/// This instruction is un-vectorizable. Implement it as a sequence
|
2014-01-28 09:01:53 +08:00
|
|
|
/// of scalars. If \p IfPredicateStore is true we need to 'hide' each
|
|
|
|
/// scalarized instruction behind an if block predicated on the control
|
|
|
|
/// dependence of the instruction.
|
|
|
|
virtual void scalarizeInstruction(Instruction *Instr,
|
|
|
|
bool IfPredicateStore=false);
|
2013-01-07 18:44:06 +08:00
|
|
|
|
2013-01-26 05:47:42 +08:00
|
|
|
/// Vectorize Load and Store instructions,
|
2014-01-11 02:20:32 +08:00
|
|
|
virtual void vectorizeMemoryInstruction(Instruction *Instr);
|
2013-01-26 05:47:42 +08:00
|
|
|
|
2013-01-07 18:44:06 +08:00
|
|
|
/// Create a broadcast instruction. This method generates a broadcast
|
|
|
|
/// instruction (shuffle) for loop invariant values and for the induction
|
|
|
|
/// value. If this is the induction variable then we extend it to N, N+1, ...
|
|
|
|
/// this is needed because each iteration in the loop corresponds to a SIMD
|
|
|
|
/// element.
|
2013-08-27 06:33:26 +08:00
|
|
|
virtual Value *getBroadcastInstrs(Value *V);
|
2013-01-07 18:44:06 +08:00
|
|
|
|
2015-01-30 13:02:21 +08:00
|
|
|
/// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
|
|
|
|
/// to each vector element of Val. The sequence starts at StartIndex.
|
|
|
|
virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step);
|
2013-01-07 18:44:06 +08:00
|
|
|
|
|
|
|
/// When we go over instructions in the basic block we rely on previous
|
|
|
|
/// values within the current basic block or on loop invariant values.
|
|
|
|
/// When we widen (vectorize) values we place them in the map. If the values
|
|
|
|
/// are not within the map, they have to be loop invariant, so we simply
|
|
|
|
/// broadcast them into a vector.
|
|
|
|
VectorParts &getVectorValue(Value *V);
|
|
|
|
|
|
|
|
/// Generate a shuffle sequence that will reverse the vector Vec.
|
2013-08-27 06:33:26 +08:00
|
|
|
virtual Value *reverseVector(Value *Vec);
|
2013-01-07 18:44:06 +08:00
|
|
|
|
|
|
|
/// This is a helper class that holds the vectorizer state. It maps scalar
|
|
|
|
/// instructions to vector instructions. When the code is 'unrolled' then
|
|
|
|
/// then a single scalar value is mapped to multiple vector parts. The parts
|
|
|
|
/// are stored in the VectorPart type.
|
|
|
|
struct ValueMap {
|
|
|
|
/// C'tor. UnrollFactor controls the number of vectors ('parts') that
|
|
|
|
/// are mapped.
|
|
|
|
ValueMap(unsigned UnrollFactor) : UF(UnrollFactor) {}
|
|
|
|
|
|
|
|
/// \return True if 'Key' is saved in the Value Map.
|
2013-01-30 01:31:33 +08:00
|
|
|
bool has(Value *Key) const { return MapStorage.count(Key); }
|
2013-01-07 18:44:06 +08:00
|
|
|
|
|
|
|
/// Initializes a new entry in the map. Sets all of the vector parts to the
|
|
|
|
/// save value in 'Val'.
|
|
|
|
/// \return A reference to a vector with splat values.
|
|
|
|
VectorParts &splat(Value *Key, Value *Val) {
|
2013-01-30 01:31:33 +08:00
|
|
|
VectorParts &Entry = MapStorage[Key];
|
|
|
|
Entry.assign(UF, Val);
|
|
|
|
return Entry;
|
2013-01-07 18:44:06 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
///\return A reference to the value that is stored at 'Key'.
|
|
|
|
VectorParts &get(Value *Key) {
|
2013-01-30 01:31:33 +08:00
|
|
|
VectorParts &Entry = MapStorage[Key];
|
|
|
|
if (Entry.empty())
|
|
|
|
Entry.resize(UF);
|
|
|
|
assert(Entry.size() == UF);
|
|
|
|
return Entry;
|
2013-01-07 18:44:06 +08:00
|
|
|
}
|
|
|
|
|
2013-01-30 01:31:33 +08:00
|
|
|
private:
|
2013-01-07 18:44:06 +08:00
|
|
|
/// The unroll factor. Each entry in the map stores this number of vector
|
|
|
|
/// elements.
|
|
|
|
unsigned UF;
|
|
|
|
|
|
|
|
/// Map storage. We use std::map and not DenseMap because insertions to a
|
|
|
|
/// dense map invalidates its iterators.
|
2013-01-30 01:31:33 +08:00
|
|
|
std::map<Value *, VectorParts> MapStorage;
|
2013-01-07 18:44:06 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
/// The original loop.
|
|
|
|
Loop *OrigLoop;
|
|
|
|
/// Scev analysis to use.
|
|
|
|
ScalarEvolution *SE;
|
|
|
|
/// Loop Info.
|
|
|
|
LoopInfo *LI;
|
|
|
|
/// Dominator Tree.
|
|
|
|
DominatorTree *DT;
|
[LoopVectorize] Use AA to partition potential dependency checks
Prior to this change, the loop vectorizer did not make use of the alias
analysis infrastructure. Instead, it performed memory dependence analysis using
ScalarEvolution-based linear dependence checks within equivalence classes
derived from the results of ValueTracking's GetUnderlyingObjects.
Unfortunately, this meant that:
1. The loop vectorizer had logic that essentially duplicated that in BasicAA
for aliasing based on identified objects.
2. The loop vectorizer could not partition the space of dependency checks
based on information only easily available from within AA (TBAA metadata is
currently the prime example).
This means, for example, regardless of whether -fno-strict-aliasing was
provided, the vectorizer would only vectorize this loop with a runtime
memory-overlap check:
void foo(int *a, float *b) {
for (int i = 0; i < 1600; ++i)
a[i] = b[i];
}
This is suboptimal because the TBAA metadata already provides the information
necessary to show that this check unnecessary. Of course, the vectorizer has a
limit on the number of such checks it will insert, so in practice, ignoring
TBAA means not vectorizing more-complicated loops that we should.
This change causes the vectorizer to use an AliasSetTracker to keep track of
the pointers in the loop. The resulting alias sets are then used to partition
the space of dependency checks, and potential runtime checks; this results in
more-efficient vectorizations.
When pointer locations are added to the AliasSetTracker, two things are done:
1. The location size is set to UnknownSize (otherwise you'd not catch
inter-iteration dependencies)
2. For instructions in blocks that would need to be predicated, TBAA is
removed (because the metadata might have a control dependency on the condition
being speculated).
For non-predicated blocks, you can leave the TBAA metadata. This is safe
because you can't have an iteration dependency on the TBAA metadata (if you
did, and you unrolled sufficiently, you'd end up with the same pointer value
used by two accesses that TBAA says should not alias, and that would yield
undefined behavior).
llvm-svn: 213486
2014-07-21 07:07:52 +08:00
|
|
|
/// Alias Analysis.
|
|
|
|
AliasAnalysis *AA;
|
2013-02-27 23:24:19 +08:00
|
|
|
/// Target Library Info.
|
|
|
|
const TargetLibraryInfo *TLI;
|
2015-03-18 03:17:18 +08:00
|
|
|
/// Target Transform Info.
|
|
|
|
const TargetTransformInfo *TTI;
|
2013-02-27 23:24:19 +08:00
|
|
|
|
2013-01-07 18:44:06 +08:00
|
|
|
/// The vectorization SIMD factor to use. Each vector will have this many
|
|
|
|
/// vector elements.
|
|
|
|
unsigned VF;
|
2013-08-27 06:33:26 +08:00
|
|
|
|
|
|
|
protected:
|
2013-01-07 18:44:06 +08:00
|
|
|
/// The vectorization unroll factor to use. Each scalar is vectorized to this
|
|
|
|
/// many different vector instructions.
|
|
|
|
unsigned UF;
|
|
|
|
|
|
|
|
/// The builder that we use
|
|
|
|
IRBuilder<> Builder;
|
|
|
|
|
|
|
|
// --- Vectorization state ---
|
|
|
|
|
|
|
|
/// The vector-loop preheader.
|
|
|
|
BasicBlock *LoopVectorPreHeader;
|
|
|
|
/// The scalar-loop preheader.
|
|
|
|
BasicBlock *LoopScalarPreHeader;
|
|
|
|
/// Middle Block between the vector and the scalar.
|
|
|
|
BasicBlock *LoopMiddleBlock;
|
|
|
|
///The ExitBlock of the scalar loop.
|
|
|
|
BasicBlock *LoopExitBlock;
|
|
|
|
///The vector loop body.
|
2014-01-28 09:01:53 +08:00
|
|
|
SmallVector<BasicBlock *, 4> LoopVectorBody;
|
2013-01-07 18:44:06 +08:00
|
|
|
///The scalar loop body.
|
|
|
|
BasicBlock *LoopScalarBody;
|
2013-01-19 21:57:58 +08:00
|
|
|
/// A list of all bypass blocks. The first block is the entry of the loop.
|
|
|
|
SmallVector<BasicBlock *, 4> LoopBypassBlocks;
|
2013-01-07 18:44:06 +08:00
|
|
|
|
|
|
|
/// The new Induction variable which was added to the new block.
|
|
|
|
PHINode *Induction;
|
|
|
|
/// The induction variable of the old basic block.
|
|
|
|
PHINode *OldInduction;
|
2013-05-12 07:04:28 +08:00
|
|
|
/// Holds the extended (to the widest induction type) start index.
|
|
|
|
Value *ExtendedIdx;
|
2013-01-07 18:44:06 +08:00
|
|
|
/// Maps scalars to widened vectors.
|
|
|
|
ValueMap WidenMap;
|
2013-06-28 04:31:06 +08:00
|
|
|
EdgeMaskCache MaskCache;
|
2014-01-11 02:20:32 +08:00
|
|
|
|
|
|
|
LoopVectorizationLegality *Legal;
|
2015-03-09 14:14:18 +08:00
|
|
|
|
|
|
|
// Record whether runtime check is added.
|
|
|
|
bool AddedSafetyChecks;
|
2013-01-07 18:44:06 +08:00
|
|
|
};
|
|
|
|
|
2013-08-27 06:33:26 +08:00
|
|
|
class InnerLoopUnroller : public InnerLoopVectorizer {
|
|
|
|
public:
|
|
|
|
InnerLoopUnroller(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
|
2015-03-10 10:37:25 +08:00
|
|
|
DominatorTree *DT, const TargetLibraryInfo *TLI,
|
2015-03-18 03:17:18 +08:00
|
|
|
const TargetTransformInfo *TTI, unsigned UnrollFactor)
|
|
|
|
: InnerLoopVectorizer(OrigLoop, SE, LI, DT, TLI, TTI, 1, UnrollFactor) {}
|
2013-08-27 06:33:26 +08:00
|
|
|
|
|
|
|
private:
|
2014-03-05 17:10:37 +08:00
|
|
|
void scalarizeInstruction(Instruction *Instr,
|
|
|
|
bool IfPredicateStore = false) override;
|
|
|
|
void vectorizeMemoryInstruction(Instruction *Instr) override;
|
|
|
|
Value *getBroadcastInstrs(Value *V) override;
|
2015-01-30 13:02:21 +08:00
|
|
|
Value *getStepVector(Value *Val, int StartIdx, Value *Step) override;
|
2014-03-05 17:10:37 +08:00
|
|
|
Value *reverseVector(Value *Vec) override;
|
2013-08-27 06:33:26 +08:00
|
|
|
};
|
|
|
|
|
2013-06-28 08:38:54 +08:00
|
|
|
/// \brief Look for a meaningful debug location on the instruction or it's
|
|
|
|
/// operands.
|
|
|
|
static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
|
|
|
|
if (!I)
|
|
|
|
return I;
|
|
|
|
|
|
|
|
DebugLoc Empty;
|
|
|
|
if (I->getDebugLoc() != Empty)
|
|
|
|
return I;
|
|
|
|
|
|
|
|
for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
|
|
|
|
if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
|
|
|
|
if (OpInst->getDebugLoc() != Empty)
|
|
|
|
return OpInst;
|
|
|
|
}
|
|
|
|
|
|
|
|
return I;
|
|
|
|
}
|
|
|
|
|
2013-06-29 00:26:54 +08:00
|
|
|
/// \brief Set the debug location in the builder using the debug location in the
|
|
|
|
/// instruction.
|
2013-06-29 01:14:48 +08:00
|
|
|
static void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
|
|
|
|
if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr))
|
2013-06-29 00:26:54 +08:00
|
|
|
B.SetCurrentDebugLocation(Inst->getDebugLoc());
|
|
|
|
else
|
|
|
|
B.SetCurrentDebugLocation(DebugLoc());
|
|
|
|
}
|
2014-04-07 20:46:30 +08:00
|
|
|
|
|
|
|
#ifndef NDEBUG
|
2014-05-20 16:26:20 +08:00
|
|
|
/// \return string containing a file name and a line # for the given loop.
|
|
|
|
static std::string getDebugLocString(const Loop *L) {
|
2014-06-27 06:52:05 +08:00
|
|
|
std::string Result;
|
|
|
|
if (L) {
|
|
|
|
raw_string_ostream OS(Result);
|
2015-03-31 03:49:49 +08:00
|
|
|
if (const DebugLoc LoopDbgLoc = L->getStartLoc())
|
2015-02-27 07:32:17 +08:00
|
|
|
LoopDbgLoc.print(OS);
|
2014-06-27 06:52:05 +08:00
|
|
|
else
|
|
|
|
// Just print the module name.
|
|
|
|
OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
|
|
|
|
OS.flush();
|
|
|
|
}
|
|
|
|
return Result;
|
2014-04-07 20:32:17 +08:00
|
|
|
}
|
2014-04-07 20:46:30 +08:00
|
|
|
#endif
|
|
|
|
|
2014-07-19 21:33:16 +08:00
|
|
|
/// \brief Propagate known metadata from one instruction to another.
|
|
|
|
static void propagateMetadata(Instruction *To, const Instruction *From) {
|
2014-11-12 05:30:22 +08:00
|
|
|
SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata;
|
2014-07-19 21:33:16 +08:00
|
|
|
From->getAllMetadataOtherThanDebugLoc(Metadata);
|
|
|
|
|
|
|
|
for (auto M : Metadata) {
|
|
|
|
unsigned Kind = M.first;
|
|
|
|
|
|
|
|
// These are safe to transfer (this is safe for TBAA, even when we
|
|
|
|
// if-convert, because should that metadata have had a control dependency
|
|
|
|
// on the condition, and thus actually aliased with some other
|
|
|
|
// non-speculated memory access when the condition was false, this would be
|
|
|
|
// caught by the runtime overlap checks).
|
|
|
|
if (Kind != LLVMContext::MD_tbaa &&
|
Add scoped-noalias metadata
This commit adds scoped noalias metadata. The primary motivations for this
feature are:
1. To preserve noalias function attribute information when inlining
2. To provide the ability to model block-scope C99 restrict pointers
Neither of these two abilities are added here, only the necessary
infrastructure. In fact, there should be no change to existing functionality,
only the addition of new features. The logic that converts noalias function
parameters into this metadata during inlining will come in a follow-up commit.
What is added here is the ability to generally specify noalias memory-access
sets. Regarding the metadata, alias-analysis scopes are defined similar to TBAA
nodes:
!scope0 = metadata !{ metadata !"scope of foo()" }
!scope1 = metadata !{ metadata !"scope 1", metadata !scope0 }
!scope2 = metadata !{ metadata !"scope 2", metadata !scope0 }
!scope3 = metadata !{ metadata !"scope 2.1", metadata !scope2 }
!scope4 = metadata !{ metadata !"scope 2.2", metadata !scope2 }
Loads and stores can be tagged with an alias-analysis scope, and also, with a
noalias tag for a specific scope:
... = load %ptr1, !alias.scope !{ !scope1 }
... = load %ptr2, !alias.scope !{ !scope1, !scope2 }, !noalias !{ !scope1 }
When evaluating an aliasing query, if one of the instructions is associated
with an alias.scope id that is identical to the noalias scope associated with
the other instruction, or is a descendant (in the scope hierarchy) of the
noalias scope associated with the other instruction, then the two memory
accesses are assumed not to alias.
Note that is the first element of the scope metadata is a string, then it can
be combined accross functions and translation units. The string can be replaced
by a self-reference to create globally unqiue scope identifiers.
[Note: This overview is slightly stylized, since the metadata nodes really need
to just be numbers (!0 instead of !scope0), and the scope lists are also global
unnamed metadata.]
Existing noalias metadata in a callee is "cloned" for use by the inlined code.
This is necessary because the aliasing scopes are unique to each call site
(because of possible control dependencies on the aliasing properties). For
example, consider a function: foo(noalias a, noalias b) { *a = *b; } that gets
inlined into bar() { ... if (...) foo(a1, b1); ... if (...) foo(a2, b2); } --
now just because we know that a1 does not alias with b1 at the first call site,
and a2 does not alias with b2 at the second call site, we cannot let inlining
these functons have the metadata imply that a1 does not alias with b2.
llvm-svn: 213864
2014-07-24 22:25:39 +08:00
|
|
|
Kind != LLVMContext::MD_alias_scope &&
|
|
|
|
Kind != LLVMContext::MD_noalias &&
|
2014-07-19 21:33:16 +08:00
|
|
|
Kind != LLVMContext::MD_fpmath)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
To->setMetadata(Kind, M.second);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// \brief Propagate known metadata from one instruction to a vector of others.
|
|
|
|
static void propagateMetadata(SmallVectorImpl<Value *> &To, const Instruction *From) {
|
|
|
|
for (Value *V : To)
|
|
|
|
if (Instruction *I = dyn_cast<Instruction>(V))
|
|
|
|
propagateMetadata(I, From);
|
|
|
|
}
|
|
|
|
|
2013-01-07 18:44:06 +08:00
|
|
|
/// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
|
|
|
|
/// to what vectorization factor.
|
|
|
|
/// This class does not look at the profitability of vectorization, only the
|
|
|
|
/// legality. This class has two main kinds of checks:
|
|
|
|
/// * Memory checks - The code in canVectorizeMemory checks if vectorization
|
|
|
|
/// will change the order of memory accesses in a way that will change the
|
|
|
|
/// correctness of the program.
|
|
|
|
/// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory
|
|
|
|
/// checks for a number of different conditions, such as the availability of a
|
|
|
|
/// single induction variable, that all types are supported and vectorize-able,
|
|
|
|
/// etc. This code reflects the capabilities of InnerLoopVectorizer.
|
|
|
|
/// This class is also used by InnerLoopVectorizer for identifying
|
|
|
|
/// induction variable and the different reduction variables.
|
|
|
|
class LoopVectorizationLegality {
|
|
|
|
public:
|
2015-03-10 10:37:25 +08:00
|
|
|
LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DominatorTree *DT,
|
|
|
|
TargetLibraryInfo *TLI, AliasAnalysis *AA,
|
|
|
|
Function *F, const TargetTransformInfo *TTI,
|
2015-02-20 03:15:04 +08:00
|
|
|
LoopAccessAnalysis *LAA)
|
2015-03-10 10:37:25 +08:00
|
|
|
: NumPredStores(0), TheLoop(L), SE(SE), TLI(TLI), TheFunction(F),
|
|
|
|
TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), Induction(nullptr),
|
|
|
|
WidestIndTy(nullptr), HasFunNoNaNAttr(false) {}
|
2013-01-07 18:44:06 +08:00
|
|
|
|
|
|
|
/// This enum represents the kinds of inductions that we support.
|
|
|
|
enum InductionKind {
|
2015-01-30 13:02:21 +08:00
|
|
|
IK_NoInduction, ///< Not an induction variable.
|
|
|
|
IK_IntInduction, ///< Integer induction variable. Step = C.
|
|
|
|
IK_PtrInduction ///< Pointer induction var. Step = C / sizeof(elem).
|
2013-01-07 18:44:06 +08:00
|
|
|
};
|
|
|
|
|
2013-10-16 00:19:54 +08:00
|
|
|
/// A struct for saving information about induction variables.
|
2013-01-07 18:44:06 +08:00
|
|
|
struct InductionInfo {
|
2015-01-30 13:02:21 +08:00
|
|
|
InductionInfo(Value *Start, InductionKind K, ConstantInt *Step)
|
|
|
|
: StartValue(Start), IK(K), StepValue(Step) {
|
|
|
|
assert(IK != IK_NoInduction && "Not an induction");
|
|
|
|
assert(StartValue && "StartValue is null");
|
|
|
|
assert(StepValue && !StepValue->isZero() && "StepValue is zero");
|
|
|
|
assert((IK != IK_PtrInduction || StartValue->getType()->isPointerTy()) &&
|
|
|
|
"StartValue is not a pointer for pointer induction");
|
|
|
|
assert((IK != IK_IntInduction || StartValue->getType()->isIntegerTy()) &&
|
|
|
|
"StartValue is not an integer for integer induction");
|
|
|
|
assert(StepValue->getType()->isIntegerTy() &&
|
|
|
|
"StepValue is not an integer");
|
|
|
|
}
|
|
|
|
InductionInfo()
|
|
|
|
: StartValue(nullptr), IK(IK_NoInduction), StepValue(nullptr) {}
|
|
|
|
|
|
|
|
/// Get the consecutive direction. Returns:
|
|
|
|
/// 0 - unknown or non-consecutive.
|
|
|
|
/// 1 - consecutive and increasing.
|
|
|
|
/// -1 - consecutive and decreasing.
|
|
|
|
int getConsecutiveDirection() const {
|
|
|
|
if (StepValue && (StepValue->isOne() || StepValue->isMinusOne()))
|
|
|
|
return StepValue->getSExtValue();
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Compute the transformed value of Index at offset StartValue using step
|
|
|
|
/// StepValue.
|
|
|
|
/// For integer induction, returns StartValue + Index * StepValue.
|
|
|
|
/// For pointer induction, returns StartValue[Index * StepValue].
|
|
|
|
/// FIXME: The newly created binary instructions should contain nsw/nuw
|
|
|
|
/// flags, which can be found from the original scalar operations.
|
|
|
|
Value *transform(IRBuilder<> &B, Value *Index) const {
|
|
|
|
switch (IK) {
|
|
|
|
case IK_IntInduction:
|
|
|
|
assert(Index->getType() == StartValue->getType() &&
|
|
|
|
"Index type does not match StartValue type");
|
|
|
|
if (StepValue->isMinusOne())
|
|
|
|
return B.CreateSub(StartValue, Index);
|
|
|
|
if (!StepValue->isOne())
|
|
|
|
Index = B.CreateMul(Index, StepValue);
|
|
|
|
return B.CreateAdd(StartValue, Index);
|
|
|
|
|
|
|
|
case IK_PtrInduction:
|
|
|
|
if (StepValue->isMinusOne())
|
|
|
|
Index = B.CreateNeg(Index);
|
|
|
|
else if (!StepValue->isOne())
|
|
|
|
Index = B.CreateMul(Index, StepValue);
|
2015-04-04 03:41:44 +08:00
|
|
|
return B.CreateGEP(nullptr, StartValue, Index);
|
2015-01-30 13:02:21 +08:00
|
|
|
|
|
|
|
case IK_NoInduction:
|
|
|
|
return nullptr;
|
|
|
|
}
|
2015-01-31 05:30:57 +08:00
|
|
|
llvm_unreachable("invalid enum");
|
2015-01-30 13:02:21 +08:00
|
|
|
}
|
|
|
|
|
2013-01-07 18:44:06 +08:00
|
|
|
/// Start value.
|
2013-05-23 00:54:56 +08:00
|
|
|
TrackingVH<Value> StartValue;
|
2013-01-07 18:44:06 +08:00
|
|
|
/// Induction kind.
|
|
|
|
InductionKind IK;
|
2015-01-30 13:02:21 +08:00
|
|
|
/// Step value.
|
|
|
|
ConstantInt *StepValue;
|
2013-01-07 18:44:06 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
/// ReductionList contains the reduction descriptors for all
|
|
|
|
/// of the reductions that were found in the loop.
|
|
|
|
typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList;
|
|
|
|
|
|
|
|
/// InductionList saves induction variables and maps them to the
|
|
|
|
/// induction descriptor.
|
|
|
|
typedef MapVector<PHINode*, InductionInfo> InductionList;
|
|
|
|
|
|
|
|
/// Returns true if it is legal to vectorize this loop.
|
|
|
|
/// This does not mean that it is profitable to vectorize this
|
|
|
|
/// loop, only that it is legal to do so.
|
|
|
|
bool canVectorize();
|
|
|
|
|
|
|
|
/// Returns the Induction variable.
|
|
|
|
PHINode *getInduction() { return Induction; }
|
|
|
|
|
|
|
|
/// Returns the reduction variables found in the loop.
|
|
|
|
ReductionList *getReductionVars() { return &Reductions; }
|
|
|
|
|
|
|
|
/// Returns the induction variables found in the loop.
|
|
|
|
InductionList *getInductionVars() { return &Inductions; }
|
|
|
|
|
2013-05-12 07:04:28 +08:00
|
|
|
/// Returns the widest induction type.
|
|
|
|
Type *getWidestInductionType() { return WidestIndTy; }
|
|
|
|
|
2013-01-07 18:44:06 +08:00
|
|
|
/// Returns True if V is an induction variable in this loop.
|
|
|
|
bool isInductionVariable(const Value *V);
|
|
|
|
|
|
|
|
/// Return true if the block BB needs to be predicated in order for the loop
|
|
|
|
/// to be vectorized.
|
|
|
|
bool blockNeedsPredication(BasicBlock *BB);
|
|
|
|
|
|
|
|
/// Check if this pointer is consecutive when vectorizing. This happens
|
|
|
|
/// when the last index of the GEP is the induction variable, or that the
|
|
|
|
/// pointer itself is an induction variable.
|
|
|
|
/// This check allows us to vectorize A[idx] into a wide load/store.
|
|
|
|
/// Returns:
|
2013-12-05 13:44:44 +08:00
|
|
|
/// 0 - Stride is unknown or non-consecutive.
|
2013-01-07 18:44:06 +08:00
|
|
|
/// 1 - Address is consecutive.
|
|
|
|
/// -1 - Address is consecutive, and decreasing.
|
|
|
|
int isConsecutivePtr(Value *Ptr);
|
|
|
|
|
|
|
|
/// Returns true if the value V is uniform within the loop.
|
|
|
|
bool isUniform(Value *V);
|
|
|
|
|
|
|
|
/// Returns true if this instruction will remain scalar after vectorization.
|
|
|
|
bool isUniformAfterVectorization(Instruction* I) { return Uniforms.count(I); }
|
|
|
|
|
|
|
|
/// Returns the information that we collected about runtime memory check.
|
2015-02-20 03:15:21 +08:00
|
|
|
const LoopAccessInfo::RuntimePointerCheck *getRuntimePointerCheck() const {
|
2015-02-20 03:15:04 +08:00
|
|
|
return LAI->getRuntimePointerCheck();
|
2015-02-02 00:56:04 +08:00
|
|
|
}
|
2013-04-20 05:03:36 +08:00
|
|
|
|
2015-02-20 03:15:21 +08:00
|
|
|
const LoopAccessInfo *getLAI() const {
|
2015-02-20 03:15:04 +08:00
|
|
|
return LAI;
|
2015-02-20 03:14:34 +08:00
|
|
|
}
|
2015-02-07 02:31:04 +08:00
|
|
|
|
2015-02-20 03:15:04 +08:00
|
|
|
unsigned getMaxSafeDepDistBytes() { return LAI->getMaxSafeDepDistBytes(); }
|
2013-06-24 20:09:15 +08:00
|
|
|
|
2014-01-11 02:20:32 +08:00
|
|
|
bool hasStride(Value *V) { return StrideSet.count(V); }
|
|
|
|
bool mustCheckStrides() { return !StrideSet.empty(); }
|
|
|
|
SmallPtrSet<Value *, 8>::iterator strides_begin() {
|
|
|
|
return StrideSet.begin();
|
|
|
|
}
|
|
|
|
SmallPtrSet<Value *, 8>::iterator strides_end() { return StrideSet.end(); }
|
|
|
|
|
2014-12-16 19:50:42 +08:00
|
|
|
/// Returns true if the target machine supports masked store operation
|
|
|
|
/// for the given \p DataType and kind of access to \p Ptr.
|
|
|
|
bool isLegalMaskedStore(Type *DataType, Value *Ptr) {
|
|
|
|
return TTI->isLegalMaskedStore(DataType, isConsecutivePtr(Ptr));
|
|
|
|
}
|
|
|
|
/// Returns true if the target machine supports masked load operation
|
|
|
|
/// for the given \p DataType and kind of access to \p Ptr.
|
|
|
|
bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {
|
|
|
|
return TTI->isLegalMaskedLoad(DataType, isConsecutivePtr(Ptr));
|
|
|
|
}
|
|
|
|
/// Returns true if vector representation of the instruction \p I
|
|
|
|
/// requires mask.
|
2015-02-20 03:14:34 +08:00
|
|
|
bool isMaskRequired(const Instruction* I) {
|
|
|
|
return (MaskedOp.count(I) != 0);
|
|
|
|
}
|
|
|
|
unsigned getNumStores() const {
|
2015-02-20 03:15:04 +08:00
|
|
|
return LAI->getNumStores();
|
2015-02-20 03:14:34 +08:00
|
|
|
}
|
|
|
|
unsigned getNumLoads() const {
|
2015-02-20 03:15:04 +08:00
|
|
|
return LAI->getNumLoads();
|
2015-02-20 03:14:34 +08:00
|
|
|
}
|
|
|
|
unsigned getNumPredStores() const {
|
|
|
|
return NumPredStores;
|
|
|
|
}
|
2013-01-07 18:44:06 +08:00
|
|
|
private:
|
|
|
|
/// Check if a single basic block loop is vectorizable.
|
|
|
|
/// At this point we know that this is a loop with a constant trip count
|
|
|
|
/// and we only need to check individual instructions.
|
|
|
|
bool canVectorizeInstrs();
|
|
|
|
|
|
|
|
/// When we vectorize loops we may change the order in which
|
|
|
|
/// we read and write from memory. This method checks if it is
|
|
|
|
/// legal to vectorize the code, considering only memory constrains.
|
|
|
|
/// Returns true if the loop is vectorizable
|
|
|
|
bool canVectorizeMemory();
|
|
|
|
|
|
|
|
/// Return true if we can vectorize this loop using the IF-conversion
|
|
|
|
/// transformation.
|
|
|
|
bool canVectorizeWithIfConvert();
|
|
|
|
|
|
|
|
/// Collect the variables that need to stay uniform after vectorization.
|
|
|
|
void collectLoopUniforms();
|
|
|
|
|
|
|
|
/// Return true if all of the instructions in the block can be speculatively
|
2013-06-29 04:46:27 +08:00
|
|
|
/// executed. \p SafePtrs is a list of addresses that are known to be legal
|
|
|
|
/// and we know that we can read from them without segfault.
|
2014-08-21 13:55:13 +08:00
|
|
|
bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs);
|
2013-01-07 18:44:06 +08:00
|
|
|
|
2015-01-30 13:02:21 +08:00
|
|
|
/// Returns the induction kind of Phi and record the step. This function may
|
|
|
|
/// return NoInduction if the PHI is not an induction variable.
|
|
|
|
InductionKind isInductionVariable(PHINode *Phi, ConstantInt *&StepValue);
|
2013-01-07 18:44:06 +08:00
|
|
|
|
2014-01-11 02:20:32 +08:00
|
|
|
/// \brief Collect memory access with loop invariant strides.
|
|
|
|
///
|
|
|
|
/// Looks for accesses like "a[i * StrideA]" where "StrideA" is loop
|
|
|
|
/// invariant.
|
2015-01-14 11:02:16 +08:00
|
|
|
void collectStridedAccess(Value *LoadOrStoreInst);
|
2014-01-11 02:20:32 +08:00
|
|
|
|
2014-06-26 01:50:15 +08:00
|
|
|
/// Report an analysis message to assist the user in diagnosing loops that are
|
2015-02-20 03:15:15 +08:00
|
|
|
/// not vectorized. These are handled as LoopAccessReport rather than
|
|
|
|
/// VectorizationReport because the << operator of VectorizationReport returns
|
|
|
|
/// LoopAccessReport.
|
|
|
|
void emitAnalysis(const LoopAccessReport &Message) {
|
|
|
|
LoopAccessReport::emitAnalysis(Message, TheFunction, TheLoop, LV_NAME);
|
2014-06-26 01:50:15 +08:00
|
|
|
}
|
|
|
|
|
2015-02-02 00:56:02 +08:00
|
|
|
unsigned NumPredStores;
|
|
|
|
|
2013-01-07 18:44:06 +08:00
|
|
|
/// The loop that we evaluate.
|
|
|
|
Loop *TheLoop;
|
|
|
|
/// Scev analysis.
|
|
|
|
ScalarEvolution *SE;
|
2013-02-27 23:24:19 +08:00
|
|
|
/// Target Library Info.
|
|
|
|
TargetLibraryInfo *TLI;
|
2014-06-26 01:50:15 +08:00
|
|
|
/// Parent function
|
|
|
|
Function *TheFunction;
|
2014-12-16 19:50:42 +08:00
|
|
|
/// Target Transform Info
|
|
|
|
const TargetTransformInfo *TTI;
|
2015-02-18 11:43:19 +08:00
|
|
|
/// Dominator Tree.
|
|
|
|
DominatorTree *DT;
|
2015-02-20 03:15:04 +08:00
|
|
|
// LoopAccess analysis.
|
|
|
|
LoopAccessAnalysis *LAA;
|
|
|
|
// And the loop-accesses info corresponding to this loop. This pointer is
|
|
|
|
// null until canVectorizeMemory sets it up.
|
2015-02-20 03:15:21 +08:00
|
|
|
const LoopAccessInfo *LAI;
|
2013-01-07 18:44:06 +08:00
|
|
|
|
|
|
|
// --- vectorization state --- //
|
|
|
|
|
|
|
|
/// Holds the integer induction variable. This is the counter of the
|
|
|
|
/// loop.
|
|
|
|
PHINode *Induction;
|
|
|
|
/// Holds the reduction variables.
|
|
|
|
ReductionList Reductions;
|
|
|
|
/// Holds all of the induction variables that we found in the loop.
|
|
|
|
/// Notice that inductions don't need to start at zero and that induction
|
|
|
|
/// variables can be pointers.
|
|
|
|
InductionList Inductions;
|
2013-05-12 07:04:28 +08:00
|
|
|
/// Holds the widest induction type encountered.
|
|
|
|
Type *WidestIndTy;
|
2013-01-07 18:44:06 +08:00
|
|
|
|
|
|
|
/// Allowed outside users. This holds the reduction
|
|
|
|
/// vars which can be accessed from outside the loop.
|
|
|
|
SmallPtrSet<Value*, 4> AllowedExit;
|
|
|
|
/// This set holds the variables which are known to be uniform after
|
|
|
|
/// vectorization.
|
2015-02-20 03:14:34 +08:00
|
|
|
SmallPtrSet<Instruction*, 4> Uniforms;
|
2015-02-20 03:15:04 +08:00
|
|
|
|
2013-05-05 09:54:48 +08:00
|
|
|
/// Can we assume the absence of NaNs.
|
|
|
|
bool HasFunNoNaNAttr;
|
2013-05-15 09:44:30 +08:00
|
|
|
|
2014-01-11 02:20:32 +08:00
|
|
|
ValueToValueMap Strides;
|
|
|
|
SmallPtrSet<Value *, 8> StrideSet;
|
2015-03-09 14:14:18 +08:00
|
|
|
|
2014-12-16 19:50:42 +08:00
|
|
|
/// While vectorizing these instructions we have to generate a
|
|
|
|
/// call to the appropriate masked intrinsic
|
|
|
|
SmallPtrSet<const Instruction*, 8> MaskedOp;
|
2013-01-07 18:44:06 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
/// LoopVectorizationCostModel - estimates the expected speedups due to
|
|
|
|
/// vectorization.
|
|
|
|
/// In many cases vectorization is not profitable. This can happen because of
|
|
|
|
/// a number of reasons. In this class we mainly attempt to predict the
|
|
|
|
/// expected speedup/slowdowns due to the supported instruction set. We use the
|
|
|
|
/// TargetTransformInfo to query the different backends for the cost of
|
|
|
|
/// different operations.
|
|
|
|
class LoopVectorizationCostModel {
|
|
|
|
public:
|
|
|
|
LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI,
|
|
|
|
LoopVectorizationLegality *Legal,
|
2013-02-05 23:08:02 +08:00
|
|
|
const TargetTransformInfo &TTI,
|
2015-03-10 10:37:25 +08:00
|
|
|
const TargetLibraryInfo *TLI, AssumptionCache *AC,
|
|
|
|
const Function *F, const LoopVectorizeHints *Hints)
|
|
|
|
: TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI),
|
2014-10-15 07:58:51 +08:00
|
|
|
TheFunction(F), Hints(Hints) {
|
2015-01-04 20:03:27 +08:00
|
|
|
CodeMetrics::collectEphemeralValues(L, AC, EphValues);
|
2014-10-15 06:59:49 +08:00
|
|
|
}
|
2013-01-07 18:44:06 +08:00
|
|
|
|
2013-01-29 00:02:45 +08:00
|
|
|
/// Information about vectorization costs
|
|
|
|
struct VectorizationFactor {
|
|
|
|
unsigned Width; // Vector width with best cost
|
|
|
|
unsigned Cost; // Cost of the loop with that width
|
|
|
|
};
|
2013-01-20 13:24:29 +08:00
|
|
|
/// \return The most profitable vectorization factor and the cost of that VF.
|
2013-01-07 18:44:06 +08:00
|
|
|
/// This method checks every power of two up to VF. If UserVF is not ZERO
|
|
|
|
/// then this vectorization factor will be selected if vectorization is
|
|
|
|
/// possible.
|
2014-08-02 08:14:03 +08:00
|
|
|
VectorizationFactor selectVectorizationFactor(bool OptForSize);
|
2013-01-07 18:44:06 +08:00
|
|
|
|
2013-01-29 00:02:45 +08:00
|
|
|
/// \return The size (in bits) of the widest type in the code that
|
2013-01-10 06:29:00 +08:00
|
|
|
/// needs to be vectorized. We ignore values that remain scalar such as
|
|
|
|
/// 64 bit loop indices.
|
|
|
|
unsigned getWidestType();
|
2013-01-07 18:44:06 +08:00
|
|
|
|
|
|
|
/// \return The most profitable unroll factor.
|
|
|
|
/// If UserUF is non-zero then this method finds the best unroll-factor
|
|
|
|
/// based on register pressure and other parameters.
|
2013-01-20 13:24:29 +08:00
|
|
|
/// VF and LoopCost are the selected vectorization factor and the cost of the
|
|
|
|
/// selected VF.
|
2014-08-02 08:14:03 +08:00
|
|
|
unsigned selectUnrollFactor(bool OptForSize, unsigned VF, unsigned LoopCost);
|
2013-01-07 18:44:06 +08:00
|
|
|
|
|
|
|
/// \brief A struct that represents some properties of the register usage
|
|
|
|
/// of a loop.
|
|
|
|
struct RegisterUsage {
|
|
|
|
/// Holds the number of loop invariant values that are used in the loop.
|
|
|
|
unsigned LoopInvariantRegs;
|
|
|
|
/// Holds the maximum number of concurrent live intervals in the loop.
|
|
|
|
unsigned MaxLocalUsers;
|
|
|
|
/// Holds the number of instructions in the loop.
|
|
|
|
unsigned NumInstructions;
|
|
|
|
};
|
|
|
|
|
|
|
|
/// \return information about the register usage of the loop.
|
|
|
|
RegisterUsage calculateRegisterUsage();
|
|
|
|
|
|
|
|
private:
|
|
|
|
/// Returns the expected execution cost. The unit of the cost does
|
|
|
|
/// not matter because we use the 'cost' units to compare different
|
|
|
|
/// vector widths. The cost that is returned is *not* normalized by
|
|
|
|
/// the factor width.
|
|
|
|
unsigned expectedCost(unsigned VF);
|
|
|
|
|
|
|
|
/// Returns the execution time cost of an instruction for a given vector
|
|
|
|
/// width. Vector width of one means scalar.
|
|
|
|
unsigned getInstructionCost(Instruction *I, unsigned VF);
|
|
|
|
|
2013-02-05 23:08:02 +08:00
|
|
|
/// Returns whether the instruction is a load or store and will be a emitted
|
|
|
|
/// as a vector operation.
|
|
|
|
bool isConsecutiveLoadOrStore(Instruction *I);
|
|
|
|
|
2014-08-02 08:14:03 +08:00
|
|
|
/// Report an analysis message to assist the user in diagnosing loops that are
|
2015-02-20 03:15:15 +08:00
|
|
|
/// not vectorized. These are handled as LoopAccessReport rather than
|
|
|
|
/// VectorizationReport because the << operator of VectorizationReport returns
|
|
|
|
/// LoopAccessReport.
|
|
|
|
void emitAnalysis(const LoopAccessReport &Message) {
|
|
|
|
LoopAccessReport::emitAnalysis(Message, TheFunction, TheLoop, LV_NAME);
|
2014-08-02 08:14:03 +08:00
|
|
|
}
|
|
|
|
|
2014-10-15 06:59:49 +08:00
|
|
|
/// Values used only by @llvm.assume calls.
|
|
|
|
SmallPtrSet<const Value *, 32> EphValues;
|
|
|
|
|
2013-01-07 18:44:06 +08:00
|
|
|
/// The loop that we evaluate.
|
|
|
|
Loop *TheLoop;
|
|
|
|
/// Scev analysis.
|
|
|
|
ScalarEvolution *SE;
|
|
|
|
/// Loop Info analysis.
|
|
|
|
LoopInfo *LI;
|
|
|
|
/// Vectorization legality.
|
|
|
|
LoopVectorizationLegality *Legal;
|
|
|
|
/// Vector target information.
|
2013-01-07 19:12:29 +08:00
|
|
|
const TargetTransformInfo &TTI;
|
2013-02-27 23:24:19 +08:00
|
|
|
/// Target Library Info.
|
|
|
|
const TargetLibraryInfo *TLI;
|
2014-08-02 08:14:03 +08:00
|
|
|
const Function *TheFunction;
|
|
|
|
// Loop Vectorize Hint.
|
|
|
|
const LoopVectorizeHints *Hints;
|
2013-01-07 18:44:06 +08:00
|
|
|
};
|
|
|
|
|
2013-05-29 04:00:34 +08:00
|
|
|
/// Utility class for getting and setting loop vectorizer hints in the form
|
|
|
|
/// of loop metadata.
|
Small refactor on VectorizerHint for deduplication
Previously, the hint mechanism relied on clean up passes to remove redundant
metadata, which still showed up if running opt at low levels of optimization.
That also has shown that multiple nodes of the same type, but with different
values could still coexist, even if temporary, and cause confusion if the
next pass got the wrong value.
This patch makes sure that, if metadata already exists in a loop, the hint
mechanism will never append a new node, but always replace the existing one.
It also enhances the algorithm to cope with more metadata types in the future
by just adding a new type, not a lot of code.
Re-applying again due to MSVC 2013 being minimum requirement, and this patch
having C++11 that MSVC 2012 didn't support.
Fixes PR20655.
llvm-svn: 216870
2014-09-01 18:00:17 +08:00
|
|
|
/// This class keeps a number of loop annotations locally (as member variables)
|
|
|
|
/// and can, upon request, write them back as metadata on the loop. It will
|
|
|
|
/// initially scan the loop for existing metadata, and will update the local
|
|
|
|
/// values based on information in the loop.
|
|
|
|
/// We cannot write all values to metadata, as the mere presence of some info,
|
|
|
|
/// for example 'force', means a decision has been made. So, we need to be
|
|
|
|
/// careful NOT to add them if the user hasn't specifically asked so.
|
2014-05-03 04:40:04 +08:00
|
|
|
class LoopVectorizeHints {
|
Small refactor on VectorizerHint for deduplication
Previously, the hint mechanism relied on clean up passes to remove redundant
metadata, which still showed up if running opt at low levels of optimization.
That also has shown that multiple nodes of the same type, but with different
values could still coexist, even if temporary, and cause confusion if the
next pass got the wrong value.
This patch makes sure that, if metadata already exists in a loop, the hint
mechanism will never append a new node, but always replace the existing one.
It also enhances the algorithm to cope with more metadata types in the future
by just adding a new type, not a lot of code.
Re-applying again due to MSVC 2013 being minimum requirement, and this patch
having C++11 that MSVC 2012 didn't support.
Fixes PR20655.
llvm-svn: 216870
2014-09-01 18:00:17 +08:00
|
|
|
enum HintKind {
|
|
|
|
HK_WIDTH,
|
|
|
|
HK_UNROLL,
|
|
|
|
HK_FORCE
|
|
|
|
};
|
|
|
|
|
|
|
|
/// Hint - associates name and validation with the hint value.
|
|
|
|
struct Hint {
|
|
|
|
const char * Name;
|
|
|
|
unsigned Value; // This may have to change for non-numeric values.
|
|
|
|
HintKind Kind;
|
|
|
|
|
|
|
|
Hint(const char * Name, unsigned Value, HintKind Kind)
|
|
|
|
: Name(Name), Value(Value), Kind(Kind) { }
|
|
|
|
|
|
|
|
bool validate(unsigned Val) {
|
|
|
|
switch (Kind) {
|
|
|
|
case HK_WIDTH:
|
2015-02-20 03:14:52 +08:00
|
|
|
return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth;
|
Small refactor on VectorizerHint for deduplication
Previously, the hint mechanism relied on clean up passes to remove redundant
metadata, which still showed up if running opt at low levels of optimization.
That also has shown that multiple nodes of the same type, but with different
values could still coexist, even if temporary, and cause confusion if the
next pass got the wrong value.
This patch makes sure that, if metadata already exists in a loop, the hint
mechanism will never append a new node, but always replace the existing one.
It also enhances the algorithm to cope with more metadata types in the future
by just adding a new type, not a lot of code.
Re-applying again due to MSVC 2013 being minimum requirement, and this patch
having C++11 that MSVC 2012 didn't support.
Fixes PR20655.
llvm-svn: 216870
2014-09-01 18:00:17 +08:00
|
|
|
case HK_UNROLL:
|
2014-09-11 01:58:16 +08:00
|
|
|
return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor;
|
Small refactor on VectorizerHint for deduplication
Previously, the hint mechanism relied on clean up passes to remove redundant
metadata, which still showed up if running opt at low levels of optimization.
That also has shown that multiple nodes of the same type, but with different
values could still coexist, even if temporary, and cause confusion if the
next pass got the wrong value.
This patch makes sure that, if metadata already exists in a loop, the hint
mechanism will never append a new node, but always replace the existing one.
It also enhances the algorithm to cope with more metadata types in the future
by just adding a new type, not a lot of code.
Re-applying again due to MSVC 2013 being minimum requirement, and this patch
having C++11 that MSVC 2012 didn't support.
Fixes PR20655.
llvm-svn: 216870
2014-09-01 18:00:17 +08:00
|
|
|
case HK_FORCE:
|
|
|
|
return (Val <= 1);
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
/// Vectorization width.
|
|
|
|
Hint Width;
|
2014-09-11 01:58:16 +08:00
|
|
|
/// Vectorization interleave factor.
|
|
|
|
Hint Interleave;
|
Small refactor on VectorizerHint for deduplication
Previously, the hint mechanism relied on clean up passes to remove redundant
metadata, which still showed up if running opt at low levels of optimization.
That also has shown that multiple nodes of the same type, but with different
values could still coexist, even if temporary, and cause confusion if the
next pass got the wrong value.
This patch makes sure that, if metadata already exists in a loop, the hint
mechanism will never append a new node, but always replace the existing one.
It also enhances the algorithm to cope with more metadata types in the future
by just adding a new type, not a lot of code.
Re-applying again due to MSVC 2013 being minimum requirement, and this patch
having C++11 that MSVC 2012 didn't support.
Fixes PR20655.
llvm-svn: 216870
2014-09-01 18:00:17 +08:00
|
|
|
/// Vectorization forced
|
|
|
|
Hint Force;
|
|
|
|
|
|
|
|
/// Return the loop metadata prefix.
|
|
|
|
static StringRef Prefix() { return "llvm.loop."; }
|
|
|
|
|
2014-05-03 04:40:04 +08:00
|
|
|
public:
|
2014-04-29 16:55:11 +08:00
|
|
|
enum ForceKind {
|
|
|
|
FK_Undefined = -1, ///< Not selected.
|
|
|
|
FK_Disabled = 0, ///< Forcing disabled.
|
|
|
|
FK_Enabled = 1, ///< Forcing enabled.
|
2014-05-03 04:40:04 +08:00
|
|
|
};
|
2013-05-29 04:00:34 +08:00
|
|
|
|
2014-09-11 01:58:16 +08:00
|
|
|
LoopVectorizeHints(const Loop *L, bool DisableInterleaving)
|
2015-02-20 03:14:52 +08:00
|
|
|
: Width("vectorize.width", VectorizerParams::VectorizationFactor,
|
|
|
|
HK_WIDTH),
|
2014-09-11 01:58:16 +08:00
|
|
|
Interleave("interleave.count", DisableInterleaving, HK_UNROLL),
|
Small refactor on VectorizerHint for deduplication
Previously, the hint mechanism relied on clean up passes to remove redundant
metadata, which still showed up if running opt at low levels of optimization.
That also has shown that multiple nodes of the same type, but with different
values could still coexist, even if temporary, and cause confusion if the
next pass got the wrong value.
This patch makes sure that, if metadata already exists in a loop, the hint
mechanism will never append a new node, but always replace the existing one.
It also enhances the algorithm to cope with more metadata types in the future
by just adding a new type, not a lot of code.
Re-applying again due to MSVC 2013 being minimum requirement, and this patch
having C++11 that MSVC 2012 didn't support.
Fixes PR20655.
llvm-svn: 216870
2014-09-01 18:00:17 +08:00
|
|
|
Force("vectorize.enable", FK_Undefined, HK_FORCE),
|
|
|
|
TheLoop(L) {
|
|
|
|
// Populate values with existing loop metadata.
|
|
|
|
getHintsFromMetadata();
|
|
|
|
|
2014-09-11 01:58:16 +08:00
|
|
|
// force-vector-interleave overrides DisableInterleaving.
|
2015-02-20 03:14:52 +08:00
|
|
|
if (VectorizerParams::isInterleaveForced())
|
|
|
|
Interleave.Value = VectorizerParams::VectorizationInterleave;
|
2013-08-29 02:33:10 +08:00
|
|
|
|
2014-09-11 01:58:16 +08:00
|
|
|
DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs()
|
|
|
|
<< "LV: Interleaving disabled by the pass manager\n");
|
2013-05-29 04:00:34 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Mark the loop L as already vectorized by setting the width to 1.
|
Small refactor on VectorizerHint for deduplication
Previously, the hint mechanism relied on clean up passes to remove redundant
metadata, which still showed up if running opt at low levels of optimization.
That also has shown that multiple nodes of the same type, but with different
values could still coexist, even if temporary, and cause confusion if the
next pass got the wrong value.
This patch makes sure that, if metadata already exists in a loop, the hint
mechanism will never append a new node, but always replace the existing one.
It also enhances the algorithm to cope with more metadata types in the future
by just adding a new type, not a lot of code.
Re-applying again due to MSVC 2013 being minimum requirement, and this patch
having C++11 that MSVC 2012 didn't support.
Fixes PR20655.
llvm-svn: 216870
2014-09-01 18:00:17 +08:00
|
|
|
void setAlreadyVectorized() {
|
2014-09-11 01:58:16 +08:00
|
|
|
Width.Value = Interleave.Value = 1;
|
2014-10-23 03:13:54 +08:00
|
|
|
Hint Hints[] = {Width, Interleave};
|
|
|
|
writeHintsToMetadata(Hints);
|
2013-05-29 04:00:34 +08:00
|
|
|
}
|
|
|
|
|
Small refactor on VectorizerHint for deduplication
Previously, the hint mechanism relied on clean up passes to remove redundant
metadata, which still showed up if running opt at low levels of optimization.
That also has shown that multiple nodes of the same type, but with different
values could still coexist, even if temporary, and cause confusion if the
next pass got the wrong value.
This patch makes sure that, if metadata already exists in a loop, the hint
mechanism will never append a new node, but always replace the existing one.
It also enhances the algorithm to cope with more metadata types in the future
by just adding a new type, not a lot of code.
Re-applying again due to MSVC 2013 being minimum requirement, and this patch
having C++11 that MSVC 2012 didn't support.
Fixes PR20655.
llvm-svn: 216870
2014-09-01 18:00:17 +08:00
|
|
|
/// Dumps all the hint information.
|
2014-06-26 01:50:15 +08:00
|
|
|
std::string emitRemark() const {
|
2015-02-02 00:56:00 +08:00
|
|
|
VectorizationReport R;
|
Small refactor on VectorizerHint for deduplication
Previously, the hint mechanism relied on clean up passes to remove redundant
metadata, which still showed up if running opt at low levels of optimization.
That also has shown that multiple nodes of the same type, but with different
values could still coexist, even if temporary, and cause confusion if the
next pass got the wrong value.
This patch makes sure that, if metadata already exists in a loop, the hint
mechanism will never append a new node, but always replace the existing one.
It also enhances the algorithm to cope with more metadata types in the future
by just adding a new type, not a lot of code.
Re-applying again due to MSVC 2013 being minimum requirement, and this patch
having C++11 that MSVC 2012 didn't support.
Fixes PR20655.
llvm-svn: 216870
2014-09-01 18:00:17 +08:00
|
|
|
if (Force.Value == LoopVectorizeHints::FK_Disabled)
|
2014-08-01 05:22:22 +08:00
|
|
|
R << "vectorization is explicitly disabled";
|
|
|
|
else {
|
|
|
|
R << "use -Rpass-analysis=loop-vectorize for more info";
|
Small refactor on VectorizerHint for deduplication
Previously, the hint mechanism relied on clean up passes to remove redundant
metadata, which still showed up if running opt at low levels of optimization.
That also has shown that multiple nodes of the same type, but with different
values could still coexist, even if temporary, and cause confusion if the
next pass got the wrong value.
This patch makes sure that, if metadata already exists in a loop, the hint
mechanism will never append a new node, but always replace the existing one.
It also enhances the algorithm to cope with more metadata types in the future
by just adding a new type, not a lot of code.
Re-applying again due to MSVC 2013 being minimum requirement, and this patch
having C++11 that MSVC 2012 didn't support.
Fixes PR20655.
llvm-svn: 216870
2014-09-01 18:00:17 +08:00
|
|
|
if (Force.Value == LoopVectorizeHints::FK_Enabled) {
|
2014-08-01 05:22:22 +08:00
|
|
|
R << " (Force=true";
|
Small refactor on VectorizerHint for deduplication
Previously, the hint mechanism relied on clean up passes to remove redundant
metadata, which still showed up if running opt at low levels of optimization.
That also has shown that multiple nodes of the same type, but with different
values could still coexist, even if temporary, and cause confusion if the
next pass got the wrong value.
This patch makes sure that, if metadata already exists in a loop, the hint
mechanism will never append a new node, but always replace the existing one.
It also enhances the algorithm to cope with more metadata types in the future
by just adding a new type, not a lot of code.
Re-applying again due to MSVC 2013 being minimum requirement, and this patch
having C++11 that MSVC 2012 didn't support.
Fixes PR20655.
llvm-svn: 216870
2014-09-01 18:00:17 +08:00
|
|
|
if (Width.Value != 0)
|
|
|
|
R << ", Vector Width=" << Width.Value;
|
2014-09-11 01:58:16 +08:00
|
|
|
if (Interleave.Value != 0)
|
|
|
|
R << ", Interleave Count=" << Interleave.Value;
|
2014-08-01 05:22:22 +08:00
|
|
|
R << ")";
|
|
|
|
}
|
2014-06-26 01:50:15 +08:00
|
|
|
}
|
2014-08-01 05:22:22 +08:00
|
|
|
|
2014-06-26 01:50:15 +08:00
|
|
|
return R.str();
|
|
|
|
}
|
|
|
|
|
Small refactor on VectorizerHint for deduplication
Previously, the hint mechanism relied on clean up passes to remove redundant
metadata, which still showed up if running opt at low levels of optimization.
That also has shown that multiple nodes of the same type, but with different
values could still coexist, even if temporary, and cause confusion if the
next pass got the wrong value.
This patch makes sure that, if metadata already exists in a loop, the hint
mechanism will never append a new node, but always replace the existing one.
It also enhances the algorithm to cope with more metadata types in the future
by just adding a new type, not a lot of code.
Re-applying again due to MSVC 2013 being minimum requirement, and this patch
having C++11 that MSVC 2012 didn't support.
Fixes PR20655.
llvm-svn: 216870
2014-09-01 18:00:17 +08:00
|
|
|
unsigned getWidth() const { return Width.Value; }
|
2014-09-11 01:58:16 +08:00
|
|
|
unsigned getInterleave() const { return Interleave.Value; }
|
Small refactor on VectorizerHint for deduplication
Previously, the hint mechanism relied on clean up passes to remove redundant
metadata, which still showed up if running opt at low levels of optimization.
That also has shown that multiple nodes of the same type, but with different
values could still coexist, even if temporary, and cause confusion if the
next pass got the wrong value.
This patch makes sure that, if metadata already exists in a loop, the hint
mechanism will never append a new node, but always replace the existing one.
It also enhances the algorithm to cope with more metadata types in the future
by just adding a new type, not a lot of code.
Re-applying again due to MSVC 2013 being minimum requirement, and this patch
having C++11 that MSVC 2012 didn't support.
Fixes PR20655.
llvm-svn: 216870
2014-09-01 18:00:17 +08:00
|
|
|
enum ForceKind getForce() const { return (ForceKind)Force.Value; }
|
2013-05-29 04:00:34 +08:00
|
|
|
|
2014-05-03 04:40:04 +08:00
|
|
|
private:
|
Small refactor on VectorizerHint for deduplication
Previously, the hint mechanism relied on clean up passes to remove redundant
metadata, which still showed up if running opt at low levels of optimization.
That also has shown that multiple nodes of the same type, but with different
values could still coexist, even if temporary, and cause confusion if the
next pass got the wrong value.
This patch makes sure that, if metadata already exists in a loop, the hint
mechanism will never append a new node, but always replace the existing one.
It also enhances the algorithm to cope with more metadata types in the future
by just adding a new type, not a lot of code.
Re-applying again due to MSVC 2013 being minimum requirement, and this patch
having C++11 that MSVC 2012 didn't support.
Fixes PR20655.
llvm-svn: 216870
2014-09-01 18:00:17 +08:00
|
|
|
/// Find hints specified in the loop metadata and update local values.
|
|
|
|
void getHintsFromMetadata() {
|
|
|
|
MDNode *LoopID = TheLoop->getLoopID();
|
2013-05-29 04:00:34 +08:00
|
|
|
if (!LoopID)
|
|
|
|
return;
|
2013-05-29 11:13:41 +08:00
|
|
|
|
2013-05-29 04:00:34 +08:00
|
|
|
// First operand should refer to the loop id itself.
|
|
|
|
assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
|
|
|
|
assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
|
2013-05-29 11:13:41 +08:00
|
|
|
|
2013-05-29 04:00:34 +08:00
|
|
|
for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
|
2014-04-25 13:29:35 +08:00
|
|
|
const MDString *S = nullptr;
|
IR: Split Metadata from Value
Split `Metadata` away from the `Value` class hierarchy, as part of
PR21532. Assembly and bitcode changes are in the wings, but this is the
bulk of the change for the IR C++ API.
I have a follow-up patch prepared for `clang`. If this breaks other
sub-projects, I apologize in advance :(. Help me compile it on Darwin
I'll try to fix it. FWIW, the errors should be easy to fix, so it may
be simpler to just fix it yourself.
This breaks the build for all metadata-related code that's out-of-tree.
Rest assured the transition is mechanical and the compiler should catch
almost all of the problems.
Here's a quick guide for updating your code:
- `Metadata` is the root of a class hierarchy with three main classes:
`MDNode`, `MDString`, and `ValueAsMetadata`. It is distinct from
the `Value` class hierarchy. It is typeless -- i.e., instances do
*not* have a `Type`.
- `MDNode`'s operands are all `Metadata *` (instead of `Value *`).
- `TrackingVH<MDNode>` and `WeakVH` referring to metadata can be
replaced with `TrackingMDNodeRef` and `TrackingMDRef`, respectively.
If you're referring solely to resolved `MDNode`s -- post graph
construction -- just use `MDNode*`.
- `MDNode` (and the rest of `Metadata`) have only limited support for
`replaceAllUsesWith()`.
As long as an `MDNode` is pointing at a forward declaration -- the
result of `MDNode::getTemporary()` -- it maintains a side map of its
uses and can RAUW itself. Once the forward declarations are fully
resolved RAUW support is dropped on the ground. This means that
uniquing collisions on changing operands cause nodes to become
"distinct". (This already happened fairly commonly, whenever an
operand went to null.)
If you're constructing complex (non self-reference) `MDNode` cycles,
you need to call `MDNode::resolveCycles()` on each node (or on a
top-level node that somehow references all of the nodes). Also,
don't do that. Metadata cycles (and the RAUW machinery needed to
construct them) are expensive.
- An `MDNode` can only refer to a `Constant` through a bridge called
`ConstantAsMetadata` (one of the subclasses of `ValueAsMetadata`).
As a side effect, accessing an operand of an `MDNode` that is known
to be, e.g., `ConstantInt`, takes three steps: first, cast from
`Metadata` to `ConstantAsMetadata`; second, extract the `Constant`;
third, cast down to `ConstantInt`.
The eventual goal is to introduce `MDInt`/`MDFloat`/etc. and have
metadata schema owners transition away from using `Constant`s when
the type isn't important (and they don't care about referring to
`GlobalValue`s).
In the meantime, I've added transitional API to the `mdconst`
namespace that matches semantics with the old code, in order to
avoid adding the error-prone three-step equivalent to every call
site. If your old code was:
MDNode *N = foo();
bar(isa <ConstantInt>(N->getOperand(0)));
baz(cast <ConstantInt>(N->getOperand(1)));
bak(cast_or_null <ConstantInt>(N->getOperand(2)));
bat(dyn_cast <ConstantInt>(N->getOperand(3)));
bay(dyn_cast_or_null<ConstantInt>(N->getOperand(4)));
you can trivially match its semantics with:
MDNode *N = foo();
bar(mdconst::hasa <ConstantInt>(N->getOperand(0)));
baz(mdconst::extract <ConstantInt>(N->getOperand(1)));
bak(mdconst::extract_or_null <ConstantInt>(N->getOperand(2)));
bat(mdconst::dyn_extract <ConstantInt>(N->getOperand(3)));
bay(mdconst::dyn_extract_or_null<ConstantInt>(N->getOperand(4)));
and when you transition your metadata schema to `MDInt`:
MDNode *N = foo();
bar(isa <MDInt>(N->getOperand(0)));
baz(cast <MDInt>(N->getOperand(1)));
bak(cast_or_null <MDInt>(N->getOperand(2)));
bat(dyn_cast <MDInt>(N->getOperand(3)));
bay(dyn_cast_or_null<MDInt>(N->getOperand(4)));
- A `CallInst` -- specifically, intrinsic instructions -- can refer to
metadata through a bridge called `MetadataAsValue`. This is a
subclass of `Value` where `getType()->isMetadataTy()`.
`MetadataAsValue` is the *only* class that can legally refer to a
`LocalAsMetadata`, which is a bridged form of non-`Constant` values
like `Argument` and `Instruction`. It can also refer to any other
`Metadata` subclass.
(I'll break all your testcases in a follow-up commit, when I propagate
this change to assembly.)
llvm-svn: 223802
2014-12-10 02:38:53 +08:00
|
|
|
SmallVector<Metadata *, 4> Args;
|
2013-05-29 04:00:34 +08:00
|
|
|
|
|
|
|
// The expected hint is either a MDString or a MDNode with the first
|
|
|
|
// operand a MDString.
|
|
|
|
if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) {
|
|
|
|
if (!MD || MD->getNumOperands() == 0)
|
|
|
|
continue;
|
|
|
|
S = dyn_cast<MDString>(MD->getOperand(0));
|
|
|
|
for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i)
|
|
|
|
Args.push_back(MD->getOperand(i));
|
|
|
|
} else {
|
|
|
|
S = dyn_cast<MDString>(LoopID->getOperand(i));
|
|
|
|
assert(Args.size() == 0 && "too many arguments for MDString");
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!S)
|
|
|
|
continue;
|
|
|
|
|
2014-07-22 07:11:03 +08:00
|
|
|
// Check if the hint starts with the loop metadata prefix.
|
Small refactor on VectorizerHint for deduplication
Previously, the hint mechanism relied on clean up passes to remove redundant
metadata, which still showed up if running opt at low levels of optimization.
That also has shown that multiple nodes of the same type, but with different
values could still coexist, even if temporary, and cause confusion if the
next pass got the wrong value.
This patch makes sure that, if metadata already exists in a loop, the hint
mechanism will never append a new node, but always replace the existing one.
It also enhances the algorithm to cope with more metadata types in the future
by just adding a new type, not a lot of code.
Re-applying again due to MSVC 2013 being minimum requirement, and this patch
having C++11 that MSVC 2012 didn't support.
Fixes PR20655.
llvm-svn: 216870
2014-09-01 18:00:17 +08:00
|
|
|
StringRef Name = S->getString();
|
2013-05-29 04:00:34 +08:00
|
|
|
if (Args.size() == 1)
|
Small refactor on VectorizerHint for deduplication
Previously, the hint mechanism relied on clean up passes to remove redundant
metadata, which still showed up if running opt at low levels of optimization.
That also has shown that multiple nodes of the same type, but with different
values could still coexist, even if temporary, and cause confusion if the
next pass got the wrong value.
This patch makes sure that, if metadata already exists in a loop, the hint
mechanism will never append a new node, but always replace the existing one.
It also enhances the algorithm to cope with more metadata types in the future
by just adding a new type, not a lot of code.
Re-applying again due to MSVC 2013 being minimum requirement, and this patch
having C++11 that MSVC 2012 didn't support.
Fixes PR20655.
llvm-svn: 216870
2014-09-01 18:00:17 +08:00
|
|
|
setHint(Name, Args[0]);
|
2013-05-29 04:00:34 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Small refactor on VectorizerHint for deduplication
Previously, the hint mechanism relied on clean up passes to remove redundant
metadata, which still showed up if running opt at low levels of optimization.
That also has shown that multiple nodes of the same type, but with different
values could still coexist, even if temporary, and cause confusion if the
next pass got the wrong value.
This patch makes sure that, if metadata already exists in a loop, the hint
mechanism will never append a new node, but always replace the existing one.
It also enhances the algorithm to cope with more metadata types in the future
by just adding a new type, not a lot of code.
Re-applying again due to MSVC 2013 being minimum requirement, and this patch
having C++11 that MSVC 2012 didn't support.
Fixes PR20655.
llvm-svn: 216870
2014-09-01 18:00:17 +08:00
|
|
|
/// Checks string hint with one operand and set value if valid.
|
IR: Split Metadata from Value
Split `Metadata` away from the `Value` class hierarchy, as part of
PR21532. Assembly and bitcode changes are in the wings, but this is the
bulk of the change for the IR C++ API.
I have a follow-up patch prepared for `clang`. If this breaks other
sub-projects, I apologize in advance :(. Help me compile it on Darwin
I'll try to fix it. FWIW, the errors should be easy to fix, so it may
be simpler to just fix it yourself.
This breaks the build for all metadata-related code that's out-of-tree.
Rest assured the transition is mechanical and the compiler should catch
almost all of the problems.
Here's a quick guide for updating your code:
- `Metadata` is the root of a class hierarchy with three main classes:
`MDNode`, `MDString`, and `ValueAsMetadata`. It is distinct from
the `Value` class hierarchy. It is typeless -- i.e., instances do
*not* have a `Type`.
- `MDNode`'s operands are all `Metadata *` (instead of `Value *`).
- `TrackingVH<MDNode>` and `WeakVH` referring to metadata can be
replaced with `TrackingMDNodeRef` and `TrackingMDRef`, respectively.
If you're referring solely to resolved `MDNode`s -- post graph
construction -- just use `MDNode*`.
- `MDNode` (and the rest of `Metadata`) have only limited support for
`replaceAllUsesWith()`.
As long as an `MDNode` is pointing at a forward declaration -- the
result of `MDNode::getTemporary()` -- it maintains a side map of its
uses and can RAUW itself. Once the forward declarations are fully
resolved RAUW support is dropped on the ground. This means that
uniquing collisions on changing operands cause nodes to become
"distinct". (This already happened fairly commonly, whenever an
operand went to null.)
If you're constructing complex (non self-reference) `MDNode` cycles,
you need to call `MDNode::resolveCycles()` on each node (or on a
top-level node that somehow references all of the nodes). Also,
don't do that. Metadata cycles (and the RAUW machinery needed to
construct them) are expensive.
- An `MDNode` can only refer to a `Constant` through a bridge called
`ConstantAsMetadata` (one of the subclasses of `ValueAsMetadata`).
As a side effect, accessing an operand of an `MDNode` that is known
to be, e.g., `ConstantInt`, takes three steps: first, cast from
`Metadata` to `ConstantAsMetadata`; second, extract the `Constant`;
third, cast down to `ConstantInt`.
The eventual goal is to introduce `MDInt`/`MDFloat`/etc. and have
metadata schema owners transition away from using `Constant`s when
the type isn't important (and they don't care about referring to
`GlobalValue`s).
In the meantime, I've added transitional API to the `mdconst`
namespace that matches semantics with the old code, in order to
avoid adding the error-prone three-step equivalent to every call
site. If your old code was:
MDNode *N = foo();
bar(isa <ConstantInt>(N->getOperand(0)));
baz(cast <ConstantInt>(N->getOperand(1)));
bak(cast_or_null <ConstantInt>(N->getOperand(2)));
bat(dyn_cast <ConstantInt>(N->getOperand(3)));
bay(dyn_cast_or_null<ConstantInt>(N->getOperand(4)));
you can trivially match its semantics with:
MDNode *N = foo();
bar(mdconst::hasa <ConstantInt>(N->getOperand(0)));
baz(mdconst::extract <ConstantInt>(N->getOperand(1)));
bak(mdconst::extract_or_null <ConstantInt>(N->getOperand(2)));
bat(mdconst::dyn_extract <ConstantInt>(N->getOperand(3)));
bay(mdconst::dyn_extract_or_null<ConstantInt>(N->getOperand(4)));
and when you transition your metadata schema to `MDInt`:
MDNode *N = foo();
bar(isa <MDInt>(N->getOperand(0)));
baz(cast <MDInt>(N->getOperand(1)));
bak(cast_or_null <MDInt>(N->getOperand(2)));
bat(dyn_cast <MDInt>(N->getOperand(3)));
bay(dyn_cast_or_null<MDInt>(N->getOperand(4)));
- A `CallInst` -- specifically, intrinsic instructions -- can refer to
metadata through a bridge called `MetadataAsValue`. This is a
subclass of `Value` where `getType()->isMetadataTy()`.
`MetadataAsValue` is the *only* class that can legally refer to a
`LocalAsMetadata`, which is a bridged form of non-`Constant` values
like `Argument` and `Instruction`. It can also refer to any other
`Metadata` subclass.
(I'll break all your testcases in a follow-up commit, when I propagate
this change to assembly.)
llvm-svn: 223802
2014-12-10 02:38:53 +08:00
|
|
|
void setHint(StringRef Name, Metadata *Arg) {
|
Small refactor on VectorizerHint for deduplication
Previously, the hint mechanism relied on clean up passes to remove redundant
metadata, which still showed up if running opt at low levels of optimization.
That also has shown that multiple nodes of the same type, but with different
values could still coexist, even if temporary, and cause confusion if the
next pass got the wrong value.
This patch makes sure that, if metadata already exists in a loop, the hint
mechanism will never append a new node, but always replace the existing one.
It also enhances the algorithm to cope with more metadata types in the future
by just adding a new type, not a lot of code.
Re-applying again due to MSVC 2013 being minimum requirement, and this patch
having C++11 that MSVC 2012 didn't support.
Fixes PR20655.
llvm-svn: 216870
2014-09-01 18:00:17 +08:00
|
|
|
if (!Name.startswith(Prefix()))
|
|
|
|
return;
|
|
|
|
Name = Name.substr(Prefix().size(), StringRef::npos);
|
|
|
|
|
IR: Split Metadata from Value
Split `Metadata` away from the `Value` class hierarchy, as part of
PR21532. Assembly and bitcode changes are in the wings, but this is the
bulk of the change for the IR C++ API.
I have a follow-up patch prepared for `clang`. If this breaks other
sub-projects, I apologize in advance :(. Help me compile it on Darwin
I'll try to fix it. FWIW, the errors should be easy to fix, so it may
be simpler to just fix it yourself.
This breaks the build for all metadata-related code that's out-of-tree.
Rest assured the transition is mechanical and the compiler should catch
almost all of the problems.
Here's a quick guide for updating your code:
- `Metadata` is the root of a class hierarchy with three main classes:
`MDNode`, `MDString`, and `ValueAsMetadata`. It is distinct from
the `Value` class hierarchy. It is typeless -- i.e., instances do
*not* have a `Type`.
- `MDNode`'s operands are all `Metadata *` (instead of `Value *`).
- `TrackingVH<MDNode>` and `WeakVH` referring to metadata can be
replaced with `TrackingMDNodeRef` and `TrackingMDRef`, respectively.
If you're referring solely to resolved `MDNode`s -- post graph
construction -- just use `MDNode*`.
- `MDNode` (and the rest of `Metadata`) have only limited support for
`replaceAllUsesWith()`.
As long as an `MDNode` is pointing at a forward declaration -- the
result of `MDNode::getTemporary()` -- it maintains a side map of its
uses and can RAUW itself. Once the forward declarations are fully
resolved RAUW support is dropped on the ground. This means that
uniquing collisions on changing operands cause nodes to become
"distinct". (This already happened fairly commonly, whenever an
operand went to null.)
If you're constructing complex (non self-reference) `MDNode` cycles,
you need to call `MDNode::resolveCycles()` on each node (or on a
top-level node that somehow references all of the nodes). Also,
don't do that. Metadata cycles (and the RAUW machinery needed to
construct them) are expensive.
- An `MDNode` can only refer to a `Constant` through a bridge called
`ConstantAsMetadata` (one of the subclasses of `ValueAsMetadata`).
As a side effect, accessing an operand of an `MDNode` that is known
to be, e.g., `ConstantInt`, takes three steps: first, cast from
`Metadata` to `ConstantAsMetadata`; second, extract the `Constant`;
third, cast down to `ConstantInt`.
The eventual goal is to introduce `MDInt`/`MDFloat`/etc. and have
metadata schema owners transition away from using `Constant`s when
the type isn't important (and they don't care about referring to
`GlobalValue`s).
In the meantime, I've added transitional API to the `mdconst`
namespace that matches semantics with the old code, in order to
avoid adding the error-prone three-step equivalent to every call
site. If your old code was:
MDNode *N = foo();
bar(isa <ConstantInt>(N->getOperand(0)));
baz(cast <ConstantInt>(N->getOperand(1)));
bak(cast_or_null <ConstantInt>(N->getOperand(2)));
bat(dyn_cast <ConstantInt>(N->getOperand(3)));
bay(dyn_cast_or_null<ConstantInt>(N->getOperand(4)));
you can trivially match its semantics with:
MDNode *N = foo();
bar(mdconst::hasa <ConstantInt>(N->getOperand(0)));
baz(mdconst::extract <ConstantInt>(N->getOperand(1)));
bak(mdconst::extract_or_null <ConstantInt>(N->getOperand(2)));
bat(mdconst::dyn_extract <ConstantInt>(N->getOperand(3)));
bay(mdconst::dyn_extract_or_null<ConstantInt>(N->getOperand(4)));
and when you transition your metadata schema to `MDInt`:
MDNode *N = foo();
bar(isa <MDInt>(N->getOperand(0)));
baz(cast <MDInt>(N->getOperand(1)));
bak(cast_or_null <MDInt>(N->getOperand(2)));
bat(dyn_cast <MDInt>(N->getOperand(3)));
bay(dyn_cast_or_null<MDInt>(N->getOperand(4)));
- A `CallInst` -- specifically, intrinsic instructions -- can refer to
metadata through a bridge called `MetadataAsValue`. This is a
subclass of `Value` where `getType()->isMetadataTy()`.
`MetadataAsValue` is the *only* class that can legally refer to a
`LocalAsMetadata`, which is a bridged form of non-`Constant` values
like `Argument` and `Instruction`. It can also refer to any other
`Metadata` subclass.
(I'll break all your testcases in a follow-up commit, when I propagate
this change to assembly.)
llvm-svn: 223802
2014-12-10 02:38:53 +08:00
|
|
|
const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg);
|
2013-05-29 04:00:34 +08:00
|
|
|
if (!C) return;
|
|
|
|
unsigned Val = C->getZExtValue();
|
|
|
|
|
2014-10-23 03:13:54 +08:00
|
|
|
Hint *Hints[] = {&Width, &Interleave, &Force};
|
Small refactor on VectorizerHint for deduplication
Previously, the hint mechanism relied on clean up passes to remove redundant
metadata, which still showed up if running opt at low levels of optimization.
That also has shown that multiple nodes of the same type, but with different
values could still coexist, even if temporary, and cause confusion if the
next pass got the wrong value.
This patch makes sure that, if metadata already exists in a loop, the hint
mechanism will never append a new node, but always replace the existing one.
It also enhances the algorithm to cope with more metadata types in the future
by just adding a new type, not a lot of code.
Re-applying again due to MSVC 2013 being minimum requirement, and this patch
having C++11 that MSVC 2012 didn't support.
Fixes PR20655.
llvm-svn: 216870
2014-09-01 18:00:17 +08:00
|
|
|
for (auto H : Hints) {
|
|
|
|
if (Name == H->Name) {
|
|
|
|
if (H->validate(Val))
|
|
|
|
H->Value = Val;
|
|
|
|
else
|
|
|
|
DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n");
|
|
|
|
break;
|
|
|
|
}
|
2013-09-11 07:45:25 +08:00
|
|
|
}
|
2013-05-29 04:00:34 +08:00
|
|
|
}
|
2014-05-03 04:40:04 +08:00
|
|
|
|
Small refactor on VectorizerHint for deduplication
Previously, the hint mechanism relied on clean up passes to remove redundant
metadata, which still showed up if running opt at low levels of optimization.
That also has shown that multiple nodes of the same type, but with different
values could still coexist, even if temporary, and cause confusion if the
next pass got the wrong value.
This patch makes sure that, if metadata already exists in a loop, the hint
mechanism will never append a new node, but always replace the existing one.
It also enhances the algorithm to cope with more metadata types in the future
by just adding a new type, not a lot of code.
Re-applying again due to MSVC 2013 being minimum requirement, and this patch
having C++11 that MSVC 2012 didn't support.
Fixes PR20655.
llvm-svn: 216870
2014-09-01 18:00:17 +08:00
|
|
|
/// Create a new hint from name / value pair.
|
|
|
|
MDNode *createHintMetadata(StringRef Name, unsigned V) const {
|
|
|
|
LLVMContext &Context = TheLoop->getHeader()->getContext();
|
IR: Split Metadata from Value
Split `Metadata` away from the `Value` class hierarchy, as part of
PR21532. Assembly and bitcode changes are in the wings, but this is the
bulk of the change for the IR C++ API.
I have a follow-up patch prepared for `clang`. If this breaks other
sub-projects, I apologize in advance :(. Help me compile it on Darwin
I'll try to fix it. FWIW, the errors should be easy to fix, so it may
be simpler to just fix it yourself.
This breaks the build for all metadata-related code that's out-of-tree.
Rest assured the transition is mechanical and the compiler should catch
almost all of the problems.
Here's a quick guide for updating your code:
- `Metadata` is the root of a class hierarchy with three main classes:
`MDNode`, `MDString`, and `ValueAsMetadata`. It is distinct from
the `Value` class hierarchy. It is typeless -- i.e., instances do
*not* have a `Type`.
- `MDNode`'s operands are all `Metadata *` (instead of `Value *`).
- `TrackingVH<MDNode>` and `WeakVH` referring to metadata can be
replaced with `TrackingMDNodeRef` and `TrackingMDRef`, respectively.
If you're referring solely to resolved `MDNode`s -- post graph
construction -- just use `MDNode*`.
- `MDNode` (and the rest of `Metadata`) have only limited support for
`replaceAllUsesWith()`.
As long as an `MDNode` is pointing at a forward declaration -- the
result of `MDNode::getTemporary()` -- it maintains a side map of its
uses and can RAUW itself. Once the forward declarations are fully
resolved RAUW support is dropped on the ground. This means that
uniquing collisions on changing operands cause nodes to become
"distinct". (This already happened fairly commonly, whenever an
operand went to null.)
If you're constructing complex (non self-reference) `MDNode` cycles,
you need to call `MDNode::resolveCycles()` on each node (or on a
top-level node that somehow references all of the nodes). Also,
don't do that. Metadata cycles (and the RAUW machinery needed to
construct them) are expensive.
- An `MDNode` can only refer to a `Constant` through a bridge called
`ConstantAsMetadata` (one of the subclasses of `ValueAsMetadata`).
As a side effect, accessing an operand of an `MDNode` that is known
to be, e.g., `ConstantInt`, takes three steps: first, cast from
`Metadata` to `ConstantAsMetadata`; second, extract the `Constant`;
third, cast down to `ConstantInt`.
The eventual goal is to introduce `MDInt`/`MDFloat`/etc. and have
metadata schema owners transition away from using `Constant`s when
the type isn't important (and they don't care about referring to
`GlobalValue`s).
In the meantime, I've added transitional API to the `mdconst`
namespace that matches semantics with the old code, in order to
avoid adding the error-prone three-step equivalent to every call
site. If your old code was:
MDNode *N = foo();
bar(isa <ConstantInt>(N->getOperand(0)));
baz(cast <ConstantInt>(N->getOperand(1)));
bak(cast_or_null <ConstantInt>(N->getOperand(2)));
bat(dyn_cast <ConstantInt>(N->getOperand(3)));
bay(dyn_cast_or_null<ConstantInt>(N->getOperand(4)));
you can trivially match its semantics with:
MDNode *N = foo();
bar(mdconst::hasa <ConstantInt>(N->getOperand(0)));
baz(mdconst::extract <ConstantInt>(N->getOperand(1)));
bak(mdconst::extract_or_null <ConstantInt>(N->getOperand(2)));
bat(mdconst::dyn_extract <ConstantInt>(N->getOperand(3)));
bay(mdconst::dyn_extract_or_null<ConstantInt>(N->getOperand(4)));
and when you transition your metadata schema to `MDInt`:
MDNode *N = foo();
bar(isa <MDInt>(N->getOperand(0)));
baz(cast <MDInt>(N->getOperand(1)));
bak(cast_or_null <MDInt>(N->getOperand(2)));
bat(dyn_cast <MDInt>(N->getOperand(3)));
bay(dyn_cast_or_null<MDInt>(N->getOperand(4)));
- A `CallInst` -- specifically, intrinsic instructions -- can refer to
metadata through a bridge called `MetadataAsValue`. This is a
subclass of `Value` where `getType()->isMetadataTy()`.
`MetadataAsValue` is the *only* class that can legally refer to a
`LocalAsMetadata`, which is a bridged form of non-`Constant` values
like `Argument` and `Instruction`. It can also refer to any other
`Metadata` subclass.
(I'll break all your testcases in a follow-up commit, when I propagate
this change to assembly.)
llvm-svn: 223802
2014-12-10 02:38:53 +08:00
|
|
|
Metadata *MDs[] = {MDString::get(Context, Name),
|
|
|
|
ConstantAsMetadata::get(
|
|
|
|
ConstantInt::get(Type::getInt32Ty(Context), V))};
|
|
|
|
return MDNode::get(Context, MDs);
|
Small refactor on VectorizerHint for deduplication
Previously, the hint mechanism relied on clean up passes to remove redundant
metadata, which still showed up if running opt at low levels of optimization.
That also has shown that multiple nodes of the same type, but with different
values could still coexist, even if temporary, and cause confusion if the
next pass got the wrong value.
This patch makes sure that, if metadata already exists in a loop, the hint
mechanism will never append a new node, but always replace the existing one.
It also enhances the algorithm to cope with more metadata types in the future
by just adding a new type, not a lot of code.
Re-applying again due to MSVC 2013 being minimum requirement, and this patch
having C++11 that MSVC 2012 didn't support.
Fixes PR20655.
llvm-svn: 216870
2014-09-01 18:00:17 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Matches metadata with hint name.
|
2014-10-23 03:13:54 +08:00
|
|
|
bool matchesHintMetadataName(MDNode *Node, ArrayRef<Hint> HintTypes) {
|
Small refactor on VectorizerHint for deduplication
Previously, the hint mechanism relied on clean up passes to remove redundant
metadata, which still showed up if running opt at low levels of optimization.
That also has shown that multiple nodes of the same type, but with different
values could still coexist, even if temporary, and cause confusion if the
next pass got the wrong value.
This patch makes sure that, if metadata already exists in a loop, the hint
mechanism will never append a new node, but always replace the existing one.
It also enhances the algorithm to cope with more metadata types in the future
by just adding a new type, not a lot of code.
Re-applying again due to MSVC 2013 being minimum requirement, and this patch
having C++11 that MSVC 2012 didn't support.
Fixes PR20655.
llvm-svn: 216870
2014-09-01 18:00:17 +08:00
|
|
|
MDString* Name = dyn_cast<MDString>(Node->getOperand(0));
|
|
|
|
if (!Name)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
for (auto H : HintTypes)
|
2014-11-14 07:59:16 +08:00
|
|
|
if (Name->getString().endswith(H.Name))
|
Small refactor on VectorizerHint for deduplication
Previously, the hint mechanism relied on clean up passes to remove redundant
metadata, which still showed up if running opt at low levels of optimization.
That also has shown that multiple nodes of the same type, but with different
values could still coexist, even if temporary, and cause confusion if the
next pass got the wrong value.
This patch makes sure that, if metadata already exists in a loop, the hint
mechanism will never append a new node, but always replace the existing one.
It also enhances the algorithm to cope with more metadata types in the future
by just adding a new type, not a lot of code.
Re-applying again due to MSVC 2013 being minimum requirement, and this patch
having C++11 that MSVC 2012 didn't support.
Fixes PR20655.
llvm-svn: 216870
2014-09-01 18:00:17 +08:00
|
|
|
return true;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Sets current hints into loop metadata, keeping other values intact.
|
2014-10-23 03:13:54 +08:00
|
|
|
void writeHintsToMetadata(ArrayRef<Hint> HintTypes) {
|
Small refactor on VectorizerHint for deduplication
Previously, the hint mechanism relied on clean up passes to remove redundant
metadata, which still showed up if running opt at low levels of optimization.
That also has shown that multiple nodes of the same type, but with different
values could still coexist, even if temporary, and cause confusion if the
next pass got the wrong value.
This patch makes sure that, if metadata already exists in a loop, the hint
mechanism will never append a new node, but always replace the existing one.
It also enhances the algorithm to cope with more metadata types in the future
by just adding a new type, not a lot of code.
Re-applying again due to MSVC 2013 being minimum requirement, and this patch
having C++11 that MSVC 2012 didn't support.
Fixes PR20655.
llvm-svn: 216870
2014-09-01 18:00:17 +08:00
|
|
|
if (HintTypes.size() == 0)
|
|
|
|
return;
|
|
|
|
|
|
|
|
// Reserve the first element to LoopID (see below).
|
IR: Split Metadata from Value
Split `Metadata` away from the `Value` class hierarchy, as part of
PR21532. Assembly and bitcode changes are in the wings, but this is the
bulk of the change for the IR C++ API.
I have a follow-up patch prepared for `clang`. If this breaks other
sub-projects, I apologize in advance :(. Help me compile it on Darwin
I'll try to fix it. FWIW, the errors should be easy to fix, so it may
be simpler to just fix it yourself.
This breaks the build for all metadata-related code that's out-of-tree.
Rest assured the transition is mechanical and the compiler should catch
almost all of the problems.
Here's a quick guide for updating your code:
- `Metadata` is the root of a class hierarchy with three main classes:
`MDNode`, `MDString`, and `ValueAsMetadata`. It is distinct from
the `Value` class hierarchy. It is typeless -- i.e., instances do
*not* have a `Type`.
- `MDNode`'s operands are all `Metadata *` (instead of `Value *`).
- `TrackingVH<MDNode>` and `WeakVH` referring to metadata can be
replaced with `TrackingMDNodeRef` and `TrackingMDRef`, respectively.
If you're referring solely to resolved `MDNode`s -- post graph
construction -- just use `MDNode*`.
- `MDNode` (and the rest of `Metadata`) have only limited support for
`replaceAllUsesWith()`.
As long as an `MDNode` is pointing at a forward declaration -- the
result of `MDNode::getTemporary()` -- it maintains a side map of its
uses and can RAUW itself. Once the forward declarations are fully
resolved RAUW support is dropped on the ground. This means that
uniquing collisions on changing operands cause nodes to become
"distinct". (This already happened fairly commonly, whenever an
operand went to null.)
If you're constructing complex (non self-reference) `MDNode` cycles,
you need to call `MDNode::resolveCycles()` on each node (or on a
top-level node that somehow references all of the nodes). Also,
don't do that. Metadata cycles (and the RAUW machinery needed to
construct them) are expensive.
- An `MDNode` can only refer to a `Constant` through a bridge called
`ConstantAsMetadata` (one of the subclasses of `ValueAsMetadata`).
As a side effect, accessing an operand of an `MDNode` that is known
to be, e.g., `ConstantInt`, takes three steps: first, cast from
`Metadata` to `ConstantAsMetadata`; second, extract the `Constant`;
third, cast down to `ConstantInt`.
The eventual goal is to introduce `MDInt`/`MDFloat`/etc. and have
metadata schema owners transition away from using `Constant`s when
the type isn't important (and they don't care about referring to
`GlobalValue`s).
In the meantime, I've added transitional API to the `mdconst`
namespace that matches semantics with the old code, in order to
avoid adding the error-prone three-step equivalent to every call
site. If your old code was:
MDNode *N = foo();
bar(isa <ConstantInt>(N->getOperand(0)));
baz(cast <ConstantInt>(N->getOperand(1)));
bak(cast_or_null <ConstantInt>(N->getOperand(2)));
bat(dyn_cast <ConstantInt>(N->getOperand(3)));
bay(dyn_cast_or_null<ConstantInt>(N->getOperand(4)));
you can trivially match its semantics with:
MDNode *N = foo();
bar(mdconst::hasa <ConstantInt>(N->getOperand(0)));
baz(mdconst::extract <ConstantInt>(N->getOperand(1)));
bak(mdconst::extract_or_null <ConstantInt>(N->getOperand(2)));
bat(mdconst::dyn_extract <ConstantInt>(N->getOperand(3)));
bay(mdconst::dyn_extract_or_null<ConstantInt>(N->getOperand(4)));
and when you transition your metadata schema to `MDInt`:
MDNode *N = foo();
bar(isa <MDInt>(N->getOperand(0)));
baz(cast <MDInt>(N->getOperand(1)));
bak(cast_or_null <MDInt>(N->getOperand(2)));
bat(dyn_cast <MDInt>(N->getOperand(3)));
bay(dyn_cast_or_null<MDInt>(N->getOperand(4)));
- A `CallInst` -- specifically, intrinsic instructions -- can refer to
metadata through a bridge called `MetadataAsValue`. This is a
subclass of `Value` where `getType()->isMetadataTy()`.
`MetadataAsValue` is the *only* class that can legally refer to a
`LocalAsMetadata`, which is a bridged form of non-`Constant` values
like `Argument` and `Instruction`. It can also refer to any other
`Metadata` subclass.
(I'll break all your testcases in a follow-up commit, when I propagate
this change to assembly.)
llvm-svn: 223802
2014-12-10 02:38:53 +08:00
|
|
|
SmallVector<Metadata *, 4> MDs(1);
|
Small refactor on VectorizerHint for deduplication
Previously, the hint mechanism relied on clean up passes to remove redundant
metadata, which still showed up if running opt at low levels of optimization.
That also has shown that multiple nodes of the same type, but with different
values could still coexist, even if temporary, and cause confusion if the
next pass got the wrong value.
This patch makes sure that, if metadata already exists in a loop, the hint
mechanism will never append a new node, but always replace the existing one.
It also enhances the algorithm to cope with more metadata types in the future
by just adding a new type, not a lot of code.
Re-applying again due to MSVC 2013 being minimum requirement, and this patch
having C++11 that MSVC 2012 didn't support.
Fixes PR20655.
llvm-svn: 216870
2014-09-01 18:00:17 +08:00
|
|
|
// If the loop already has metadata, then ignore the existing operands.
|
|
|
|
MDNode *LoopID = TheLoop->getLoopID();
|
|
|
|
if (LoopID) {
|
|
|
|
for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
|
|
|
|
MDNode *Node = cast<MDNode>(LoopID->getOperand(i));
|
|
|
|
// If node in update list, ignore old value.
|
|
|
|
if (!matchesHintMetadataName(Node, HintTypes))
|
IR: Split Metadata from Value
Split `Metadata` away from the `Value` class hierarchy, as part of
PR21532. Assembly and bitcode changes are in the wings, but this is the
bulk of the change for the IR C++ API.
I have a follow-up patch prepared for `clang`. If this breaks other
sub-projects, I apologize in advance :(. Help me compile it on Darwin
I'll try to fix it. FWIW, the errors should be easy to fix, so it may
be simpler to just fix it yourself.
This breaks the build for all metadata-related code that's out-of-tree.
Rest assured the transition is mechanical and the compiler should catch
almost all of the problems.
Here's a quick guide for updating your code:
- `Metadata` is the root of a class hierarchy with three main classes:
`MDNode`, `MDString`, and `ValueAsMetadata`. It is distinct from
the `Value` class hierarchy. It is typeless -- i.e., instances do
*not* have a `Type`.
- `MDNode`'s operands are all `Metadata *` (instead of `Value *`).
- `TrackingVH<MDNode>` and `WeakVH` referring to metadata can be
replaced with `TrackingMDNodeRef` and `TrackingMDRef`, respectively.
If you're referring solely to resolved `MDNode`s -- post graph
construction -- just use `MDNode*`.
- `MDNode` (and the rest of `Metadata`) have only limited support for
`replaceAllUsesWith()`.
As long as an `MDNode` is pointing at a forward declaration -- the
result of `MDNode::getTemporary()` -- it maintains a side map of its
uses and can RAUW itself. Once the forward declarations are fully
resolved RAUW support is dropped on the ground. This means that
uniquing collisions on changing operands cause nodes to become
"distinct". (This already happened fairly commonly, whenever an
operand went to null.)
If you're constructing complex (non self-reference) `MDNode` cycles,
you need to call `MDNode::resolveCycles()` on each node (or on a
top-level node that somehow references all of the nodes). Also,
don't do that. Metadata cycles (and the RAUW machinery needed to
construct them) are expensive.
- An `MDNode` can only refer to a `Constant` through a bridge called
`ConstantAsMetadata` (one of the subclasses of `ValueAsMetadata`).
As a side effect, accessing an operand of an `MDNode` that is known
to be, e.g., `ConstantInt`, takes three steps: first, cast from
`Metadata` to `ConstantAsMetadata`; second, extract the `Constant`;
third, cast down to `ConstantInt`.
The eventual goal is to introduce `MDInt`/`MDFloat`/etc. and have
metadata schema owners transition away from using `Constant`s when
the type isn't important (and they don't care about referring to
`GlobalValue`s).
In the meantime, I've added transitional API to the `mdconst`
namespace that matches semantics with the old code, in order to
avoid adding the error-prone three-step equivalent to every call
site. If your old code was:
MDNode *N = foo();
bar(isa <ConstantInt>(N->getOperand(0)));
baz(cast <ConstantInt>(N->getOperand(1)));
bak(cast_or_null <ConstantInt>(N->getOperand(2)));
bat(dyn_cast <ConstantInt>(N->getOperand(3)));
bay(dyn_cast_or_null<ConstantInt>(N->getOperand(4)));
you can trivially match its semantics with:
MDNode *N = foo();
bar(mdconst::hasa <ConstantInt>(N->getOperand(0)));
baz(mdconst::extract <ConstantInt>(N->getOperand(1)));
bak(mdconst::extract_or_null <ConstantInt>(N->getOperand(2)));
bat(mdconst::dyn_extract <ConstantInt>(N->getOperand(3)));
bay(mdconst::dyn_extract_or_null<ConstantInt>(N->getOperand(4)));
and when you transition your metadata schema to `MDInt`:
MDNode *N = foo();
bar(isa <MDInt>(N->getOperand(0)));
baz(cast <MDInt>(N->getOperand(1)));
bak(cast_or_null <MDInt>(N->getOperand(2)));
bat(dyn_cast <MDInt>(N->getOperand(3)));
bay(dyn_cast_or_null<MDInt>(N->getOperand(4)));
- A `CallInst` -- specifically, intrinsic instructions -- can refer to
metadata through a bridge called `MetadataAsValue`. This is a
subclass of `Value` where `getType()->isMetadataTy()`.
`MetadataAsValue` is the *only* class that can legally refer to a
`LocalAsMetadata`, which is a bridged form of non-`Constant` values
like `Argument` and `Instruction`. It can also refer to any other
`Metadata` subclass.
(I'll break all your testcases in a follow-up commit, when I propagate
this change to assembly.)
llvm-svn: 223802
2014-12-10 02:38:53 +08:00
|
|
|
MDs.push_back(Node);
|
Small refactor on VectorizerHint for deduplication
Previously, the hint mechanism relied on clean up passes to remove redundant
metadata, which still showed up if running opt at low levels of optimization.
That also has shown that multiple nodes of the same type, but with different
values could still coexist, even if temporary, and cause confusion if the
next pass got the wrong value.
This patch makes sure that, if metadata already exists in a loop, the hint
mechanism will never append a new node, but always replace the existing one.
It also enhances the algorithm to cope with more metadata types in the future
by just adding a new type, not a lot of code.
Re-applying again due to MSVC 2013 being minimum requirement, and this patch
having C++11 that MSVC 2012 didn't support.
Fixes PR20655.
llvm-svn: 216870
2014-09-01 18:00:17 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Now, add the missing hints.
|
|
|
|
for (auto H : HintTypes)
|
IR: Split Metadata from Value
Split `Metadata` away from the `Value` class hierarchy, as part of
PR21532. Assembly and bitcode changes are in the wings, but this is the
bulk of the change for the IR C++ API.
I have a follow-up patch prepared for `clang`. If this breaks other
sub-projects, I apologize in advance :(. Help me compile it on Darwin
I'll try to fix it. FWIW, the errors should be easy to fix, so it may
be simpler to just fix it yourself.
This breaks the build for all metadata-related code that's out-of-tree.
Rest assured the transition is mechanical and the compiler should catch
almost all of the problems.
Here's a quick guide for updating your code:
- `Metadata` is the root of a class hierarchy with three main classes:
`MDNode`, `MDString`, and `ValueAsMetadata`. It is distinct from
the `Value` class hierarchy. It is typeless -- i.e., instances do
*not* have a `Type`.
- `MDNode`'s operands are all `Metadata *` (instead of `Value *`).
- `TrackingVH<MDNode>` and `WeakVH` referring to metadata can be
replaced with `TrackingMDNodeRef` and `TrackingMDRef`, respectively.
If you're referring solely to resolved `MDNode`s -- post graph
construction -- just use `MDNode*`.
- `MDNode` (and the rest of `Metadata`) have only limited support for
`replaceAllUsesWith()`.
As long as an `MDNode` is pointing at a forward declaration -- the
result of `MDNode::getTemporary()` -- it maintains a side map of its
uses and can RAUW itself. Once the forward declarations are fully
resolved RAUW support is dropped on the ground. This means that
uniquing collisions on changing operands cause nodes to become
"distinct". (This already happened fairly commonly, whenever an
operand went to null.)
If you're constructing complex (non self-reference) `MDNode` cycles,
you need to call `MDNode::resolveCycles()` on each node (or on a
top-level node that somehow references all of the nodes). Also,
don't do that. Metadata cycles (and the RAUW machinery needed to
construct them) are expensive.
- An `MDNode` can only refer to a `Constant` through a bridge called
`ConstantAsMetadata` (one of the subclasses of `ValueAsMetadata`).
As a side effect, accessing an operand of an `MDNode` that is known
to be, e.g., `ConstantInt`, takes three steps: first, cast from
`Metadata` to `ConstantAsMetadata`; second, extract the `Constant`;
third, cast down to `ConstantInt`.
The eventual goal is to introduce `MDInt`/`MDFloat`/etc. and have
metadata schema owners transition away from using `Constant`s when
the type isn't important (and they don't care about referring to
`GlobalValue`s).
In the meantime, I've added transitional API to the `mdconst`
namespace that matches semantics with the old code, in order to
avoid adding the error-prone three-step equivalent to every call
site. If your old code was:
MDNode *N = foo();
bar(isa <ConstantInt>(N->getOperand(0)));
baz(cast <ConstantInt>(N->getOperand(1)));
bak(cast_or_null <ConstantInt>(N->getOperand(2)));
bat(dyn_cast <ConstantInt>(N->getOperand(3)));
bay(dyn_cast_or_null<ConstantInt>(N->getOperand(4)));
you can trivially match its semantics with:
MDNode *N = foo();
bar(mdconst::hasa <ConstantInt>(N->getOperand(0)));
baz(mdconst::extract <ConstantInt>(N->getOperand(1)));
bak(mdconst::extract_or_null <ConstantInt>(N->getOperand(2)));
bat(mdconst::dyn_extract <ConstantInt>(N->getOperand(3)));
bay(mdconst::dyn_extract_or_null<ConstantInt>(N->getOperand(4)));
and when you transition your metadata schema to `MDInt`:
MDNode *N = foo();
bar(isa <MDInt>(N->getOperand(0)));
baz(cast <MDInt>(N->getOperand(1)));
bak(cast_or_null <MDInt>(N->getOperand(2)));
bat(dyn_cast <MDInt>(N->getOperand(3)));
bay(dyn_cast_or_null<MDInt>(N->getOperand(4)));
- A `CallInst` -- specifically, intrinsic instructions -- can refer to
metadata through a bridge called `MetadataAsValue`. This is a
subclass of `Value` where `getType()->isMetadataTy()`.
`MetadataAsValue` is the *only* class that can legally refer to a
`LocalAsMetadata`, which is a bridged form of non-`Constant` values
like `Argument` and `Instruction`. It can also refer to any other
`Metadata` subclass.
(I'll break all your testcases in a follow-up commit, when I propagate
this change to assembly.)
llvm-svn: 223802
2014-12-10 02:38:53 +08:00
|
|
|
MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value));
|
Small refactor on VectorizerHint for deduplication
Previously, the hint mechanism relied on clean up passes to remove redundant
metadata, which still showed up if running opt at low levels of optimization.
That also has shown that multiple nodes of the same type, but with different
values could still coexist, even if temporary, and cause confusion if the
next pass got the wrong value.
This patch makes sure that, if metadata already exists in a loop, the hint
mechanism will never append a new node, but always replace the existing one.
It also enhances the algorithm to cope with more metadata types in the future
by just adding a new type, not a lot of code.
Re-applying again due to MSVC 2013 being minimum requirement, and this patch
having C++11 that MSVC 2012 didn't support.
Fixes PR20655.
llvm-svn: 216870
2014-09-01 18:00:17 +08:00
|
|
|
|
|
|
|
// Replace current metadata node with new one.
|
|
|
|
LLVMContext &Context = TheLoop->getHeader()->getContext();
|
IR: Split Metadata from Value
Split `Metadata` away from the `Value` class hierarchy, as part of
PR21532. Assembly and bitcode changes are in the wings, but this is the
bulk of the change for the IR C++ API.
I have a follow-up patch prepared for `clang`. If this breaks other
sub-projects, I apologize in advance :(. Help me compile it on Darwin
I'll try to fix it. FWIW, the errors should be easy to fix, so it may
be simpler to just fix it yourself.
This breaks the build for all metadata-related code that's out-of-tree.
Rest assured the transition is mechanical and the compiler should catch
almost all of the problems.
Here's a quick guide for updating your code:
- `Metadata` is the root of a class hierarchy with three main classes:
`MDNode`, `MDString`, and `ValueAsMetadata`. It is distinct from
the `Value` class hierarchy. It is typeless -- i.e., instances do
*not* have a `Type`.
- `MDNode`'s operands are all `Metadata *` (instead of `Value *`).
- `TrackingVH<MDNode>` and `WeakVH` referring to metadata can be
replaced with `TrackingMDNodeRef` and `TrackingMDRef`, respectively.
If you're referring solely to resolved `MDNode`s -- post graph
construction -- just use `MDNode*`.
- `MDNode` (and the rest of `Metadata`) have only limited support for
`replaceAllUsesWith()`.
As long as an `MDNode` is pointing at a forward declaration -- the
result of `MDNode::getTemporary()` -- it maintains a side map of its
uses and can RAUW itself. Once the forward declarations are fully
resolved RAUW support is dropped on the ground. This means that
uniquing collisions on changing operands cause nodes to become
"distinct". (This already happened fairly commonly, whenever an
operand went to null.)
If you're constructing complex (non self-reference) `MDNode` cycles,
you need to call `MDNode::resolveCycles()` on each node (or on a
top-level node that somehow references all of the nodes). Also,
don't do that. Metadata cycles (and the RAUW machinery needed to
construct them) are expensive.
- An `MDNode` can only refer to a `Constant` through a bridge called
`ConstantAsMetadata` (one of the subclasses of `ValueAsMetadata`).
As a side effect, accessing an operand of an `MDNode` that is known
to be, e.g., `ConstantInt`, takes three steps: first, cast from
`Metadata` to `ConstantAsMetadata`; second, extract the `Constant`;
third, cast down to `ConstantInt`.
The eventual goal is to introduce `MDInt`/`MDFloat`/etc. and have
metadata schema owners transition away from using `Constant`s when
the type isn't important (and they don't care about referring to
`GlobalValue`s).
In the meantime, I've added transitional API to the `mdconst`
namespace that matches semantics with the old code, in order to
avoid adding the error-prone three-step equivalent to every call
site. If your old code was:
MDNode *N = foo();
bar(isa <ConstantInt>(N->getOperand(0)));
baz(cast <ConstantInt>(N->getOperand(1)));
bak(cast_or_null <ConstantInt>(N->getOperand(2)));
bat(dyn_cast <ConstantInt>(N->getOperand(3)));
bay(dyn_cast_or_null<ConstantInt>(N->getOperand(4)));
you can trivially match its semantics with:
MDNode *N = foo();
bar(mdconst::hasa <ConstantInt>(N->getOperand(0)));
baz(mdconst::extract <ConstantInt>(N->getOperand(1)));
bak(mdconst::extract_or_null <ConstantInt>(N->getOperand(2)));
bat(mdconst::dyn_extract <ConstantInt>(N->getOperand(3)));
bay(mdconst::dyn_extract_or_null<ConstantInt>(N->getOperand(4)));
and when you transition your metadata schema to `MDInt`:
MDNode *N = foo();
bar(isa <MDInt>(N->getOperand(0)));
baz(cast <MDInt>(N->getOperand(1)));
bak(cast_or_null <MDInt>(N->getOperand(2)));
bat(dyn_cast <MDInt>(N->getOperand(3)));
bay(dyn_cast_or_null<MDInt>(N->getOperand(4)));
- A `CallInst` -- specifically, intrinsic instructions -- can refer to
metadata through a bridge called `MetadataAsValue`. This is a
subclass of `Value` where `getType()->isMetadataTy()`.
`MetadataAsValue` is the *only* class that can legally refer to a
`LocalAsMetadata`, which is a bridged form of non-`Constant` values
like `Argument` and `Instruction`. It can also refer to any other
`Metadata` subclass.
(I'll break all your testcases in a follow-up commit, when I propagate
this change to assembly.)
llvm-svn: 223802
2014-12-10 02:38:53 +08:00
|
|
|
MDNode *NewLoopID = MDNode::get(Context, MDs);
|
Small refactor on VectorizerHint for deduplication
Previously, the hint mechanism relied on clean up passes to remove redundant
metadata, which still showed up if running opt at low levels of optimization.
That also has shown that multiple nodes of the same type, but with different
values could still coexist, even if temporary, and cause confusion if the
next pass got the wrong value.
This patch makes sure that, if metadata already exists in a loop, the hint
mechanism will never append a new node, but always replace the existing one.
It also enhances the algorithm to cope with more metadata types in the future
by just adding a new type, not a lot of code.
Re-applying again due to MSVC 2013 being minimum requirement, and this patch
having C++11 that MSVC 2012 didn't support.
Fixes PR20655.
llvm-svn: 216870
2014-09-01 18:00:17 +08:00
|
|
|
// Set operand 0 to refer to the loop id itself.
|
|
|
|
NewLoopID->replaceOperandWith(0, NewLoopID);
|
|
|
|
|
|
|
|
TheLoop->setLoopID(NewLoopID);
|
|
|
|
}
|
2014-05-03 04:40:04 +08:00
|
|
|
|
Small refactor on VectorizerHint for deduplication
Previously, the hint mechanism relied on clean up passes to remove redundant
metadata, which still showed up if running opt at low levels of optimization.
That also has shown that multiple nodes of the same type, but with different
values could still coexist, even if temporary, and cause confusion if the
next pass got the wrong value.
This patch makes sure that, if metadata already exists in a loop, the hint
mechanism will never append a new node, but always replace the existing one.
It also enhances the algorithm to cope with more metadata types in the future
by just adding a new type, not a lot of code.
Re-applying again due to MSVC 2013 being minimum requirement, and this patch
having C++11 that MSVC 2012 didn't support.
Fixes PR20655.
llvm-svn: 216870
2014-09-01 18:00:17 +08:00
|
|
|
/// The loop these hints belong to.
|
|
|
|
const Loop *TheLoop;
|
2013-05-29 04:00:34 +08:00
|
|
|
};
|
|
|
|
|
2014-07-16 08:36:00 +08:00
|
|
|
static void emitMissedWarning(Function *F, Loop *L,
|
|
|
|
const LoopVectorizeHints &LH) {
|
|
|
|
emitOptimizationRemarkMissed(F->getContext(), DEBUG_TYPE, *F,
|
|
|
|
L->getStartLoc(), LH.emitRemark());
|
|
|
|
|
|
|
|
if (LH.getForce() == LoopVectorizeHints::FK_Enabled) {
|
|
|
|
if (LH.getWidth() != 1)
|
|
|
|
emitLoopVectorizeWarning(
|
|
|
|
F->getContext(), *F, L->getStartLoc(),
|
|
|
|
"failed explicitly specified loop vectorization");
|
2014-09-11 01:58:16 +08:00
|
|
|
else if (LH.getInterleave() != 1)
|
2014-07-16 08:36:00 +08:00
|
|
|
emitLoopInterleaveWarning(
|
|
|
|
F->getContext(), *F, L->getStartLoc(),
|
|
|
|
"failed explicitly specified loop interleaving");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-03-19 06:00:32 +08:00
|
|
|
static void addInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) {
|
|
|
|
if (L.empty())
|
|
|
|
return V.push_back(&L);
|
2014-01-25 18:01:55 +08:00
|
|
|
|
2014-03-19 06:00:32 +08:00
|
|
|
for (Loop *InnerL : L)
|
|
|
|
addInnerLoop(*InnerL, V);
|
2014-01-25 18:01:55 +08:00
|
|
|
}
|
|
|
|
|
2012-12-11 05:39:02 +08:00
|
|
|
/// The LoopVectorize Pass.
|
2014-01-25 18:01:55 +08:00
|
|
|
struct LoopVectorize : public FunctionPass {
|
2012-12-12 09:11:46 +08:00
|
|
|
/// Pass identification, replacement for typeid
|
|
|
|
static char ID;
|
2012-10-18 02:25:06 +08:00
|
|
|
|
2013-12-06 05:20:02 +08:00
|
|
|
explicit LoopVectorize(bool NoUnrolling = false, bool AlwaysVectorize = true)
|
2014-01-25 18:01:55 +08:00
|
|
|
: FunctionPass(ID),
|
2013-12-06 05:20:02 +08:00
|
|
|
DisableUnrolling(NoUnrolling),
|
|
|
|
AlwaysVectorize(AlwaysVectorize) {
|
2012-10-18 02:25:06 +08:00
|
|
|
initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
|
|
|
|
}
|
|
|
|
|
|
|
|
ScalarEvolution *SE;
|
|
|
|
LoopInfo *LI;
|
2012-10-25 04:36:32 +08:00
|
|
|
TargetTransformInfo *TTI;
|
2012-10-30 05:52:38 +08:00
|
|
|
DominatorTree *DT;
|
2014-01-27 21:11:50 +08:00
|
|
|
BlockFrequencyInfo *BFI;
|
2013-02-27 23:24:19 +08:00
|
|
|
TargetLibraryInfo *TLI;
|
[LoopVectorize] Use AA to partition potential dependency checks
Prior to this change, the loop vectorizer did not make use of the alias
analysis infrastructure. Instead, it performed memory dependence analysis using
ScalarEvolution-based linear dependence checks within equivalence classes
derived from the results of ValueTracking's GetUnderlyingObjects.
Unfortunately, this meant that:
1. The loop vectorizer had logic that essentially duplicated that in BasicAA
for aliasing based on identified objects.
2. The loop vectorizer could not partition the space of dependency checks
based on information only easily available from within AA (TBAA metadata is
currently the prime example).
This means, for example, regardless of whether -fno-strict-aliasing was
provided, the vectorizer would only vectorize this loop with a runtime
memory-overlap check:
void foo(int *a, float *b) {
for (int i = 0; i < 1600; ++i)
a[i] = b[i];
}
This is suboptimal because the TBAA metadata already provides the information
necessary to show that this check unnecessary. Of course, the vectorizer has a
limit on the number of such checks it will insert, so in practice, ignoring
TBAA means not vectorizing more-complicated loops that we should.
This change causes the vectorizer to use an AliasSetTracker to keep track of
the pointers in the loop. The resulting alias sets are then used to partition
the space of dependency checks, and potential runtime checks; this results in
more-efficient vectorizations.
When pointer locations are added to the AliasSetTracker, two things are done:
1. The location size is set to UnknownSize (otherwise you'd not catch
inter-iteration dependencies)
2. For instructions in blocks that would need to be predicated, TBAA is
removed (because the metadata might have a control dependency on the condition
being speculated).
For non-predicated blocks, you can leave the TBAA metadata. This is safe
because you can't have an iteration dependency on the TBAA metadata (if you
did, and you unrolled sufficiently, you'd end up with the same pointer value
used by two accesses that TBAA says should not alias, and that would yield
undefined behavior).
llvm-svn: 213486
2014-07-21 07:07:52 +08:00
|
|
|
AliasAnalysis *AA;
|
2015-01-04 20:03:27 +08:00
|
|
|
AssumptionCache *AC;
|
2015-02-20 03:15:04 +08:00
|
|
|
LoopAccessAnalysis *LAA;
|
2013-08-29 02:33:10 +08:00
|
|
|
bool DisableUnrolling;
|
2013-12-06 05:20:02 +08:00
|
|
|
bool AlwaysVectorize;
|
2012-10-18 02:25:06 +08:00
|
|
|
|
2014-01-27 21:11:50 +08:00
|
|
|
BlockFrequency ColdEntryFreq;
|
|
|
|
|
2014-03-05 17:10:37 +08:00
|
|
|
bool runOnFunction(Function &F) override {
|
2012-10-18 02:25:06 +08:00
|
|
|
SE = &getAnalysis<ScalarEvolution>();
|
2015-01-17 22:16:18 +08:00
|
|
|
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
|
2015-02-01 20:01:35 +08:00
|
|
|
TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
|
2014-01-13 21:07:17 +08:00
|
|
|
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
|
2014-01-27 21:11:50 +08:00
|
|
|
BFI = &getAnalysis<BlockFrequencyInfo>();
|
2015-01-15 18:41:28 +08:00
|
|
|
auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
|
|
|
|
TLI = TLIP ? &TLIP->getTLI() : nullptr;
|
[LoopVectorize] Use AA to partition potential dependency checks
Prior to this change, the loop vectorizer did not make use of the alias
analysis infrastructure. Instead, it performed memory dependence analysis using
ScalarEvolution-based linear dependence checks within equivalence classes
derived from the results of ValueTracking's GetUnderlyingObjects.
Unfortunately, this meant that:
1. The loop vectorizer had logic that essentially duplicated that in BasicAA
for aliasing based on identified objects.
2. The loop vectorizer could not partition the space of dependency checks
based on information only easily available from within AA (TBAA metadata is
currently the prime example).
This means, for example, regardless of whether -fno-strict-aliasing was
provided, the vectorizer would only vectorize this loop with a runtime
memory-overlap check:
void foo(int *a, float *b) {
for (int i = 0; i < 1600; ++i)
a[i] = b[i];
}
This is suboptimal because the TBAA metadata already provides the information
necessary to show that this check unnecessary. Of course, the vectorizer has a
limit on the number of such checks it will insert, so in practice, ignoring
TBAA means not vectorizing more-complicated loops that we should.
This change causes the vectorizer to use an AliasSetTracker to keep track of
the pointers in the loop. The resulting alias sets are then used to partition
the space of dependency checks, and potential runtime checks; this results in
more-efficient vectorizations.
When pointer locations are added to the AliasSetTracker, two things are done:
1. The location size is set to UnknownSize (otherwise you'd not catch
inter-iteration dependencies)
2. For instructions in blocks that would need to be predicated, TBAA is
removed (because the metadata might have a control dependency on the condition
being speculated).
For non-predicated blocks, you can leave the TBAA metadata. This is safe
because you can't have an iteration dependency on the TBAA metadata (if you
did, and you unrolled sufficiently, you'd end up with the same pointer value
used by two accesses that TBAA says should not alias, and that would yield
undefined behavior).
llvm-svn: 213486
2014-07-21 07:07:52 +08:00
|
|
|
AA = &getAnalysis<AliasAnalysis>();
|
2015-01-04 20:03:27 +08:00
|
|
|
AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
|
2015-02-20 03:15:04 +08:00
|
|
|
LAA = &getAnalysis<LoopAccessAnalysis>();
|
2012-10-18 02:25:06 +08:00
|
|
|
|
2014-01-27 21:11:50 +08:00
|
|
|
// Compute some weights outside of the loop over the loops. Compute this
|
|
|
|
// using a BranchProbability to re-use its scaling math.
|
|
|
|
const BranchProbability ColdProb(1, 5); // 20%
|
|
|
|
ColdEntryFreq = BlockFrequency(BFI->getEntryFreq()) * ColdProb;
|
|
|
|
|
2013-09-18 20:43:35 +08:00
|
|
|
// If the target claims to have no vector registers don't attempt
|
|
|
|
// vectorization.
|
|
|
|
if (!TTI->getNumberOfRegisters(true))
|
|
|
|
return false;
|
|
|
|
|
2014-01-25 18:01:55 +08:00
|
|
|
// Build up a worklist of inner-loops to vectorize. This is necessary as
|
|
|
|
// the act of vectorizing or partially unrolling a loop creates new loops
|
|
|
|
// and can invalidate iterators across the loops.
|
|
|
|
SmallVector<Loop *, 8> Worklist;
|
|
|
|
|
2014-03-19 06:00:32 +08:00
|
|
|
for (Loop *L : *LI)
|
|
|
|
addInnerLoop(*L, Worklist);
|
2014-01-25 18:01:55 +08:00
|
|
|
|
2014-04-23 16:40:37 +08:00
|
|
|
LoopsAnalyzed += Worklist.size();
|
|
|
|
|
2014-01-25 18:01:55 +08:00
|
|
|
// Now walk the identified inner loops.
|
|
|
|
bool Changed = false;
|
|
|
|
while (!Worklist.empty())
|
|
|
|
Changed |= processLoop(Worklist.pop_back_val());
|
|
|
|
|
|
|
|
// Process each loop nest in the function.
|
|
|
|
return Changed;
|
|
|
|
}
|
|
|
|
|
2015-03-09 14:14:18 +08:00
|
|
|
static void AddRuntimeUnrollDisableMetaData(Loop *L) {
|
|
|
|
SmallVector<Metadata *, 4> MDs;
|
|
|
|
// Reserve first location for self reference to the LoopID metadata node.
|
|
|
|
MDs.push_back(nullptr);
|
|
|
|
bool IsUnrollMetadata = false;
|
|
|
|
MDNode *LoopID = L->getLoopID();
|
|
|
|
if (LoopID) {
|
|
|
|
// First find existing loop unrolling disable metadata.
|
|
|
|
for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
|
|
|
|
MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
|
|
|
|
if (MD) {
|
|
|
|
const MDString *S = dyn_cast<MDString>(MD->getOperand(0));
|
|
|
|
IsUnrollMetadata =
|
|
|
|
S && S->getString().startswith("llvm.loop.unroll.disable");
|
|
|
|
}
|
|
|
|
MDs.push_back(LoopID->getOperand(i));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!IsUnrollMetadata) {
|
|
|
|
// Add runtime unroll disable metadata.
|
|
|
|
LLVMContext &Context = L->getHeader()->getContext();
|
|
|
|
SmallVector<Metadata *, 1> DisableOperands;
|
|
|
|
DisableOperands.push_back(
|
|
|
|
MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
|
|
|
|
MDNode *DisableNode = MDNode::get(Context, DisableOperands);
|
|
|
|
MDs.push_back(DisableNode);
|
|
|
|
MDNode *NewLoopID = MDNode::get(Context, MDs);
|
|
|
|
// Set operand 0 to refer to the loop id itself.
|
|
|
|
NewLoopID->replaceOperandWith(0, NewLoopID);
|
|
|
|
L->setLoopID(NewLoopID);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-01-25 18:01:55 +08:00
|
|
|
bool processLoop(Loop *L) {
|
2014-03-19 05:58:38 +08:00
|
|
|
assert(L->empty() && "Only process inner loops.");
|
2014-05-07 17:51:22 +08:00
|
|
|
|
|
|
|
#ifndef NDEBUG
|
2014-05-20 16:26:20 +08:00
|
|
|
const std::string DebugLocStr = getDebugLocString(L);
|
2014-05-07 17:51:22 +08:00
|
|
|
#endif /* NDEBUG */
|
|
|
|
|
2014-04-15 17:37:30 +08:00
|
|
|
DEBUG(dbgs() << "\nLV: Checking a loop in \""
|
2014-04-07 20:32:17 +08:00
|
|
|
<< L->getHeader()->getParent()->getName() << "\" from "
|
2014-05-07 17:51:22 +08:00
|
|
|
<< DebugLocStr << "\n");
|
2012-10-18 02:25:06 +08:00
|
|
|
|
2013-08-29 02:33:10 +08:00
|
|
|
LoopVectorizeHints Hints(L, DisableUnrolling);
|
2013-05-29 04:00:34 +08:00
|
|
|
|
2014-04-15 17:37:30 +08:00
|
|
|
DEBUG(dbgs() << "LV: Loop hints:"
|
2014-04-29 16:55:11 +08:00
|
|
|
<< " force="
|
2014-05-03 04:40:04 +08:00
|
|
|
<< (Hints.getForce() == LoopVectorizeHints::FK_Disabled
|
2014-04-29 16:55:11 +08:00
|
|
|
? "disabled"
|
2014-05-03 04:40:04 +08:00
|
|
|
: (Hints.getForce() == LoopVectorizeHints::FK_Enabled
|
2014-04-29 16:55:11 +08:00
|
|
|
? "enabled"
|
2014-05-03 04:40:04 +08:00
|
|
|
: "?")) << " width=" << Hints.getWidth()
|
2014-09-11 01:58:16 +08:00
|
|
|
<< " unroll=" << Hints.getInterleave() << "\n");
|
2014-04-29 16:55:11 +08:00
|
|
|
|
2014-06-26 01:50:15 +08:00
|
|
|
// Function containing loop
|
|
|
|
Function *F = L->getHeader()->getParent();
|
|
|
|
|
|
|
|
// Looking at the diagnostic output is the only way to determine if a loop
|
|
|
|
// was vectorized (other than looking at the IR or machine code), so it
|
|
|
|
// is important to generate an optimization remark for each loop. Most of
|
|
|
|
// these messages are generated by emitOptimizationRemarkAnalysis. Remarks
|
|
|
|
// generated by emitOptimizationRemark and emitOptimizationRemarkMissed are
|
|
|
|
// less verbose reporting vectorized loops and unvectorized loops that may
|
|
|
|
// benefit from vectorization, respectively.
|
|
|
|
|
2014-05-03 04:40:04 +08:00
|
|
|
if (Hints.getForce() == LoopVectorizeHints::FK_Disabled) {
|
2013-12-06 05:20:02 +08:00
|
|
|
DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n");
|
2014-06-26 01:50:15 +08:00
|
|
|
emitOptimizationRemarkAnalysis(F->getContext(), DEBUG_TYPE, *F,
|
|
|
|
L->getStartLoc(), Hints.emitRemark());
|
2013-12-06 05:20:02 +08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2014-05-03 04:40:04 +08:00
|
|
|
if (!AlwaysVectorize && Hints.getForce() != LoopVectorizeHints::FK_Enabled) {
|
2013-12-06 05:20:02 +08:00
|
|
|
DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n");
|
2014-06-26 01:50:15 +08:00
|
|
|
emitOptimizationRemarkAnalysis(F->getContext(), DEBUG_TYPE, *F,
|
|
|
|
L->getStartLoc(), Hints.emitRemark());
|
2013-12-06 05:20:02 +08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2014-09-11 01:58:16 +08:00
|
|
|
if (Hints.getWidth() == 1 && Hints.getInterleave() == 1) {
|
2013-12-06 05:20:02 +08:00
|
|
|
DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n");
|
2014-06-26 01:50:15 +08:00
|
|
|
emitOptimizationRemarkAnalysis(
|
|
|
|
F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
|
|
|
|
"loop not vectorized: vector width and interleave count are "
|
|
|
|
"explicitly set to 1");
|
2013-05-29 04:00:34 +08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2014-04-29 16:55:11 +08:00
|
|
|
// Check the loop for a trip count threshold:
|
|
|
|
// do not vectorize loops with a tiny trip count.
|
2014-10-11 08:12:11 +08:00
|
|
|
const unsigned TC = SE->getSmallConstantTripCount(L);
|
2014-04-29 16:55:11 +08:00
|
|
|
if (TC > 0u && TC < TinyTripCountVectorThreshold) {
|
|
|
|
DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
|
|
|
|
<< "This loop is not worth vectorizing.");
|
2014-05-03 04:40:04 +08:00
|
|
|
if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
|
2014-04-29 16:55:11 +08:00
|
|
|
DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
|
|
|
|
else {
|
|
|
|
DEBUG(dbgs() << "\n");
|
2014-06-26 01:50:15 +08:00
|
|
|
emitOptimizationRemarkAnalysis(
|
|
|
|
F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
|
|
|
|
"vectorization is not beneficial and is not explicitly forced");
|
2014-04-29 16:55:11 +08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-10-18 02:25:06 +08:00
|
|
|
// Check if it is legal to vectorize the loop.
|
2015-03-10 10:37:25 +08:00
|
|
|
LoopVectorizationLegality LVL(L, SE, DT, TLI, AA, F, TTI, LAA);
|
2012-10-25 04:36:32 +08:00
|
|
|
if (!LVL.canVectorize()) {
|
2013-12-06 05:20:02 +08:00
|
|
|
DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
|
2014-07-16 08:36:00 +08:00
|
|
|
emitMissedWarning(F, L, Hints);
|
2012-10-18 02:25:06 +08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2012-12-12 09:11:46 +08:00
|
|
|
// Use the cost model.
|
2015-03-10 10:37:25 +08:00
|
|
|
LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, TLI, AC, F, &Hints);
|
2012-12-13 03:29:45 +08:00
|
|
|
|
2013-02-09 01:43:32 +08:00
|
|
|
// Check the function attributes to find out if this function should be
|
2012-12-13 03:29:45 +08:00
|
|
|
// optimized for size.
|
2014-05-03 04:40:04 +08:00
|
|
|
bool OptForSize = Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
|
2014-04-29 16:55:11 +08:00
|
|
|
F->hasFnAttribute(Attribute::OptimizeForSize);
|
2014-01-27 19:27:37 +08:00
|
|
|
|
2014-01-27 21:11:50 +08:00
|
|
|
// Compute the weighted frequency of this loop being executed and see if it
|
|
|
|
// is less than 20% of the function entry baseline frequency. Note that we
|
|
|
|
// always have a canonical loop here because we think we *can* vectoriez.
|
2014-01-28 17:10:41 +08:00
|
|
|
// FIXME: This is hidden behind a flag due to pervasive problems with
|
|
|
|
// exactly what block frequency models.
|
|
|
|
if (LoopVectorizeWithBlockFrequency) {
|
|
|
|
BlockFrequency LoopEntryFreq = BFI->getBlockFreq(L->getLoopPreheader());
|
2014-05-03 04:40:04 +08:00
|
|
|
if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
|
2014-04-29 16:55:11 +08:00
|
|
|
LoopEntryFreq < ColdEntryFreq)
|
2014-01-28 17:10:41 +08:00
|
|
|
OptForSize = true;
|
|
|
|
}
|
2014-01-27 21:11:50 +08:00
|
|
|
|
2014-01-27 19:27:37 +08:00
|
|
|
// Check the function attributes to see if implicit floats are allowed.a
|
|
|
|
// FIXME: This check doesn't seem possibly correct -- what if the loop is
|
|
|
|
// an integer loop and the vector instructions selected are purely integer
|
|
|
|
// vector instructions?
|
|
|
|
if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
|
2013-01-03 07:54:43 +08:00
|
|
|
DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat"
|
|
|
|
"attribute is used.\n");
|
2014-06-26 01:50:15 +08:00
|
|
|
emitOptimizationRemarkAnalysis(
|
|
|
|
F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
|
|
|
|
"loop not vectorized due to NoImplicitFloat attribute");
|
2014-07-16 08:36:00 +08:00
|
|
|
emitMissedWarning(F, L, Hints);
|
2013-01-03 07:54:43 +08:00
|
|
|
return false;
|
|
|
|
}
|
2012-12-13 03:29:45 +08:00
|
|
|
|
2013-01-20 13:24:29 +08:00
|
|
|
// Select the optimal vectorization factor.
|
2014-04-15 17:37:30 +08:00
|
|
|
const LoopVectorizationCostModel::VectorizationFactor VF =
|
2014-08-02 08:14:03 +08:00
|
|
|
CM.selectVectorizationFactor(OptForSize);
|
2014-04-29 16:55:11 +08:00
|
|
|
|
2013-01-20 13:24:29 +08:00
|
|
|
// Select the unroll factor.
|
2014-05-03 04:40:04 +08:00
|
|
|
const unsigned UF =
|
2014-08-02 08:14:03 +08:00
|
|
|
CM.selectUnrollFactor(OptForSize, VF.Width, VF.Cost);
|
2012-12-12 09:11:46 +08:00
|
|
|
|
2014-05-07 17:51:22 +08:00
|
|
|
DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in "
|
|
|
|
<< DebugLocStr << '\n');
|
2013-10-03 04:04:29 +08:00
|
|
|
DEBUG(dbgs() << "LV: Unroll Factor is " << UF << '\n');
|
2012-10-18 02:25:06 +08:00
|
|
|
|
2013-08-27 06:33:26 +08:00
|
|
|
if (VF.Width == 1) {
|
2014-06-26 01:50:15 +08:00
|
|
|
DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial\n");
|
|
|
|
|
|
|
|
if (UF == 1) {
|
|
|
|
emitOptimizationRemarkAnalysis(
|
|
|
|
F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
|
|
|
|
"not beneficial to vectorize and user disabled interleaving");
|
2013-08-27 06:33:26 +08:00
|
|
|
return false;
|
2014-06-26 01:50:15 +08:00
|
|
|
}
|
2013-12-06 05:20:02 +08:00
|
|
|
DEBUG(dbgs() << "LV: Trying to at least unroll the loops.\n");
|
2014-04-30 04:06:10 +08:00
|
|
|
|
|
|
|
// Report the unrolling decision.
|
2014-05-22 22:19:46 +08:00
|
|
|
emitOptimizationRemark(F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
|
|
|
|
Twine("unrolled with interleaving factor " +
|
|
|
|
Twine(UF) +
|
|
|
|
" (vectorization not beneficial)"));
|
2014-04-30 04:06:10 +08:00
|
|
|
|
2013-08-27 06:33:26 +08:00
|
|
|
// We decided not to vectorize, but we may want to unroll.
|
2014-06-26 01:50:15 +08:00
|
|
|
|
2015-03-18 03:17:18 +08:00
|
|
|
InnerLoopUnroller Unroller(L, SE, LI, DT, TLI, TTI, UF);
|
2013-08-27 06:33:26 +08:00
|
|
|
Unroller.vectorize(&LVL);
|
|
|
|
} else {
|
|
|
|
// If we decided that it is *legal* to vectorize the loop then do it.
|
2015-03-18 03:17:18 +08:00
|
|
|
InnerLoopVectorizer LB(L, SE, LI, DT, TLI, TTI, VF.Width, UF);
|
2013-08-27 06:33:26 +08:00
|
|
|
LB.vectorize(&LVL);
|
2014-04-23 16:40:37 +08:00
|
|
|
++LoopsVectorized;
|
2014-04-30 04:06:10 +08:00
|
|
|
|
2015-03-09 14:14:18 +08:00
|
|
|
// Add metadata to disable runtime unrolling scalar loop when there's no
|
|
|
|
// runtime check about strides and memory. Because at this situation,
|
|
|
|
// scalar loop is rarely used not worthy to be unrolled.
|
|
|
|
if (!LB.IsSafetyChecksAdded())
|
|
|
|
AddRuntimeUnrollDisableMetaData(L);
|
|
|
|
|
2014-04-30 04:06:10 +08:00
|
|
|
// Report the vectorization decision.
|
2014-05-22 22:19:46 +08:00
|
|
|
emitOptimizationRemark(
|
|
|
|
F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
|
2014-04-30 04:06:10 +08:00
|
|
|
Twine("vectorized loop (vectorization factor: ") + Twine(VF.Width) +
|
|
|
|
", unrolling interleave factor: " + Twine(UF) + ")");
|
2013-08-27 06:33:26 +08:00
|
|
|
}
|
2012-10-18 02:25:06 +08:00
|
|
|
|
2013-05-29 04:00:34 +08:00
|
|
|
// Mark the loop as already vectorized to avoid vectorizing again.
|
Small refactor on VectorizerHint for deduplication
Previously, the hint mechanism relied on clean up passes to remove redundant
metadata, which still showed up if running opt at low levels of optimization.
That also has shown that multiple nodes of the same type, but with different
values could still coexist, even if temporary, and cause confusion if the
next pass got the wrong value.
This patch makes sure that, if metadata already exists in a loop, the hint
mechanism will never append a new node, but always replace the existing one.
It also enhances the algorithm to cope with more metadata types in the future
by just adding a new type, not a lot of code.
Re-applying again due to MSVC 2013 being minimum requirement, and this patch
having C++11 that MSVC 2012 didn't support.
Fixes PR20655.
llvm-svn: 216870
2014-09-01 18:00:17 +08:00
|
|
|
Hints.setAlreadyVectorized();
|
2013-05-29 04:00:34 +08:00
|
|
|
|
2012-10-18 13:29:12 +08:00
|
|
|
DEBUG(verifyFunction(*L->getHeader()->getParent()));
|
2012-10-18 02:25:06 +08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2014-03-05 17:10:37 +08:00
|
|
|
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
2015-01-04 20:03:27 +08:00
|
|
|
AU.addRequired<AssumptionCacheTracker>();
|
2012-10-18 02:25:06 +08:00
|
|
|
AU.addRequiredID(LoopSimplifyID);
|
2012-10-20 07:05:40 +08:00
|
|
|
AU.addRequiredID(LCSSAID);
|
2014-01-27 21:11:50 +08:00
|
|
|
AU.addRequired<BlockFrequencyInfo>();
|
2014-01-13 21:07:17 +08:00
|
|
|
AU.addRequired<DominatorTreeWrapperPass>();
|
2015-01-17 22:16:18 +08:00
|
|
|
AU.addRequired<LoopInfoWrapperPass>();
|
2012-10-18 02:25:06 +08:00
|
|
|
AU.addRequired<ScalarEvolution>();
|
[PM] Change the core design of the TTI analysis to use a polymorphic
type erased interface and a single analysis pass rather than an
extremely complex analysis group.
The end result is that the TTI analysis can contain a type erased
implementation that supports the polymorphic TTI interface. We can build
one from a target-specific implementation or from a dummy one in the IR.
I've also factored all of the code into "mix-in"-able base classes,
including CRTP base classes to facilitate calling back up to the most
specialized form when delegating horizontally across the surface. These
aren't as clean as I would like and I'm planning to work on cleaning
some of this up, but I wanted to start by putting into the right form.
There are a number of reasons for this change, and this particular
design. The first and foremost reason is that an analysis group is
complete overkill, and the chaining delegation strategy was so opaque,
confusing, and high overhead that TTI was suffering greatly for it.
Several of the TTI functions had failed to be implemented in all places
because of the chaining-based delegation making there be no checking of
this. A few other functions were implemented with incorrect delegation.
The message to me was very clear working on this -- the delegation and
analysis group structure was too confusing to be useful here.
The other reason of course is that this is *much* more natural fit for
the new pass manager. This will lay the ground work for a type-erased
per-function info object that can look up the correct subtarget and even
cache it.
Yet another benefit is that this will significantly simplify the
interaction of the pass managers and the TargetMachine. See the future
work below.
The downside of this change is that it is very, very verbose. I'm going
to work to improve that, but it is somewhat an implementation necessity
in C++ to do type erasure. =/ I discussed this design really extensively
with Eric and Hal prior to going down this path, and afterward showed
them the result. No one was really thrilled with it, but there doesn't
seem to be a substantially better alternative. Using a base class and
virtual method dispatch would make the code much shorter, but as
discussed in the update to the programmer's manual and elsewhere,
a polymorphic interface feels like the more principled approach even if
this is perhaps the least compelling example of it. ;]
Ultimately, there is still a lot more to be done here, but this was the
huge chunk that I couldn't really split things out of because this was
the interface change to TTI. I've tried to minimize all the other parts
of this. The follow up work should include at least:
1) Improving the TargetMachine interface by having it directly return
a TTI object. Because we have a non-pass object with value semantics
and an internal type erasure mechanism, we can narrow the interface
of the TargetMachine to *just* do what we need: build and return
a TTI object that we can then insert into the pass pipeline.
2) Make the TTI object be fully specialized for a particular function.
This will include splitting off a minimal form of it which is
sufficient for the inliner and the old pass manager.
3) Add a new pass manager analysis which produces TTI objects from the
target machine for each function. This may actually be done as part
of #2 in order to use the new analysis to implement #2.
4) Work on narrowing the API between TTI and the targets so that it is
easier to understand and less verbose to type erase.
5) Work on narrowing the API between TTI and its clients so that it is
easier to understand and less verbose to forward.
6) Try to improve the CRTP-based delegation. I feel like this code is
just a bit messy and exacerbating the complexity of implementing
the TTI in each target.
Many thanks to Eric and Hal for their help here. I ended up blocked on
this somewhat more abruptly than I expected, and so I appreciate getting
it sorted out very quickly.
Differential Revision: http://reviews.llvm.org/D7293
llvm-svn: 227669
2015-01-31 11:43:40 +08:00
|
|
|
AU.addRequired<TargetTransformInfoWrapperPass>();
|
[LoopVectorize] Use AA to partition potential dependency checks
Prior to this change, the loop vectorizer did not make use of the alias
analysis infrastructure. Instead, it performed memory dependence analysis using
ScalarEvolution-based linear dependence checks within equivalence classes
derived from the results of ValueTracking's GetUnderlyingObjects.
Unfortunately, this meant that:
1. The loop vectorizer had logic that essentially duplicated that in BasicAA
for aliasing based on identified objects.
2. The loop vectorizer could not partition the space of dependency checks
based on information only easily available from within AA (TBAA metadata is
currently the prime example).
This means, for example, regardless of whether -fno-strict-aliasing was
provided, the vectorizer would only vectorize this loop with a runtime
memory-overlap check:
void foo(int *a, float *b) {
for (int i = 0; i < 1600; ++i)
a[i] = b[i];
}
This is suboptimal because the TBAA metadata already provides the information
necessary to show that this check unnecessary. Of course, the vectorizer has a
limit on the number of such checks it will insert, so in practice, ignoring
TBAA means not vectorizing more-complicated loops that we should.
This change causes the vectorizer to use an AliasSetTracker to keep track of
the pointers in the loop. The resulting alias sets are then used to partition
the space of dependency checks, and potential runtime checks; this results in
more-efficient vectorizations.
When pointer locations are added to the AliasSetTracker, two things are done:
1. The location size is set to UnknownSize (otherwise you'd not catch
inter-iteration dependencies)
2. For instructions in blocks that would need to be predicated, TBAA is
removed (because the metadata might have a control dependency on the condition
being speculated).
For non-predicated blocks, you can leave the TBAA metadata. This is safe
because you can't have an iteration dependency on the TBAA metadata (if you
did, and you unrolled sufficiently, you'd end up with the same pointer value
used by two accesses that TBAA says should not alias, and that would yield
undefined behavior).
llvm-svn: 213486
2014-07-21 07:07:52 +08:00
|
|
|
AU.addRequired<AliasAnalysis>();
|
2015-02-20 03:15:04 +08:00
|
|
|
AU.addRequired<LoopAccessAnalysis>();
|
2015-01-17 22:16:18 +08:00
|
|
|
AU.addPreserved<LoopInfoWrapperPass>();
|
2014-01-13 21:07:17 +08:00
|
|
|
AU.addPreserved<DominatorTreeWrapperPass>();
|
[LoopVectorize] Use AA to partition potential dependency checks
Prior to this change, the loop vectorizer did not make use of the alias
analysis infrastructure. Instead, it performed memory dependence analysis using
ScalarEvolution-based linear dependence checks within equivalence classes
derived from the results of ValueTracking's GetUnderlyingObjects.
Unfortunately, this meant that:
1. The loop vectorizer had logic that essentially duplicated that in BasicAA
for aliasing based on identified objects.
2. The loop vectorizer could not partition the space of dependency checks
based on information only easily available from within AA (TBAA metadata is
currently the prime example).
This means, for example, regardless of whether -fno-strict-aliasing was
provided, the vectorizer would only vectorize this loop with a runtime
memory-overlap check:
void foo(int *a, float *b) {
for (int i = 0; i < 1600; ++i)
a[i] = b[i];
}
This is suboptimal because the TBAA metadata already provides the information
necessary to show that this check unnecessary. Of course, the vectorizer has a
limit on the number of such checks it will insert, so in practice, ignoring
TBAA means not vectorizing more-complicated loops that we should.
This change causes the vectorizer to use an AliasSetTracker to keep track of
the pointers in the loop. The resulting alias sets are then used to partition
the space of dependency checks, and potential runtime checks; this results in
more-efficient vectorizations.
When pointer locations are added to the AliasSetTracker, two things are done:
1. The location size is set to UnknownSize (otherwise you'd not catch
inter-iteration dependencies)
2. For instructions in blocks that would need to be predicated, TBAA is
removed (because the metadata might have a control dependency on the condition
being speculated).
For non-predicated blocks, you can leave the TBAA metadata. This is safe
because you can't have an iteration dependency on the TBAA metadata (if you
did, and you unrolled sufficiently, you'd end up with the same pointer value
used by two accesses that TBAA says should not alias, and that would yield
undefined behavior).
llvm-svn: 213486
2014-07-21 07:07:52 +08:00
|
|
|
AU.addPreserved<AliasAnalysis>();
|
2012-10-18 02:25:06 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
};
|
|
|
|
|
2013-01-07 18:44:06 +08:00
|
|
|
} // end anonymous namespace
|
2012-12-11 05:39:02 +08:00
|
|
|
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
|
|
|
|
// LoopVectorizationCostModel.
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
2012-12-04 05:33:08 +08:00
|
|
|
Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
|
2012-11-30 03:25:41 +08:00
|
|
|
// We need to place the broadcast of invariant variables outside the loop.
|
2012-12-11 03:25:06 +08:00
|
|
|
Instruction *Instr = dyn_cast<Instruction>(V);
|
2014-01-28 09:01:53 +08:00
|
|
|
bool NewInstr =
|
|
|
|
(Instr && std::find(LoopVectorBody.begin(), LoopVectorBody.end(),
|
|
|
|
Instr->getParent()) != LoopVectorBody.end());
|
2012-12-11 03:25:06 +08:00
|
|
|
bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr;
|
2012-11-30 03:25:41 +08:00
|
|
|
|
|
|
|
// Place the code for broadcasting invariant variables in the new preheader.
|
2013-09-30 23:40:17 +08:00
|
|
|
IRBuilder<>::InsertPointGuard Guard(Builder);
|
2012-11-30 03:25:41 +08:00
|
|
|
if (Invariant)
|
|
|
|
Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
|
|
|
|
|
2012-10-18 02:25:06 +08:00
|
|
|
// Broadcast the scalar into all locations in the vector.
|
2013-01-02 03:55:16 +08:00
|
|
|
Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
|
2012-11-30 03:25:41 +08:00
|
|
|
|
2012-10-18 02:25:06 +08:00
|
|
|
return Shuf;
|
|
|
|
}
|
|
|
|
|
2015-01-30 13:02:21 +08:00
|
|
|
Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx,
|
|
|
|
Value *Step) {
|
2012-10-18 02:25:06 +08:00
|
|
|
assert(Val->getType()->isVectorTy() && "Must be a vector");
|
|
|
|
assert(Val->getType()->getScalarType()->isIntegerTy() &&
|
|
|
|
"Elem must be an integer");
|
2015-01-30 13:02:21 +08:00
|
|
|
assert(Step->getType() == Val->getType()->getScalarType() &&
|
|
|
|
"Step has wrong type");
|
2012-10-18 02:25:06 +08:00
|
|
|
// Create the types.
|
|
|
|
Type *ITy = Val->getType()->getScalarType();
|
|
|
|
VectorType *Ty = cast<VectorType>(Val->getType());
|
2012-12-11 03:25:06 +08:00
|
|
|
int VLen = Ty->getNumElements();
|
2012-10-18 02:25:06 +08:00
|
|
|
SmallVector<Constant*, 8> Indices;
|
|
|
|
|
|
|
|
// Create a vector of consecutive numbers from zero to VF.
|
2015-01-30 13:02:21 +08:00
|
|
|
for (int i = 0; i < VLen; ++i)
|
|
|
|
Indices.push_back(ConstantInt::get(ITy, StartIdx + i));
|
2012-10-18 02:25:06 +08:00
|
|
|
|
|
|
|
// Add the consecutive indices to the vector value.
|
|
|
|
Constant *Cv = ConstantVector::get(Indices);
|
|
|
|
assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
|
2015-01-30 13:02:21 +08:00
|
|
|
Step = Builder.CreateVectorSplat(VLen, Step);
|
|
|
|
assert(Step->getType() == Val->getType() && "Invalid step vec");
|
|
|
|
// FIXME: The newly created binary instructions should contain nsw/nuw flags,
|
|
|
|
// which can be found from the original scalar operations.
|
|
|
|
Step = Builder.CreateMul(Cv, Step);
|
|
|
|
return Builder.CreateAdd(Val, Step, "induction");
|
2012-10-18 02:25:06 +08:00
|
|
|
}
|
|
|
|
|
2013-11-01 22:09:50 +08:00
|
|
|
/// \brief Find the operand of the GEP that should be checked for consecutive
|
|
|
|
/// stores. This ignores trailing indices that have no effect on the final
|
|
|
|
/// pointer.
|
2015-03-10 10:37:25 +08:00
|
|
|
static unsigned getGEPInductionOperand(const GetElementPtrInst *Gep) {
|
|
|
|
const DataLayout &DL = Gep->getModule()->getDataLayout();
|
2013-11-01 22:09:50 +08:00
|
|
|
unsigned LastOperand = Gep->getNumOperands() - 1;
|
2015-03-10 10:37:25 +08:00
|
|
|
unsigned GEPAllocSize = DL.getTypeAllocSize(
|
2013-11-01 22:09:50 +08:00
|
|
|
cast<PointerType>(Gep->getType()->getScalarType())->getElementType());
|
|
|
|
|
|
|
|
// Walk backwards and try to peel off zeros.
|
|
|
|
while (LastOperand > 1 && match(Gep->getOperand(LastOperand), m_Zero())) {
|
|
|
|
// Find the type we're currently indexing into.
|
|
|
|
gep_type_iterator GEPTI = gep_type_begin(Gep);
|
|
|
|
std::advance(GEPTI, LastOperand - 1);
|
|
|
|
|
|
|
|
// If it's a type with the same allocation size as the result of the GEP we
|
|
|
|
// can peel off the zero index.
|
2015-03-10 10:37:25 +08:00
|
|
|
if (DL.getTypeAllocSize(*GEPTI) != GEPAllocSize)
|
2013-11-01 22:09:50 +08:00
|
|
|
break;
|
|
|
|
--LastOperand;
|
|
|
|
}
|
|
|
|
|
|
|
|
return LastOperand;
|
|
|
|
}
|
|
|
|
|
2012-12-27 03:08:17 +08:00
|
|
|
int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
|
2013-12-05 13:44:44 +08:00
|
|
|
assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr");
|
2013-01-23 09:35:00 +08:00
|
|
|
// Make sure that the pointer does not point to structs.
|
2013-10-22 03:43:56 +08:00
|
|
|
if (Ptr->getType()->getPointerElementType()->isAggregateType())
|
2013-01-23 09:35:00 +08:00
|
|
|
return 0;
|
2012-11-25 16:41:35 +08:00
|
|
|
|
2012-12-11 03:25:06 +08:00
|
|
|
// If this value is a pointer induction variable we know it is consecutive.
|
2012-11-25 16:41:35 +08:00
|
|
|
PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr);
|
2012-12-11 03:25:06 +08:00
|
|
|
if (Phi && Inductions.count(Phi)) {
|
|
|
|
InductionInfo II = Inductions[Phi];
|
2015-01-30 13:02:21 +08:00
|
|
|
return II.getConsecutiveDirection();
|
2012-12-11 03:25:06 +08:00
|
|
|
}
|
2012-11-25 16:41:35 +08:00
|
|
|
|
2012-10-23 02:27:56 +08:00
|
|
|
GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr);
|
2012-10-18 02:25:06 +08:00
|
|
|
if (!Gep)
|
2012-12-27 03:08:17 +08:00
|
|
|
return 0;
|
2012-10-18 02:25:06 +08:00
|
|
|
|
|
|
|
unsigned NumOperands = Gep->getNumOperands();
|
2013-01-23 09:35:00 +08:00
|
|
|
Value *GpPtr = Gep->getPointerOperand();
|
|
|
|
// If this GEP value is a consecutive pointer induction variable and all of
|
|
|
|
// the indices are constant then we know it is consecutive. We can
|
|
|
|
Phi = dyn_cast<PHINode>(GpPtr);
|
|
|
|
if (Phi && Inductions.count(Phi)) {
|
|
|
|
|
|
|
|
// Make sure that the pointer does not point to structs.
|
|
|
|
PointerType *GepPtrType = cast<PointerType>(GpPtr->getType());
|
|
|
|
if (GepPtrType->getElementType()->isAggregateType())
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
// Make sure that all of the index operands are loop invariant.
|
|
|
|
for (unsigned i = 1; i < NumOperands; ++i)
|
|
|
|
if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
InductionInfo II = Inductions[Phi];
|
2015-01-30 13:02:21 +08:00
|
|
|
return II.getConsecutiveDirection();
|
2013-01-23 09:35:00 +08:00
|
|
|
}
|
|
|
|
|
2015-03-10 10:37:25 +08:00
|
|
|
unsigned InductionOperand = getGEPInductionOperand(Gep);
|
2013-11-01 22:09:50 +08:00
|
|
|
|
|
|
|
// Check that all of the gep indices are uniform except for our induction
|
|
|
|
// operand.
|
|
|
|
for (unsigned i = 0; i != NumOperands; ++i)
|
|
|
|
if (i != InductionOperand &&
|
|
|
|
!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
|
2012-12-27 03:08:17 +08:00
|
|
|
return 0;
|
2012-10-18 02:25:06 +08:00
|
|
|
|
2013-11-01 22:09:50 +08:00
|
|
|
// We can emit wide load/stores only if the last non-zero index is the
|
|
|
|
// induction variable.
|
2014-04-25 13:29:35 +08:00
|
|
|
const SCEV *Last = nullptr;
|
2014-01-11 02:20:32 +08:00
|
|
|
if (!Strides.count(Gep))
|
|
|
|
Last = SE->getSCEV(Gep->getOperand(InductionOperand));
|
|
|
|
else {
|
|
|
|
// Because of the multiplication by a stride we can have a s/zext cast.
|
|
|
|
// We are going to replace this stride by 1 so the cast is safe to ignore.
|
|
|
|
//
|
|
|
|
// %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
|
|
|
|
// %0 = trunc i64 %indvars.iv to i32
|
|
|
|
// %mul = mul i32 %0, %Stride1
|
|
|
|
// %idxprom = zext i32 %mul to i64 << Safe cast.
|
|
|
|
// %arrayidx = getelementptr inbounds i32* %B, i64 %idxprom
|
|
|
|
//
|
|
|
|
Last = replaceSymbolicStrideSCEV(SE, Strides,
|
|
|
|
Gep->getOperand(InductionOperand), Gep);
|
|
|
|
if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(Last))
|
|
|
|
Last =
|
|
|
|
(C->getSCEVType() == scSignExtend || C->getSCEVType() == scZeroExtend)
|
|
|
|
? C->getOperand()
|
|
|
|
: Last;
|
|
|
|
}
|
2012-10-18 02:25:06 +08:00
|
|
|
if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) {
|
|
|
|
const SCEV *Step = AR->getStepRecurrence(*SE);
|
|
|
|
|
|
|
|
// The memory is consecutive because the last index is consecutive
|
|
|
|
// and all other indices are loop invariant.
|
|
|
|
if (Step->isOne())
|
2012-12-27 03:08:17 +08:00
|
|
|
return 1;
|
|
|
|
if (Step->isAllOnesValue())
|
|
|
|
return -1;
|
2012-10-18 02:25:06 +08:00
|
|
|
}
|
|
|
|
|
2012-12-27 03:08:17 +08:00
|
|
|
return 0;
|
2012-10-18 02:25:06 +08:00
|
|
|
}
|
|
|
|
|
2015-02-20 03:14:34 +08:00
|
|
|
bool LoopVectorizationLegality::isUniform(Value *V) {
|
2015-02-20 03:15:04 +08:00
|
|
|
return LAI->isUniform(V);
|
2015-02-20 03:14:34 +08:00
|
|
|
}
|
2015-02-02 00:56:04 +08:00
|
|
|
|
2013-01-03 08:52:27 +08:00
|
|
|
InnerLoopVectorizer::VectorParts&
|
|
|
|
InnerLoopVectorizer::getVectorValue(Value *V) {
|
2012-11-30 03:25:41 +08:00
|
|
|
assert(V != Induction && "The new induction variable should not be used.");
|
2012-10-20 07:05:40 +08:00
|
|
|
assert(!V->getType()->isVectorTy() && "Can't widen a vector");
|
2012-10-19 01:31:49 +08:00
|
|
|
|
2014-01-11 02:20:32 +08:00
|
|
|
// If we have a stride that is replaced by one, do it here.
|
|
|
|
if (Legal->hasStride(V))
|
|
|
|
V = ConstantInt::get(V->getType(), 1);
|
|
|
|
|
2013-01-03 08:52:27 +08:00
|
|
|
// If we have this scalar in the map, return it.
|
|
|
|
if (WidenMap.has(V))
|
|
|
|
return WidenMap.get(V);
|
|
|
|
|
|
|
|
// If this scalar is unknown, assume that it is a constant or that it is
|
|
|
|
// loop invariant. Broadcast V and save the value for future uses.
|
2012-10-19 01:31:49 +08:00
|
|
|
Value *B = getBroadcastInstrs(V);
|
2013-01-30 01:31:33 +08:00
|
|
|
return WidenMap.splat(V, B);
|
2012-10-18 02:25:06 +08:00
|
|
|
}
|
|
|
|
|
2012-12-27 03:08:17 +08:00
|
|
|
Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
|
|
|
|
assert(Vec->getType()->isVectorTy() && "Invalid type");
|
|
|
|
SmallVector<Constant*, 8> ShuffleMask;
|
|
|
|
for (unsigned i = 0; i < VF; ++i)
|
|
|
|
ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
|
|
|
|
|
|
|
|
return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
|
|
|
|
ConstantVector::get(ShuffleMask),
|
|
|
|
"reverse");
|
|
|
|
}
|
|
|
|
|
2014-01-11 02:20:32 +08:00
|
|
|
void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
|
2013-01-26 05:47:42 +08:00
|
|
|
// Attempt to issue a wide load.
|
|
|
|
LoadInst *LI = dyn_cast<LoadInst>(Instr);
|
|
|
|
StoreInst *SI = dyn_cast<StoreInst>(Instr);
|
|
|
|
|
|
|
|
assert((LI || SI) && "Invalid Load/Store instruction");
|
|
|
|
|
|
|
|
Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType();
|
|
|
|
Type *DataTy = VectorType::get(ScalarDataTy, VF);
|
|
|
|
Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
|
|
|
|
unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment();
|
2013-11-16 07:09:33 +08:00
|
|
|
// An alignment of 0 means target abi alignment. We need to use the scalar's
|
|
|
|
// target abi alignment in such a case.
|
2015-03-10 10:37:25 +08:00
|
|
|
const DataLayout &DL = Instr->getModule()->getDataLayout();
|
2013-11-16 07:09:33 +08:00
|
|
|
if (!Alignment)
|
2015-03-10 10:37:25 +08:00
|
|
|
Alignment = DL.getABITypeAlignment(ScalarDataTy);
|
2013-06-18 02:49:06 +08:00
|
|
|
unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
|
2015-03-10 10:37:25 +08:00
|
|
|
unsigned ScalarAllocatedSize = DL.getTypeAllocSize(ScalarDataTy);
|
|
|
|
unsigned VectorElementSize = DL.getTypeStoreSize(DataTy) / VF;
|
2013-04-25 00:16:01 +08:00
|
|
|
|
2014-12-16 19:50:42 +08:00
|
|
|
if (SI && Legal->blockNeedsPredication(SI->getParent()) &&
|
|
|
|
!Legal->isMaskRequired(SI))
|
2014-01-28 09:01:53 +08:00
|
|
|
return scalarizeInstruction(Instr, true);
|
|
|
|
|
2013-04-25 00:16:01 +08:00
|
|
|
if (ScalarAllocatedSize != VectorElementSize)
|
|
|
|
return scalarizeInstruction(Instr);
|
|
|
|
|
2013-12-05 13:44:44 +08:00
|
|
|
// If the pointer is loop invariant or if it is non-consecutive,
|
2013-01-26 05:47:42 +08:00
|
|
|
// scalarize the load.
|
2013-04-25 00:16:03 +08:00
|
|
|
int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
|
|
|
|
bool Reverse = ConsecutiveStride < 0;
|
2013-01-26 05:47:42 +08:00
|
|
|
bool UniformLoad = LI && Legal->isUniform(Ptr);
|
2013-04-25 00:16:03 +08:00
|
|
|
if (!ConsecutiveStride || UniformLoad)
|
2013-01-26 05:47:42 +08:00
|
|
|
return scalarizeInstruction(Instr);
|
|
|
|
|
|
|
|
Constant *Zero = Builder.getInt32(0);
|
|
|
|
VectorParts &Entry = WidenMap.get(Instr);
|
|
|
|
|
|
|
|
// Handle consecutive loads/stores.
|
|
|
|
GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
|
|
|
|
if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) {
|
2013-06-29 00:26:54 +08:00
|
|
|
setDebugLocFromInst(Builder, Gep);
|
2013-01-26 05:47:42 +08:00
|
|
|
Value *PtrOperand = Gep->getPointerOperand();
|
|
|
|
Value *FirstBasePtr = getVectorValue(PtrOperand)[0];
|
|
|
|
FirstBasePtr = Builder.CreateExtractElement(FirstBasePtr, Zero);
|
|
|
|
|
|
|
|
// Create the new GEP with the new induction variable.
|
|
|
|
GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
|
|
|
|
Gep2->setOperand(0, FirstBasePtr);
|
|
|
|
Gep2->setName("gep.indvar.base");
|
|
|
|
Ptr = Builder.Insert(Gep2);
|
|
|
|
} else if (Gep) {
|
2013-06-29 00:26:54 +08:00
|
|
|
setDebugLocFromInst(Builder, Gep);
|
2013-01-26 05:47:42 +08:00
|
|
|
assert(SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand()),
|
|
|
|
OrigLoop) && "Base ptr must be invariant");
|
|
|
|
|
|
|
|
// The last index does not have to be the induction. It can be
|
|
|
|
// consecutive and be a function of the index. For example A[I+1];
|
|
|
|
unsigned NumOperands = Gep->getNumOperands();
|
2015-03-10 10:37:25 +08:00
|
|
|
unsigned InductionOperand = getGEPInductionOperand(Gep);
|
2013-01-26 05:47:42 +08:00
|
|
|
// Create the new GEP with the new induction variable.
|
|
|
|
GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
|
2013-06-27 23:11:55 +08:00
|
|
|
|
|
|
|
for (unsigned i = 0; i < NumOperands; ++i) {
|
|
|
|
Value *GepOperand = Gep->getOperand(i);
|
|
|
|
Instruction *GepOperandInst = dyn_cast<Instruction>(GepOperand);
|
|
|
|
|
|
|
|
// Update last index or loop invariant instruction anchored in loop.
|
2013-11-01 22:09:50 +08:00
|
|
|
if (i == InductionOperand ||
|
2013-06-27 23:11:55 +08:00
|
|
|
(GepOperandInst && OrigLoop->contains(GepOperandInst))) {
|
2013-11-01 22:09:50 +08:00
|
|
|
assert((i == InductionOperand ||
|
2013-06-27 23:11:55 +08:00
|
|
|
SE->isLoopInvariant(SE->getSCEV(GepOperandInst), OrigLoop)) &&
|
|
|
|
"Must be last index or loop invariant");
|
|
|
|
|
|
|
|
VectorParts &GEPParts = getVectorValue(GepOperand);
|
|
|
|
Value *Index = GEPParts[0];
|
|
|
|
Index = Builder.CreateExtractElement(Index, Zero);
|
|
|
|
Gep2->setOperand(i, Index);
|
|
|
|
Gep2->setName("gep.indvar.idx");
|
|
|
|
}
|
|
|
|
}
|
2013-01-26 05:47:42 +08:00
|
|
|
Ptr = Builder.Insert(Gep2);
|
|
|
|
} else {
|
|
|
|
// Use the induction element ptr.
|
|
|
|
assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
|
2013-06-29 01:14:48 +08:00
|
|
|
setDebugLocFromInst(Builder, Ptr);
|
2013-01-26 05:47:42 +08:00
|
|
|
VectorParts &PtrVal = getVectorValue(Ptr);
|
|
|
|
Ptr = Builder.CreateExtractElement(PtrVal[0], Zero);
|
|
|
|
}
|
|
|
|
|
2014-12-30 22:28:14 +08:00
|
|
|
VectorParts Mask = createBlockInMask(Instr->getParent());
|
2013-01-26 05:47:42 +08:00
|
|
|
// Handle Stores:
|
|
|
|
if (SI) {
|
|
|
|
assert(!Legal->isUniform(SI->getPointerOperand()) &&
|
|
|
|
"We do not allow storing to uniform addresses");
|
2013-06-29 00:26:54 +08:00
|
|
|
setDebugLocFromInst(Builder, SI);
|
2013-06-27 08:45:41 +08:00
|
|
|
// We don't want to update the value in the map as it might be used in
|
|
|
|
// another expression. So don't use a reference type for "StoredVal".
|
|
|
|
VectorParts StoredVal = getVectorValue(SI->getValueOperand());
|
2014-12-30 22:28:14 +08:00
|
|
|
|
2013-01-26 05:47:42 +08:00
|
|
|
for (unsigned Part = 0; Part < UF; ++Part) {
|
|
|
|
// Calculate the pointer for the specific unroll-part.
|
2015-04-04 03:41:44 +08:00
|
|
|
Value *PartPtr =
|
|
|
|
Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF));
|
2013-01-26 05:47:42 +08:00
|
|
|
|
|
|
|
if (Reverse) {
|
|
|
|
// If we store to reverse consecutive memory locations then we need
|
|
|
|
// to reverse the order of elements in the stored value.
|
|
|
|
StoredVal[Part] = reverseVector(StoredVal[Part]);
|
|
|
|
// If the address is consecutive but reversed, then the
|
|
|
|
// wide store needs to start at the last vector element.
|
2015-04-04 03:41:44 +08:00
|
|
|
PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF));
|
|
|
|
PartPtr = Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF));
|
2015-01-22 16:20:06 +08:00
|
|
|
Mask[Part] = reverseVector(Mask[Part]);
|
2013-01-26 05:47:42 +08:00
|
|
|
}
|
|
|
|
|
2013-08-01 06:17:45 +08:00
|
|
|
Value *VecPtr = Builder.CreateBitCast(PartPtr,
|
|
|
|
DataTy->getPointerTo(AddressSpace));
|
2014-12-16 19:50:42 +08:00
|
|
|
|
|
|
|
Instruction *NewSI;
|
2014-12-30 22:28:14 +08:00
|
|
|
if (Legal->isMaskRequired(SI))
|
|
|
|
NewSI = Builder.CreateMaskedStore(StoredVal[Part], VecPtr, Alignment,
|
|
|
|
Mask[Part]);
|
2014-12-16 19:50:42 +08:00
|
|
|
else
|
|
|
|
NewSI = Builder.CreateAlignedStore(StoredVal[Part], VecPtr, Alignment);
|
2014-07-19 21:33:16 +08:00
|
|
|
propagateMetadata(NewSI, SI);
|
2013-01-26 05:47:42 +08:00
|
|
|
}
|
2013-06-27 08:45:41 +08:00
|
|
|
return;
|
2013-01-26 05:47:42 +08:00
|
|
|
}
|
|
|
|
|
2013-06-28 08:38:54 +08:00
|
|
|
// Handle loads.
|
|
|
|
assert(LI && "Must have a load instruction");
|
2013-06-29 00:26:54 +08:00
|
|
|
setDebugLocFromInst(Builder, LI);
|
2013-01-26 05:47:42 +08:00
|
|
|
for (unsigned Part = 0; Part < UF; ++Part) {
|
|
|
|
// Calculate the pointer for the specific unroll-part.
|
2015-04-04 03:41:44 +08:00
|
|
|
Value *PartPtr =
|
|
|
|
Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF));
|
2013-01-26 05:47:42 +08:00
|
|
|
|
|
|
|
if (Reverse) {
|
|
|
|
// If the address is consecutive but reversed, then the
|
2014-12-14 17:43:50 +08:00
|
|
|
// wide load needs to start at the last vector element.
|
2015-04-04 03:41:44 +08:00
|
|
|
PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF));
|
|
|
|
PartPtr = Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF));
|
2015-01-22 16:20:06 +08:00
|
|
|
Mask[Part] = reverseVector(Mask[Part]);
|
2013-01-26 05:47:42 +08:00
|
|
|
}
|
|
|
|
|
2014-12-16 19:50:42 +08:00
|
|
|
Instruction* NewLI;
|
2014-12-25 15:49:20 +08:00
|
|
|
Value *VecPtr = Builder.CreateBitCast(PartPtr,
|
|
|
|
DataTy->getPointerTo(AddressSpace));
|
2014-12-30 22:28:14 +08:00
|
|
|
if (Legal->isMaskRequired(LI))
|
|
|
|
NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part],
|
|
|
|
UndefValue::get(DataTy),
|
|
|
|
"wide.masked.load");
|
|
|
|
else
|
2014-12-16 19:50:42 +08:00
|
|
|
NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load");
|
2014-07-19 21:33:16 +08:00
|
|
|
propagateMetadata(NewLI, LI);
|
|
|
|
Entry[Part] = Reverse ? reverseVector(NewLI) : NewLI;
|
2013-01-26 05:47:42 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-01-28 09:01:53 +08:00
|
|
|
void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredicateStore) {
|
2012-10-18 02:25:06 +08:00
|
|
|
assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
|
|
|
|
// Holds vector parameters or scalars, in case of uniform vals.
|
2013-01-03 08:52:27 +08:00
|
|
|
SmallVector<VectorParts, 4> Params;
|
2012-10-18 02:25:06 +08:00
|
|
|
|
2013-06-29 00:26:54 +08:00
|
|
|
setDebugLocFromInst(Builder, Instr);
|
2013-06-28 08:38:54 +08:00
|
|
|
|
2012-10-18 02:25:06 +08:00
|
|
|
// Find all of the vectorized parameters.
|
|
|
|
for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
|
|
|
|
Value *SrcOp = Instr->getOperand(op);
|
|
|
|
|
|
|
|
// If we are accessing the old induction variable, use the new one.
|
|
|
|
if (SrcOp == OldInduction) {
|
2012-11-30 03:25:41 +08:00
|
|
|
Params.push_back(getVectorValue(SrcOp));
|
2012-10-18 02:25:06 +08:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Try using previously calculated values.
|
|
|
|
Instruction *SrcInst = dyn_cast<Instruction>(SrcOp);
|
|
|
|
|
|
|
|
// If the src is an instruction that appeared earlier in the basic block
|
2012-10-18 13:29:12 +08:00
|
|
|
// then it should already be vectorized.
|
2012-12-21 04:24:40 +08:00
|
|
|
if (SrcInst && OrigLoop->contains(SrcInst)) {
|
2013-01-03 08:52:27 +08:00
|
|
|
assert(WidenMap.has(SrcInst) && "Source operand is unavailable");
|
2012-10-18 02:25:06 +08:00
|
|
|
// The parameter is a vector value from earlier.
|
2013-01-03 08:52:27 +08:00
|
|
|
Params.push_back(WidenMap.get(SrcInst));
|
2012-10-18 02:25:06 +08:00
|
|
|
} else {
|
|
|
|
// The parameter is a scalar from outside the loop. Maybe even a constant.
|
2013-01-03 08:52:27 +08:00
|
|
|
VectorParts Scalars;
|
|
|
|
Scalars.append(UF, SrcOp);
|
|
|
|
Params.push_back(Scalars);
|
2012-10-18 02:25:06 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
assert(Params.size() == Instr->getNumOperands() &&
|
|
|
|
"Invalid number of operands");
|
|
|
|
|
|
|
|
// Does this instruction return a value ?
|
|
|
|
bool IsVoidRetTy = Instr->getType()->isVoidTy();
|
|
|
|
|
2014-04-25 13:29:35 +08:00
|
|
|
Value *UndefVec = IsVoidRetTy ? nullptr :
|
2013-01-03 08:52:27 +08:00
|
|
|
UndefValue::get(VectorType::get(Instr->getType(), VF));
|
|
|
|
// Create a new entry in the WidenMap and initialize it to Undef or Null.
|
|
|
|
VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
|
2012-10-18 02:25:06 +08:00
|
|
|
|
2014-01-28 09:01:53 +08:00
|
|
|
Instruction *InsertPt = Builder.GetInsertPoint();
|
|
|
|
BasicBlock *IfBlock = Builder.GetInsertBlock();
|
2014-04-25 13:29:35 +08:00
|
|
|
BasicBlock *CondBlock = nullptr;
|
2014-01-28 09:01:53 +08:00
|
|
|
|
|
|
|
VectorParts Cond;
|
2014-04-25 13:29:35 +08:00
|
|
|
Loop *VectorLp = nullptr;
|
2014-01-28 09:01:53 +08:00
|
|
|
if (IfPredicateStore) {
|
|
|
|
assert(Instr->getParent()->getSinglePredecessor() &&
|
|
|
|
"Only support single predecessor blocks");
|
|
|
|
Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(),
|
|
|
|
Instr->getParent());
|
|
|
|
VectorLp = LI->getLoopFor(IfBlock);
|
|
|
|
assert(VectorLp && "Must have a loop for this block");
|
|
|
|
}
|
|
|
|
|
2013-04-24 01:12:42 +08:00
|
|
|
// For each vector unroll 'part':
|
|
|
|
for (unsigned Part = 0; Part < UF; ++Part) {
|
|
|
|
// For each scalar that we create:
|
|
|
|
for (unsigned Width = 0; Width < VF; ++Width) {
|
2014-01-28 09:01:53 +08:00
|
|
|
|
|
|
|
// Start if-block.
|
2014-04-25 13:29:35 +08:00
|
|
|
Value *Cmp = nullptr;
|
2014-01-28 09:01:53 +08:00
|
|
|
if (IfPredicateStore) {
|
|
|
|
Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Width));
|
|
|
|
Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, ConstantInt::get(Cmp->getType(), 1));
|
|
|
|
CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
|
2014-02-09 04:41:13 +08:00
|
|
|
LoopVectorBody.push_back(CondBlock);
|
2015-01-18 09:25:51 +08:00
|
|
|
VectorLp->addBasicBlockToLoop(CondBlock, *LI);
|
2014-01-28 09:01:53 +08:00
|
|
|
// Update Builder with newly created basic block.
|
|
|
|
Builder.SetInsertPoint(InsertPt);
|
|
|
|
}
|
|
|
|
|
2013-01-03 08:52:27 +08:00
|
|
|
Instruction *Cloned = Instr->clone();
|
|
|
|
if (!IsVoidRetTy)
|
|
|
|
Cloned->setName(Instr->getName() + ".cloned");
|
2013-09-28 21:42:22 +08:00
|
|
|
// Replace the operands of the cloned instructions with extracted scalars.
|
2013-01-03 08:52:27 +08:00
|
|
|
for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
|
|
|
|
Value *Op = Params[op][Part];
|
|
|
|
// Param is a vector. Need to extract the right lane.
|
|
|
|
if (Op->getType()->isVectorTy())
|
|
|
|
Op = Builder.CreateExtractElement(Op, Builder.getInt32(Width));
|
|
|
|
Cloned->setOperand(op, Op);
|
|
|
|
}
|
2012-10-18 02:25:06 +08:00
|
|
|
|
2013-01-03 08:52:27 +08:00
|
|
|
// Place the cloned scalar in the new loop.
|
|
|
|
Builder.Insert(Cloned);
|
2012-10-18 02:25:06 +08:00
|
|
|
|
2013-01-03 08:52:27 +08:00
|
|
|
// If the original scalar returns a value we need to place it in a vector
|
|
|
|
// so that future users will be able to use it.
|
|
|
|
if (!IsVoidRetTy)
|
|
|
|
VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned,
|
|
|
|
Builder.getInt32(Width));
|
2014-01-28 09:01:53 +08:00
|
|
|
// End if-block.
|
|
|
|
if (IfPredicateStore) {
|
|
|
|
BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
|
2014-02-09 04:41:13 +08:00
|
|
|
LoopVectorBody.push_back(NewIfBlock);
|
2015-01-18 09:25:51 +08:00
|
|
|
VectorLp->addBasicBlockToLoop(NewIfBlock, *LI);
|
2014-01-28 09:01:53 +08:00
|
|
|
Builder.SetInsertPoint(InsertPt);
|
|
|
|
Instruction *OldBr = IfBlock->getTerminator();
|
|
|
|
BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
|
|
|
|
OldBr->eraseFromParent();
|
|
|
|
IfBlock = NewIfBlock;
|
|
|
|
}
|
2013-01-03 08:52:27 +08:00
|
|
|
}
|
2012-10-18 02:25:06 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-01-11 02:20:32 +08:00
|
|
|
static Instruction *getFirstInst(Instruction *FirstInst, Value *V,
|
|
|
|
Instruction *Loc) {
|
|
|
|
if (FirstInst)
|
|
|
|
return FirstInst;
|
|
|
|
if (Instruction *I = dyn_cast<Instruction>(V))
|
2014-04-25 13:29:35 +08:00
|
|
|
return I->getParent() == Loc->getParent() ? I : nullptr;
|
|
|
|
return nullptr;
|
2014-01-11 02:20:32 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
std::pair<Instruction *, Instruction *>
|
|
|
|
InnerLoopVectorizer::addStrideCheck(Instruction *Loc) {
|
2014-04-25 13:29:35 +08:00
|
|
|
Instruction *tnullptr = nullptr;
|
2014-01-11 02:20:32 +08:00
|
|
|
if (!Legal->mustCheckStrides())
|
2014-01-11 17:59:27 +08:00
|
|
|
return std::pair<Instruction *, Instruction *>(tnullptr, tnullptr);
|
2014-01-11 02:20:32 +08:00
|
|
|
|
|
|
|
IRBuilder<> ChkBuilder(Loc);
|
|
|
|
|
|
|
|
// Emit checks.
|
2014-04-25 13:29:35 +08:00
|
|
|
Value *Check = nullptr;
|
|
|
|
Instruction *FirstInst = nullptr;
|
2014-01-11 02:20:32 +08:00
|
|
|
for (SmallPtrSet<Value *, 8>::iterator SI = Legal->strides_begin(),
|
|
|
|
SE = Legal->strides_end();
|
|
|
|
SI != SE; ++SI) {
|
2014-01-15 11:35:46 +08:00
|
|
|
Value *Ptr = stripIntegerCast(*SI);
|
2014-01-11 02:20:32 +08:00
|
|
|
Value *C = ChkBuilder.CreateICmpNE(Ptr, ConstantInt::get(Ptr->getType(), 1),
|
|
|
|
"stride.chk");
|
|
|
|
// Store the first instruction we create.
|
|
|
|
FirstInst = getFirstInst(FirstInst, C, Loc);
|
|
|
|
if (Check)
|
|
|
|
Check = ChkBuilder.CreateOr(Check, C);
|
|
|
|
else
|
|
|
|
Check = C;
|
|
|
|
}
|
|
|
|
|
|
|
|
// We have to do this trickery because the IRBuilder might fold the check to a
|
|
|
|
// constant expression in which case there is no Instruction anchored in a
|
|
|
|
// the block.
|
|
|
|
LLVMContext &Ctx = Loc->getContext();
|
|
|
|
Instruction *TheCheck =
|
|
|
|
BinaryOperator::CreateAnd(Check, ConstantInt::getTrue(Ctx));
|
|
|
|
ChkBuilder.Insert(TheCheck, "stride.not.one");
|
|
|
|
FirstInst = getFirstInst(FirstInst, TheCheck, Loc);
|
|
|
|
|
|
|
|
return std::make_pair(FirstInst, TheCheck);
|
|
|
|
}
|
|
|
|
|
|
|
|
void InnerLoopVectorizer::createEmptyLoop() {
|
2012-10-18 13:29:12 +08:00
|
|
|
/*
|
|
|
|
In this function we generate a new loop. The new loop will contain
|
|
|
|
the vectorized instructions while the old loop will continue to run the
|
|
|
|
scalar remainder.
|
|
|
|
|
2014-05-30 06:10:01 +08:00
|
|
|
[ ] <-- Back-edge taken count overflow check.
|
|
|
|
/ |
|
|
|
|
/ v
|
|
|
|
| [ ] <-- vector loop bypass (may consist of multiple blocks).
|
|
|
|
| / |
|
|
|
|
| / v
|
|
|
|
|| [ ] <-- vector pre header.
|
|
|
|
|| |
|
|
|
|
|| v
|
|
|
|
|| [ ] \
|
|
|
|
|| [ ]_| <-- vector loop.
|
|
|
|
|| |
|
|
|
|
| \ v
|
|
|
|
| >[ ] <--- middle-block.
|
|
|
|
| / |
|
|
|
|
| / v
|
|
|
|
-|- >[ ] <--- new preheader.
|
2012-12-11 05:39:02 +08:00
|
|
|
| |
|
|
|
|
| v
|
|
|
|
| [ ] \
|
|
|
|
| [ ]_| <-- old scalar loop to handle remainder.
|
2012-12-12 09:33:47 +08:00
|
|
|
\ |
|
|
|
|
\ v
|
|
|
|
>[ ] <-- exit block.
|
2012-10-18 13:29:12 +08:00
|
|
|
...
|
|
|
|
*/
|
|
|
|
|
2012-11-27 03:51:46 +08:00
|
|
|
BasicBlock *OldBasicBlock = OrigLoop->getHeader();
|
|
|
|
BasicBlock *BypassBlock = OrigLoop->getLoopPreheader();
|
|
|
|
BasicBlock *ExitBlock = OrigLoop->getExitBlock();
|
2014-05-30 06:10:01 +08:00
|
|
|
assert(BypassBlock && "Invalid loop structure");
|
2012-11-27 03:51:46 +08:00
|
|
|
assert(ExitBlock && "Must have an exit block");
|
|
|
|
|
2012-11-25 16:41:35 +08:00
|
|
|
// Some loops have a single integer induction variable, while other loops
|
|
|
|
// don't. One example is c++ iterators that often have multiple pointer
|
|
|
|
// induction variables. In the code below we also support a case where we
|
|
|
|
// don't have a single induction variable.
|
2012-11-09 15:09:44 +08:00
|
|
|
OldInduction = Legal->getInduction();
|
2013-05-12 07:04:28 +08:00
|
|
|
Type *IdxTy = Legal->getWidestInductionType();
|
2012-11-09 15:09:44 +08:00
|
|
|
|
|
|
|
// Find the loop boundaries.
|
2013-06-01 05:48:56 +08:00
|
|
|
const SCEV *ExitCount = SE->getBackedgeTakenCount(OrigLoop);
|
2012-11-09 15:09:44 +08:00
|
|
|
assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count");
|
|
|
|
|
2013-11-27 06:11:23 +08:00
|
|
|
// The exit count might have the type of i64 while the phi is i32. This can
|
|
|
|
// happen if we have an induction variable that is sign extended before the
|
|
|
|
// compare. The only way that we get a backedge taken count is that the
|
|
|
|
// induction variable was signed and as such will not overflow. In such a case
|
|
|
|
// truncation is legal.
|
|
|
|
if (ExitCount->getType()->getPrimitiveSizeInBits() >
|
|
|
|
IdxTy->getPrimitiveSizeInBits())
|
|
|
|
ExitCount = SE->getTruncateOrNoop(ExitCount, IdxTy);
|
|
|
|
|
2014-05-30 06:10:01 +08:00
|
|
|
const SCEV *BackedgeTakeCount = SE->getNoopOrZeroExtend(ExitCount, IdxTy);
|
2012-11-09 15:09:44 +08:00
|
|
|
// Get the total trip count from the count by adding 1.
|
2014-05-30 06:10:01 +08:00
|
|
|
ExitCount = SE->getAddExpr(BackedgeTakeCount,
|
|
|
|
SE->getConstant(BackedgeTakeCount->getType(), 1));
|
2012-11-09 15:09:44 +08:00
|
|
|
|
2015-03-10 10:37:25 +08:00
|
|
|
const DataLayout &DL = OldBasicBlock->getModule()->getDataLayout();
|
|
|
|
|
2012-11-27 03:51:46 +08:00
|
|
|
// Expand the trip count and place the new instructions in the preheader.
|
|
|
|
// Notice that the pre-header does not change, only the loop body.
|
2015-03-10 10:37:25 +08:00
|
|
|
SCEVExpander Exp(*SE, DL, "induction");
|
2012-11-27 03:51:46 +08:00
|
|
|
|
2014-05-30 06:10:01 +08:00
|
|
|
// We need to test whether the backedge-taken count is uint##_max. Adding one
|
|
|
|
// to it will cause overflow and an incorrect loop trip count in the vector
|
|
|
|
// body. In case of overflow we want to directly jump to the scalar remainder
|
|
|
|
// loop.
|
|
|
|
Value *BackedgeCount =
|
|
|
|
Exp.expandCodeFor(BackedgeTakeCount, BackedgeTakeCount->getType(),
|
|
|
|
BypassBlock->getTerminator());
|
|
|
|
if (BackedgeCount->getType()->isPointerTy())
|
|
|
|
BackedgeCount = CastInst::CreatePointerCast(BackedgeCount, IdxTy,
|
|
|
|
"backedge.ptrcnt.to.int",
|
|
|
|
BypassBlock->getTerminator());
|
|
|
|
Instruction *CheckBCOverflow =
|
|
|
|
CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, BackedgeCount,
|
|
|
|
Constant::getAllOnesValue(BackedgeCount->getType()),
|
|
|
|
"backedge.overflow", BypassBlock->getTerminator());
|
|
|
|
|
2012-11-25 16:41:35 +08:00
|
|
|
// The loop index does not have to start at Zero. Find the original start
|
|
|
|
// value from the induction PHI node. If we don't have an induction variable
|
|
|
|
// then we know that it starts at zero.
|
2013-05-12 07:04:28 +08:00
|
|
|
Builder.SetInsertPoint(BypassBlock->getTerminator());
|
|
|
|
Value *StartIdx = ExtendedIdx = OldInduction ?
|
|
|
|
Builder.CreateZExt(OldInduction->getIncomingValueForBlock(BypassBlock),
|
|
|
|
IdxTy):
|
|
|
|
ConstantInt::get(IdxTy, 0);
|
2012-10-31 08:45:26 +08:00
|
|
|
|
2014-06-22 11:38:59 +08:00
|
|
|
// We need an instruction to anchor the overflow check on. StartIdx needs to
|
|
|
|
// be defined before the overflow check branch. Because the scalar preheader
|
|
|
|
// is going to merge the start index and so the overflow branch block needs to
|
|
|
|
// contain a definition of the start index.
|
|
|
|
Instruction *OverflowCheckAnchor = BinaryOperator::CreateAdd(
|
|
|
|
StartIdx, ConstantInt::get(IdxTy, 0), "overflow.check.anchor",
|
|
|
|
BypassBlock->getTerminator());
|
|
|
|
|
|
|
|
// Count holds the overall loop count (N).
|
|
|
|
Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
|
|
|
|
BypassBlock->getTerminator());
|
|
|
|
|
2013-01-19 21:57:58 +08:00
|
|
|
LoopBypassBlocks.push_back(BypassBlock);
|
2012-11-27 03:51:46 +08:00
|
|
|
|
|
|
|
// Split the single block loop into the two loop structure described above.
|
2012-10-18 13:29:12 +08:00
|
|
|
BasicBlock *VectorPH =
|
2012-12-11 05:39:02 +08:00
|
|
|
BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph");
|
2012-11-27 03:51:46 +08:00
|
|
|
BasicBlock *VecBody =
|
2012-12-11 05:39:02 +08:00
|
|
|
VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
|
2012-11-27 03:51:46 +08:00
|
|
|
BasicBlock *MiddleBlock =
|
2012-12-11 05:39:02 +08:00
|
|
|
VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
|
2012-10-18 13:29:12 +08:00
|
|
|
BasicBlock *ScalarPH =
|
2012-12-11 05:39:02 +08:00
|
|
|
MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
|
2012-11-27 03:51:46 +08:00
|
|
|
|
2013-07-13 14:20:06 +08:00
|
|
|
// Create and register the new vector loop.
|
|
|
|
Loop* Lp = new Loop();
|
|
|
|
Loop *ParentLoop = OrigLoop->getParentLoop();
|
|
|
|
|
|
|
|
// Insert the new loop into the loop nest and register the new basic blocks
|
|
|
|
// before calling any utilities such as SCEV that require valid LoopInfo.
|
|
|
|
if (ParentLoop) {
|
|
|
|
ParentLoop->addChildLoop(Lp);
|
2015-01-18 09:25:51 +08:00
|
|
|
ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);
|
|
|
|
ParentLoop->addBasicBlockToLoop(VectorPH, *LI);
|
|
|
|
ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);
|
2013-07-13 14:20:06 +08:00
|
|
|
} else {
|
|
|
|
LI->addTopLevelLoop(Lp);
|
|
|
|
}
|
2015-01-18 09:25:51 +08:00
|
|
|
Lp->addBasicBlockToLoop(VecBody, *LI);
|
2013-07-13 14:20:06 +08:00
|
|
|
|
2012-10-18 02:25:06 +08:00
|
|
|
// Use this IR builder to create the loop instructions (Phi, Br, Cmp)
|
|
|
|
// inside the loop.
|
2013-09-27 23:30:25 +08:00
|
|
|
Builder.SetInsertPoint(VecBody->getFirstNonPHI());
|
2012-10-18 02:25:06 +08:00
|
|
|
|
|
|
|
// Generate the induction variable.
|
2013-06-29 00:26:54 +08:00
|
|
|
setDebugLocFromInst(Builder, getDebugLocFromInstOrOperands(OldInduction));
|
2012-10-20 07:27:19 +08:00
|
|
|
Induction = Builder.CreatePHI(IdxTy, 2, "index");
|
2013-01-03 08:52:27 +08:00
|
|
|
// The loop step is equal to the vectorization factor (num of SIMD elements)
|
|
|
|
// times the unroll factor (num of SIMD instructions).
|
|
|
|
Constant *Step = ConstantInt::get(IdxTy, VF * UF);
|
2012-10-18 02:25:06 +08:00
|
|
|
|
2013-01-26 03:26:23 +08:00
|
|
|
// This is the IR builder that we use to add all of the logic for bypassing
|
|
|
|
// the new vector loop.
|
2013-01-26 03:43:15 +08:00
|
|
|
IRBuilder<> BypassBuilder(BypassBlock->getTerminator());
|
2013-06-29 00:26:54 +08:00
|
|
|
setDebugLocFromInst(BypassBuilder,
|
|
|
|
getDebugLocFromInstOrOperands(OldInduction));
|
2013-01-26 03:26:23 +08:00
|
|
|
|
2012-11-25 16:41:35 +08:00
|
|
|
// We may need to extend the index in case there is a type mismatch.
|
|
|
|
// We know that the count starts at zero and does not overflow.
|
|
|
|
if (Count->getType() != IdxTy) {
|
|
|
|
// The exit count can be of pointer type. Convert it to the correct
|
|
|
|
// integer type.
|
|
|
|
if (ExitCount->getType()->isPointerTy())
|
2013-01-26 03:26:23 +08:00
|
|
|
Count = BypassBuilder.CreatePointerCast(Count, IdxTy, "ptrcnt.to.int");
|
2012-11-25 16:41:35 +08:00
|
|
|
else
|
2013-01-26 03:26:23 +08:00
|
|
|
Count = BypassBuilder.CreateZExtOrTrunc(Count, IdxTy, "cnt.cast");
|
2012-11-25 16:41:35 +08:00
|
|
|
}
|
2012-10-31 08:45:26 +08:00
|
|
|
|
|
|
|
// Add the start index to the loop count to get the new end index.
|
2013-01-26 03:26:23 +08:00
|
|
|
Value *IdxEnd = BypassBuilder.CreateAdd(Count, StartIdx, "end.idx");
|
2012-10-31 08:45:26 +08:00
|
|
|
|
2012-10-18 13:29:12 +08:00
|
|
|
// Now we need to generate the expression for N - (N % VF), which is
|
|
|
|
// the part that the vectorized body will execute.
|
2013-01-26 03:26:23 +08:00
|
|
|
Value *R = BypassBuilder.CreateURem(Count, Step, "n.mod.vf");
|
|
|
|
Value *CountRoundDown = BypassBuilder.CreateSub(Count, R, "n.vec");
|
|
|
|
Value *IdxEndRoundDown = BypassBuilder.CreateAdd(CountRoundDown, StartIdx,
|
|
|
|
"end.idx.rnd.down");
|
2012-10-18 13:29:12 +08:00
|
|
|
|
2012-11-25 16:41:35 +08:00
|
|
|
// Now, compare the new count to zero. If it is zero skip the vector loop and
|
|
|
|
// jump to the scalar loop.
|
2014-06-22 11:38:59 +08:00
|
|
|
Value *Cmp =
|
|
|
|
BypassBuilder.CreateICmpEQ(IdxEndRoundDown, StartIdx, "cmp.zero");
|
2013-01-26 03:26:23 +08:00
|
|
|
|
|
|
|
BasicBlock *LastBypassBlock = BypassBlock;
|
2012-11-09 15:09:44 +08:00
|
|
|
|
2014-05-30 06:10:01 +08:00
|
|
|
// Generate code to check that the loops trip count that we computed by adding
|
|
|
|
// one to the backedge-taken count will not overflow.
|
|
|
|
{
|
2014-06-22 11:38:59 +08:00
|
|
|
auto PastOverflowCheck =
|
|
|
|
std::next(BasicBlock::iterator(OverflowCheckAnchor));
|
2014-05-30 06:10:01 +08:00
|
|
|
BasicBlock *CheckBlock =
|
2014-06-22 11:38:59 +08:00
|
|
|
LastBypassBlock->splitBasicBlock(PastOverflowCheck, "overflow.checked");
|
2014-05-30 06:10:01 +08:00
|
|
|
if (ParentLoop)
|
2015-01-18 09:25:51 +08:00
|
|
|
ParentLoop->addBasicBlockToLoop(CheckBlock, *LI);
|
2014-05-30 06:10:01 +08:00
|
|
|
LoopBypassBlocks.push_back(CheckBlock);
|
|
|
|
Instruction *OldTerm = LastBypassBlock->getTerminator();
|
|
|
|
BranchInst::Create(ScalarPH, CheckBlock, CheckBCOverflow, OldTerm);
|
|
|
|
OldTerm->eraseFromParent();
|
|
|
|
LastBypassBlock = CheckBlock;
|
|
|
|
}
|
|
|
|
|
2014-01-11 02:20:32 +08:00
|
|
|
// Generate the code to check that the strides we assumed to be one are really
|
|
|
|
// one. We want the new basic block to start at the first instruction in a
|
|
|
|
// sequence of instructions that form a check.
|
|
|
|
Instruction *StrideCheck;
|
|
|
|
Instruction *FirstCheckInst;
|
2014-03-02 21:30:33 +08:00
|
|
|
std::tie(FirstCheckInst, StrideCheck) =
|
2014-05-30 06:10:01 +08:00
|
|
|
addStrideCheck(LastBypassBlock->getTerminator());
|
2014-01-11 02:20:32 +08:00
|
|
|
if (StrideCheck) {
|
2015-03-09 14:14:18 +08:00
|
|
|
AddedSafetyChecks = true;
|
2014-01-11 02:20:32 +08:00
|
|
|
// Create a new block containing the stride check.
|
|
|
|
BasicBlock *CheckBlock =
|
2014-05-30 06:10:01 +08:00
|
|
|
LastBypassBlock->splitBasicBlock(FirstCheckInst, "vector.stridecheck");
|
2014-01-11 02:20:32 +08:00
|
|
|
if (ParentLoop)
|
2015-01-18 09:25:51 +08:00
|
|
|
ParentLoop->addBasicBlockToLoop(CheckBlock, *LI);
|
2014-01-11 02:20:32 +08:00
|
|
|
LoopBypassBlocks.push_back(CheckBlock);
|
|
|
|
|
|
|
|
// Replace the branch into the memory check block with a conditional branch
|
|
|
|
// for the "few elements case".
|
2014-05-30 06:10:01 +08:00
|
|
|
Instruction *OldTerm = LastBypassBlock->getTerminator();
|
2014-01-11 02:20:32 +08:00
|
|
|
BranchInst::Create(MiddleBlock, CheckBlock, Cmp, OldTerm);
|
|
|
|
OldTerm->eraseFromParent();
|
|
|
|
|
|
|
|
Cmp = StrideCheck;
|
|
|
|
LastBypassBlock = CheckBlock;
|
|
|
|
}
|
|
|
|
|
2013-01-19 21:57:58 +08:00
|
|
|
// Generate the code that checks in runtime if arrays overlap. We put the
|
|
|
|
// checks into a separate block to make the more common case of few elements
|
|
|
|
// faster.
|
2014-01-11 02:20:32 +08:00
|
|
|
Instruction *MemRuntimeCheck;
|
2014-03-02 21:30:33 +08:00
|
|
|
std::tie(FirstCheckInst, MemRuntimeCheck) =
|
2015-02-18 11:42:35 +08:00
|
|
|
Legal->getLAI()->addRuntimeCheck(LastBypassBlock->getTerminator());
|
2013-01-26 03:26:23 +08:00
|
|
|
if (MemRuntimeCheck) {
|
2015-03-09 14:14:18 +08:00
|
|
|
AddedSafetyChecks = true;
|
2013-01-19 21:57:58 +08:00
|
|
|
// Create a new block containing the memory check.
|
2014-01-11 02:20:32 +08:00
|
|
|
BasicBlock *CheckBlock =
|
2015-02-04 06:45:39 +08:00
|
|
|
LastBypassBlock->splitBasicBlock(FirstCheckInst, "vector.memcheck");
|
2013-07-13 14:20:06 +08:00
|
|
|
if (ParentLoop)
|
2015-01-18 09:25:51 +08:00
|
|
|
ParentLoop->addBasicBlockToLoop(CheckBlock, *LI);
|
2013-01-19 21:57:58 +08:00
|
|
|
LoopBypassBlocks.push_back(CheckBlock);
|
|
|
|
|
|
|
|
// Replace the branch into the memory check block with a conditional branch
|
|
|
|
// for the "few elements case".
|
2014-01-11 02:20:32 +08:00
|
|
|
Instruction *OldTerm = LastBypassBlock->getTerminator();
|
2013-01-19 21:57:58 +08:00
|
|
|
BranchInst::Create(MiddleBlock, CheckBlock, Cmp, OldTerm);
|
|
|
|
OldTerm->eraseFromParent();
|
|
|
|
|
2013-01-26 03:26:23 +08:00
|
|
|
Cmp = MemRuntimeCheck;
|
|
|
|
LastBypassBlock = CheckBlock;
|
2013-01-19 21:57:58 +08:00
|
|
|
}
|
2012-11-09 15:09:44 +08:00
|
|
|
|
2013-01-26 03:26:23 +08:00
|
|
|
LastBypassBlock->getTerminator()->eraseFromParent();
|
|
|
|
BranchInst::Create(MiddleBlock, VectorPH, Cmp,
|
|
|
|
LastBypassBlock);
|
2012-10-18 13:29:12 +08:00
|
|
|
|
2012-11-09 15:09:44 +08:00
|
|
|
// We are going to resume the execution of the scalar loop.
|
2012-11-17 08:27:03 +08:00
|
|
|
// Go over all of the induction variables that we found and fix the
|
|
|
|
// PHIs that are left in the scalar version of the loop.
|
|
|
|
// The starting values of PHI nodes depend on the counter of the last
|
|
|
|
// iteration in the vectorized loop.
|
2012-12-11 05:39:02 +08:00
|
|
|
// If we come from a bypass edge then we need to start from the original
|
|
|
|
// start value.
|
2012-11-17 08:27:03 +08:00
|
|
|
|
|
|
|
// This variable saves the new starting index for the scalar loop.
|
2014-04-25 13:29:35 +08:00
|
|
|
PHINode *ResumeIndex = nullptr;
|
2012-11-17 08:27:03 +08:00
|
|
|
LoopVectorizationLegality::InductionList::iterator I, E;
|
|
|
|
LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
|
2013-05-12 07:04:24 +08:00
|
|
|
// Set builder to point to last bypass block.
|
|
|
|
BypassBuilder.SetInsertPoint(LoopBypassBlocks.back()->getTerminator());
|
2012-11-17 08:27:03 +08:00
|
|
|
for (I = List->begin(), E = List->end(); I != E; ++I) {
|
|
|
|
PHINode *OrigPhi = I->first;
|
2012-12-11 03:25:06 +08:00
|
|
|
LoopVectorizationLegality::InductionInfo II = I->second;
|
2013-05-12 07:04:28 +08:00
|
|
|
|
|
|
|
Type *ResumeValTy = (OrigPhi == OldInduction) ? IdxTy : OrigPhi->getType();
|
|
|
|
PHINode *ResumeVal = PHINode::Create(ResumeValTy, 2, "resume.val",
|
2012-12-11 05:39:02 +08:00
|
|
|
MiddleBlock->getTerminator());
|
2013-05-12 07:04:28 +08:00
|
|
|
// We might have extended the type of the induction variable but we need a
|
|
|
|
// truncated version for the scalar loop.
|
|
|
|
PHINode *TruncResumeVal = (OrigPhi == OldInduction) ?
|
|
|
|
PHINode::Create(OrigPhi->getType(), 2, "trunc.resume.val",
|
2014-04-25 13:29:35 +08:00
|
|
|
MiddleBlock->getTerminator()) : nullptr;
|
2013-05-12 07:04:28 +08:00
|
|
|
|
2014-05-30 06:10:01 +08:00
|
|
|
// Create phi nodes to merge from the backedge-taken check block.
|
|
|
|
PHINode *BCResumeVal = PHINode::Create(ResumeValTy, 3, "bc.resume.val",
|
|
|
|
ScalarPH->getTerminator());
|
|
|
|
BCResumeVal->addIncoming(ResumeVal, MiddleBlock);
|
|
|
|
|
|
|
|
PHINode *BCTruncResumeVal = nullptr;
|
|
|
|
if (OrigPhi == OldInduction) {
|
|
|
|
BCTruncResumeVal =
|
|
|
|
PHINode::Create(OrigPhi->getType(), 2, "bc.trunc.resume.val",
|
|
|
|
ScalarPH->getTerminator());
|
|
|
|
BCTruncResumeVal->addIncoming(TruncResumeVal, MiddleBlock);
|
|
|
|
}
|
|
|
|
|
2014-04-25 13:29:35 +08:00
|
|
|
Value *EndValue = nullptr;
|
2012-12-11 03:25:06 +08:00
|
|
|
switch (II.IK) {
|
2013-01-09 01:23:17 +08:00
|
|
|
case LoopVectorizationLegality::IK_NoInduction:
|
2012-12-11 03:25:06 +08:00
|
|
|
llvm_unreachable("Unknown induction");
|
2013-01-09 01:23:17 +08:00
|
|
|
case LoopVectorizationLegality::IK_IntInduction: {
|
2013-05-14 08:21:18 +08:00
|
|
|
// Handle the integer induction counter.
|
2012-12-11 03:25:06 +08:00
|
|
|
assert(OrigPhi->getType()->isIntegerTy() && "Invalid type");
|
2013-05-14 08:21:18 +08:00
|
|
|
|
|
|
|
// We have the canonical induction variable.
|
2013-05-12 07:04:28 +08:00
|
|
|
if (OrigPhi == OldInduction) {
|
|
|
|
// Create a truncated version of the resume value for the scalar loop,
|
|
|
|
// we might have promoted the type to a larger width.
|
|
|
|
EndValue =
|
|
|
|
BypassBuilder.CreateTrunc(IdxEndRoundDown, OrigPhi->getType());
|
|
|
|
// The new PHI merges the original incoming value, in case of a bypass,
|
|
|
|
// or the value at the end of the vectorized loop.
|
2014-05-30 06:10:01 +08:00
|
|
|
for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
|
2013-05-12 07:04:28 +08:00
|
|
|
TruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]);
|
|
|
|
TruncResumeVal->addIncoming(EndValue, VecBody);
|
2013-05-14 08:21:18 +08:00
|
|
|
|
2014-05-30 06:10:01 +08:00
|
|
|
BCTruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]);
|
|
|
|
|
2013-05-14 08:21:18 +08:00
|
|
|
// We know what the end value is.
|
|
|
|
EndValue = IdxEndRoundDown;
|
|
|
|
// We also know which PHI node holds it.
|
|
|
|
ResumeIndex = ResumeVal;
|
|
|
|
break;
|
2013-05-12 07:04:28 +08:00
|
|
|
}
|
2013-05-14 08:21:18 +08:00
|
|
|
|
|
|
|
// Not the canonical induction variable - add the vector loop count to the
|
|
|
|
// start value.
|
|
|
|
Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown,
|
|
|
|
II.StartValue->getType(),
|
|
|
|
"cast.crd");
|
2015-01-30 13:02:21 +08:00
|
|
|
EndValue = II.transform(BypassBuilder, CRD);
|
|
|
|
EndValue->setName("ind.end");
|
2012-12-11 03:25:06 +08:00
|
|
|
break;
|
|
|
|
}
|
2013-01-09 01:23:17 +08:00
|
|
|
case LoopVectorizationLegality::IK_PtrInduction: {
|
2015-01-30 13:02:21 +08:00
|
|
|
EndValue = II.transform(BypassBuilder, CountRoundDown);
|
|
|
|
EndValue->setName("ptr.ind.end");
|
2013-01-23 09:35:00 +08:00
|
|
|
break;
|
|
|
|
}
|
2012-12-11 03:25:06 +08:00
|
|
|
}// end of case
|
2012-11-17 08:27:03 +08:00
|
|
|
|
|
|
|
// The new PHI merges the original incoming value, in case of a bypass,
|
|
|
|
// or the value at the end of the vectorized loop.
|
2014-05-30 06:10:01 +08:00
|
|
|
for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) {
|
2013-05-12 07:04:28 +08:00
|
|
|
if (OrigPhi == OldInduction)
|
|
|
|
ResumeVal->addIncoming(StartIdx, LoopBypassBlocks[I]);
|
|
|
|
else
|
|
|
|
ResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]);
|
|
|
|
}
|
2012-11-17 08:27:03 +08:00
|
|
|
ResumeVal->addIncoming(EndValue, VecBody);
|
|
|
|
|
|
|
|
// Fix the scalar body counter (PHI node).
|
2012-11-25 16:41:35 +08:00
|
|
|
unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);
|
2014-05-30 06:10:01 +08:00
|
|
|
|
|
|
|
// The old induction's phi node in the scalar body needs the truncated
|
|
|
|
// value.
|
|
|
|
if (OrigPhi == OldInduction) {
|
|
|
|
BCResumeVal->addIncoming(StartIdx, LoopBypassBlocks[0]);
|
|
|
|
OrigPhi->setIncomingValue(BlockIdx, BCTruncResumeVal);
|
|
|
|
} else {
|
|
|
|
BCResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]);
|
|
|
|
OrigPhi->setIncomingValue(BlockIdx, BCResumeVal);
|
|
|
|
}
|
2012-11-17 08:27:03 +08:00
|
|
|
}
|
|
|
|
|
2012-11-25 16:41:35 +08:00
|
|
|
// If we are generating a new induction variable then we also need to
|
|
|
|
// generate the code that calculates the exit value. This value is not
|
|
|
|
// simply the end of the counter because we may skip the vectorized body
|
|
|
|
// in case of a runtime check.
|
|
|
|
if (!OldInduction){
|
|
|
|
assert(!ResumeIndex && "Unexpected resume value found");
|
|
|
|
ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val",
|
|
|
|
MiddleBlock->getTerminator());
|
2014-05-30 06:10:01 +08:00
|
|
|
for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
|
2013-01-19 21:57:58 +08:00
|
|
|
ResumeIndex->addIncoming(StartIdx, LoopBypassBlocks[I]);
|
2012-11-25 16:41:35 +08:00
|
|
|
ResumeIndex->addIncoming(IdxEndRoundDown, VecBody);
|
|
|
|
}
|
|
|
|
|
2012-11-17 08:27:03 +08:00
|
|
|
// Make sure that we found the index where scalar loop needs to continue.
|
|
|
|
assert(ResumeIndex && ResumeIndex->getType()->isIntegerTy() &&
|
|
|
|
"Invalid resume Index");
|
2012-11-09 15:09:44 +08:00
|
|
|
|
2012-10-18 13:29:12 +08:00
|
|
|
// Add a check in the middle block to see if we have completed
|
|
|
|
// all of the iterations in the first vector loop.
|
|
|
|
// If (N - N%VF) == N, then we *don't* need to run the remainder.
|
2012-10-31 08:45:26 +08:00
|
|
|
Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd,
|
2012-11-09 15:09:44 +08:00
|
|
|
ResumeIndex, "cmp.n",
|
2012-10-18 13:29:12 +08:00
|
|
|
MiddleBlock->getTerminator());
|
|
|
|
|
|
|
|
BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator());
|
|
|
|
// Remove the old terminator.
|
|
|
|
MiddleBlock->getTerminator()->eraseFromParent();
|
|
|
|
|
2012-10-18 02:25:06 +08:00
|
|
|
// Create i+1 and fill the PHINode.
|
2012-10-20 07:27:19 +08:00
|
|
|
Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next");
|
2012-10-31 08:45:26 +08:00
|
|
|
Induction->addIncoming(StartIdx, VectorPH);
|
2012-10-18 13:29:12 +08:00
|
|
|
Induction->addIncoming(NextIdx, VecBody);
|
2012-10-18 02:25:06 +08:00
|
|
|
// Create the compare.
|
2012-10-31 08:45:26 +08:00
|
|
|
Value *ICmp = Builder.CreateICmpEQ(NextIdx, IdxEndRoundDown);
|
2012-10-20 07:27:19 +08:00
|
|
|
Builder.CreateCondBr(ICmp, MiddleBlock, VecBody);
|
2012-10-18 13:29:12 +08:00
|
|
|
|
|
|
|
// Now we have two terminators. Remove the old one from the block.
|
|
|
|
VecBody->getTerminator()->eraseFromParent();
|
|
|
|
|
|
|
|
// Get ready to start creating new instructions into the vectorized body.
|
2012-10-20 07:27:19 +08:00
|
|
|
Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
|
2012-10-18 13:29:12 +08:00
|
|
|
|
2012-10-20 07:05:40 +08:00
|
|
|
// Save the state.
|
2012-10-30 05:52:38 +08:00
|
|
|
LoopVectorPreHeader = VectorPH;
|
|
|
|
LoopScalarPreHeader = ScalarPH;
|
2012-10-20 07:05:40 +08:00
|
|
|
LoopMiddleBlock = MiddleBlock;
|
|
|
|
LoopExitBlock = ExitBlock;
|
2014-01-28 09:01:53 +08:00
|
|
|
LoopVectorBody.push_back(VecBody);
|
2012-10-20 07:05:40 +08:00
|
|
|
LoopScalarBody = OldBasicBlock;
|
2013-10-24 22:50:51 +08:00
|
|
|
|
|
|
|
LoopVectorizeHints Hints(Lp, true);
|
Small refactor on VectorizerHint for deduplication
Previously, the hint mechanism relied on clean up passes to remove redundant
metadata, which still showed up if running opt at low levels of optimization.
That also has shown that multiple nodes of the same type, but with different
values could still coexist, even if temporary, and cause confusion if the
next pass got the wrong value.
This patch makes sure that, if metadata already exists in a loop, the hint
mechanism will never append a new node, but always replace the existing one.
It also enhances the algorithm to cope with more metadata types in the future
by just adding a new type, not a lot of code.
Re-applying again due to MSVC 2013 being minimum requirement, and this patch
having C++11 that MSVC 2012 didn't support.
Fixes PR20655.
llvm-svn: 216870
2014-09-01 18:00:17 +08:00
|
|
|
Hints.setAlreadyVectorized();
|
2012-10-18 02:25:06 +08:00
|
|
|
}
|
|
|
|
|
2013-11-02 21:39:00 +08:00
|
|
|
namespace {
|
|
|
|
struct CSEDenseMapInfo {
|
|
|
|
static bool canHandle(Instruction *I) {
|
|
|
|
return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
|
|
|
|
isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
|
|
|
|
}
|
|
|
|
static inline Instruction *getEmptyKey() {
|
|
|
|
return DenseMapInfo<Instruction *>::getEmptyKey();
|
|
|
|
}
|
|
|
|
static inline Instruction *getTombstoneKey() {
|
|
|
|
return DenseMapInfo<Instruction *>::getTombstoneKey();
|
|
|
|
}
|
|
|
|
static unsigned getHashValue(Instruction *I) {
|
|
|
|
assert(canHandle(I) && "Unknown instruction!");
|
|
|
|
return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
|
|
|
|
I->value_op_end()));
|
|
|
|
}
|
|
|
|
static bool isEqual(Instruction *LHS, Instruction *RHS) {
|
|
|
|
if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
|
|
|
|
LHS == getTombstoneKey() || RHS == getTombstoneKey())
|
|
|
|
return LHS == RHS;
|
|
|
|
return LHS->isIdenticalTo(RHS);
|
|
|
|
}
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
2014-01-28 09:01:53 +08:00
|
|
|
/// \brief Check whether this block is a predicated block.
|
|
|
|
/// Due to if predication of stores we might create a sequence of "if(pred) a[i]
|
|
|
|
/// = ...; " blocks. We start with one vectorized basic block. For every
|
|
|
|
/// conditional block we split this vectorized block. Therefore, every second
|
|
|
|
/// block will be a predicated one.
|
|
|
|
static bool isPredicatedBlock(unsigned BlockNum) {
|
|
|
|
return BlockNum % 2;
|
|
|
|
}
|
|
|
|
|
2013-11-02 07:28:54 +08:00
|
|
|
///\brief Perform cse of induction variable instructions.
|
2014-01-28 09:01:53 +08:00
|
|
|
static void cse(SmallVector<BasicBlock *, 4> &BBs) {
|
2013-11-02 07:28:54 +08:00
|
|
|
// Perform simple cse.
|
2013-11-02 21:39:00 +08:00
|
|
|
SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
|
2014-01-28 09:01:53 +08:00
|
|
|
for (unsigned i = 0, e = BBs.size(); i != e; ++i) {
|
|
|
|
BasicBlock *BB = BBs[i];
|
|
|
|
for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
|
|
|
|
Instruction *In = I++;
|
2013-11-02 07:28:54 +08:00
|
|
|
|
2014-01-28 09:01:53 +08:00
|
|
|
if (!CSEDenseMapInfo::canHandle(In))
|
|
|
|
continue;
|
2013-11-02 07:28:54 +08:00
|
|
|
|
2014-01-28 09:01:53 +08:00
|
|
|
// Check if we can replace this instruction with any of the
|
|
|
|
// visited instructions.
|
|
|
|
if (Instruction *V = CSEMap.lookup(In)) {
|
|
|
|
In->replaceAllUsesWith(V);
|
|
|
|
In->eraseFromParent();
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
// Ignore instructions in conditional blocks. We create "if (pred) a[i] =
|
|
|
|
// ...;" blocks for predicated stores. Every second block is a predicated
|
|
|
|
// block.
|
|
|
|
if (isPredicatedBlock(i))
|
|
|
|
continue;
|
2013-11-02 07:28:54 +08:00
|
|
|
|
2014-01-28 09:01:53 +08:00
|
|
|
CSEMap[In] = In;
|
|
|
|
}
|
2013-11-02 07:28:54 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-03-06 05:10:47 +08:00
|
|
|
/// \brief Adds a 'fast' flag to floating point operations.
|
|
|
|
static Value *addFastMathFlag(Value *V) {
|
|
|
|
if (isa<FPMathOperator>(V)){
|
|
|
|
FastMathFlags Flags;
|
|
|
|
Flags.setUnsafeAlgebra();
|
|
|
|
cast<Instruction>(V)->setFastMathFlags(Flags);
|
|
|
|
}
|
|
|
|
return V;
|
|
|
|
}
|
|
|
|
|
2015-03-18 03:46:50 +08:00
|
|
|
/// Estimate the overhead of scalarizing a value. Insert and Extract are set if
|
|
|
|
/// the result needs to be inserted and/or extracted from vectors.
|
|
|
|
static unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract,
|
|
|
|
const TargetTransformInfo &TTI) {
|
2015-03-18 04:31:56 +08:00
|
|
|
if (Ty->isVoidTy())
|
|
|
|
return 0;
|
|
|
|
|
2015-03-18 03:46:50 +08:00
|
|
|
assert(Ty->isVectorTy() && "Can only scalarize vectors");
|
|
|
|
unsigned Cost = 0;
|
|
|
|
|
|
|
|
for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
|
|
|
|
if (Insert)
|
|
|
|
Cost += TTI.getVectorInstrCost(Instruction::InsertElement, Ty, i);
|
|
|
|
if (Extract)
|
|
|
|
Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, Ty, i);
|
|
|
|
}
|
|
|
|
|
|
|
|
return Cost;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Estimate cost of a call instruction CI if it were vectorized with factor VF.
|
|
|
|
// Return the cost of the instruction, including scalarization overhead if it's
|
|
|
|
// needed. The flag NeedToScalarize shows if the call needs to be scalarized -
|
|
|
|
// i.e. either vector version isn't available, or is too expensive.
|
|
|
|
static unsigned getVectorCallCost(CallInst *CI, unsigned VF,
|
|
|
|
const TargetTransformInfo &TTI,
|
|
|
|
const TargetLibraryInfo *TLI,
|
|
|
|
bool &NeedToScalarize) {
|
|
|
|
Function *F = CI->getCalledFunction();
|
|
|
|
StringRef FnName = CI->getCalledFunction()->getName();
|
|
|
|
Type *ScalarRetTy = CI->getType();
|
|
|
|
SmallVector<Type *, 4> Tys, ScalarTys;
|
|
|
|
for (auto &ArgOp : CI->arg_operands())
|
|
|
|
ScalarTys.push_back(ArgOp->getType());
|
|
|
|
|
|
|
|
// Estimate cost of scalarized vector call. The source operands are assumed
|
|
|
|
// to be vectors, so we need to extract individual elements from there,
|
|
|
|
// execute VF scalar calls, and then gather the result into the vector return
|
|
|
|
// value.
|
|
|
|
unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
|
|
|
|
if (VF == 1)
|
|
|
|
return ScalarCallCost;
|
|
|
|
|
|
|
|
// Compute corresponding vector type for return value and arguments.
|
|
|
|
Type *RetTy = ToVectorTy(ScalarRetTy, VF);
|
|
|
|
for (unsigned i = 0, ie = ScalarTys.size(); i != ie; ++i)
|
|
|
|
Tys.push_back(ToVectorTy(ScalarTys[i], VF));
|
|
|
|
|
|
|
|
// Compute costs of unpacking argument values for the scalar calls and
|
|
|
|
// packing the return values to a vector.
|
|
|
|
unsigned ScalarizationCost =
|
|
|
|
getScalarizationOverhead(RetTy, true, false, TTI);
|
|
|
|
for (unsigned i = 0, ie = Tys.size(); i != ie; ++i)
|
|
|
|
ScalarizationCost += getScalarizationOverhead(Tys[i], false, true, TTI);
|
|
|
|
|
|
|
|
unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
|
|
|
|
|
|
|
|
// If we can't emit a vector call for this function, then the currently found
|
|
|
|
// cost is the cost we need to return.
|
|
|
|
NeedToScalarize = true;
|
|
|
|
if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin())
|
|
|
|
return Cost;
|
|
|
|
|
|
|
|
// If the corresponding vector cost is cheaper, return its cost.
|
|
|
|
unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
|
|
|
|
if (VectorCallCost < Cost) {
|
|
|
|
NeedToScalarize = false;
|
|
|
|
return VectorCallCost;
|
|
|
|
}
|
|
|
|
return Cost;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Estimate cost of an intrinsic call instruction CI if it were vectorized with
|
|
|
|
// factor VF. Return the cost of the instruction, including scalarization
|
|
|
|
// overhead if it's needed.
|
|
|
|
static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF,
|
|
|
|
const TargetTransformInfo &TTI,
|
|
|
|
const TargetLibraryInfo *TLI) {
|
|
|
|
Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
|
|
|
|
assert(ID && "Expected intrinsic call!");
|
|
|
|
|
|
|
|
Type *RetTy = ToVectorTy(CI->getType(), VF);
|
|
|
|
SmallVector<Type *, 4> Tys;
|
|
|
|
for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i)
|
|
|
|
Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF));
|
|
|
|
|
|
|
|
return TTI.getIntrinsicInstrCost(ID, RetTy, Tys);
|
|
|
|
}
|
|
|
|
|
2014-01-11 02:20:32 +08:00
|
|
|
void InnerLoopVectorizer::vectorizeLoop() {
|
2012-10-25 04:36:32 +08:00
|
|
|
//===------------------------------------------------===//
|
|
|
|
//
|
|
|
|
// Notice: any optimization or new instruction that go
|
|
|
|
// into the code below should be also be implemented in
|
|
|
|
// the cost-model.
|
|
|
|
//
|
|
|
|
//===------------------------------------------------===//
|
2013-01-26 05:47:42 +08:00
|
|
|
Constant *Zero = Builder.getInt32(0);
|
2012-10-18 02:25:06 +08:00
|
|
|
|
2012-10-20 07:05:40 +08:00
|
|
|
// In order to support reduction variables we need to be able to vectorize
|
|
|
|
// Phi nodes. Phi nodes have cycles, so we need to vectorize them in two
|
2012-11-30 03:25:41 +08:00
|
|
|
// stages. First, we create a new vector PHI node with no incoming edges.
|
2012-10-20 07:05:40 +08:00
|
|
|
// We use this value when we vectorize all of the instructions that use the
|
|
|
|
// PHI. Next, after all of the instructions in the block are complete we
|
|
|
|
// add the new incoming edges to the PHI. At this point all of the
|
|
|
|
// instructions in the basic block are vectorized, so we can use them to
|
|
|
|
// construct the PHI.
|
2012-11-17 08:27:03 +08:00
|
|
|
PhiVector RdxPHIsToFix;
|
2012-10-20 07:05:40 +08:00
|
|
|
|
2012-12-04 14:15:11 +08:00
|
|
|
// Scan the loop in a topological order to ensure that defs are vectorized
|
|
|
|
// before users.
|
|
|
|
LoopBlocksDFS DFS(OrigLoop);
|
|
|
|
DFS.perform(LI);
|
|
|
|
|
|
|
|
// Vectorize all of the blocks in the original loop.
|
|
|
|
for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(),
|
|
|
|
be = DFS.endRPO(); bb != be; ++bb)
|
2014-01-11 02:20:32 +08:00
|
|
|
vectorizeBlockInLoop(*bb, &RdxPHIsToFix);
|
2012-12-04 14:15:11 +08:00
|
|
|
|
|
|
|
// At this point every instruction in the original loop is widened to
|
|
|
|
// a vector form. We are almost done. Now, we need to fix the PHI nodes
|
|
|
|
// that we vectorized. The PHI nodes are currently empty because we did
|
|
|
|
// not want to introduce cycles. Notice that the remaining PHI nodes
|
|
|
|
// that we need to fix are reduction variables.
|
|
|
|
|
|
|
|
// Create the 'reduced' values for each of the induction vars.
|
|
|
|
// The reduced values are the vector values that we scalarize and combine
|
|
|
|
// after the loop is finished.
|
|
|
|
for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end();
|
|
|
|
it != e; ++it) {
|
|
|
|
PHINode *RdxPhi = *it;
|
|
|
|
assert(RdxPhi && "Unable to recover vectorized PHI");
|
|
|
|
|
|
|
|
// Find the reduction variable descriptor.
|
|
|
|
assert(Legal->getReductionVars()->count(RdxPhi) &&
|
|
|
|
"Unable to find the reduction variable");
|
2015-04-20 12:38:33 +08:00
|
|
|
ReductionDescriptor RdxDesc = (*Legal->getReductionVars())[RdxPhi];
|
2012-12-04 14:15:11 +08:00
|
|
|
|
2015-04-20 12:38:33 +08:00
|
|
|
ReductionDescriptor::ReductionKind RK = RdxDesc.getReductionKind();
|
|
|
|
TrackingVH<Value> ReductionStartValue = RdxDesc.getReductionStartValue();
|
|
|
|
Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
|
|
|
|
ReductionInstDesc::MinMaxReductionKind MinMaxKind =
|
|
|
|
RdxDesc.getMinMaxReductionKind();
|
|
|
|
setDebugLocFromInst(Builder, ReductionStartValue);
|
2013-06-29 00:26:54 +08:00
|
|
|
|
2012-12-04 14:15:11 +08:00
|
|
|
// We need to generate a reduction vector from the incoming scalar.
|
2014-01-25 01:20:08 +08:00
|
|
|
// To do so, we need to generate the 'identity' vector and override
|
2012-12-04 14:15:11 +08:00
|
|
|
// one of the elements with the incoming scalar reduction. We need
|
|
|
|
// to do it in the vector-loop preheader.
|
2014-05-30 06:10:01 +08:00
|
|
|
Builder.SetInsertPoint(LoopBypassBlocks[1]->getTerminator());
|
2012-12-04 14:15:11 +08:00
|
|
|
|
|
|
|
// This is the vector-clone of the value that leaves the loop.
|
2015-04-20 12:38:33 +08:00
|
|
|
VectorParts &VectorExit = getVectorValue(LoopExitInst);
|
2013-01-03 08:52:27 +08:00
|
|
|
Type *VecTy = VectorExit[0]->getType();
|
2012-12-04 14:15:11 +08:00
|
|
|
|
|
|
|
// Find the reduction identity variable. Zero for addition, or, xor,
|
|
|
|
// one for multiplication, -1 for And.
|
2013-05-05 09:54:42 +08:00
|
|
|
Value *Identity;
|
|
|
|
Value *VectorStart;
|
2015-04-20 12:38:33 +08:00
|
|
|
if (RK == ReductionDescriptor::RK_IntegerMinMax ||
|
|
|
|
RK == ReductionDescriptor::RK_FloatMinMax) {
|
2013-05-05 09:54:42 +08:00
|
|
|
// MinMax reduction have the start value as their identify.
|
2013-08-28 02:52:47 +08:00
|
|
|
if (VF == 1) {
|
2015-04-20 12:38:33 +08:00
|
|
|
VectorStart = Identity = ReductionStartValue;
|
2013-08-28 02:52:47 +08:00
|
|
|
} else {
|
2015-04-20 12:38:33 +08:00
|
|
|
VectorStart = Identity =
|
|
|
|
Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
|
2013-08-28 02:52:47 +08:00
|
|
|
}
|
2013-05-05 09:54:48 +08:00
|
|
|
} else {
|
2013-08-28 02:52:47 +08:00
|
|
|
// Handle other reduction kinds:
|
2013-05-05 09:54:42 +08:00
|
|
|
Constant *Iden =
|
2015-04-20 12:38:33 +08:00
|
|
|
ReductionDescriptor::getReductionIdentity(RK, VecTy->getScalarType());
|
2013-08-28 02:52:47 +08:00
|
|
|
if (VF == 1) {
|
|
|
|
Identity = Iden;
|
|
|
|
// This vector is the Identity vector where the first element is the
|
|
|
|
// incoming scalar reduction.
|
2015-04-20 12:38:33 +08:00
|
|
|
VectorStart = ReductionStartValue;
|
2013-08-28 02:52:47 +08:00
|
|
|
} else {
|
|
|
|
Identity = ConstantVector::getSplat(VF, Iden);
|
|
|
|
|
|
|
|
// This vector is the Identity vector where the first element is the
|
|
|
|
// incoming scalar reduction.
|
2015-04-20 12:38:33 +08:00
|
|
|
VectorStart =
|
|
|
|
Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
|
2013-08-28 02:52:47 +08:00
|
|
|
}
|
2013-05-05 09:54:42 +08:00
|
|
|
}
|
2012-12-04 14:15:11 +08:00
|
|
|
|
|
|
|
// Fix the vector-loop phi.
|
|
|
|
|
|
|
|
// Reductions do not have to start at zero. They can start with
|
|
|
|
// any loop invariant values.
|
2013-01-03 08:52:27 +08:00
|
|
|
VectorParts &VecRdxPhi = WidenMap.get(RdxPhi);
|
|
|
|
BasicBlock *Latch = OrigLoop->getLoopLatch();
|
|
|
|
Value *LoopVal = RdxPhi->getIncomingValueForBlock(Latch);
|
|
|
|
VectorParts &Val = getVectorValue(LoopVal);
|
|
|
|
for (unsigned part = 0; part < UF; ++part) {
|
2013-05-29 11:13:41 +08:00
|
|
|
// Make sure to add the reduction stat value only to the
|
2013-01-03 08:52:27 +08:00
|
|
|
// first unroll part.
|
|
|
|
Value *StartVal = (part == 0) ? VectorStart : Identity;
|
2014-12-10 06:45:07 +08:00
|
|
|
cast<PHINode>(VecRdxPhi[part])->addIncoming(StartVal,
|
|
|
|
LoopVectorPreHeader);
|
2014-01-28 09:01:53 +08:00
|
|
|
cast<PHINode>(VecRdxPhi[part])->addIncoming(Val[part],
|
|
|
|
LoopVectorBody.back());
|
2013-01-03 08:52:27 +08:00
|
|
|
}
|
2012-12-04 14:15:11 +08:00
|
|
|
|
|
|
|
// Before each round, move the insertion point right between
|
|
|
|
// the PHIs and the values we are going to write.
|
|
|
|
// This allows us to write both PHINodes and the extractelement
|
|
|
|
// instructions.
|
|
|
|
Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt());
|
|
|
|
|
2013-01-03 08:52:27 +08:00
|
|
|
VectorParts RdxParts;
|
2015-04-20 12:38:33 +08:00
|
|
|
setDebugLocFromInst(Builder, LoopExitInst);
|
2013-01-03 08:52:27 +08:00
|
|
|
for (unsigned part = 0; part < UF; ++part) {
|
|
|
|
// This PHINode contains the vectorized reduction variable, or
|
|
|
|
// the initial value vector, if we bypass the vector loop.
|
2015-04-20 12:38:33 +08:00
|
|
|
VectorParts &RdxExitVal = getVectorValue(LoopExitInst);
|
2013-01-03 08:52:27 +08:00
|
|
|
PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi");
|
|
|
|
Value *StartVal = (part == 0) ? VectorStart : Identity;
|
2014-05-30 06:10:01 +08:00
|
|
|
for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
|
2013-01-19 21:57:58 +08:00
|
|
|
NewPhi->addIncoming(StartVal, LoopBypassBlocks[I]);
|
2014-01-28 09:01:53 +08:00
|
|
|
NewPhi->addIncoming(RdxExitVal[part],
|
|
|
|
LoopVectorBody.back());
|
2013-01-03 08:52:27 +08:00
|
|
|
RdxParts.push_back(NewPhi);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Reduce all of the unrolled parts into a single vector.
|
|
|
|
Value *ReducedPartRdx = RdxParts[0];
|
2015-04-20 12:38:33 +08:00
|
|
|
unsigned Op = ReductionDescriptor::getReductionBinOp(RK);
|
2013-06-29 01:14:48 +08:00
|
|
|
setDebugLocFromInst(Builder, ReducedPartRdx);
|
2013-01-03 08:52:27 +08:00
|
|
|
for (unsigned part = 1; part < UF; ++part) {
|
2013-05-05 09:54:48 +08:00
|
|
|
if (Op != Instruction::ICmp && Op != Instruction::FCmp)
|
2014-03-06 05:10:47 +08:00
|
|
|
// Floating point operations had to be 'fast' to enable the reduction.
|
|
|
|
ReducedPartRdx = addFastMathFlag(
|
|
|
|
Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxParts[part],
|
|
|
|
ReducedPartRdx, "bin.rdx"));
|
2013-04-19 01:22:34 +08:00
|
|
|
else
|
2015-04-20 12:38:33 +08:00
|
|
|
ReducedPartRdx = ReductionDescriptor::createMinMaxOp(
|
|
|
|
Builder, MinMaxKind, ReducedPartRdx, RdxParts[part]);
|
2013-01-03 08:52:27 +08:00
|
|
|
}
|
2012-12-04 14:15:11 +08:00
|
|
|
|
2013-08-28 02:52:47 +08:00
|
|
|
if (VF > 1) {
|
|
|
|
// VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
|
|
|
|
// and vector ops, reducing the set of values being computed by half each
|
|
|
|
// round.
|
|
|
|
assert(isPowerOf2_32(VF) &&
|
|
|
|
"Reduction emission only supported for pow2 vectors!");
|
|
|
|
Value *TmpVec = ReducedPartRdx;
|
2014-04-25 13:29:35 +08:00
|
|
|
SmallVector<Constant*, 32> ShuffleMask(VF, nullptr);
|
2013-08-28 02:52:47 +08:00
|
|
|
for (unsigned i = VF; i != 1; i >>= 1) {
|
|
|
|
// Move the upper half of the vector to the lower half.
|
|
|
|
for (unsigned j = 0; j != i/2; ++j)
|
|
|
|
ShuffleMask[j] = Builder.getInt32(i/2 + j);
|
|
|
|
|
|
|
|
// Fill the rest of the mask with undef.
|
|
|
|
std::fill(&ShuffleMask[i/2], ShuffleMask.end(),
|
|
|
|
UndefValue::get(Builder.getInt32Ty()));
|
|
|
|
|
|
|
|
Value *Shuf =
|
LoopVectorize: Emit reductions as log2(vectorsize) shuffles + vector ops instead of scalar operations.
For example on x86 with SSE4.2 a <8 x i8> add reduction becomes
movdqa %xmm0, %xmm1
movhlps %xmm1, %xmm1 ## xmm1 = xmm1[1,1]
paddw %xmm0, %xmm1
pshufd $1, %xmm1, %xmm0 ## xmm0 = xmm1[1,0,0,0]
paddw %xmm1, %xmm0
phaddw %xmm0, %xmm0
pextrb $0, %xmm0, %edx
instead of
pextrb $2, %xmm0, %esi
pextrb $0, %xmm0, %edx
addb %sil, %dl
pextrb $4, %xmm0, %esi
addb %dl, %sil
pextrb $6, %xmm0, %edx
addb %sil, %dl
pextrb $8, %xmm0, %esi
addb %dl, %sil
pextrb $10, %xmm0, %edi
pextrb $14, %xmm0, %edx
addb %sil, %dil
pextrb $12, %xmm0, %esi
addb %dil, %sil
addb %sil, %dl
llvm-svn: 170439
2012-12-19 02:40:20 +08:00
|
|
|
Builder.CreateShuffleVector(TmpVec,
|
|
|
|
UndefValue::get(TmpVec->getType()),
|
|
|
|
ConstantVector::get(ShuffleMask),
|
|
|
|
"rdx.shuf");
|
|
|
|
|
2013-08-28 02:52:47 +08:00
|
|
|
if (Op != Instruction::ICmp && Op != Instruction::FCmp)
|
2014-03-06 05:10:47 +08:00
|
|
|
// Floating point operations had to be 'fast' to enable the reduction.
|
|
|
|
TmpVec = addFastMathFlag(Builder.CreateBinOp(
|
|
|
|
(Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx"));
|
2013-08-28 02:52:47 +08:00
|
|
|
else
|
2015-04-20 12:38:33 +08:00
|
|
|
TmpVec = ReductionDescriptor::createMinMaxOp(Builder, MinMaxKind,
|
|
|
|
TmpVec, Shuf);
|
2013-08-28 02:52:47 +08:00
|
|
|
}
|
2012-12-04 14:15:11 +08:00
|
|
|
|
2013-08-28 02:52:47 +08:00
|
|
|
// The result is in the first element of the vector.
|
|
|
|
ReducedPartRdx = Builder.CreateExtractElement(TmpVec,
|
|
|
|
Builder.getInt32(0));
|
|
|
|
}
|
LoopVectorize: Emit reductions as log2(vectorsize) shuffles + vector ops instead of scalar operations.
For example on x86 with SSE4.2 a <8 x i8> add reduction becomes
movdqa %xmm0, %xmm1
movhlps %xmm1, %xmm1 ## xmm1 = xmm1[1,1]
paddw %xmm0, %xmm1
pshufd $1, %xmm1, %xmm0 ## xmm0 = xmm1[1,0,0,0]
paddw %xmm1, %xmm0
phaddw %xmm0, %xmm0
pextrb $0, %xmm0, %edx
instead of
pextrb $2, %xmm0, %esi
pextrb $0, %xmm0, %edx
addb %sil, %dl
pextrb $4, %xmm0, %esi
addb %dl, %sil
pextrb $6, %xmm0, %edx
addb %sil, %dl
pextrb $8, %xmm0, %esi
addb %dl, %sil
pextrb $10, %xmm0, %edi
pextrb $14, %xmm0, %edx
addb %sil, %dil
pextrb $12, %xmm0, %esi
addb %dil, %sil
addb %sil, %dl
llvm-svn: 170439
2012-12-19 02:40:20 +08:00
|
|
|
|
2014-05-30 06:10:01 +08:00
|
|
|
// Create a phi node that merges control-flow from the backedge-taken check
|
|
|
|
// block and the middle block.
|
|
|
|
PHINode *BCBlockPhi = PHINode::Create(RdxPhi->getType(), 2, "bc.merge.rdx",
|
|
|
|
LoopScalarPreHeader->getTerminator());
|
2015-04-20 12:38:33 +08:00
|
|
|
BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[0]);
|
2014-05-30 06:10:01 +08:00
|
|
|
BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
|
|
|
|
|
2012-12-04 14:15:11 +08:00
|
|
|
// Now, we need to fix the users of the reduction variable
|
|
|
|
// inside and outside of the scalar remainder loop.
|
|
|
|
// We know that the loop is in LCSSA form. We need to update the
|
|
|
|
// PHI nodes in the exit blocks.
|
|
|
|
for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
|
|
|
|
LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
|
|
|
|
PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
|
2013-09-11 02:46:15 +08:00
|
|
|
if (!LCSSAPhi) break;
|
2012-12-04 14:15:11 +08:00
|
|
|
|
|
|
|
// All PHINodes need to have a single entry edge, or two if
|
|
|
|
// we already fixed them.
|
|
|
|
assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
|
|
|
|
|
|
|
|
// We found our reduction value exit-PHI. Update it with the
|
|
|
|
// incoming bypass edge.
|
2015-04-20 12:38:33 +08:00
|
|
|
if (LCSSAPhi->getIncomingValue(0) == LoopExitInst) {
|
2012-12-04 14:15:11 +08:00
|
|
|
// Add an edge coming from the bypass.
|
2013-08-28 02:52:47 +08:00
|
|
|
LCSSAPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
|
2012-12-04 14:15:11 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}// end of the LCSSA phi scan.
|
|
|
|
|
|
|
|
// Fix the scalar loop reduction variable with the incoming reduction sum
|
|
|
|
// from the vector body and from the backedge value.
|
2012-12-05 02:17:33 +08:00
|
|
|
int IncomingEdgeBlockIdx =
|
|
|
|
(RdxPhi)->getBasicBlockIndex(OrigLoop->getLoopLatch());
|
|
|
|
assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
|
|
|
|
// Pick the other block.
|
|
|
|
int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
|
2014-05-30 06:10:01 +08:00
|
|
|
(RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
|
2015-04-20 12:38:33 +08:00
|
|
|
(RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
|
2012-12-04 14:15:11 +08:00
|
|
|
}// end of for each redux variable.
|
2013-11-02 06:18:19 +08:00
|
|
|
|
2013-08-27 06:33:26 +08:00
|
|
|
fixLCSSAPHIs();
|
2013-11-02 06:18:19 +08:00
|
|
|
|
2013-11-02 07:28:54 +08:00
|
|
|
// Remove redundant induction instructions.
|
|
|
|
cse(LoopVectorBody);
|
2013-08-27 06:33:26 +08:00
|
|
|
}
|
2012-12-30 15:47:00 +08:00
|
|
|
|
2013-08-27 06:33:26 +08:00
|
|
|
void InnerLoopVectorizer::fixLCSSAPHIs() {
|
2012-12-30 15:47:00 +08:00
|
|
|
for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
|
|
|
|
LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
|
|
|
|
PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
|
2013-09-11 02:46:15 +08:00
|
|
|
if (!LCSSAPhi) break;
|
2012-12-30 15:47:00 +08:00
|
|
|
if (LCSSAPhi->getNumIncomingValues() == 1)
|
|
|
|
LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()),
|
|
|
|
LoopMiddleBlock);
|
|
|
|
}
|
2014-10-28 19:53:30 +08:00
|
|
|
}
|
2012-12-04 14:15:11 +08:00
|
|
|
|
2013-01-03 08:52:27 +08:00
|
|
|
InnerLoopVectorizer::VectorParts
|
|
|
|
InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
|
2012-12-04 14:15:11 +08:00
|
|
|
assert(std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) &&
|
|
|
|
"Invalid edge");
|
|
|
|
|
2013-06-28 04:31:06 +08:00
|
|
|
// Look for cached value.
|
|
|
|
std::pair<BasicBlock*, BasicBlock*> Edge(Src, Dst);
|
|
|
|
EdgeMaskCache::iterator ECEntryIt = MaskCache.find(Edge);
|
|
|
|
if (ECEntryIt != MaskCache.end())
|
|
|
|
return ECEntryIt->second;
|
|
|
|
|
2013-01-03 08:52:27 +08:00
|
|
|
VectorParts SrcMask = createBlockInMask(Src);
|
2012-12-04 14:15:11 +08:00
|
|
|
|
|
|
|
// The terminator has to be a branch inst!
|
|
|
|
BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
|
|
|
|
assert(BI && "Unexpected terminator found");
|
|
|
|
|
|
|
|
if (BI->isConditional()) {
|
2013-01-03 08:52:27 +08:00
|
|
|
VectorParts EdgeMask = getVectorValue(BI->getCondition());
|
|
|
|
|
2012-12-04 14:15:11 +08:00
|
|
|
if (BI->getSuccessor(0) != Dst)
|
2013-01-03 08:52:27 +08:00
|
|
|
for (unsigned part = 0; part < UF; ++part)
|
|
|
|
EdgeMask[part] = Builder.CreateNot(EdgeMask[part]);
|
|
|
|
|
|
|
|
for (unsigned part = 0; part < UF; ++part)
|
|
|
|
EdgeMask[part] = Builder.CreateAnd(EdgeMask[part], SrcMask[part]);
|
2013-06-28 04:31:06 +08:00
|
|
|
|
|
|
|
MaskCache[Edge] = EdgeMask;
|
2013-01-03 08:52:27 +08:00
|
|
|
return EdgeMask;
|
2012-12-04 14:15:11 +08:00
|
|
|
}
|
|
|
|
|
2013-06-28 04:31:06 +08:00
|
|
|
MaskCache[Edge] = SrcMask;
|
2013-01-03 08:52:27 +08:00
|
|
|
return SrcMask;
|
2012-12-04 14:15:11 +08:00
|
|
|
}
|
|
|
|
|
2013-01-03 08:52:27 +08:00
|
|
|
InnerLoopVectorizer::VectorParts
|
|
|
|
InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
|
2012-12-04 14:15:11 +08:00
|
|
|
assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
|
|
|
|
|
|
|
|
// Loop incoming mask is all-one.
|
2012-12-11 05:39:02 +08:00
|
|
|
if (OrigLoop->getHeader() == BB) {
|
|
|
|
Value *C = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 1);
|
|
|
|
return getVectorValue(C);
|
|
|
|
}
|
2012-12-04 14:15:11 +08:00
|
|
|
|
|
|
|
// This is the block mask. We OR all incoming edges, and with zero.
|
2012-12-11 05:39:02 +08:00
|
|
|
Value *Zero = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 0);
|
2013-01-03 08:52:27 +08:00
|
|
|
VectorParts BlockMask = getVectorValue(Zero);
|
2012-12-04 14:15:11 +08:00
|
|
|
|
2014-07-22 01:06:51 +08:00
|
|
|
// For each pred:
|
|
|
|
for (pred_iterator it = pred_begin(BB), e = pred_end(BB); it != e; ++it) {
|
|
|
|
VectorParts EM = createEdgeMask(*it, BB);
|
2013-01-03 08:52:27 +08:00
|
|
|
for (unsigned part = 0; part < UF; ++part)
|
|
|
|
BlockMask[part] = Builder.CreateOr(BlockMask[part], EM[part]);
|
|
|
|
}
|
2012-12-04 14:15:11 +08:00
|
|
|
|
|
|
|
return BlockMask;
|
|
|
|
}
|
|
|
|
|
2013-08-27 06:33:26 +08:00
|
|
|
void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
|
|
|
|
InnerLoopVectorizer::VectorParts &Entry,
|
|
|
|
unsigned UF, unsigned VF, PhiVector *PV) {
|
|
|
|
PHINode* P = cast<PHINode>(PN);
|
|
|
|
// Handle reduction variables:
|
|
|
|
if (Legal->getReductionVars()->count(P)) {
|
|
|
|
for (unsigned part = 0; part < UF; ++part) {
|
|
|
|
// This is phase one of vectorizing PHIs.
|
|
|
|
Type *VecTy = (VF == 1) ? PN->getType() :
|
|
|
|
VectorType::get(PN->getType(), VF);
|
|
|
|
Entry[part] = PHINode::Create(VecTy, 2, "vec.phi",
|
2014-01-28 09:01:53 +08:00
|
|
|
LoopVectorBody.back()-> getFirstInsertionPt());
|
2013-08-27 06:33:26 +08:00
|
|
|
}
|
|
|
|
PV->push_back(P);
|
|
|
|
return;
|
|
|
|
}
|
2012-11-25 16:41:35 +08:00
|
|
|
|
2013-08-27 06:33:26 +08:00
|
|
|
setDebugLocFromInst(Builder, P);
|
|
|
|
// Check for PHI nodes that are lowered to vector selects.
|
|
|
|
if (P->getParent() != OrigLoop->getHeader()) {
|
2013-12-05 13:44:44 +08:00
|
|
|
// We know that all PHIs in non-header blocks are converted into
|
2013-08-27 06:33:26 +08:00
|
|
|
// selects, so we don't have to worry about the insertion order and we
|
|
|
|
// can just use the builder.
|
|
|
|
// At this point we generate the predication tree. There may be
|
|
|
|
// duplications since this is a simple recursive scan, but future
|
|
|
|
// optimizations will clean it up.
|
|
|
|
|
|
|
|
unsigned NumIncoming = P->getNumIncomingValues();
|
|
|
|
|
|
|
|
// Generate a sequence of selects of the form:
|
|
|
|
// SELECT(Mask3, In3,
|
|
|
|
// SELECT(Mask2, In2,
|
|
|
|
// ( ...)))
|
|
|
|
for (unsigned In = 0; In < NumIncoming; In++) {
|
|
|
|
VectorParts Cond = createEdgeMask(P->getIncomingBlock(In),
|
|
|
|
P->getParent());
|
|
|
|
VectorParts &In0 = getVectorValue(P->getIncomingValue(In));
|
|
|
|
|
|
|
|
for (unsigned part = 0; part < UF; ++part) {
|
|
|
|
// We might have single edge PHIs (blocks) - use an identity
|
|
|
|
// 'select' for the first PHI operand.
|
|
|
|
if (In == 0)
|
|
|
|
Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
|
|
|
|
In0[part]);
|
|
|
|
else
|
|
|
|
// Select between the current value and the previous incoming edge
|
|
|
|
// based on the incoming mask.
|
|
|
|
Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
|
|
|
|
Entry[part], "predphi");
|
2012-12-11 05:39:02 +08:00
|
|
|
}
|
2013-08-27 06:33:26 +08:00
|
|
|
}
|
|
|
|
return;
|
|
|
|
}
|
2012-12-11 03:25:06 +08:00
|
|
|
|
2013-08-27 06:33:26 +08:00
|
|
|
// This PHINode must be an induction variable.
|
|
|
|
// Make sure that we know about it.
|
|
|
|
assert(Legal->getInductionVars()->count(P) &&
|
|
|
|
"Not an induction variable");
|
|
|
|
|
|
|
|
LoopVectorizationLegality::InductionInfo II =
|
|
|
|
Legal->getInductionVars()->lookup(P);
|
|
|
|
|
2015-01-30 13:02:21 +08:00
|
|
|
// FIXME: The newly created binary instructions should contain nsw/nuw flags,
|
|
|
|
// which can be found from the original scalar operations.
|
2013-08-27 06:33:26 +08:00
|
|
|
switch (II.IK) {
|
|
|
|
case LoopVectorizationLegality::IK_NoInduction:
|
|
|
|
llvm_unreachable("Unknown induction");
|
|
|
|
case LoopVectorizationLegality::IK_IntInduction: {
|
|
|
|
assert(P->getType() == II.StartValue->getType() && "Types must match");
|
|
|
|
Type *PhiTy = P->getType();
|
|
|
|
Value *Broadcasted;
|
|
|
|
if (P == OldInduction) {
|
|
|
|
// Handle the canonical induction variable. We might have had to
|
|
|
|
// extend the type.
|
|
|
|
Broadcasted = Builder.CreateTrunc(Induction, PhiTy);
|
|
|
|
} else {
|
|
|
|
// Handle other induction variables that are now based on the
|
|
|
|
// canonical one.
|
|
|
|
Value *NormalizedIdx = Builder.CreateSub(Induction, ExtendedIdx,
|
|
|
|
"normalized.idx");
|
|
|
|
NormalizedIdx = Builder.CreateSExtOrTrunc(NormalizedIdx, PhiTy);
|
2015-01-30 13:02:21 +08:00
|
|
|
Broadcasted = II.transform(Builder, NormalizedIdx);
|
|
|
|
Broadcasted->setName("offset.idx");
|
2013-08-27 06:33:26 +08:00
|
|
|
}
|
|
|
|
Broadcasted = getBroadcastInstrs(Broadcasted);
|
|
|
|
// After broadcasting the induction variable we need to make the vector
|
|
|
|
// consecutive by adding 0, 1, 2, etc.
|
|
|
|
for (unsigned part = 0; part < UF; ++part)
|
2015-01-30 13:02:21 +08:00
|
|
|
Entry[part] = getStepVector(Broadcasted, VF * part, II.StepValue);
|
2013-08-27 06:33:26 +08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
case LoopVectorizationLegality::IK_PtrInduction:
|
|
|
|
// Handle the pointer induction variable case.
|
|
|
|
assert(P->getType()->isPointerTy() && "Unexpected type.");
|
2015-01-30 13:02:21 +08:00
|
|
|
// This is the normalized GEP that starts counting at zero.
|
|
|
|
Value *NormalizedIdx =
|
|
|
|
Builder.CreateSub(Induction, ExtendedIdx, "normalized.idx");
|
2013-08-27 06:33:26 +08:00
|
|
|
// This is the vector of results. Notice that we don't generate
|
|
|
|
// vector geps because scalar geps result in better code.
|
|
|
|
for (unsigned part = 0; part < UF; ++part) {
|
|
|
|
if (VF == 1) {
|
2015-01-30 13:02:21 +08:00
|
|
|
int EltIndex = part;
|
2013-08-27 06:33:26 +08:00
|
|
|
Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex);
|
2015-01-30 13:02:21 +08:00
|
|
|
Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx);
|
|
|
|
Value *SclrGep = II.transform(Builder, GlobalIdx);
|
|
|
|
SclrGep->setName("next.gep");
|
2013-08-27 06:33:26 +08:00
|
|
|
Entry[part] = SclrGep;
|
2012-11-17 08:27:03 +08:00
|
|
|
continue;
|
|
|
|
}
|
2012-12-11 03:25:06 +08:00
|
|
|
|
2013-08-27 06:33:26 +08:00
|
|
|
Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF));
|
|
|
|
for (unsigned int i = 0; i < VF; ++i) {
|
2015-01-30 13:02:21 +08:00
|
|
|
int EltIndex = i + part * VF;
|
2013-08-27 06:33:26 +08:00
|
|
|
Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex);
|
2015-01-30 13:02:21 +08:00
|
|
|
Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx);
|
|
|
|
Value *SclrGep = II.transform(Builder, GlobalIdx);
|
|
|
|
SclrGep->setName("next.gep");
|
2013-08-27 06:33:26 +08:00
|
|
|
VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
|
|
|
|
Builder.getInt32(i),
|
|
|
|
"insert.gep");
|
2012-11-25 16:41:35 +08:00
|
|
|
}
|
2013-08-27 06:33:26 +08:00
|
|
|
Entry[part] = VecVal;
|
2012-10-18 02:25:06 +08:00
|
|
|
}
|
2013-08-27 06:33:26 +08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
2012-10-18 02:25:06 +08:00
|
|
|
|
2014-01-11 02:20:32 +08:00
|
|
|
void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
|
2013-08-27 06:33:26 +08:00
|
|
|
// For each instruction in the old loop.
|
|
|
|
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
|
|
|
|
VectorParts &Entry = WidenMap.get(it);
|
|
|
|
switch (it->getOpcode()) {
|
|
|
|
case Instruction::Br:
|
|
|
|
// Nothing to do for PHIs and BR, since we already took care of the
|
|
|
|
// loop control flow instructions.
|
|
|
|
continue;
|
2015-01-30 13:02:21 +08:00
|
|
|
case Instruction::PHI: {
|
2013-08-27 06:33:26 +08:00
|
|
|
// Vectorize PHINodes.
|
2014-01-11 02:20:32 +08:00
|
|
|
widenPHIInstruction(it, Entry, UF, VF, PV);
|
2013-08-27 06:33:26 +08:00
|
|
|
continue;
|
2012-12-11 05:39:02 +08:00
|
|
|
}// End of PHI.
|
|
|
|
|
|
|
|
case Instruction::Add:
|
|
|
|
case Instruction::FAdd:
|
|
|
|
case Instruction::Sub:
|
|
|
|
case Instruction::FSub:
|
|
|
|
case Instruction::Mul:
|
|
|
|
case Instruction::FMul:
|
|
|
|
case Instruction::UDiv:
|
|
|
|
case Instruction::SDiv:
|
|
|
|
case Instruction::FDiv:
|
|
|
|
case Instruction::URem:
|
|
|
|
case Instruction::SRem:
|
|
|
|
case Instruction::FRem:
|
|
|
|
case Instruction::Shl:
|
|
|
|
case Instruction::LShr:
|
|
|
|
case Instruction::AShr:
|
|
|
|
case Instruction::And:
|
|
|
|
case Instruction::Or:
|
|
|
|
case Instruction::Xor: {
|
|
|
|
// Just widen binops.
|
|
|
|
BinaryOperator *BinOp = dyn_cast<BinaryOperator>(it);
|
2013-06-29 00:26:54 +08:00
|
|
|
setDebugLocFromInst(Builder, BinOp);
|
2013-01-03 08:52:27 +08:00
|
|
|
VectorParts &A = getVectorValue(it->getOperand(0));
|
|
|
|
VectorParts &B = getVectorValue(it->getOperand(1));
|
2012-12-11 05:39:02 +08:00
|
|
|
|
|
|
|
// Use this vector value for all users of the original instruction.
|
2013-01-03 08:52:27 +08:00
|
|
|
for (unsigned Part = 0; Part < UF; ++Part) {
|
|
|
|
Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]);
|
|
|
|
|
2014-09-02 02:44:57 +08:00
|
|
|
if (BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V))
|
2014-09-03 09:06:50 +08:00
|
|
|
VecOp->copyIRFlags(BinOp);
|
2014-10-28 19:53:30 +08:00
|
|
|
|
2013-01-03 08:52:27 +08:00
|
|
|
Entry[Part] = V;
|
2012-10-18 02:25:06 +08:00
|
|
|
}
|
2014-07-19 21:33:16 +08:00
|
|
|
|
|
|
|
propagateMetadata(Entry, it);
|
2012-12-11 05:39:02 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
case Instruction::Select: {
|
|
|
|
// Widen selects.
|
|
|
|
// If the selector is loop invariant we can create a select
|
|
|
|
// instruction with a scalar condition. Otherwise, use vector-select.
|
2013-01-03 08:52:27 +08:00
|
|
|
bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(it->getOperand(0)),
|
|
|
|
OrigLoop);
|
2013-06-29 00:26:54 +08:00
|
|
|
setDebugLocFromInst(Builder, it);
|
2012-12-11 05:39:02 +08:00
|
|
|
|
|
|
|
// The condition can be loop invariant but still defined inside the
|
|
|
|
// loop. This means that we can't just use the original 'cond' value.
|
|
|
|
// We have to take the 'vectorized' value and pick the first lane.
|
|
|
|
// Instcombine will make this a no-op.
|
2013-01-03 08:52:27 +08:00
|
|
|
VectorParts &Cond = getVectorValue(it->getOperand(0));
|
|
|
|
VectorParts &Op0 = getVectorValue(it->getOperand(1));
|
|
|
|
VectorParts &Op1 = getVectorValue(it->getOperand(2));
|
2013-08-27 06:33:26 +08:00
|
|
|
|
|
|
|
Value *ScalarCond = (VF == 1) ? Cond[0] :
|
|
|
|
Builder.CreateExtractElement(Cond[0], Builder.getInt32(0));
|
|
|
|
|
2013-01-03 08:52:27 +08:00
|
|
|
for (unsigned Part = 0; Part < UF; ++Part) {
|
|
|
|
Entry[Part] = Builder.CreateSelect(
|
|
|
|
InvariantCond ? ScalarCond : Cond[Part],
|
|
|
|
Op0[Part],
|
|
|
|
Op1[Part]);
|
|
|
|
}
|
2014-07-19 21:33:16 +08:00
|
|
|
|
|
|
|
propagateMetadata(Entry, it);
|
2012-12-11 05:39:02 +08:00
|
|
|
break;
|
|
|
|
}
|
2012-10-18 02:25:06 +08:00
|
|
|
|
2012-12-11 05:39:02 +08:00
|
|
|
case Instruction::ICmp:
|
|
|
|
case Instruction::FCmp: {
|
|
|
|
// Widen compares. Generate vector compares.
|
|
|
|
bool FCmp = (it->getOpcode() == Instruction::FCmp);
|
|
|
|
CmpInst *Cmp = dyn_cast<CmpInst>(it);
|
2013-06-29 00:26:54 +08:00
|
|
|
setDebugLocFromInst(Builder, it);
|
2013-01-03 08:52:27 +08:00
|
|
|
VectorParts &A = getVectorValue(it->getOperand(0));
|
|
|
|
VectorParts &B = getVectorValue(it->getOperand(1));
|
|
|
|
for (unsigned Part = 0; Part < UF; ++Part) {
|
2014-04-25 13:29:35 +08:00
|
|
|
Value *C = nullptr;
|
2013-01-03 08:52:27 +08:00
|
|
|
if (FCmp)
|
|
|
|
C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]);
|
|
|
|
else
|
|
|
|
C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]);
|
|
|
|
Entry[Part] = C;
|
|
|
|
}
|
2014-07-19 21:33:16 +08:00
|
|
|
|
|
|
|
propagateMetadata(Entry, it);
|
2012-12-11 05:39:02 +08:00
|
|
|
break;
|
|
|
|
}
|
2012-11-09 15:09:44 +08:00
|
|
|
|
2013-01-26 05:47:42 +08:00
|
|
|
case Instruction::Store:
|
|
|
|
case Instruction::Load:
|
2014-01-11 02:20:32 +08:00
|
|
|
vectorizeMemoryInstruction(it);
|
2012-10-18 02:25:06 +08:00
|
|
|
break;
|
2012-12-11 05:39:02 +08:00
|
|
|
case Instruction::ZExt:
|
|
|
|
case Instruction::SExt:
|
|
|
|
case Instruction::FPToUI:
|
|
|
|
case Instruction::FPToSI:
|
|
|
|
case Instruction::FPExt:
|
|
|
|
case Instruction::PtrToInt:
|
|
|
|
case Instruction::IntToPtr:
|
|
|
|
case Instruction::SIToFP:
|
|
|
|
case Instruction::UIToFP:
|
|
|
|
case Instruction::Trunc:
|
|
|
|
case Instruction::FPTrunc:
|
|
|
|
case Instruction::BitCast: {
|
|
|
|
CastInst *CI = dyn_cast<CastInst>(it);
|
2013-06-29 00:26:54 +08:00
|
|
|
setDebugLocFromInst(Builder, it);
|
2012-12-12 02:58:10 +08:00
|
|
|
/// Optimize the special case where the source is the induction
|
|
|
|
/// variable. Notice that we can only optimize the 'trunc' case
|
|
|
|
/// because: a. FP conversions lose precision, b. sext/zext may wrap,
|
|
|
|
/// c. other casts depend on pointer size.
|
|
|
|
if (CI->getOperand(0) == OldInduction &&
|
|
|
|
it->getOpcode() == Instruction::Trunc) {
|
|
|
|
Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction,
|
|
|
|
CI->getType());
|
|
|
|
Value *Broadcasted = getBroadcastInstrs(ScalarCast);
|
2015-01-30 13:02:21 +08:00
|
|
|
LoopVectorizationLegality::InductionInfo II =
|
|
|
|
Legal->getInductionVars()->lookup(OldInduction);
|
|
|
|
Constant *Step =
|
|
|
|
ConstantInt::getSigned(CI->getType(), II.StepValue->getSExtValue());
|
2013-01-03 08:52:27 +08:00
|
|
|
for (unsigned Part = 0; Part < UF; ++Part)
|
2015-01-30 13:02:21 +08:00
|
|
|
Entry[Part] = getStepVector(Broadcasted, VF * Part, Step);
|
2014-07-19 21:33:16 +08:00
|
|
|
propagateMetadata(Entry, it);
|
2012-12-12 02:58:10 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
/// Vectorize casts.
|
2013-08-27 06:33:26 +08:00
|
|
|
Type *DestTy = (VF == 1) ? CI->getType() :
|
|
|
|
VectorType::get(CI->getType(), VF);
|
2013-01-03 08:52:27 +08:00
|
|
|
|
|
|
|
VectorParts &A = getVectorValue(it->getOperand(0));
|
|
|
|
for (unsigned Part = 0; Part < UF; ++Part)
|
|
|
|
Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy);
|
2014-07-19 21:33:16 +08:00
|
|
|
propagateMetadata(Entry, it);
|
2012-12-11 05:39:02 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
case Instruction::Call: {
|
2013-03-10 00:27:27 +08:00
|
|
|
// Ignore dbg intrinsics.
|
|
|
|
if (isa<DbgInfoIntrinsic>(it))
|
2013-03-09 23:56:34 +08:00
|
|
|
break;
|
2013-06-29 00:26:54 +08:00
|
|
|
setDebugLocFromInst(Builder, it);
|
2013-06-28 08:38:54 +08:00
|
|
|
|
2012-12-11 05:39:02 +08:00
|
|
|
Module *M = BB->getParent()->getParent();
|
2013-02-27 23:24:19 +08:00
|
|
|
CallInst *CI = cast<CallInst>(it);
|
2015-03-18 03:46:50 +08:00
|
|
|
|
|
|
|
StringRef FnName = CI->getCalledFunction()->getName();
|
|
|
|
Function *F = CI->getCalledFunction();
|
|
|
|
Type *RetTy = ToVectorTy(CI->getType(), VF);
|
|
|
|
SmallVector<Type *, 4> Tys;
|
|
|
|
for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i)
|
|
|
|
Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF));
|
|
|
|
|
2013-02-27 23:24:19 +08:00
|
|
|
Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
|
2015-03-18 03:46:50 +08:00
|
|
|
if (ID &&
|
|
|
|
(ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
|
|
|
|
ID == Intrinsic::lifetime_start)) {
|
2013-08-07 06:37:52 +08:00
|
|
|
scalarizeInstruction(it);
|
|
|
|
break;
|
2015-03-18 03:46:50 +08:00
|
|
|
}
|
|
|
|
// The flag shows whether we use Intrinsic or a usual Call for vectorized
|
|
|
|
// version of the instruction.
|
|
|
|
// Is it beneficial to perform intrinsic call compared to lib call?
|
|
|
|
bool NeedToScalarize;
|
|
|
|
unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize);
|
|
|
|
bool UseVectorIntrinsic =
|
|
|
|
ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost;
|
|
|
|
if (!UseVectorIntrinsic && NeedToScalarize) {
|
|
|
|
scalarizeInstruction(it);
|
|
|
|
break;
|
|
|
|
}
|
2013-08-27 06:33:26 +08:00
|
|
|
|
2015-03-18 03:46:50 +08:00
|
|
|
for (unsigned Part = 0; Part < UF; ++Part) {
|
|
|
|
SmallVector<Value *, 4> Args;
|
|
|
|
for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
|
|
|
|
Value *Arg = CI->getArgOperand(i);
|
|
|
|
// Some intrinsics have a scalar argument - don't replace it with a
|
|
|
|
// vector.
|
|
|
|
if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) {
|
|
|
|
VectorParts &VectorArg = getVectorValue(CI->getArgOperand(i));
|
|
|
|
Arg = VectorArg[Part];
|
|
|
|
}
|
|
|
|
Args.push_back(Arg);
|
2013-01-03 08:52:27 +08:00
|
|
|
}
|
2014-07-19 21:33:16 +08:00
|
|
|
|
2015-03-18 03:46:50 +08:00
|
|
|
Function *VectorF;
|
|
|
|
if (UseVectorIntrinsic) {
|
|
|
|
// Use vector version of the intrinsic.
|
|
|
|
Type *TysForDecl[] = {CI->getType()};
|
|
|
|
if (VF > 1)
|
|
|
|
TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
|
|
|
|
VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
|
|
|
|
} else {
|
|
|
|
// Use vector version of the library call.
|
|
|
|
StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
|
|
|
|
assert(!VFnName.empty() && "Vector function name is empty.");
|
|
|
|
VectorF = M->getFunction(VFnName);
|
|
|
|
if (!VectorF) {
|
|
|
|
// Generate a declaration
|
|
|
|
FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
|
|
|
|
VectorF =
|
|
|
|
Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
|
|
|
|
VectorF->copyAttributesFrom(F);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
assert(VectorF && "Can't create vector function.");
|
|
|
|
Entry[Part] = Builder.CreateCall(VectorF, Args);
|
2013-01-03 08:52:27 +08:00
|
|
|
}
|
2015-03-18 03:46:50 +08:00
|
|
|
|
|
|
|
propagateMetadata(Entry, it);
|
2012-12-11 05:39:02 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
default:
|
|
|
|
// All other instructions are unsupported. Scalarize them.
|
|
|
|
scalarizeInstruction(it);
|
|
|
|
break;
|
2012-10-18 02:25:06 +08:00
|
|
|
}// end of switch.
|
|
|
|
}// end of for_each instr.
|
|
|
|
}
|
|
|
|
|
2012-12-04 05:33:08 +08:00
|
|
|
void InnerLoopVectorizer::updateAnalysis() {
|
2012-11-30 03:25:41 +08:00
|
|
|
// Forget the original basic block.
|
2012-10-22 12:53:05 +08:00
|
|
|
SE->forgetLoop(OrigLoop);
|
2012-10-30 05:52:38 +08:00
|
|
|
|
|
|
|
// Update the dominator tree information.
|
2013-01-19 21:57:58 +08:00
|
|
|
assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
|
2012-10-30 05:52:38 +08:00
|
|
|
"Entry does not dominate exit.");
|
|
|
|
|
2013-01-19 21:57:58 +08:00
|
|
|
for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
|
|
|
|
DT->addNewBlock(LoopBypassBlocks[I], LoopBypassBlocks[I-1]);
|
|
|
|
DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlocks.back());
|
2014-01-28 09:01:53 +08:00
|
|
|
|
|
|
|
// Due to if predication of stores we might create a sequence of "if(pred)
|
|
|
|
// a[i] = ...; " blocks.
|
|
|
|
for (unsigned i = 0, e = LoopVectorBody.size(); i != e; ++i) {
|
|
|
|
if (i == 0)
|
|
|
|
DT->addNewBlock(LoopVectorBody[0], LoopVectorPreHeader);
|
|
|
|
else if (isPredicatedBlock(i)) {
|
|
|
|
DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-1]);
|
|
|
|
} else {
|
|
|
|
DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-2]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-05-30 06:10:01 +08:00
|
|
|
DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks[1]);
|
|
|
|
DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
|
2012-10-30 05:52:38 +08:00
|
|
|
DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
|
2014-11-01 06:28:03 +08:00
|
|
|
DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
|
2012-10-30 05:52:38 +08:00
|
|
|
|
2014-01-13 21:07:17 +08:00
|
|
|
DEBUG(DT->verifyDomTree());
|
2012-10-18 02:25:06 +08:00
|
|
|
}
|
|
|
|
|
2013-12-17 09:11:01 +08:00
|
|
|
/// \brief Check whether it is safe to if-convert this phi node.
|
|
|
|
///
|
|
|
|
/// Phi nodes with constant expressions that can trap are not safe to if
|
|
|
|
/// convert.
|
|
|
|
static bool canIfConvertPHINodes(BasicBlock *BB) {
|
|
|
|
for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
|
|
|
|
PHINode *Phi = dyn_cast<PHINode>(I);
|
|
|
|
if (!Phi)
|
|
|
|
return true;
|
|
|
|
for (unsigned p = 0, e = Phi->getNumIncomingValues(); p != e; ++p)
|
|
|
|
if (Constant *C = dyn_cast<Constant>(Phi->getIncomingValue(p)))
|
|
|
|
if (C->canTrap())
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2012-12-04 05:06:35 +08:00
|
|
|
bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
|
2014-06-26 01:50:15 +08:00
|
|
|
if (!EnableIfConversion) {
|
2015-02-02 00:56:00 +08:00
|
|
|
emitAnalysis(VectorizationReport() << "if-conversion is disabled");
|
2012-12-04 05:06:35 +08:00
|
|
|
return false;
|
2014-06-26 01:50:15 +08:00
|
|
|
}
|
2012-12-04 05:06:35 +08:00
|
|
|
|
|
|
|
assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");
|
|
|
|
|
2013-06-29 04:46:27 +08:00
|
|
|
// A list of pointers that we can safely read and write to.
|
|
|
|
SmallPtrSet<Value *, 8> SafePointes;
|
|
|
|
|
|
|
|
// Collect safe addresses.
|
2013-10-26 11:08:02 +08:00
|
|
|
for (Loop::block_iterator BI = TheLoop->block_begin(),
|
|
|
|
BE = TheLoop->block_end(); BI != BE; ++BI) {
|
|
|
|
BasicBlock *BB = *BI;
|
2013-06-29 04:46:27 +08:00
|
|
|
|
|
|
|
if (blockNeedsPredication(BB))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
|
|
|
|
if (LoadInst *LI = dyn_cast<LoadInst>(I))
|
|
|
|
SafePointes.insert(LI->getPointerOperand());
|
|
|
|
else if (StoreInst *SI = dyn_cast<StoreInst>(I))
|
|
|
|
SafePointes.insert(SI->getPointerOperand());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-12-04 05:06:35 +08:00
|
|
|
// Collect the blocks that need predication.
|
2013-12-17 09:11:01 +08:00
|
|
|
BasicBlock *Header = TheLoop->getHeader();
|
2013-10-26 11:08:02 +08:00
|
|
|
for (Loop::block_iterator BI = TheLoop->block_begin(),
|
|
|
|
BE = TheLoop->block_end(); BI != BE; ++BI) {
|
|
|
|
BasicBlock *BB = *BI;
|
2012-12-04 05:06:35 +08:00
|
|
|
|
2012-12-11 12:55:10 +08:00
|
|
|
// We don't support switch statements inside loops.
|
2014-06-26 01:50:15 +08:00
|
|
|
if (!isa<BranchInst>(BB->getTerminator())) {
|
2015-02-02 00:56:00 +08:00
|
|
|
emitAnalysis(VectorizationReport(BB->getTerminator())
|
2014-06-26 01:50:15 +08:00
|
|
|
<< "loop contains a switch statement");
|
2012-12-11 12:55:10 +08:00
|
|
|
return false;
|
2014-06-26 01:50:15 +08:00
|
|
|
}
|
2012-12-11 12:55:10 +08:00
|
|
|
|
2012-12-04 06:57:09 +08:00
|
|
|
// We must be able to predicate all blocks that need to be predicated.
|
2013-12-17 09:11:01 +08:00
|
|
|
if (blockNeedsPredication(BB)) {
|
2014-06-26 01:50:15 +08:00
|
|
|
if (!blockCanBePredicated(BB, SafePointes)) {
|
2015-02-02 00:56:00 +08:00
|
|
|
emitAnalysis(VectorizationReport(BB->getTerminator())
|
2014-06-26 01:50:15 +08:00
|
|
|
<< "control flow cannot be substituted for a select");
|
2013-12-17 09:11:01 +08:00
|
|
|
return false;
|
2014-06-26 01:50:15 +08:00
|
|
|
}
|
|
|
|
} else if (BB != Header && !canIfConvertPHINodes(BB)) {
|
2015-02-02 00:56:00 +08:00
|
|
|
emitAnalysis(VectorizationReport(BB->getTerminator())
|
2014-06-26 01:50:15 +08:00
|
|
|
<< "control flow cannot be substituted for a select");
|
2012-12-04 05:06:35 +08:00
|
|
|
return false;
|
2014-06-26 01:50:15 +08:00
|
|
|
}
|
2012-12-04 05:06:35 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// We can if-convert this loop.
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2012-10-25 04:36:32 +08:00
|
|
|
bool LoopVectorizationLegality::canVectorize() {
|
2013-05-25 02:05:35 +08:00
|
|
|
// We must have a loop in canonical form. Loops with indirectbr in them cannot
|
|
|
|
// be canonicalized.
|
2014-06-26 01:50:15 +08:00
|
|
|
if (!TheLoop->getLoopPreheader()) {
|
|
|
|
emitAnalysis(
|
2015-02-02 00:56:00 +08:00
|
|
|
VectorizationReport() <<
|
|
|
|
"loop control flow is not understood by vectorizer");
|
2013-05-25 02:05:35 +08:00
|
|
|
return false;
|
2014-06-26 01:50:15 +08:00
|
|
|
}
|
2012-10-18 02:25:06 +08:00
|
|
|
|
2012-12-04 05:06:35 +08:00
|
|
|
// We can only vectorize innermost loops.
|
2015-01-15 19:41:30 +08:00
|
|
|
if (!TheLoop->getSubLoopsVector().empty()) {
|
2015-02-02 00:56:00 +08:00
|
|
|
emitAnalysis(VectorizationReport() << "loop is not the innermost loop");
|
2012-12-04 05:06:35 +08:00
|
|
|
return false;
|
2014-06-26 01:50:15 +08:00
|
|
|
}
|
2012-12-04 05:06:35 +08:00
|
|
|
|
|
|
|
// We must have a single backedge.
|
2014-06-26 01:50:15 +08:00
|
|
|
if (TheLoop->getNumBackEdges() != 1) {
|
|
|
|
emitAnalysis(
|
2015-02-02 00:56:00 +08:00
|
|
|
VectorizationReport() <<
|
|
|
|
"loop control flow is not understood by vectorizer");
|
2012-12-04 05:06:35 +08:00
|
|
|
return false;
|
2014-06-26 01:50:15 +08:00
|
|
|
}
|
2012-12-04 05:06:35 +08:00
|
|
|
|
|
|
|
// We must have a single exiting block.
|
2014-06-26 01:50:15 +08:00
|
|
|
if (!TheLoop->getExitingBlock()) {
|
|
|
|
emitAnalysis(
|
2015-02-02 00:56:00 +08:00
|
|
|
VectorizationReport() <<
|
|
|
|
"loop control flow is not understood by vectorizer");
|
2012-12-04 05:06:35 +08:00
|
|
|
return false;
|
2014-06-26 01:50:15 +08:00
|
|
|
}
|
2012-12-04 05:06:35 +08:00
|
|
|
|
2014-12-03 06:59:06 +08:00
|
|
|
// We only handle bottom-tested loops, i.e. loop in which the condition is
|
|
|
|
// checked at the end of each iteration. With that we can assume that all
|
|
|
|
// instructions in the loop are executed the same number of times.
|
|
|
|
if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
|
|
|
|
emitAnalysis(
|
2015-02-02 00:56:00 +08:00
|
|
|
VectorizationReport() <<
|
|
|
|
"loop control flow is not understood by vectorizer");
|
2014-12-03 06:59:06 +08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2013-10-12 00:14:39 +08:00
|
|
|
// We need to have a loop header.
|
|
|
|
DEBUG(dbgs() << "LV: Found a loop: " <<
|
|
|
|
TheLoop->getHeader()->getName() << '\n');
|
2012-12-04 05:06:35 +08:00
|
|
|
|
2013-12-05 13:44:44 +08:00
|
|
|
// Check if we can if-convert non-single-bb loops.
|
2013-10-12 00:14:39 +08:00
|
|
|
unsigned NumBlocks = TheLoop->getNumBlocks();
|
2012-12-04 05:06:35 +08:00
|
|
|
if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
|
|
|
|
DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
|
2012-10-25 04:36:32 +08:00
|
|
|
return false;
|
2012-10-18 02:25:06 +08:00
|
|
|
}
|
|
|
|
|
2012-10-18 13:29:12 +08:00
|
|
|
// ScalarEvolution needs to be able to find the exit count.
|
2013-06-01 05:48:56 +08:00
|
|
|
const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop);
|
2012-10-18 13:29:12 +08:00
|
|
|
if (ExitCount == SE->getCouldNotCompute()) {
|
2015-02-02 00:56:00 +08:00
|
|
|
emitAnalysis(VectorizationReport() <<
|
|
|
|
"could not determine number of loop iterations");
|
2012-10-18 13:29:12 +08:00
|
|
|
DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n");
|
2012-10-25 04:36:32 +08:00
|
|
|
return false;
|
2012-10-18 13:29:12 +08:00
|
|
|
}
|
|
|
|
|
2012-12-04 05:06:35 +08:00
|
|
|
// Check if we can vectorize the instructions and CFG in this loop.
|
2012-12-04 06:57:09 +08:00
|
|
|
if (!canVectorizeInstrs()) {
|
2012-12-04 05:06:35 +08:00
|
|
|
DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2012-11-09 15:09:44 +08:00
|
|
|
// Go over each instruction and look at memory deps.
|
2012-12-04 05:06:35 +08:00
|
|
|
if (!canVectorizeMemory()) {
|
|
|
|
DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
|
2012-11-09 15:09:44 +08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2012-12-04 06:57:09 +08:00
|
|
|
// Collect all of the variables that remain uniform after vectorization.
|
2012-12-04 05:06:35 +08:00
|
|
|
collectLoopUniforms();
|
|
|
|
|
2015-02-20 03:14:34 +08:00
|
|
|
DEBUG(dbgs() << "LV: We can vectorize this loop" <<
|
2015-02-20 03:15:04 +08:00
|
|
|
(LAI->getRuntimePointerCheck()->Need ? " (with a runtime bound check)" :
|
2015-02-20 03:14:34 +08:00
|
|
|
"")
|
|
|
|
<<"!\n");
|
2012-10-18 13:29:12 +08:00
|
|
|
|
|
|
|
// Okay! We can vectorize. At this point we don't have any other mem analysis
|
2012-10-25 04:36:32 +08:00
|
|
|
// which may limit our maximum vectorization factor, so just return true with
|
|
|
|
// no restrictions.
|
|
|
|
return true;
|
2012-10-18 02:25:06 +08:00
|
|
|
}
|
|
|
|
|
2014-02-25 07:12:18 +08:00
|
|
|
static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) {
|
2013-05-12 07:04:28 +08:00
|
|
|
if (Ty->isPointerTy())
|
2013-08-22 10:42:55 +08:00
|
|
|
return DL.getIntPtrType(Ty);
|
|
|
|
|
2013-11-18 21:14:32 +08:00
|
|
|
// It is possible that char's or short's overflow when we ask for the loop's
|
|
|
|
// trip count, work around this by changing the type size.
|
|
|
|
if (Ty->getScalarSizeInBits() < 32)
|
|
|
|
return Type::getInt32Ty(Ty->getContext());
|
|
|
|
|
2013-05-12 07:04:28 +08:00
|
|
|
return Ty;
|
|
|
|
}
|
|
|
|
|
2014-02-25 07:12:18 +08:00
|
|
|
static Type* getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) {
|
2013-05-12 07:04:28 +08:00
|
|
|
Ty0 = convertPointerToIntegerType(DL, Ty0);
|
|
|
|
Ty1 = convertPointerToIntegerType(DL, Ty1);
|
|
|
|
if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits())
|
|
|
|
return Ty0;
|
|
|
|
return Ty1;
|
|
|
|
}
|
|
|
|
|
2013-06-01 03:53:50 +08:00
|
|
|
/// \brief Check that the instruction has outside loop users and is not an
|
|
|
|
/// identified reduction variable.
|
|
|
|
static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
|
2014-08-21 13:55:13 +08:00
|
|
|
SmallPtrSetImpl<Value *> &Reductions) {
|
2013-06-01 03:53:50 +08:00
|
|
|
// Reduction instructions are allowed to have exit users. All other
|
|
|
|
// instructions must not have external users.
|
|
|
|
if (!Reductions.count(Inst))
|
|
|
|
//Check that all of the users of the loop are inside the BB.
|
2014-03-09 11:16:01 +08:00
|
|
|
for (User *U : Inst->users()) {
|
|
|
|
Instruction *UI = cast<Instruction>(U);
|
2013-06-01 03:53:50 +08:00
|
|
|
// This user may be a reduction exit value.
|
2014-03-09 11:16:01 +08:00
|
|
|
if (!TheLoop->contains(UI)) {
|
|
|
|
DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << '\n');
|
2013-06-01 03:53:50 +08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2012-12-04 06:57:09 +08:00
|
|
|
bool LoopVectorizationLegality::canVectorizeInstrs() {
|
2012-11-17 08:27:03 +08:00
|
|
|
BasicBlock *PreHeader = TheLoop->getLoopPreheader();
|
2012-12-04 05:06:35 +08:00
|
|
|
BasicBlock *Header = TheLoop->getHeader();
|
2012-11-17 08:27:03 +08:00
|
|
|
|
2013-05-05 09:54:48 +08:00
|
|
|
// Look for the attribute signaling the absence of NaNs.
|
|
|
|
Function &F = *Header->getParent();
|
2015-03-10 10:37:25 +08:00
|
|
|
const DataLayout &DL = F.getParent()->getDataLayout();
|
2013-05-05 09:54:48 +08:00
|
|
|
if (F.hasFnAttribute("no-nans-fp-math"))
|
2015-02-14 09:11:29 +08:00
|
|
|
HasFunNoNaNAttr =
|
|
|
|
F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
|
2013-05-05 09:54:48 +08:00
|
|
|
|
2012-12-04 06:57:09 +08:00
|
|
|
// For each block in the loop.
|
2012-12-04 05:06:35 +08:00
|
|
|
for (Loop::block_iterator bb = TheLoop->block_begin(),
|
|
|
|
be = TheLoop->block_end(); bb != be; ++bb) {
|
2012-10-18 02:25:06 +08:00
|
|
|
|
2012-12-04 05:06:35 +08:00
|
|
|
// Scan the instructions in the block and look for hazards.
|
2012-12-04 06:57:09 +08:00
|
|
|
for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
|
|
|
|
++it) {
|
2012-11-17 08:27:03 +08:00
|
|
|
|
2012-12-04 06:57:09 +08:00
|
|
|
if (PHINode *Phi = dyn_cast<PHINode>(it)) {
|
2013-05-12 07:04:26 +08:00
|
|
|
Type *PhiTy = Phi->getType();
|
2012-12-11 03:25:06 +08:00
|
|
|
// Check that this PHI type is allowed.
|
2013-05-12 07:04:26 +08:00
|
|
|
if (!PhiTy->isIntegerTy() &&
|
|
|
|
!PhiTy->isFloatingPointTy() &&
|
|
|
|
!PhiTy->isPointerTy()) {
|
2015-02-02 00:56:00 +08:00
|
|
|
emitAnalysis(VectorizationReport(it)
|
2014-06-26 01:50:15 +08:00
|
|
|
<< "loop control flow is not understood by vectorizer");
|
2012-12-11 03:25:06 +08:00
|
|
|
DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n");
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2012-12-04 05:06:35 +08:00
|
|
|
// If this PHINode is not in the header block, then we know that we
|
2012-12-11 03:25:06 +08:00
|
|
|
// can convert it to select during if-conversion. No need to check if
|
|
|
|
// the PHIs in this block are induction or reduction variables.
|
2013-06-01 03:53:50 +08:00
|
|
|
if (*bb != Header) {
|
|
|
|
// Check that this instruction has no outside users or is an
|
|
|
|
// identified reduction value with an outside user.
|
2014-06-26 01:50:15 +08:00
|
|
|
if (!hasOutsideLoopUser(TheLoop, it, AllowedExit))
|
2013-06-01 03:53:50 +08:00
|
|
|
continue;
|
2015-02-02 00:56:00 +08:00
|
|
|
emitAnalysis(VectorizationReport(it) <<
|
|
|
|
"value could not be identified as "
|
|
|
|
"an induction or reduction variable");
|
2013-06-01 03:53:50 +08:00
|
|
|
return false;
|
|
|
|
}
|
2012-10-20 16:26:33 +08:00
|
|
|
|
2015-01-14 11:02:16 +08:00
|
|
|
// We only allow if-converted PHIs with exactly two incoming values.
|
2013-05-04 01:42:55 +08:00
|
|
|
if (Phi->getNumIncomingValues() != 2) {
|
2015-02-02 00:56:00 +08:00
|
|
|
emitAnalysis(VectorizationReport(it)
|
2014-06-26 01:50:15 +08:00
|
|
|
<< "control flow not understood by vectorizer");
|
2013-05-04 01:42:55 +08:00
|
|
|
DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2012-12-04 05:06:35 +08:00
|
|
|
// This is the value coming from the preheader.
|
|
|
|
Value *StartValue = Phi->getIncomingValueForBlock(PreHeader);
|
2015-01-30 13:02:21 +08:00
|
|
|
ConstantInt *StepValue = nullptr;
|
2012-12-11 03:25:06 +08:00
|
|
|
// Check if this is an induction variable.
|
2015-01-30 13:02:21 +08:00
|
|
|
InductionKind IK = isInductionVariable(Phi, StepValue);
|
2012-12-11 03:25:06 +08:00
|
|
|
|
2013-01-09 01:23:17 +08:00
|
|
|
if (IK_NoInduction != IK) {
|
2013-05-12 07:04:28 +08:00
|
|
|
// Get the widest type.
|
|
|
|
if (!WidestIndTy)
|
2015-03-10 10:37:25 +08:00
|
|
|
WidestIndTy = convertPointerToIntegerType(DL, PhiTy);
|
2013-05-12 07:04:28 +08:00
|
|
|
else
|
2015-03-10 10:37:25 +08:00
|
|
|
WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy);
|
2013-05-12 07:04:28 +08:00
|
|
|
|
2012-12-11 03:25:06 +08:00
|
|
|
// Int inductions are special because we only allow one IV.
|
2015-01-30 13:02:21 +08:00
|
|
|
if (IK == IK_IntInduction && StepValue->isOne()) {
|
2013-05-14 08:21:18 +08:00
|
|
|
// Use the phi node with the widest type as induction. Use the last
|
|
|
|
// one if there are multiple (no good reason for doing this other
|
|
|
|
// than it is expedient).
|
|
|
|
if (!Induction || PhiTy == WidestIndTy)
|
|
|
|
Induction = Phi;
|
2012-12-11 03:25:06 +08:00
|
|
|
}
|
2012-12-04 05:06:35 +08:00
|
|
|
|
2012-12-11 03:25:06 +08:00
|
|
|
DEBUG(dbgs() << "LV: Found an induction variable.\n");
|
2015-01-30 13:02:21 +08:00
|
|
|
Inductions[Phi] = InductionInfo(StartValue, IK, StepValue);
|
2013-09-17 00:17:24 +08:00
|
|
|
|
|
|
|
// Until we explicitly handle the case of an induction variable with
|
|
|
|
// an outside loop user we have to give up vectorizing this loop.
|
2014-06-26 01:50:15 +08:00
|
|
|
if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) {
|
2015-02-02 00:56:00 +08:00
|
|
|
emitAnalysis(VectorizationReport(it) <<
|
|
|
|
"use of induction value outside of the "
|
|
|
|
"loop is not handled by vectorizer");
|
2013-09-17 00:17:24 +08:00
|
|
|
return false;
|
2014-06-26 01:50:15 +08:00
|
|
|
}
|
2013-09-17 00:17:24 +08:00
|
|
|
|
2012-12-04 05:06:35 +08:00
|
|
|
continue;
|
2012-10-20 16:26:33 +08:00
|
|
|
}
|
2012-10-21 10:38:01 +08:00
|
|
|
|
2015-04-20 12:38:33 +08:00
|
|
|
if (ReductionDescriptor::isReductionPHI(Phi, TheLoop,
|
|
|
|
Reductions[Phi])) {
|
|
|
|
AllowedExit.insert(Reductions[Phi].getLoopExitInstr());
|
2013-05-05 09:54:48 +08:00
|
|
|
continue;
|
|
|
|
}
|
2012-10-18 02:25:06 +08:00
|
|
|
|
2015-02-02 00:56:00 +08:00
|
|
|
emitAnalysis(VectorizationReport(it) <<
|
|
|
|
"value that could not be identified as "
|
|
|
|
"reduction is used outside the loop");
|
2012-12-04 05:06:35 +08:00
|
|
|
DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n");
|
|
|
|
return false;
|
|
|
|
}// end of PHI handling
|
2012-10-18 02:25:06 +08:00
|
|
|
|
2015-03-18 03:46:50 +08:00
|
|
|
// We handle calls that:
|
|
|
|
// * Are debug info intrinsics.
|
|
|
|
// * Have a mapping to an IR intrinsic.
|
|
|
|
// * Have a vector version available.
|
2012-12-04 06:57:09 +08:00
|
|
|
CallInst *CI = dyn_cast<CallInst>(it);
|
2015-03-18 03:46:50 +08:00
|
|
|
if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa<DbgInfoIntrinsic>(CI) &&
|
|
|
|
!(CI->getCalledFunction() && TLI &&
|
|
|
|
TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()))) {
|
2015-02-02 00:56:00 +08:00
|
|
|
emitAnalysis(VectorizationReport(it) <<
|
|
|
|
"call instruction cannot be vectorized");
|
2015-03-18 03:46:50 +08:00
|
|
|
DEBUG(dbgs() << "LV: Found a non-intrinsic, non-libfunc callsite.\n");
|
2013-09-23 22:54:39 +08:00
|
|
|
return false;
|
2012-12-04 05:06:35 +08:00
|
|
|
}
|
2012-10-20 07:05:40 +08:00
|
|
|
|
2014-05-30 12:31:24 +08:00
|
|
|
// Intrinsics such as powi,cttz and ctlz are legal to vectorize if the
|
|
|
|
// second argument is the same (i.e. loop invariant)
|
|
|
|
if (CI &&
|
|
|
|
hasVectorInstrinsicScalarOpd(getIntrinsicIDForCall(CI, TLI), 1)) {
|
|
|
|
if (!SE->isLoopInvariant(SE->getSCEV(CI->getOperand(1)), TheLoop)) {
|
2015-02-02 00:56:00 +08:00
|
|
|
emitAnalysis(VectorizationReport(it)
|
2014-06-26 01:50:15 +08:00
|
|
|
<< "intrinsic instruction cannot be vectorized");
|
2014-05-30 12:31:24 +08:00
|
|
|
DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n");
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-12-24 17:14:18 +08:00
|
|
|
// Check that the instruction return type is vectorizable.
|
2013-10-26 04:40:15 +08:00
|
|
|
// Also, we can't vectorize extractelement instructions.
|
|
|
|
if ((!VectorType::isValidElementType(it->getType()) &&
|
|
|
|
!it->getType()->isVoidTy()) || isa<ExtractElementInst>(it)) {
|
2015-02-02 00:56:00 +08:00
|
|
|
emitAnalysis(VectorizationReport(it)
|
2014-06-26 01:50:15 +08:00
|
|
|
<< "instruction return type cannot be vectorized");
|
2013-10-03 04:04:29 +08:00
|
|
|
DEBUG(dbgs() << "LV: Found unvectorizable type.\n");
|
2012-12-04 05:06:35 +08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2012-12-24 17:14:18 +08:00
|
|
|
// Check that the stored type is vectorizable.
|
|
|
|
if (StoreInst *ST = dyn_cast<StoreInst>(it)) {
|
|
|
|
Type *T = ST->getValueOperand()->getType();
|
2014-06-26 01:50:15 +08:00
|
|
|
if (!VectorType::isValidElementType(T)) {
|
2015-02-02 00:56:00 +08:00
|
|
|
emitAnalysis(VectorizationReport(ST) <<
|
|
|
|
"store instruction cannot be vectorized");
|
2012-12-24 17:14:18 +08:00
|
|
|
return false;
|
2014-06-26 01:50:15 +08:00
|
|
|
}
|
2014-01-11 02:20:32 +08:00
|
|
|
if (EnableMemAccessVersioning)
|
2015-01-14 11:02:16 +08:00
|
|
|
collectStridedAccess(ST);
|
2012-12-24 17:14:18 +08:00
|
|
|
}
|
|
|
|
|
2014-01-11 02:20:32 +08:00
|
|
|
if (EnableMemAccessVersioning)
|
|
|
|
if (LoadInst *LI = dyn_cast<LoadInst>(it))
|
2015-01-14 11:02:16 +08:00
|
|
|
collectStridedAccess(LI);
|
2014-01-11 02:20:32 +08:00
|
|
|
|
2012-12-04 05:06:35 +08:00
|
|
|
// Reduction instructions are allowed to have exit users.
|
|
|
|
// All other instructions must not have external users.
|
2014-06-26 01:50:15 +08:00
|
|
|
if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) {
|
2015-02-02 00:56:00 +08:00
|
|
|
emitAnalysis(VectorizationReport(it) <<
|
|
|
|
"value cannot be used outside the loop");
|
2013-06-01 03:53:50 +08:00
|
|
|
return false;
|
2014-06-26 01:50:15 +08:00
|
|
|
}
|
2013-06-01 03:53:50 +08:00
|
|
|
|
2012-12-04 05:06:35 +08:00
|
|
|
} // next instr.
|
|
|
|
|
|
|
|
}
|
2012-10-18 02:25:06 +08:00
|
|
|
|
2012-10-20 07:05:40 +08:00
|
|
|
if (!Induction) {
|
2012-11-25 16:41:35 +08:00
|
|
|
DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
|
2014-06-26 01:50:15 +08:00
|
|
|
if (Inductions.empty()) {
|
2015-02-02 00:56:00 +08:00
|
|
|
emitAnalysis(VectorizationReport()
|
2014-06-26 01:50:15 +08:00
|
|
|
<< "loop induction variable could not be identified");
|
2013-05-09 08:32:18 +08:00
|
|
|
return false;
|
2014-06-26 01:50:15 +08:00
|
|
|
}
|
2012-10-18 13:29:12 +08:00
|
|
|
}
|
|
|
|
|
2012-12-04 05:06:35 +08:00
|
|
|
return true;
|
|
|
|
}
|
2012-10-27 07:49:28 +08:00
|
|
|
|
2014-01-11 02:20:32 +08:00
|
|
|
///\brief Remove GEPs whose indices but the last one are loop invariant and
|
|
|
|
/// return the induction operand of the gep pointer.
|
2015-03-10 10:37:25 +08:00
|
|
|
static Value *stripGetElementPtr(Value *Ptr, ScalarEvolution *SE, Loop *Lp) {
|
2014-01-11 02:20:32 +08:00
|
|
|
GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
|
|
|
|
if (!GEP)
|
|
|
|
return Ptr;
|
|
|
|
|
2015-03-10 10:37:25 +08:00
|
|
|
unsigned InductionOperand = getGEPInductionOperand(GEP);
|
2014-01-11 02:20:32 +08:00
|
|
|
|
|
|
|
// Check that all of the gep indices are uniform except for our induction
|
|
|
|
// operand.
|
|
|
|
for (unsigned i = 0, e = GEP->getNumOperands(); i != e; ++i)
|
|
|
|
if (i != InductionOperand &&
|
|
|
|
!SE->isLoopInvariant(SE->getSCEV(GEP->getOperand(i)), Lp))
|
|
|
|
return Ptr;
|
|
|
|
return GEP->getOperand(InductionOperand);
|
|
|
|
}
|
|
|
|
|
|
|
|
///\brief Look for a cast use of the passed value.
|
|
|
|
static Value *getUniqueCastUse(Value *Ptr, Loop *Lp, Type *Ty) {
|
2014-04-25 13:29:35 +08:00
|
|
|
Value *UniqueCast = nullptr;
|
2014-03-09 11:16:01 +08:00
|
|
|
for (User *U : Ptr->users()) {
|
|
|
|
CastInst *CI = dyn_cast<CastInst>(U);
|
2014-01-11 02:20:32 +08:00
|
|
|
if (CI && CI->getType() == Ty) {
|
|
|
|
if (!UniqueCast)
|
|
|
|
UniqueCast = CI;
|
|
|
|
else
|
2014-04-25 13:29:35 +08:00
|
|
|
return nullptr;
|
2014-01-11 02:20:32 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return UniqueCast;
|
|
|
|
}
|
|
|
|
|
|
|
|
///\brief Get the stride of a pointer access in a loop.
|
|
|
|
/// Looks for symbolic strides "a[i*stride]". Returns the symbolic stride as a
|
|
|
|
/// pointer to the Value, or null otherwise.
|
2015-03-10 10:37:25 +08:00
|
|
|
static Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *Lp) {
|
2014-01-11 02:20:32 +08:00
|
|
|
const PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
|
|
|
|
if (!PtrTy || PtrTy->isAggregateType())
|
2014-04-25 13:29:35 +08:00
|
|
|
return nullptr;
|
2014-01-11 02:20:32 +08:00
|
|
|
|
|
|
|
// Try to remove a gep instruction to make the pointer (actually index at this
|
|
|
|
// point) easier analyzable. If OrigPtr is equal to Ptr we are analzying the
|
|
|
|
// pointer, otherwise, we are analyzing the index.
|
|
|
|
Value *OrigPtr = Ptr;
|
|
|
|
|
|
|
|
// The size of the pointer access.
|
|
|
|
int64_t PtrAccessSize = 1;
|
|
|
|
|
2015-03-10 10:37:25 +08:00
|
|
|
Ptr = stripGetElementPtr(Ptr, SE, Lp);
|
2014-01-11 02:20:32 +08:00
|
|
|
const SCEV *V = SE->getSCEV(Ptr);
|
|
|
|
|
|
|
|
if (Ptr != OrigPtr)
|
|
|
|
// Strip off casts.
|
|
|
|
while (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(V))
|
|
|
|
V = C->getOperand();
|
|
|
|
|
|
|
|
const SCEVAddRecExpr *S = dyn_cast<SCEVAddRecExpr>(V);
|
|
|
|
if (!S)
|
2014-04-25 13:29:35 +08:00
|
|
|
return nullptr;
|
2014-01-11 02:20:32 +08:00
|
|
|
|
|
|
|
V = S->getStepRecurrence(*SE);
|
|
|
|
if (!V)
|
2014-04-25 13:29:35 +08:00
|
|
|
return nullptr;
|
2014-01-11 02:20:32 +08:00
|
|
|
|
|
|
|
// Strip off the size of access multiplication if we are still analyzing the
|
|
|
|
// pointer.
|
|
|
|
if (OrigPtr == Ptr) {
|
2015-03-10 10:37:25 +08:00
|
|
|
const DataLayout &DL = Lp->getHeader()->getModule()->getDataLayout();
|
|
|
|
DL.getTypeAllocSize(PtrTy->getElementType());
|
2014-01-11 02:20:32 +08:00
|
|
|
if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(V)) {
|
|
|
|
if (M->getOperand(0)->getSCEVType() != scConstant)
|
2014-04-25 13:29:35 +08:00
|
|
|
return nullptr;
|
2014-01-11 02:20:32 +08:00
|
|
|
|
|
|
|
const APInt &APStepVal =
|
|
|
|
cast<SCEVConstant>(M->getOperand(0))->getValue()->getValue();
|
|
|
|
|
|
|
|
// Huge step value - give up.
|
|
|
|
if (APStepVal.getBitWidth() > 64)
|
2014-04-25 13:29:35 +08:00
|
|
|
return nullptr;
|
2014-01-11 02:20:32 +08:00
|
|
|
|
|
|
|
int64_t StepVal = APStepVal.getSExtValue();
|
|
|
|
if (PtrAccessSize != StepVal)
|
2014-04-25 13:29:35 +08:00
|
|
|
return nullptr;
|
2014-01-11 02:20:32 +08:00
|
|
|
V = M->getOperand(1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Strip off casts.
|
2014-04-25 13:29:35 +08:00
|
|
|
Type *StripedOffRecurrenceCast = nullptr;
|
2014-01-11 02:20:32 +08:00
|
|
|
if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(V)) {
|
|
|
|
StripedOffRecurrenceCast = C->getType();
|
|
|
|
V = C->getOperand();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Look for the loop invariant symbolic value.
|
|
|
|
const SCEVUnknown *U = dyn_cast<SCEVUnknown>(V);
|
|
|
|
if (!U)
|
2014-04-25 13:29:35 +08:00
|
|
|
return nullptr;
|
2014-01-11 02:20:32 +08:00
|
|
|
|
|
|
|
Value *Stride = U->getValue();
|
|
|
|
if (!Lp->isLoopInvariant(Stride))
|
2014-04-25 13:29:35 +08:00
|
|
|
return nullptr;
|
2014-01-11 02:20:32 +08:00
|
|
|
|
|
|
|
// If we have stripped off the recurrence cast we have to make sure that we
|
|
|
|
// return the value that is used in this loop so that we can replace it later.
|
|
|
|
if (StripedOffRecurrenceCast)
|
|
|
|
Stride = getUniqueCastUse(Stride, Lp, StripedOffRecurrenceCast);
|
|
|
|
|
|
|
|
return Stride;
|
|
|
|
}
|
|
|
|
|
2015-01-14 11:02:16 +08:00
|
|
|
void LoopVectorizationLegality::collectStridedAccess(Value *MemAccess) {
|
2014-04-25 13:29:35 +08:00
|
|
|
Value *Ptr = nullptr;
|
2014-01-11 02:20:32 +08:00
|
|
|
if (LoadInst *LI = dyn_cast<LoadInst>(MemAccess))
|
|
|
|
Ptr = LI->getPointerOperand();
|
|
|
|
else if (StoreInst *SI = dyn_cast<StoreInst>(MemAccess))
|
|
|
|
Ptr = SI->getPointerOperand();
|
|
|
|
else
|
|
|
|
return;
|
|
|
|
|
2015-03-10 10:37:25 +08:00
|
|
|
Value *Stride = getStrideFromPointer(Ptr, SE, TheLoop);
|
2014-01-11 02:20:32 +08:00
|
|
|
if (!Stride)
|
|
|
|
return;
|
|
|
|
|
|
|
|
DEBUG(dbgs() << "LV: Found a strided access that we can version");
|
|
|
|
DEBUG(dbgs() << " Ptr: " << *Ptr << " Stride: " << *Stride << "\n");
|
|
|
|
Strides[Ptr] = Stride;
|
|
|
|
StrideSet.insert(Stride);
|
|
|
|
}
|
|
|
|
|
2012-12-04 05:06:35 +08:00
|
|
|
void LoopVectorizationLegality::collectLoopUniforms() {
|
2012-10-27 07:49:28 +08:00
|
|
|
// We now know that the loop is vectorizable!
|
|
|
|
// Collect variables that will remain uniform after vectorization.
|
|
|
|
std::vector<Value*> Worklist;
|
2012-12-04 05:06:35 +08:00
|
|
|
BasicBlock *Latch = TheLoop->getLoopLatch();
|
|
|
|
|
2012-10-27 07:49:28 +08:00
|
|
|
// Start with the conditional branch and walk up the block.
|
2012-12-04 05:06:35 +08:00
|
|
|
Worklist.push_back(Latch->getTerminator()->getOperand(0));
|
2012-10-27 07:49:28 +08:00
|
|
|
|
2014-04-02 10:34:49 +08:00
|
|
|
// Also add all consecutive pointer values; these values will be uniform
|
|
|
|
// after vectorization (and subsequent cleanup) and, until revectorization is
|
|
|
|
// supported, all dependencies must also be uniform.
|
|
|
|
for (Loop::block_iterator B = TheLoop->block_begin(),
|
|
|
|
BE = TheLoop->block_end(); B != BE; ++B)
|
|
|
|
for (BasicBlock::iterator I = (*B)->begin(), IE = (*B)->end();
|
|
|
|
I != IE; ++I)
|
|
|
|
if (I->getType()->isPointerTy() && isConsecutivePtr(I))
|
|
|
|
Worklist.insert(Worklist.end(), I->op_begin(), I->op_end());
|
|
|
|
|
2015-01-15 19:41:30 +08:00
|
|
|
while (!Worklist.empty()) {
|
2012-10-27 07:49:28 +08:00
|
|
|
Instruction *I = dyn_cast<Instruction>(Worklist.back());
|
|
|
|
Worklist.pop_back();
|
|
|
|
|
2012-12-04 05:06:35 +08:00
|
|
|
// Look at instructions inside this loop.
|
|
|
|
// Stop when reaching PHI nodes.
|
2012-12-04 06:57:09 +08:00
|
|
|
// TODO: we need to follow values all over the loop, not only in this block.
|
2012-12-04 05:06:35 +08:00
|
|
|
if (!I || !TheLoop->contains(I) || isa<PHINode>(I))
|
2012-11-25 16:41:35 +08:00
|
|
|
continue;
|
2012-10-27 07:49:28 +08:00
|
|
|
|
|
|
|
// This is a known uniform.
|
|
|
|
Uniforms.insert(I);
|
|
|
|
|
|
|
|
// Insert all operands.
|
2013-05-17 22:48:17 +08:00
|
|
|
Worklist.insert(Worklist.end(), I->op_begin(), I->op_end());
|
2012-10-27 07:49:28 +08:00
|
|
|
}
|
2012-10-20 12:59:06 +08:00
|
|
|
}
|
|
|
|
|
2015-02-02 00:56:04 +08:00
|
|
|
bool LoopVectorizationLegality::canVectorizeMemory() {
|
2015-02-20 03:15:04 +08:00
|
|
|
LAI = &LAA->getInfo(TheLoop, Strides);
|
|
|
|
auto &OptionalReport = LAI->getReport();
|
2015-02-20 03:14:56 +08:00
|
|
|
if (OptionalReport)
|
2015-02-20 03:15:15 +08:00
|
|
|
emitAnalysis(VectorizationReport(*OptionalReport));
|
2015-03-11 02:54:23 +08:00
|
|
|
if (!LAI->canVectorizeMemory())
|
|
|
|
return false;
|
|
|
|
|
2015-04-09 01:48:40 +08:00
|
|
|
if (LAI->hasStoreToLoopInvariantAddress()) {
|
|
|
|
emitAnalysis(
|
|
|
|
VectorizationReport()
|
|
|
|
<< "write to a loop invariant address could not be vectorized");
|
|
|
|
DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n");
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2015-03-11 02:54:23 +08:00
|
|
|
if (LAI->getNumRuntimePointerChecks() >
|
|
|
|
VectorizerParams::RuntimeMemoryCheckThreshold) {
|
|
|
|
emitAnalysis(VectorizationReport()
|
|
|
|
<< LAI->getNumRuntimePointerChecks() << " exceeds limit of "
|
|
|
|
<< VectorizerParams::RuntimeMemoryCheckThreshold
|
|
|
|
<< " dependent memory operations checked at runtime");
|
|
|
|
DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
return true;
|
2015-02-02 00:56:04 +08:00
|
|
|
}
|
|
|
|
|
2015-03-27 11:44:15 +08:00
|
|
|
LoopVectorizationLegality::InductionKind
|
|
|
|
LoopVectorizationLegality::isInductionVariable(PHINode *Phi,
|
|
|
|
ConstantInt *&StepValue) {
|
|
|
|
if (!isInductionPHI(Phi, SE, StepValue))
|
|
|
|
return IK_NoInduction;
|
|
|
|
|
|
|
|
Type *PhiTy = Phi->getType();
|
|
|
|
// Found an Integer induction variable.
|
|
|
|
if (PhiTy->isIntegerTy())
|
|
|
|
return IK_IntInduction;
|
|
|
|
// Found an Pointer induction variable.
|
2015-01-30 13:02:21 +08:00
|
|
|
return IK_PtrInduction;
|
2012-10-20 16:26:33 +08:00
|
|
|
}
|
|
|
|
|
2012-12-13 08:21:03 +08:00
|
|
|
bool LoopVectorizationLegality::isInductionVariable(const Value *V) {
|
|
|
|
Value *In0 = const_cast<Value*>(V);
|
|
|
|
PHINode *PN = dyn_cast_or_null<PHINode>(In0);
|
|
|
|
if (!PN)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
return Inductions.count(PN);
|
|
|
|
}
|
|
|
|
|
2015-02-02 00:56:04 +08:00
|
|
|
bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) {
|
2015-02-18 11:43:19 +08:00
|
|
|
return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
|
2015-02-02 00:56:04 +08:00
|
|
|
}
|
|
|
|
|
2013-06-29 04:46:27 +08:00
|
|
|
bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB,
|
2014-08-21 13:55:13 +08:00
|
|
|
SmallPtrSetImpl<Value *> &SafePtrs) {
|
2014-12-16 19:50:42 +08:00
|
|
|
|
2012-12-04 05:06:35 +08:00
|
|
|
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
|
2014-12-16 19:50:42 +08:00
|
|
|
// Check that we don't have a constant expression that can trap as operand.
|
|
|
|
for (Instruction::op_iterator OI = it->op_begin(), OE = it->op_end();
|
|
|
|
OI != OE; ++OI) {
|
|
|
|
if (Constant *C = dyn_cast<Constant>(*OI))
|
|
|
|
if (C->canTrap())
|
|
|
|
return false;
|
|
|
|
}
|
2013-05-15 09:44:30 +08:00
|
|
|
// We might be able to hoist the load.
|
2013-06-29 04:46:27 +08:00
|
|
|
if (it->mayReadFromMemory()) {
|
|
|
|
LoadInst *LI = dyn_cast<LoadInst>(it);
|
2014-12-16 19:50:42 +08:00
|
|
|
if (!LI)
|
|
|
|
return false;
|
|
|
|
if (!SafePtrs.count(LI->getPointerOperand())) {
|
|
|
|
if (isLegalMaskedLoad(LI->getType(), LI->getPointerOperand())) {
|
|
|
|
MaskedOp.insert(LI);
|
|
|
|
continue;
|
|
|
|
}
|
2013-06-29 04:46:27 +08:00
|
|
|
return false;
|
2014-12-16 19:50:42 +08:00
|
|
|
}
|
2013-06-29 04:46:27 +08:00
|
|
|
}
|
2013-05-15 09:44:30 +08:00
|
|
|
|
2013-05-15 10:02:45 +08:00
|
|
|
// We don't predicate stores at the moment.
|
2014-01-28 09:01:53 +08:00
|
|
|
if (it->mayWriteToMemory()) {
|
|
|
|
StoreInst *SI = dyn_cast<StoreInst>(it);
|
|
|
|
// We only support predication of stores in basic blocks with one
|
|
|
|
// predecessor.
|
2014-12-16 19:50:42 +08:00
|
|
|
if (!SI)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
bool isSafePtr = (SafePtrs.count(SI->getPointerOperand()) != 0);
|
|
|
|
bool isSinglePredecessor = SI->getParent()->getSinglePredecessor();
|
|
|
|
|
|
|
|
if (++NumPredStores > NumberOfStoresToPredicate || !isSafePtr ||
|
|
|
|
!isSinglePredecessor) {
|
|
|
|
// Build a masked store if it is legal for the target, otherwise scalarize
|
|
|
|
// the block.
|
|
|
|
bool isLegalMaskedOp =
|
|
|
|
isLegalMaskedStore(SI->getValueOperand()->getType(),
|
|
|
|
SI->getPointerOperand());
|
|
|
|
if (isLegalMaskedOp) {
|
|
|
|
--NumPredStores;
|
|
|
|
MaskedOp.insert(SI);
|
|
|
|
continue;
|
|
|
|
}
|
2014-01-28 09:01:53 +08:00
|
|
|
return false;
|
2014-12-16 19:50:42 +08:00
|
|
|
}
|
2014-01-28 09:01:53 +08:00
|
|
|
}
|
|
|
|
if (it->mayThrow())
|
2012-12-04 05:06:35 +08:00
|
|
|
return false;
|
|
|
|
|
2012-12-12 09:31:10 +08:00
|
|
|
// The instructions below can trap.
|
2012-12-04 05:06:35 +08:00
|
|
|
switch (it->getOpcode()) {
|
2012-12-11 05:39:02 +08:00
|
|
|
default: continue;
|
|
|
|
case Instruction::UDiv:
|
|
|
|
case Instruction::SDiv:
|
|
|
|
case Instruction::URem:
|
|
|
|
case Instruction::SRem:
|
2014-12-14 17:43:50 +08:00
|
|
|
return false;
|
2012-12-04 05:06:35 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2013-01-29 00:02:45 +08:00
|
|
|
LoopVectorizationCostModel::VectorizationFactor
|
2014-08-02 08:14:03 +08:00
|
|
|
LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
|
2013-01-29 00:02:45 +08:00
|
|
|
// Width 1 means no vectorize
|
|
|
|
VectorizationFactor Factor = { 1U, 0U };
|
2012-12-12 09:11:46 +08:00
|
|
|
if (OptForSize && Legal->getRuntimePointerCheck()->Need) {
|
2015-02-02 00:56:00 +08:00
|
|
|
emitAnalysis(VectorizationReport() <<
|
|
|
|
"runtime pointer checks needed. Enable vectorization of this "
|
|
|
|
"loop with '#pragma clang loop vectorize(enable)' when "
|
|
|
|
"compiling with -Os");
|
2012-12-12 09:11:46 +08:00
|
|
|
DEBUG(dbgs() << "LV: Aborting. Runtime ptr check is required in Os.\n");
|
2013-01-29 00:02:45 +08:00
|
|
|
return Factor;
|
2012-12-12 09:11:46 +08:00
|
|
|
}
|
|
|
|
|
2015-02-02 00:56:02 +08:00
|
|
|
if (!EnableCondStoresVectorization && Legal->getNumPredStores()) {
|
2015-02-02 00:56:00 +08:00
|
|
|
emitAnalysis(VectorizationReport() <<
|
|
|
|
"store that is conditionally executed prevents vectorization");
|
2014-01-28 09:01:53 +08:00
|
|
|
DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n");
|
|
|
|
return Factor;
|
|
|
|
}
|
|
|
|
|
2012-12-12 09:11:46 +08:00
|
|
|
// Find the trip count.
|
2014-10-11 08:12:11 +08:00
|
|
|
unsigned TC = SE->getSmallConstantTripCount(TheLoop);
|
2013-10-03 04:04:29 +08:00
|
|
|
DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
|
2012-12-12 09:11:46 +08:00
|
|
|
|
2013-01-10 06:29:00 +08:00
|
|
|
unsigned WidestType = getWidestType();
|
|
|
|
unsigned WidestRegister = TTI.getRegisterBitWidth(true);
|
2013-06-24 20:09:15 +08:00
|
|
|
unsigned MaxSafeDepDist = -1U;
|
|
|
|
if (Legal->getMaxSafeDepDistBytes() != -1U)
|
|
|
|
MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
|
2013-08-01 06:17:45 +08:00
|
|
|
WidestRegister = ((WidestRegister < MaxSafeDepDist) ?
|
|
|
|
WidestRegister : MaxSafeDepDist);
|
2013-01-10 06:29:00 +08:00
|
|
|
unsigned MaxVectorSize = WidestRegister / WidestType;
|
|
|
|
DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n");
|
2013-10-03 04:04:29 +08:00
|
|
|
DEBUG(dbgs() << "LV: The Widest register is: "
|
|
|
|
<< WidestRegister << " bits.\n");
|
2013-01-10 06:29:00 +08:00
|
|
|
|
|
|
|
if (MaxVectorSize == 0) {
|
|
|
|
DEBUG(dbgs() << "LV: The target has no vector registers.\n");
|
2013-01-16 02:25:16 +08:00
|
|
|
MaxVectorSize = 1;
|
2013-01-10 06:29:00 +08:00
|
|
|
}
|
|
|
|
|
2014-12-14 17:43:50 +08:00
|
|
|
assert(MaxVectorSize <= 64 && "Did not expect to pack so many elements"
|
2013-01-20 13:24:29 +08:00
|
|
|
" into one vector!");
|
2013-01-11 15:11:59 +08:00
|
|
|
|
2012-12-12 09:11:46 +08:00
|
|
|
unsigned VF = MaxVectorSize;
|
|
|
|
|
|
|
|
// If we optimize the program for size, avoid creating the tail loop.
|
|
|
|
if (OptForSize) {
|
|
|
|
// If we are unable to calculate the trip count then don't try to vectorize.
|
|
|
|
if (TC < 2) {
|
2015-02-02 00:56:00 +08:00
|
|
|
emitAnalysis
|
|
|
|
(VectorizationReport() <<
|
|
|
|
"unable to calculate the loop count due to complex control flow");
|
2012-12-12 09:11:46 +08:00
|
|
|
DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n");
|
2013-01-29 00:02:45 +08:00
|
|
|
return Factor;
|
2012-12-12 09:11:46 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Find the maximum SIMD width that can fit within the trip count.
|
|
|
|
VF = TC % MaxVectorSize;
|
|
|
|
|
|
|
|
if (VF == 0)
|
|
|
|
VF = MaxVectorSize;
|
|
|
|
|
|
|
|
// If the trip count that we found modulo the vectorization factor is not
|
|
|
|
// zero then we require a tail.
|
|
|
|
if (VF < 2) {
|
2015-02-02 00:56:00 +08:00
|
|
|
emitAnalysis(VectorizationReport() <<
|
|
|
|
"cannot optimize for size and vectorize at the "
|
|
|
|
"same time. Enable vectorization of this loop "
|
|
|
|
"with '#pragma clang loop vectorize(enable)' "
|
|
|
|
"when compiling with -Os");
|
2012-12-12 09:11:46 +08:00
|
|
|
DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n");
|
2013-01-29 00:02:45 +08:00
|
|
|
return Factor;
|
2012-12-12 09:11:46 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-08-02 08:14:03 +08:00
|
|
|
int UserVF = Hints->getWidth();
|
2012-12-12 09:11:46 +08:00
|
|
|
if (UserVF != 0) {
|
|
|
|
assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
|
2013-10-03 04:04:29 +08:00
|
|
|
DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
|
2012-12-12 09:11:46 +08:00
|
|
|
|
2013-01-29 00:02:45 +08:00
|
|
|
Factor.Width = UserVF;
|
|
|
|
return Factor;
|
2012-12-12 09:11:46 +08:00
|
|
|
}
|
|
|
|
|
2012-10-25 04:36:32 +08:00
|
|
|
float Cost = expectedCost(1);
|
2014-04-29 17:45:08 +08:00
|
|
|
#ifndef NDEBUG
|
2014-04-29 16:55:11 +08:00
|
|
|
const float ScalarCost = Cost;
|
2014-04-29 17:45:08 +08:00
|
|
|
#endif /* NDEBUG */
|
2012-10-25 04:36:32 +08:00
|
|
|
unsigned Width = 1;
|
2014-04-29 17:45:08 +08:00
|
|
|
DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
|
2014-04-29 16:55:11 +08:00
|
|
|
|
2014-08-02 08:14:03 +08:00
|
|
|
bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
|
2014-04-29 16:55:11 +08:00
|
|
|
// Ignore scalar width, because the user explicitly wants vectorization.
|
|
|
|
if (ForceVectorization && VF > 1) {
|
|
|
|
Width = 2;
|
|
|
|
Cost = expectedCost(Width) / (float)Width;
|
|
|
|
}
|
|
|
|
|
2012-10-25 04:36:32 +08:00
|
|
|
for (unsigned i=2; i <= VF; i*=2) {
|
|
|
|
// Notice that the vector loop needs to be executed less times, so
|
|
|
|
// we need to divide the cost of the vector loops by the width of
|
|
|
|
// the vector elements.
|
|
|
|
float VectorCost = expectedCost(i) / (float)i;
|
2013-10-03 04:04:29 +08:00
|
|
|
DEBUG(dbgs() << "LV: Vector loop of width " << i << " costs: " <<
|
2014-11-29 05:29:14 +08:00
|
|
|
(int)VectorCost << ".\n");
|
2012-10-25 04:36:32 +08:00
|
|
|
if (VectorCost < Cost) {
|
|
|
|
Cost = VectorCost;
|
|
|
|
Width = i;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-04-29 16:55:11 +08:00
|
|
|
DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
|
|
|
|
<< "LV: Vectorization seems to be not beneficial, "
|
|
|
|
<< "but was forced by a user.\n");
|
2014-04-15 17:37:30 +08:00
|
|
|
DEBUG(dbgs() << "LV: Selecting VF: "<< Width << ".\n");
|
2013-01-29 00:02:45 +08:00
|
|
|
Factor.Width = Width;
|
|
|
|
Factor.Cost = Width * Cost;
|
|
|
|
return Factor;
|
2012-10-25 04:36:32 +08:00
|
|
|
}
|
|
|
|
|
2013-01-10 06:29:00 +08:00
|
|
|
unsigned LoopVectorizationCostModel::getWidestType() {
|
|
|
|
unsigned MaxWidth = 8;
|
2015-03-10 10:37:25 +08:00
|
|
|
const DataLayout &DL = TheFunction->getParent()->getDataLayout();
|
2013-01-10 06:29:00 +08:00
|
|
|
|
|
|
|
// For each block.
|
|
|
|
for (Loop::block_iterator bb = TheLoop->block_begin(),
|
|
|
|
be = TheLoop->block_end(); bb != be; ++bb) {
|
|
|
|
BasicBlock *BB = *bb;
|
|
|
|
|
|
|
|
// For each instruction in the loop.
|
|
|
|
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
|
2013-01-11 15:11:59 +08:00
|
|
|
Type *T = it->getType();
|
|
|
|
|
2014-10-15 06:59:49 +08:00
|
|
|
// Ignore ephemeral values.
|
|
|
|
if (EphValues.count(it))
|
|
|
|
continue;
|
|
|
|
|
2013-01-11 15:11:59 +08:00
|
|
|
// Only examine Loads, Stores and PHINodes.
|
|
|
|
if (!isa<LoadInst>(it) && !isa<StoreInst>(it) && !isa<PHINode>(it))
|
2013-01-10 06:29:00 +08:00
|
|
|
continue;
|
|
|
|
|
2013-01-11 15:11:59 +08:00
|
|
|
// Examine PHI nodes that are reduction variables.
|
|
|
|
if (PHINode *PN = dyn_cast<PHINode>(it))
|
|
|
|
if (!Legal->getReductionVars()->count(PN))
|
|
|
|
continue;
|
2013-01-10 06:29:00 +08:00
|
|
|
|
2013-01-11 15:11:59 +08:00
|
|
|
// Examine the stored values.
|
2013-02-14 05:12:29 +08:00
|
|
|
if (StoreInst *ST = dyn_cast<StoreInst>(it))
|
2013-01-10 06:29:00 +08:00
|
|
|
T = ST->getValueOperand()->getType();
|
|
|
|
|
2013-02-05 23:08:02 +08:00
|
|
|
// Ignore loaded pointer types and stored pointer types that are not
|
|
|
|
// consecutive. However, we do want to take consecutive stores/loads of
|
|
|
|
// pointer vectors into account.
|
2013-02-14 05:12:29 +08:00
|
|
|
if (T->isPointerTy() && !isConsecutiveLoadOrStore(it))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
MaxWidth = std::max(MaxWidth,
|
2015-03-10 10:37:25 +08:00
|
|
|
(unsigned)DL.getTypeSizeInBits(T->getScalarType()));
|
2013-01-10 06:29:00 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return MaxWidth;
|
|
|
|
}
|
|
|
|
|
2013-01-05 01:48:25 +08:00
|
|
|
unsigned
|
|
|
|
LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
|
2013-01-20 13:24:29 +08:00
|
|
|
unsigned VF,
|
|
|
|
unsigned LoopCost) {
|
|
|
|
|
|
|
|
// -- The unroll heuristics --
|
|
|
|
// We unroll the loop in order to expose ILP and reduce the loop overhead.
|
|
|
|
// There are many micro-architectural considerations that we can't predict
|
2014-08-26 08:59:15 +08:00
|
|
|
// at this level. For example, frontend pressure (on decode or fetch) due to
|
2013-01-20 13:24:29 +08:00
|
|
|
// code size, or the number and capabilities of the execution ports.
|
|
|
|
//
|
|
|
|
// We use the following heuristics to select the unroll factor:
|
2014-08-26 08:59:15 +08:00
|
|
|
// 1. If the code has reductions, then we unroll in order to break the cross
|
2013-01-20 13:24:29 +08:00
|
|
|
// iteration dependency.
|
2014-08-26 08:59:15 +08:00
|
|
|
// 2. If the loop is really small, then we unroll in order to reduce the loop
|
2013-01-20 13:24:29 +08:00
|
|
|
// overhead.
|
|
|
|
// 3. We don't unroll if we think that we will spill registers to memory due
|
|
|
|
// to the increased register pressure.
|
|
|
|
|
2013-01-05 01:48:25 +08:00
|
|
|
// Use the user preference, unless 'auto' is selected.
|
2014-09-11 01:58:16 +08:00
|
|
|
int UserUF = Hints->getInterleave();
|
2013-01-05 01:48:25 +08:00
|
|
|
if (UserUF != 0)
|
|
|
|
return UserUF;
|
|
|
|
|
2014-08-26 08:59:15 +08:00
|
|
|
// When we optimize for size, we don't unroll.
|
2013-01-05 01:48:25 +08:00
|
|
|
if (OptForSize)
|
|
|
|
return 1;
|
|
|
|
|
2013-06-24 20:09:15 +08:00
|
|
|
// We used the distance for the unroll factor.
|
|
|
|
if (Legal->getMaxSafeDepDistBytes() != -1U)
|
|
|
|
return 1;
|
|
|
|
|
2013-01-08 05:54:51 +08:00
|
|
|
// Do not unroll loops with a relatively small trip count.
|
2014-10-11 13:28:30 +08:00
|
|
|
unsigned TC = SE->getSmallConstantTripCount(TheLoop);
|
2013-01-08 05:54:51 +08:00
|
|
|
if (TC > 1 && TC < TinyTripCountUnrollThreshold)
|
|
|
|
return 1;
|
|
|
|
|
2014-01-27 19:12:14 +08:00
|
|
|
unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1);
|
|
|
|
DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters <<
|
|
|
|
" registers\n");
|
2013-01-05 01:48:25 +08:00
|
|
|
|
2014-01-27 19:12:19 +08:00
|
|
|
if (VF == 1) {
|
|
|
|
if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
|
|
|
|
TargetNumRegisters = ForceTargetNumScalarRegs;
|
|
|
|
} else {
|
|
|
|
if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
|
|
|
|
TargetNumRegisters = ForceTargetNumVectorRegs;
|
|
|
|
}
|
|
|
|
|
2013-01-05 01:48:25 +08:00
|
|
|
LoopVectorizationCostModel::RegisterUsage R = calculateRegisterUsage();
|
|
|
|
// We divide by these constants so assume that we have at least one
|
|
|
|
// instruction that uses at least one register.
|
|
|
|
R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
|
|
|
|
R.NumInstructions = std::max(R.NumInstructions, 1U);
|
|
|
|
|
|
|
|
// We calculate the unroll factor using the following formula.
|
|
|
|
// Subtract the number of loop invariants from the number of available
|
|
|
|
// registers. These registers are used by all of the unrolled instances.
|
|
|
|
// Next, divide the remaining registers by the number of registers that is
|
|
|
|
// required by the loop, in order to estimate how many parallel instances
|
2014-01-27 19:12:24 +08:00
|
|
|
// fit without causing spills. All of this is rounded down if necessary to be
|
|
|
|
// a power of two. We want power of two unroll factors to simplify any
|
|
|
|
// addressing operations or alignment considerations.
|
|
|
|
unsigned UF = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) /
|
|
|
|
R.MaxLocalUsers);
|
2013-01-05 01:48:25 +08:00
|
|
|
|
2014-01-29 12:36:12 +08:00
|
|
|
// Don't count the induction variable as unrolled.
|
|
|
|
if (EnableIndVarRegisterHeur)
|
|
|
|
UF = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) /
|
|
|
|
std::max(1U, (R.MaxLocalUsers - 1)));
|
|
|
|
|
2013-01-05 01:48:25 +08:00
|
|
|
// Clamp the unroll factor ranges to reasonable factors.
|
2015-05-07 01:12:25 +08:00
|
|
|
unsigned MaxInterleaveSize = TTI.getMaxInterleaveFactor(VF);
|
2013-01-20 13:24:29 +08:00
|
|
|
|
2014-01-27 19:12:19 +08:00
|
|
|
// Check if the user has overridden the unroll max.
|
|
|
|
if (VF == 1) {
|
2014-09-11 01:58:16 +08:00
|
|
|
if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
|
|
|
|
MaxInterleaveSize = ForceTargetMaxScalarInterleaveFactor;
|
2014-01-27 19:12:19 +08:00
|
|
|
} else {
|
2014-09-11 01:58:16 +08:00
|
|
|
if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
|
|
|
|
MaxInterleaveSize = ForceTargetMaxVectorInterleaveFactor;
|
2014-01-27 19:12:19 +08:00
|
|
|
}
|
|
|
|
|
2013-01-20 13:24:29 +08:00
|
|
|
// If we did not calculate the cost for VF (because the user selected the VF)
|
|
|
|
// then we calculate the cost of VF here.
|
|
|
|
if (LoopCost == 0)
|
|
|
|
LoopCost = expectedCost(VF);
|
|
|
|
|
|
|
|
// Clamp the calculated UF to be between the 1 and the max unroll factor
|
|
|
|
// that the target allows.
|
2014-09-11 01:58:16 +08:00
|
|
|
if (UF > MaxInterleaveSize)
|
|
|
|
UF = MaxInterleaveSize;
|
2013-01-05 01:48:25 +08:00
|
|
|
else if (UF < 1)
|
|
|
|
UF = 1;
|
|
|
|
|
2014-01-27 16:17:58 +08:00
|
|
|
// Unroll if we vectorized this loop and there is a reduction that could
|
|
|
|
// benefit from unrolling.
|
|
|
|
if (VF > 1 && Legal->getReductionVars()->size()) {
|
2013-10-03 04:04:29 +08:00
|
|
|
DEBUG(dbgs() << "LV: Unrolling because of reductions.\n");
|
2013-01-20 13:24:29 +08:00
|
|
|
return UF;
|
|
|
|
}
|
|
|
|
|
2014-01-31 18:51:08 +08:00
|
|
|
// Note that if we've already vectorized the loop we will have done the
|
|
|
|
// runtime check and so unrolling won't require further checks.
|
|
|
|
bool UnrollingRequiresRuntimePointerCheck =
|
|
|
|
(VF == 1 && Legal->getRuntimePointerCheck()->Need);
|
|
|
|
|
|
|
|
// We want to unroll small loops in order to reduce the loop overhead and
|
|
|
|
// potentially expose ILP opportunities.
|
|
|
|
DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
|
|
|
|
if (!UnrollingRequiresRuntimePointerCheck &&
|
2014-01-28 09:01:53 +08:00
|
|
|
LoopCost < SmallLoopCost) {
|
2014-01-31 18:51:08 +08:00
|
|
|
// We assume that the cost overhead is 1 and we use the cost model
|
|
|
|
// to estimate the cost of the loop and unroll until the cost of the
|
|
|
|
// loop overhead is about 5% of the cost of the loop.
|
|
|
|
unsigned SmallUF = std::min(UF, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
|
|
|
|
|
2014-01-28 09:01:53 +08:00
|
|
|
// Unroll until store/load ports (estimated by max unroll factor) are
|
|
|
|
// saturated.
|
2015-02-02 00:56:02 +08:00
|
|
|
unsigned NumStores = Legal->getNumStores();
|
|
|
|
unsigned NumLoads = Legal->getNumLoads();
|
|
|
|
unsigned StoresUF = UF / (NumStores ? NumStores : 1);
|
|
|
|
unsigned LoadsUF = UF / (NumLoads ? NumLoads : 1);
|
2014-01-31 18:51:08 +08:00
|
|
|
|
2014-08-21 07:53:52 +08:00
|
|
|
// If we have a scalar reduction (vector reductions are already dealt with
|
|
|
|
// by this point), we can increase the critical path length if the loop
|
|
|
|
// we're unrolling is inside another loop. Limit, by default to 2, so the
|
|
|
|
// critical path only gets increased by one reduction operation.
|
|
|
|
if (Legal->getReductionVars()->size() &&
|
|
|
|
TheLoop->getLoopDepth() > 1) {
|
|
|
|
unsigned F = static_cast<unsigned>(MaxNestedScalarReductionUF);
|
|
|
|
SmallUF = std::min(SmallUF, F);
|
|
|
|
StoresUF = std::min(StoresUF, F);
|
|
|
|
LoadsUF = std::min(LoadsUF, F);
|
|
|
|
}
|
|
|
|
|
2014-01-31 18:51:08 +08:00
|
|
|
if (EnableLoadStoreRuntimeUnroll && std::max(StoresUF, LoadsUF) > SmallUF) {
|
|
|
|
DEBUG(dbgs() << "LV: Unrolling to saturate store or load ports.\n");
|
|
|
|
return std::max(StoresUF, LoadsUF);
|
|
|
|
}
|
2014-01-28 09:01:53 +08:00
|
|
|
|
2013-10-03 04:04:29 +08:00
|
|
|
DEBUG(dbgs() << "LV: Unrolling to reduce branch cost.\n");
|
2014-01-31 18:51:08 +08:00
|
|
|
return SmallUF;
|
2013-01-20 13:24:29 +08:00
|
|
|
}
|
|
|
|
|
2015-03-07 07:12:04 +08:00
|
|
|
// Unroll if this is a large loop (small loops are already dealt with by this
|
|
|
|
// point) that could benefit from interleaved unrolling.
|
|
|
|
bool HasReductions = (Legal->getReductionVars()->size() > 0);
|
|
|
|
if (TTI.enableAggressiveInterleaving(HasReductions)) {
|
|
|
|
DEBUG(dbgs() << "LV: Unrolling to expose ILP.\n");
|
|
|
|
return UF;
|
|
|
|
}
|
|
|
|
|
2013-10-03 04:04:29 +08:00
|
|
|
DEBUG(dbgs() << "LV: Not Unrolling.\n");
|
2013-01-20 13:24:29 +08:00
|
|
|
return 1;
|
2013-01-05 01:48:25 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
LoopVectorizationCostModel::RegisterUsage
|
|
|
|
LoopVectorizationCostModel::calculateRegisterUsage() {
|
|
|
|
// This function calculates the register usage by measuring the highest number
|
|
|
|
// of values that are alive at a single location. Obviously, this is a very
|
|
|
|
// rough estimation. We scan the loop in a topological order in order and
|
|
|
|
// assign a number to each instruction. We use RPO to ensure that defs are
|
|
|
|
// met before their users. We assume that each instruction that has in-loop
|
|
|
|
// users starts an interval. We record every time that an in-loop value is
|
|
|
|
// used, so we have a list of the first and last occurrences of each
|
|
|
|
// instruction. Next, we transpose this data structure into a multi map that
|
|
|
|
// holds the list of intervals that *end* at a specific location. This multi
|
|
|
|
// map allows us to perform a linear search. We scan the instructions linearly
|
|
|
|
// and record each time that a new interval starts, by placing it in a set.
|
|
|
|
// If we find this value in the multi-map then we remove it from the set.
|
|
|
|
// The max register usage is the maximum size of the set.
|
|
|
|
// We also search for instructions that are defined outside the loop, but are
|
|
|
|
// used inside the loop. We need this number separately from the max-interval
|
|
|
|
// usage number because when we unroll, loop-invariant values do not take
|
|
|
|
// more register.
|
|
|
|
LoopBlocksDFS DFS(TheLoop);
|
|
|
|
DFS.perform(LI);
|
|
|
|
|
|
|
|
RegisterUsage R;
|
|
|
|
R.NumInstructions = 0;
|
|
|
|
|
|
|
|
// Each 'key' in the map opens a new interval. The values
|
|
|
|
// of the map are the index of the 'last seen' usage of the
|
|
|
|
// instruction that is the key.
|
|
|
|
typedef DenseMap<Instruction*, unsigned> IntervalMap;
|
|
|
|
// Maps instruction to its index.
|
|
|
|
DenseMap<unsigned, Instruction*> IdxToInstr;
|
|
|
|
// Marks the end of each interval.
|
|
|
|
IntervalMap EndPoint;
|
|
|
|
// Saves the list of instruction indices that are used in the loop.
|
|
|
|
SmallSet<Instruction*, 8> Ends;
|
|
|
|
// Saves the list of values that are used in the loop but are
|
|
|
|
// defined outside the loop, such as arguments and constants.
|
|
|
|
SmallPtrSet<Value*, 8> LoopInvariants;
|
|
|
|
|
|
|
|
unsigned Index = 0;
|
|
|
|
for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(),
|
|
|
|
be = DFS.endRPO(); bb != be; ++bb) {
|
|
|
|
R.NumInstructions += (*bb)->size();
|
|
|
|
for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
|
|
|
|
++it) {
|
|
|
|
Instruction *I = it;
|
|
|
|
IdxToInstr[Index++] = I;
|
|
|
|
|
|
|
|
// Save the end location of each USE.
|
|
|
|
for (unsigned i = 0; i < I->getNumOperands(); ++i) {
|
|
|
|
Value *U = I->getOperand(i);
|
|
|
|
Instruction *Instr = dyn_cast<Instruction>(U);
|
|
|
|
|
|
|
|
// Ignore non-instruction values such as arguments, constants, etc.
|
|
|
|
if (!Instr) continue;
|
|
|
|
|
|
|
|
// If this instruction is outside the loop then record it and continue.
|
|
|
|
if (!TheLoop->contains(Instr)) {
|
|
|
|
LoopInvariants.insert(Instr);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Overwrite previous end points.
|
|
|
|
EndPoint[Instr] = Index;
|
|
|
|
Ends.insert(Instr);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Saves the list of intervals that end with the index in 'key'.
|
|
|
|
typedef SmallVector<Instruction*, 2> InstrList;
|
|
|
|
DenseMap<unsigned, InstrList> TransposeEnds;
|
|
|
|
|
|
|
|
// Transpose the EndPoints to a list of values that end at each index.
|
|
|
|
for (IntervalMap::iterator it = EndPoint.begin(), e = EndPoint.end();
|
|
|
|
it != e; ++it)
|
|
|
|
TransposeEnds[it->second].push_back(it->first);
|
|
|
|
|
|
|
|
SmallSet<Instruction*, 8> OpenIntervals;
|
|
|
|
unsigned MaxUsage = 0;
|
|
|
|
|
|
|
|
|
|
|
|
DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
|
|
|
|
for (unsigned int i = 0; i < Index; ++i) {
|
|
|
|
Instruction *I = IdxToInstr[i];
|
|
|
|
// Ignore instructions that are never used within the loop.
|
|
|
|
if (!Ends.count(I)) continue;
|
|
|
|
|
2014-10-15 06:59:49 +08:00
|
|
|
// Ignore ephemeral values.
|
|
|
|
if (EphValues.count(I))
|
|
|
|
continue;
|
|
|
|
|
2013-01-05 01:48:25 +08:00
|
|
|
// Remove all of the instructions that end at this location.
|
|
|
|
InstrList &List = TransposeEnds[i];
|
2013-01-05 05:08:44 +08:00
|
|
|
for (unsigned int j=0, e = List.size(); j < e; ++j)
|
|
|
|
OpenIntervals.erase(List[j]);
|
2013-01-05 01:48:25 +08:00
|
|
|
|
|
|
|
// Count the number of live interals.
|
|
|
|
MaxUsage = std::max(MaxUsage, OpenIntervals.size());
|
|
|
|
|
|
|
|
DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " <<
|
2013-10-03 04:04:29 +08:00
|
|
|
OpenIntervals.size() << '\n');
|
2013-01-05 01:48:25 +08:00
|
|
|
|
|
|
|
// Add the current instruction to the list of open intervals.
|
|
|
|
OpenIntervals.insert(I);
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned Invariant = LoopInvariants.size();
|
2013-10-03 04:04:29 +08:00
|
|
|
DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << '\n');
|
|
|
|
DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n');
|
|
|
|
DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << '\n');
|
2013-01-05 01:48:25 +08:00
|
|
|
|
|
|
|
R.LoopInvariantRegs = Invariant;
|
|
|
|
R.MaxLocalUsers = MaxUsage;
|
|
|
|
return R;
|
|
|
|
}
|
|
|
|
|
2012-10-25 04:36:32 +08:00
|
|
|
unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
|
|
|
|
unsigned Cost = 0;
|
|
|
|
|
2012-12-04 06:46:31 +08:00
|
|
|
// For each block.
|
|
|
|
for (Loop::block_iterator bb = TheLoop->block_begin(),
|
|
|
|
be = TheLoop->block_end(); bb != be; ++bb) {
|
|
|
|
unsigned BlockCost = 0;
|
|
|
|
BasicBlock *BB = *bb;
|
2012-12-04 08:49:28 +08:00
|
|
|
|
2012-12-04 06:46:31 +08:00
|
|
|
// For each instruction in the old loop.
|
|
|
|
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
|
2013-03-10 00:27:27 +08:00
|
|
|
// Skip dbg intrinsics.
|
|
|
|
if (isa<DbgInfoIntrinsic>(it))
|
2013-03-09 23:56:34 +08:00
|
|
|
continue;
|
|
|
|
|
2014-10-15 06:59:49 +08:00
|
|
|
// Ignore ephemeral values.
|
|
|
|
if (EphValues.count(it))
|
|
|
|
continue;
|
|
|
|
|
2012-12-04 06:46:31 +08:00
|
|
|
unsigned C = getInstructionCost(it, VF);
|
2014-01-27 19:41:50 +08:00
|
|
|
|
|
|
|
// Check if we should override the cost.
|
|
|
|
if (ForceTargetInstructionCost.getNumOccurrences() > 0)
|
|
|
|
C = ForceTargetInstructionCost;
|
|
|
|
|
2013-07-23 01:10:48 +08:00
|
|
|
BlockCost += C;
|
2013-10-03 04:04:29 +08:00
|
|
|
DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF " <<
|
|
|
|
VF << " For instruction: " << *it << '\n');
|
2012-12-04 06:46:31 +08:00
|
|
|
}
|
|
|
|
|
2012-12-04 15:11:52 +08:00
|
|
|
// We assume that if-converted blocks have a 50% chance of being executed.
|
|
|
|
// When the code is scalar then some of the blocks are avoided due to CF.
|
|
|
|
// When the code is vectorized we execute all code paths.
|
2013-07-23 01:10:48 +08:00
|
|
|
if (VF == 1 && Legal->blockNeedsPredication(*bb))
|
2012-12-04 15:11:52 +08:00
|
|
|
BlockCost /= 2;
|
2012-12-04 08:49:28 +08:00
|
|
|
|
2012-12-04 06:46:31 +08:00
|
|
|
Cost += BlockCost;
|
2012-10-25 04:36:32 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return Cost;
|
|
|
|
}
|
|
|
|
|
2013-07-13 03:16:02 +08:00
|
|
|
/// \brief Check whether the address computation for a non-consecutive memory
|
|
|
|
/// access looks like an unlikely candidate for being merged into the indexing
|
|
|
|
/// mode.
|
|
|
|
///
|
|
|
|
/// We look for a GEP which has one index that is an induction variable and all
|
|
|
|
/// other indices are loop invariant. If the stride of this access is also
|
|
|
|
/// within a small bound we decide that this address computation can likely be
|
|
|
|
/// merged into the addressing mode.
|
|
|
|
/// In all other cases, we identify the address computation as complex.
|
|
|
|
static bool isLikelyComplexAddressComputation(Value *Ptr,
|
|
|
|
LoopVectorizationLegality *Legal,
|
|
|
|
ScalarEvolution *SE,
|
|
|
|
const Loop *TheLoop) {
|
|
|
|
GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
|
|
|
|
if (!Gep)
|
|
|
|
return true;
|
|
|
|
|
|
|
|
// We are looking for a gep with all loop invariant indices except for one
|
|
|
|
// which should be an induction variable.
|
|
|
|
unsigned NumOperands = Gep->getNumOperands();
|
|
|
|
for (unsigned i = 1; i < NumOperands; ++i) {
|
|
|
|
Value *Opd = Gep->getOperand(i);
|
|
|
|
if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
|
|
|
|
!Legal->isInductionVariable(Opd))
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Now we know we have a GEP ptr, %inv, %ind, %inv. Make sure that the step
|
|
|
|
// can likely be merged into the address computation.
|
|
|
|
unsigned MaxMergeDistance = 64;
|
|
|
|
|
|
|
|
const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Ptr));
|
|
|
|
if (!AddRec)
|
|
|
|
return true;
|
|
|
|
|
|
|
|
// Check the step is constant.
|
|
|
|
const SCEV *Step = AddRec->getStepRecurrence(*SE);
|
|
|
|
// Calculate the pointer stride and check if it is consecutive.
|
|
|
|
const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
|
|
|
|
if (!C)
|
|
|
|
return true;
|
|
|
|
|
|
|
|
const APInt &APStepVal = C->getValue()->getValue();
|
|
|
|
|
|
|
|
// Huge step value - give up.
|
|
|
|
if (APStepVal.getBitWidth() > 64)
|
|
|
|
return true;
|
|
|
|
|
|
|
|
int64_t StepVal = APStepVal.getSExtValue();
|
|
|
|
|
|
|
|
return StepVal > MaxMergeDistance;
|
|
|
|
}
|
|
|
|
|
2014-01-11 02:20:32 +08:00
|
|
|
static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
|
|
|
|
if (Legal->hasStride(I->getOperand(0)) || Legal->hasStride(I->getOperand(1)))
|
|
|
|
return true;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2012-10-25 04:36:32 +08:00
|
|
|
unsigned
|
|
|
|
LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
|
2012-10-27 07:49:28 +08:00
|
|
|
// If we know that this instruction will remain uniform, check the cost of
|
|
|
|
// the scalar version.
|
|
|
|
if (Legal->isUniformAfterVectorization(I))
|
|
|
|
VF = 1;
|
|
|
|
|
2012-10-26 05:03:48 +08:00
|
|
|
Type *RetTy = I->getType();
|
|
|
|
Type *VectorTy = ToVectorTy(RetTy, VF);
|
|
|
|
|
|
|
|
// TODO: We need to estimate the cost of intrinsic calls.
|
2012-10-25 04:36:32 +08:00
|
|
|
switch (I->getOpcode()) {
|
2012-12-11 05:39:02 +08:00
|
|
|
case Instruction::GetElementPtr:
|
2013-02-08 22:50:48 +08:00
|
|
|
// We mark this instruction as zero-cost because the cost of GEPs in
|
|
|
|
// vectorized code depends on whether the corresponding memory instruction
|
|
|
|
// is scalarized or not. Therefore, we handle GEPs with the memory
|
|
|
|
// instruction cost.
|
2012-12-11 05:39:02 +08:00
|
|
|
return 0;
|
|
|
|
case Instruction::Br: {
|
2013-01-07 19:12:29 +08:00
|
|
|
return TTI.getCFInstrCost(I->getOpcode());
|
2012-12-11 05:39:02 +08:00
|
|
|
}
|
|
|
|
case Instruction::PHI:
|
|
|
|
//TODO: IF-converted IFs become selects.
|
|
|
|
return 0;
|
|
|
|
case Instruction::Add:
|
|
|
|
case Instruction::FAdd:
|
|
|
|
case Instruction::Sub:
|
|
|
|
case Instruction::FSub:
|
|
|
|
case Instruction::Mul:
|
|
|
|
case Instruction::FMul:
|
|
|
|
case Instruction::UDiv:
|
|
|
|
case Instruction::SDiv:
|
|
|
|
case Instruction::FDiv:
|
|
|
|
case Instruction::URem:
|
|
|
|
case Instruction::SRem:
|
|
|
|
case Instruction::FRem:
|
|
|
|
case Instruction::Shl:
|
|
|
|
case Instruction::LShr:
|
|
|
|
case Instruction::AShr:
|
|
|
|
case Instruction::And:
|
|
|
|
case Instruction::Or:
|
2013-04-05 07:26:27 +08:00
|
|
|
case Instruction::Xor: {
|
2014-01-11 02:20:32 +08:00
|
|
|
// Since we will replace the stride by 1 the multiplication should go away.
|
|
|
|
if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
|
|
|
|
return 0;
|
2013-04-05 07:26:27 +08:00
|
|
|
// Certain instructions can be cheaper to vectorize if they have a constant
|
|
|
|
// second vector operand. One example of this are shifts on x86.
|
|
|
|
TargetTransformInfo::OperandValueKind Op1VK =
|
|
|
|
TargetTransformInfo::OK_AnyValue;
|
|
|
|
TargetTransformInfo::OperandValueKind Op2VK =
|
|
|
|
TargetTransformInfo::OK_AnyValue;
|
2014-08-25 12:56:54 +08:00
|
|
|
TargetTransformInfo::OperandValueProperties Op1VP =
|
|
|
|
TargetTransformInfo::OP_None;
|
|
|
|
TargetTransformInfo::OperandValueProperties Op2VP =
|
|
|
|
TargetTransformInfo::OP_None;
|
2014-02-13 07:43:47 +08:00
|
|
|
Value *Op2 = I->getOperand(1);
|
2013-04-05 07:26:27 +08:00
|
|
|
|
2014-02-13 07:43:47 +08:00
|
|
|
// Check for a splat of a constant or for a non uniform vector of constants.
|
2014-08-25 12:56:54 +08:00
|
|
|
if (isa<ConstantInt>(Op2)) {
|
|
|
|
ConstantInt *CInt = cast<ConstantInt>(Op2);
|
|
|
|
if (CInt && CInt->getValue().isPowerOf2())
|
|
|
|
Op2VP = TargetTransformInfo::OP_PowerOf2;
|
2013-04-05 07:26:27 +08:00
|
|
|
Op2VK = TargetTransformInfo::OK_UniformConstantValue;
|
2014-08-25 12:56:54 +08:00
|
|
|
} else if (isa<ConstantVector>(Op2) || isa<ConstantDataVector>(Op2)) {
|
2014-02-13 07:43:47 +08:00
|
|
|
Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
|
2014-08-25 12:56:54 +08:00
|
|
|
Constant *SplatValue = cast<Constant>(Op2)->getSplatValue();
|
|
|
|
if (SplatValue) {
|
|
|
|
ConstantInt *CInt = dyn_cast<ConstantInt>(SplatValue);
|
|
|
|
if (CInt && CInt->getValue().isPowerOf2())
|
|
|
|
Op2VP = TargetTransformInfo::OP_PowerOf2;
|
2014-02-13 07:43:47 +08:00
|
|
|
Op2VK = TargetTransformInfo::OK_UniformConstantValue;
|
2014-08-25 12:56:54 +08:00
|
|
|
}
|
2014-02-13 07:43:47 +08:00
|
|
|
}
|
2013-04-05 07:26:27 +08:00
|
|
|
|
2014-08-25 12:56:54 +08:00
|
|
|
return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK,
|
|
|
|
Op1VP, Op2VP);
|
2013-04-05 07:26:27 +08:00
|
|
|
}
|
2012-12-11 05:39:02 +08:00
|
|
|
case Instruction::Select: {
|
|
|
|
SelectInst *SI = cast<SelectInst>(I);
|
|
|
|
const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
|
|
|
|
bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
|
|
|
|
Type *CondTy = SI->getCondition()->getType();
|
2013-03-15 02:54:36 +08:00
|
|
|
if (!ScalarCond)
|
2012-12-11 05:39:02 +08:00
|
|
|
CondTy = VectorType::get(CondTy, VF);
|
|
|
|
|
2013-01-07 19:12:29 +08:00
|
|
|
return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy);
|
2012-12-11 05:39:02 +08:00
|
|
|
}
|
|
|
|
case Instruction::ICmp:
|
|
|
|
case Instruction::FCmp: {
|
|
|
|
Type *ValTy = I->getOperand(0)->getType();
|
|
|
|
VectorTy = ToVectorTy(ValTy, VF);
|
2013-01-07 19:12:29 +08:00
|
|
|
return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy);
|
2012-12-11 05:39:02 +08:00
|
|
|
}
|
2013-02-08 03:05:21 +08:00
|
|
|
case Instruction::Store:
|
|
|
|
case Instruction::Load: {
|
|
|
|
StoreInst *SI = dyn_cast<StoreInst>(I);
|
|
|
|
LoadInst *LI = dyn_cast<LoadInst>(I);
|
|
|
|
Type *ValTy = (SI ? SI->getValueOperand()->getType() :
|
|
|
|
LI->getType());
|
|
|
|
VectorTy = ToVectorTy(ValTy, VF);
|
|
|
|
|
|
|
|
unsigned Alignment = SI ? SI->getAlignment() : LI->getAlignment();
|
|
|
|
unsigned AS = SI ? SI->getPointerAddressSpace() :
|
|
|
|
LI->getPointerAddressSpace();
|
|
|
|
Value *Ptr = SI ? SI->getPointerOperand() : LI->getPointerOperand();
|
2013-02-08 22:50:48 +08:00
|
|
|
// We add the cost of address computation here instead of with the gep
|
|
|
|
// instruction because only here we know whether the operation is
|
|
|
|
// scalarized.
|
2013-02-08 03:05:21 +08:00
|
|
|
if (VF == 1)
|
2013-02-08 22:50:48 +08:00
|
|
|
return TTI.getAddressComputationCost(VectorTy) +
|
|
|
|
TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
|
2013-02-06 02:46:41 +08:00
|
|
|
|
2013-02-08 03:05:21 +08:00
|
|
|
// Scalarized loads/stores.
|
2013-04-25 00:16:03 +08:00
|
|
|
int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
|
|
|
|
bool Reverse = ConsecutiveStride < 0;
|
2015-03-10 10:37:25 +08:00
|
|
|
const DataLayout &DL = I->getModule()->getDataLayout();
|
|
|
|
unsigned ScalarAllocatedSize = DL.getTypeAllocSize(ValTy);
|
|
|
|
unsigned VectorElementSize = DL.getTypeStoreSize(VectorTy) / VF;
|
2013-04-25 00:16:03 +08:00
|
|
|
if (!ConsecutiveStride || ScalarAllocatedSize != VectorElementSize) {
|
2013-07-13 03:16:02 +08:00
|
|
|
bool IsComplexComputation =
|
|
|
|
isLikelyComplexAddressComputation(Ptr, Legal, SE, TheLoop);
|
2013-02-08 03:05:21 +08:00
|
|
|
unsigned Cost = 0;
|
|
|
|
// The cost of extracting from the value vector and pointer vector.
|
|
|
|
Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
|
|
|
|
for (unsigned i = 0; i < VF; ++i) {
|
|
|
|
// The cost of extracting the pointer operand.
|
|
|
|
Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, PtrTy, i);
|
|
|
|
// In case of STORE, the cost of ExtractElement from the vector.
|
|
|
|
// In case of LOAD, the cost of InsertElement into the returned
|
|
|
|
// vector.
|
|
|
|
Cost += TTI.getVectorInstrCost(SI ? Instruction::ExtractElement :
|
|
|
|
Instruction::InsertElement,
|
|
|
|
VectorTy, i);
|
|
|
|
}
|
|
|
|
|
2013-02-08 22:50:48 +08:00
|
|
|
// The cost of the scalar loads/stores.
|
2013-07-13 03:16:02 +08:00
|
|
|
Cost += VF * TTI.getAddressComputationCost(PtrTy, IsComplexComputation);
|
2013-02-08 03:05:21 +08:00
|
|
|
Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
|
|
|
|
Alignment, AS);
|
|
|
|
return Cost;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Wide load/stores.
|
2013-02-08 22:50:48 +08:00
|
|
|
unsigned Cost = TTI.getAddressComputationCost(VectorTy);
|
2015-02-15 16:08:48 +08:00
|
|
|
if (Legal->isMaskRequired(I))
|
|
|
|
Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment,
|
|
|
|
AS);
|
|
|
|
else
|
|
|
|
Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
|
2013-02-08 22:50:48 +08:00
|
|
|
|
2013-02-08 03:05:21 +08:00
|
|
|
if (Reverse)
|
|
|
|
Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
|
|
|
|
VectorTy, 0);
|
|
|
|
return Cost;
|
|
|
|
}
|
2012-12-11 05:39:02 +08:00
|
|
|
case Instruction::ZExt:
|
|
|
|
case Instruction::SExt:
|
|
|
|
case Instruction::FPToUI:
|
|
|
|
case Instruction::FPToSI:
|
|
|
|
case Instruction::FPExt:
|
|
|
|
case Instruction::PtrToInt:
|
|
|
|
case Instruction::IntToPtr:
|
|
|
|
case Instruction::SIToFP:
|
|
|
|
case Instruction::UIToFP:
|
|
|
|
case Instruction::Trunc:
|
|
|
|
case Instruction::FPTrunc:
|
|
|
|
case Instruction::BitCast: {
|
2012-12-13 08:21:03 +08:00
|
|
|
// We optimize the truncation of induction variable.
|
|
|
|
// The cost of these is the same as the scalar operation.
|
|
|
|
if (I->getOpcode() == Instruction::Trunc &&
|
|
|
|
Legal->isInductionVariable(I->getOperand(0)))
|
2013-01-07 19:12:29 +08:00
|
|
|
return TTI.getCastInstrCost(I->getOpcode(), I->getType(),
|
|
|
|
I->getOperand(0)->getType());
|
2012-12-13 08:21:03 +08:00
|
|
|
|
2012-12-11 05:39:02 +08:00
|
|
|
Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF);
|
2013-01-07 19:12:29 +08:00
|
|
|
return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
|
2012-12-11 05:39:02 +08:00
|
|
|
}
|
|
|
|
case Instruction::Call: {
|
2015-03-18 03:46:50 +08:00
|
|
|
bool NeedToScalarize;
|
2013-02-27 23:24:19 +08:00
|
|
|
CallInst *CI = cast<CallInst>(I);
|
2015-03-18 03:46:50 +08:00
|
|
|
unsigned CallCost = getVectorCallCost(CI, VF, TTI, TLI, NeedToScalarize);
|
|
|
|
if (getIntrinsicIDForCall(CI, TLI))
|
|
|
|
return std::min(CallCost, getVectorIntrinsicCost(CI, VF, TTI, TLI));
|
|
|
|
return CallCost;
|
2012-12-11 05:39:02 +08:00
|
|
|
}
|
|
|
|
default: {
|
|
|
|
// We are scalarizing the instruction. Return the cost of the scalar
|
|
|
|
// instruction, plus the cost of insert and extract into vector
|
|
|
|
// elements, times the vector width.
|
|
|
|
unsigned Cost = 0;
|
2012-10-25 04:36:32 +08:00
|
|
|
|
2012-12-23 21:21:41 +08:00
|
|
|
if (!RetTy->isVoidTy() && VF != 1) {
|
2013-01-07 19:12:29 +08:00
|
|
|
unsigned InsCost = TTI.getVectorInstrCost(Instruction::InsertElement,
|
|
|
|
VectorTy);
|
|
|
|
unsigned ExtCost = TTI.getVectorInstrCost(Instruction::ExtractElement,
|
|
|
|
VectorTy);
|
2012-12-23 21:19:18 +08:00
|
|
|
|
|
|
|
// The cost of inserting the results plus extracting each one of the
|
|
|
|
// operands.
|
|
|
|
Cost += VF * (InsCost + ExtCost * I->getNumOperands());
|
|
|
|
}
|
2012-12-11 05:39:02 +08:00
|
|
|
|
2012-12-23 15:23:55 +08:00
|
|
|
// The cost of executing VF copies of the scalar instruction. This opcode
|
|
|
|
// is unknown. Assume that it is the same as 'mul'.
|
2013-01-07 19:12:29 +08:00
|
|
|
Cost += VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy);
|
2012-12-11 05:39:02 +08:00
|
|
|
return Cost;
|
|
|
|
}
|
2012-10-25 04:36:32 +08:00
|
|
|
}// end of switch.
|
|
|
|
}
|
|
|
|
|
2012-10-18 02:25:06 +08:00
|
|
|
char LoopVectorize::ID = 0;
|
|
|
|
static const char lv_name[] = "Loop Vectorization";
|
|
|
|
INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
|
[PM] Change the core design of the TTI analysis to use a polymorphic
type erased interface and a single analysis pass rather than an
extremely complex analysis group.
The end result is that the TTI analysis can contain a type erased
implementation that supports the polymorphic TTI interface. We can build
one from a target-specific implementation or from a dummy one in the IR.
I've also factored all of the code into "mix-in"-able base classes,
including CRTP base classes to facilitate calling back up to the most
specialized form when delegating horizontally across the surface. These
aren't as clean as I would like and I'm planning to work on cleaning
some of this up, but I wanted to start by putting into the right form.
There are a number of reasons for this change, and this particular
design. The first and foremost reason is that an analysis group is
complete overkill, and the chaining delegation strategy was so opaque,
confusing, and high overhead that TTI was suffering greatly for it.
Several of the TTI functions had failed to be implemented in all places
because of the chaining-based delegation making there be no checking of
this. A few other functions were implemented with incorrect delegation.
The message to me was very clear working on this -- the delegation and
analysis group structure was too confusing to be useful here.
The other reason of course is that this is *much* more natural fit for
the new pass manager. This will lay the ground work for a type-erased
per-function info object that can look up the correct subtarget and even
cache it.
Yet another benefit is that this will significantly simplify the
interaction of the pass managers and the TargetMachine. See the future
work below.
The downside of this change is that it is very, very verbose. I'm going
to work to improve that, but it is somewhat an implementation necessity
in C++ to do type erasure. =/ I discussed this design really extensively
with Eric and Hal prior to going down this path, and afterward showed
them the result. No one was really thrilled with it, but there doesn't
seem to be a substantially better alternative. Using a base class and
virtual method dispatch would make the code much shorter, but as
discussed in the update to the programmer's manual and elsewhere,
a polymorphic interface feels like the more principled approach even if
this is perhaps the least compelling example of it. ;]
Ultimately, there is still a lot more to be done here, but this was the
huge chunk that I couldn't really split things out of because this was
the interface change to TTI. I've tried to minimize all the other parts
of this. The follow up work should include at least:
1) Improving the TargetMachine interface by having it directly return
a TTI object. Because we have a non-pass object with value semantics
and an internal type erasure mechanism, we can narrow the interface
of the TargetMachine to *just* do what we need: build and return
a TTI object that we can then insert into the pass pipeline.
2) Make the TTI object be fully specialized for a particular function.
This will include splitting off a minimal form of it which is
sufficient for the inliner and the old pass manager.
3) Add a new pass manager analysis which produces TTI objects from the
target machine for each function. This may actually be done as part
of #2 in order to use the new analysis to implement #2.
4) Work on narrowing the API between TTI and the targets so that it is
easier to understand and less verbose to type erase.
5) Work on narrowing the API between TTI and its clients so that it is
easier to understand and less verbose to forward.
6) Try to improve the CRTP-based delegation. I feel like this code is
just a bit messy and exacerbating the complexity of implementing
the TTI in each target.
Many thanks to Eric and Hal for their help here. I ended up blocked on
this somewhat more abruptly than I expected, and so I appreciate getting
it sorted out very quickly.
Differential Revision: http://reviews.llvm.org/D7293
llvm-svn: 227669
2015-01-31 11:43:40 +08:00
|
|
|
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
|
[LoopVectorize] Use AA to partition potential dependency checks
Prior to this change, the loop vectorizer did not make use of the alias
analysis infrastructure. Instead, it performed memory dependence analysis using
ScalarEvolution-based linear dependence checks within equivalence classes
derived from the results of ValueTracking's GetUnderlyingObjects.
Unfortunately, this meant that:
1. The loop vectorizer had logic that essentially duplicated that in BasicAA
for aliasing based on identified objects.
2. The loop vectorizer could not partition the space of dependency checks
based on information only easily available from within AA (TBAA metadata is
currently the prime example).
This means, for example, regardless of whether -fno-strict-aliasing was
provided, the vectorizer would only vectorize this loop with a runtime
memory-overlap check:
void foo(int *a, float *b) {
for (int i = 0; i < 1600; ++i)
a[i] = b[i];
}
This is suboptimal because the TBAA metadata already provides the information
necessary to show that this check unnecessary. Of course, the vectorizer has a
limit on the number of such checks it will insert, so in practice, ignoring
TBAA means not vectorizing more-complicated loops that we should.
This change causes the vectorizer to use an AliasSetTracker to keep track of
the pointers in the loop. The resulting alias sets are then used to partition
the space of dependency checks, and potential runtime checks; this results in
more-efficient vectorizations.
When pointer locations are added to the AliasSetTracker, two things are done:
1. The location size is set to UnknownSize (otherwise you'd not catch
inter-iteration dependencies)
2. For instructions in blocks that would need to be predicated, TBAA is
removed (because the metadata might have a control dependency on the condition
being speculated).
For non-predicated blocks, you can leave the TBAA metadata. This is safe
because you can't have an iteration dependency on the TBAA metadata (if you
did, and you unrolled sufficiently, you'd end up with the same pointer value
used by two accesses that TBAA says should not alias, and that would yield
undefined behavior).
llvm-svn: 213486
2014-07-21 07:07:52 +08:00
|
|
|
INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
|
2015-01-04 20:03:27 +08:00
|
|
|
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
|
2014-01-27 21:11:50 +08:00
|
|
|
INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfo)
|
2014-01-13 21:07:17 +08:00
|
|
|
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
|
2012-10-18 02:25:06 +08:00
|
|
|
INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
|
2013-10-13 02:29:15 +08:00
|
|
|
INITIALIZE_PASS_DEPENDENCY(LCSSA)
|
2015-01-17 22:16:18 +08:00
|
|
|
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
|
2012-10-18 02:25:06 +08:00
|
|
|
INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
|
2015-02-20 03:15:04 +08:00
|
|
|
INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis)
|
2012-10-18 02:25:06 +08:00
|
|
|
INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
|
|
|
|
|
|
|
|
namespace llvm {
|
2013-12-06 05:20:02 +08:00
|
|
|
Pass *createLoopVectorizePass(bool NoUnrolling, bool AlwaysVectorize) {
|
|
|
|
return new LoopVectorize(NoUnrolling, AlwaysVectorize);
|
2012-10-18 02:25:06 +08:00
|
|
|
}
|
|
|
|
}
|
2012-12-11 05:39:02 +08:00
|
|
|
|
2013-02-05 23:08:02 +08:00
|
|
|
bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
|
|
|
|
// Check for a store.
|
2013-02-14 05:12:29 +08:00
|
|
|
if (StoreInst *ST = dyn_cast<StoreInst>(Inst))
|
2013-02-05 23:08:02 +08:00
|
|
|
return Legal->isConsecutivePtr(ST->getPointerOperand()) != 0;
|
2012-12-11 05:39:02 +08:00
|
|
|
|
2013-02-05 23:08:02 +08:00
|
|
|
// Check for a load.
|
2013-02-14 05:12:29 +08:00
|
|
|
if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
|
2013-02-05 23:08:02 +08:00
|
|
|
return Legal->isConsecutivePtr(LI->getPointerOperand()) != 0;
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
2013-08-27 06:33:26 +08:00
|
|
|
|
|
|
|
|
2014-01-28 09:01:53 +08:00
|
|
|
void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
|
|
|
|
bool IfPredicateStore) {
|
2013-08-27 06:33:26 +08:00
|
|
|
assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
|
|
|
|
// Holds vector parameters or scalars, in case of uniform vals.
|
|
|
|
SmallVector<VectorParts, 4> Params;
|
|
|
|
|
|
|
|
setDebugLocFromInst(Builder, Instr);
|
|
|
|
|
|
|
|
// Find all of the vectorized parameters.
|
|
|
|
for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
|
|
|
|
Value *SrcOp = Instr->getOperand(op);
|
|
|
|
|
|
|
|
// If we are accessing the old induction variable, use the new one.
|
|
|
|
if (SrcOp == OldInduction) {
|
|
|
|
Params.push_back(getVectorValue(SrcOp));
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Try using previously calculated values.
|
|
|
|
Instruction *SrcInst = dyn_cast<Instruction>(SrcOp);
|
|
|
|
|
|
|
|
// If the src is an instruction that appeared earlier in the basic block
|
|
|
|
// then it should already be vectorized.
|
|
|
|
if (SrcInst && OrigLoop->contains(SrcInst)) {
|
|
|
|
assert(WidenMap.has(SrcInst) && "Source operand is unavailable");
|
|
|
|
// The parameter is a vector value from earlier.
|
|
|
|
Params.push_back(WidenMap.get(SrcInst));
|
|
|
|
} else {
|
|
|
|
// The parameter is a scalar from outside the loop. Maybe even a constant.
|
|
|
|
VectorParts Scalars;
|
|
|
|
Scalars.append(UF, SrcOp);
|
|
|
|
Params.push_back(Scalars);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
assert(Params.size() == Instr->getNumOperands() &&
|
|
|
|
"Invalid number of operands");
|
|
|
|
|
|
|
|
// Does this instruction return a value ?
|
|
|
|
bool IsVoidRetTy = Instr->getType()->isVoidTy();
|
|
|
|
|
2014-04-25 13:29:35 +08:00
|
|
|
Value *UndefVec = IsVoidRetTy ? nullptr :
|
2013-08-27 06:33:26 +08:00
|
|
|
UndefValue::get(Instr->getType());
|
|
|
|
// Create a new entry in the WidenMap and initialize it to Undef or Null.
|
|
|
|
VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
|
|
|
|
|
2014-01-28 09:01:53 +08:00
|
|
|
Instruction *InsertPt = Builder.GetInsertPoint();
|
|
|
|
BasicBlock *IfBlock = Builder.GetInsertBlock();
|
2014-04-25 13:29:35 +08:00
|
|
|
BasicBlock *CondBlock = nullptr;
|
2014-01-28 09:01:53 +08:00
|
|
|
|
|
|
|
VectorParts Cond;
|
2014-04-25 13:29:35 +08:00
|
|
|
Loop *VectorLp = nullptr;
|
2014-01-28 09:01:53 +08:00
|
|
|
if (IfPredicateStore) {
|
|
|
|
assert(Instr->getParent()->getSinglePredecessor() &&
|
|
|
|
"Only support single predecessor blocks");
|
|
|
|
Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(),
|
|
|
|
Instr->getParent());
|
|
|
|
VectorLp = LI->getLoopFor(IfBlock);
|
|
|
|
assert(VectorLp && "Must have a loop for this block");
|
|
|
|
}
|
|
|
|
|
2013-08-27 06:33:26 +08:00
|
|
|
// For each vector unroll 'part':
|
|
|
|
for (unsigned Part = 0; Part < UF; ++Part) {
|
|
|
|
// For each scalar that we create:
|
|
|
|
|
2014-01-28 09:01:53 +08:00
|
|
|
// Start an "if (pred) a[i] = ..." block.
|
2014-04-25 13:29:35 +08:00
|
|
|
Value *Cmp = nullptr;
|
2014-01-28 09:01:53 +08:00
|
|
|
if (IfPredicateStore) {
|
|
|
|
if (Cond[Part]->getType()->isVectorTy())
|
|
|
|
Cond[Part] =
|
|
|
|
Builder.CreateExtractElement(Cond[Part], Builder.getInt32(0));
|
|
|
|
Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cond[Part],
|
|
|
|
ConstantInt::get(Cond[Part]->getType(), 1));
|
|
|
|
CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
|
2014-02-09 04:41:13 +08:00
|
|
|
LoopVectorBody.push_back(CondBlock);
|
2015-01-18 09:25:51 +08:00
|
|
|
VectorLp->addBasicBlockToLoop(CondBlock, *LI);
|
2014-01-28 09:01:53 +08:00
|
|
|
// Update Builder with newly created basic block.
|
|
|
|
Builder.SetInsertPoint(InsertPt);
|
|
|
|
}
|
|
|
|
|
2013-08-27 06:33:26 +08:00
|
|
|
Instruction *Cloned = Instr->clone();
|
|
|
|
if (!IsVoidRetTy)
|
|
|
|
Cloned->setName(Instr->getName() + ".cloned");
|
2013-09-28 21:42:22 +08:00
|
|
|
// Replace the operands of the cloned instructions with extracted scalars.
|
2013-08-27 06:33:26 +08:00
|
|
|
for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
|
|
|
|
Value *Op = Params[op][Part];
|
|
|
|
Cloned->setOperand(op, Op);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Place the cloned scalar in the new loop.
|
|
|
|
Builder.Insert(Cloned);
|
|
|
|
|
|
|
|
// If the original scalar returns a value we need to place it in a vector
|
|
|
|
// so that future users will be able to use it.
|
|
|
|
if (!IsVoidRetTy)
|
|
|
|
VecResults[Part] = Cloned;
|
2014-01-28 09:01:53 +08:00
|
|
|
|
|
|
|
// End if-block.
|
|
|
|
if (IfPredicateStore) {
|
|
|
|
BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
|
2014-02-09 04:41:13 +08:00
|
|
|
LoopVectorBody.push_back(NewIfBlock);
|
2015-01-18 09:25:51 +08:00
|
|
|
VectorLp->addBasicBlockToLoop(NewIfBlock, *LI);
|
2014-01-28 09:01:53 +08:00
|
|
|
Builder.SetInsertPoint(InsertPt);
|
|
|
|
Instruction *OldBr = IfBlock->getTerminator();
|
|
|
|
BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
|
|
|
|
OldBr->eraseFromParent();
|
|
|
|
IfBlock = NewIfBlock;
|
|
|
|
}
|
2013-08-27 06:33:26 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-01-11 02:20:32 +08:00
|
|
|
void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) {
|
2014-01-28 09:01:53 +08:00
|
|
|
StoreInst *SI = dyn_cast<StoreInst>(Instr);
|
|
|
|
bool IfPredicateStore = (SI && Legal->blockNeedsPredication(SI->getParent()));
|
|
|
|
|
|
|
|
return scalarizeInstruction(Instr, IfPredicateStore);
|
2013-08-27 06:33:26 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
Value *InnerLoopUnroller::reverseVector(Value *Vec) {
|
|
|
|
return Vec;
|
|
|
|
}
|
|
|
|
|
|
|
|
Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) {
|
|
|
|
return V;
|
|
|
|
}
|
|
|
|
|
2015-01-30 13:02:21 +08:00
|
|
|
Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step) {
|
2013-08-27 06:33:26 +08:00
|
|
|
// When unrolling and the VF is 1, we only need to add a simple scalar.
|
|
|
|
Type *ITy = Val->getType();
|
|
|
|
assert(!ITy->isVectorTy() && "Val must be a scalar");
|
2015-01-30 13:02:21 +08:00
|
|
|
Constant *C = ConstantInt::get(ITy, StartIdx);
|
|
|
|
return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
|
2013-08-27 06:33:26 +08:00
|
|
|
}
|