Refactor vectorization patterns

This CL removes the reliance of the vectorize pass on the specification of a `fastestVaryingDim` parameter. This parameter is a restriction meant to more easily target a particular loop/memref combination for vectorization and is mainly used for testing.

This also had the side-effect of restricting vectorization patterns to only the ones in which all memrefs were contiguous along the same loop dimension. This simple restriction prevented matmul to vectorize in 2-D.

this CL removes the restriction and adds the matmul test which vectorizes in 2-D along the parallel loops. Support for reduction loops is left for future work.

PiperOrigin-RevId: 240993827
This commit is contained in:
Nicolas Vasilache 2019-03-29 09:34:06 -07:00 committed by jpienaar
parent 3ddd0411d0
commit 094ca64ab0
5 changed files with 222 additions and 186 deletions

View File

@ -71,34 +71,34 @@ uint64_t getLargestDivisorOfTripCount(AffineForOp forOp);
///
/// Returns false in cases with more than one AffineApplyOp, this is
/// conservative.
bool isAccessInvariant(Value &iv, Value &index);
bool isAccessInvariant(Value *iv, Value *index);
/// Given an induction variable `iv` of type AffineForOp and `indices` of type
/// IndexType, returns the set of `indices` that are independent of `iv`.
///
/// Prerequisites (inherited from `isAccessInvariant` above):
/// 1. `iv` and `indices` of the proper type;
/// 2. at most one reachable AffineApplyOp from index;
/// 2. at most one affine.apply is reachable from each index in `indices`;
///
/// Returns false in cases with more than one AffineApplyOp, this is
/// conservative.
/// Emits a note if it encounters a chain of affine.apply and conservatively
/// those cases.
llvm::DenseSet<Value *, llvm::DenseMapInfo<Value *>>
getInvariantAccesses(Value &iv, llvm::ArrayRef<Value *> indices);
getInvariantAccesses(Value *iv, llvm::ArrayRef<Value *> indices);
using VectorizableLoopFun = std::function<bool(AffineForOp)>;
/// Checks whether the loop is structurally vectorizable; i.e.:
/// 1. no conditionals are nested under the loop;
/// 2. all nested load/stores are to scalar MemRefs.
/// 1. no conditionals are nested under the loop;
/// 2. all nested load/stores are to scalar MemRefs.
/// TODO(ntv): relax the no-conditionals restriction
bool isVectorizableLoopBody(AffineForOp loop);
/// Checks whether the loop is structurally vectorizable and that all the LoadOp
/// and StoreOp matched have access indexing functions that are are either:
/// 1. invariant along the loop induction variable created by 'loop';
/// 2. varying along the 'fastestVaryingDim' memory dimension.
bool isVectorizableLoopBodyAlongFastestVaryingMemRefDim(
AffineForOp loop, unsigned fastestVaryingDim);
/// 2. varying along at most one memory dimension. If such a unique dimension
/// is found, it is written into `memRefDim`.
bool isVectorizableLoopBody(AffineForOp loop, int *memRefDim);
/// Checks where SSA dominance would be violated if a for op's body
/// operations are shifted by the specified shifts. This method checks if a

View File

@ -174,36 +174,36 @@ uint64_t mlir::getLargestDivisorOfTripCount(AffineForOp forOp) {
return gcd.getValue();
}
bool mlir::isAccessInvariant(Value &iv, Value &index) {
assert(isForInductionVar(&iv) && "iv must be a AffineForOp");
assert(index.getType().isa<IndexType>() && "index must be of IndexType");
bool mlir::isAccessInvariant(Value *iv, Value *index) {
assert(isForInductionVar(iv) && "iv must be a AffineForOp");
assert(index->getType().isa<IndexType>() && "index must be of IndexType");
SmallVector<Operation *, 4> affineApplyOps;
getReachableAffineApplyOps({&index}, affineApplyOps);
getReachableAffineApplyOps({index}, affineApplyOps);
if (affineApplyOps.empty()) {
// Pointer equality test because of Value pointer semantics.
return &index != &iv;
return index != iv;
}
if (affineApplyOps.size() > 1) {
affineApplyOps[0]->emitError(
affineApplyOps[0]->emitNote(
"CompositionAffineMapsPass must have been run: there should be at most "
"one AffineApplyOp");
"one AffineApplyOp, returning false conservatively.");
return false;
}
auto composeOp = affineApplyOps[0]->cast<AffineApplyOp>();
// We need yet another level of indirection because the `dim` index of the
// access may not correspond to the `dim` index of composeOp.
return !(AffineValueMap(composeOp).isFunctionOf(0, &iv));
return !(AffineValueMap(composeOp).isFunctionOf(0, iv));
}
llvm::DenseSet<Value *>
mlir::getInvariantAccesses(Value &iv, llvm::ArrayRef<Value *> indices) {
mlir::getInvariantAccesses(Value *iv, llvm::ArrayRef<Value *> indices) {
llvm::DenseSet<Value *> res;
for (unsigned idx = 0, n = indices.size(); idx < n; ++idx) {
auto *val = indices[idx];
if (isAccessInvariant(iv, *val)) {
if (isAccessInvariant(iv, val)) {
res.insert(val);
}
}
@ -213,31 +213,30 @@ mlir::getInvariantAccesses(Value &iv, llvm::ArrayRef<Value *> indices) {
/// Given:
/// 1. an induction variable `iv` of type AffineForOp;
/// 2. a `memoryOp` of type const LoadOp& or const StoreOp&;
/// 3. the index of the `fastestVaryingDim` along which to check;
/// determines whether `memoryOp`[`fastestVaryingDim`] is a contiguous access
/// along `iv`.
/// Contiguous is defined as either invariant or varying only along
/// `fastestVaryingDim`.
/// determines whether `memoryOp` has a contiguous access along `iv`. Contiguous
/// is defined as either invariant or varying only along a unique MemRef dim.
/// Upon success, the unique MemRef dim is written in `memRefDim` (or -1 to
/// convey the memRef access is invariant along `iv`).
///
/// Prerequisites:
/// 1. `iv` of the proper type;
/// 2. the MemRef accessed by `memoryOp` has no layout map or at most an
/// 1. `memRefDim` ~= nullptr;
/// 2. `iv` of the proper type;
/// 3. the MemRef accessed by `memoryOp` has no layout map or at most an
/// identity layout map.
///
/// Currently only supports no layoutMap or identity layoutMap in the MemRef.
/// Returns false if the MemRef has a non-identity layoutMap or more than
/// 1 layoutMap. This is conservative.
/// Returns false if the MemRef has a non-identity layoutMap or more than 1
/// layoutMap. This is conservative.
///
// TODO(ntv): check strides.
template <typename LoadOrStoreOp>
static bool isContiguousAccess(Value &iv, LoadOrStoreOp memoryOp,
unsigned fastestVaryingDim) {
static bool isContiguousAccess(Value *iv, LoadOrStoreOp memoryOp,
int *memRefDim) {
static_assert(std::is_same<LoadOrStoreOp, LoadOp>::value ||
std::is_same<LoadOrStoreOp, StoreOp>::value,
"Must be called on either const LoadOp & or const StoreOp &");
assert(memRefDim && "memRefDim == nullptr");
auto memRefType = memoryOp.getMemRefType();
if (fastestVaryingDim >= memRefType.getRank())
return false;
auto layoutMap = memRefType.getAffineMaps();
// TODO(ntv): remove dependence on Builder once we support non-identity
@ -250,17 +249,26 @@ static bool isContiguousAccess(Value &iv, LoadOrStoreOp memoryOp,
return memoryOp.emitError("NYI: non-trivial layoutMap"), false;
}
int uniqueVaryingIndexAlongIv = -1;
auto indices = memoryOp.getIndices();
auto numIndices = llvm::size(indices);
unsigned d = 0;
for (auto index : indices) {
if (fastestVaryingDim == (numIndices - 1) - d++) {
continue;
}
if (!isAccessInvariant(iv, *index)) {
return false;
unsigned numIndices = llvm::size(indices);
unsigned dim = 0;
for (auto *index : indices) {
if (!isAccessInvariant(iv, index)) {
if (uniqueVaryingIndexAlongIv != -1) {
// 2+ varying indices -> do not vectorize along iv.
return false;
}
uniqueVaryingIndexAlongIv = dim;
}
++dim;
}
if (uniqueVaryingIndexAlongIv == -1)
*memRefDim = -1;
else
*memRefDim = numIndices - (uniqueVaryingIndexAlongIv + 1);
return true;
}
@ -328,15 +336,12 @@ isVectorizableLoopBodyWithOpCond(AffineForOp loop,
return true;
}
bool mlir::isVectorizableLoopBodyAlongFastestVaryingMemRefDim(
AffineForOp loop, unsigned fastestVaryingDim) {
VectorizableOpFun fun([fastestVaryingDim](AffineForOp loop, Operation &op) {
bool mlir::isVectorizableLoopBody(AffineForOp loop, int *memRefDim) {
VectorizableOpFun fun([memRefDim](AffineForOp loop, Operation &op) {
auto load = op.dyn_cast<LoadOp>();
auto store = op.dyn_cast<StoreOp>();
return load ? isContiguousAccess(*loop.getInductionVar(), load,
fastestVaryingDim)
: isContiguousAccess(*loop.getInductionVar(), store,
fastestVaryingDim);
return load ? isContiguousAccess(loop.getInductionVar(), load, memRefDim)
: isContiguousAccess(loop.getInductionVar(), store, memRefDim);
});
return isVectorizableLoopBodyWithOpCond(loop, fun);
}
@ -348,8 +353,8 @@ bool mlir::isVectorizableLoopBody(AffineForOp loop) {
/// Checks whether SSA dominance would be violated if a for op's body
/// operations are shifted by the specified shifts. This method checks if a
/// 'def' and all its uses have the same shift factor.
// TODO(mlir-team): extend this to check for memory-based dependence
// violation when we have the support.
// TODO(mlir-team): extend this to check for memory-based dependence violation
// when we have the support.
bool mlir::isInstwiseShiftValid(AffineForOp forOp, ArrayRef<uint64_t> shifts) {
auto *forBody = forOp.getBody();
assert(shifts.size() == forBody->getOperations().size());

View File

@ -101,25 +101,33 @@ Optional<SmallVector<unsigned, 4>> mlir::shapeRatio(VectorType superVectorType,
/// If no index is found to be invariant, 0 is added to the permutation_map and
/// corresponds to a vector broadcast along that dimension.
///
/// Returns an empty AffineMap if `enclosingLoopToVectorDim` is empty,
/// signalling that no permutation map can be constructed given
/// `enclosingLoopToVectorDim`.
///
/// Examples can be found in the documentation of `makePermutationMap`, in the
/// header file.
static AffineMap makePermutationMap(
MLIRContext *context,
llvm::iterator_range<Operation::operand_iterator> indices,
llvm::iterator_range<Operation::operand_iterator> operands,
const DenseMap<Operation *, unsigned> &enclosingLoopToVectorDim) {
if (enclosingLoopToVectorDim.empty())
return AffineMap();
MLIRContext *context =
enclosingLoopToVectorDim.begin()->getFirst()->getContext();
using functional::makePtrDynCaster;
using functional::map;
auto unwrappedIndices = map(makePtrDynCaster<Value, Value>(), indices);
SmallVector<Value *, 8> indices(operands);
SmallVector<AffineExpr, 4> perm(enclosingLoopToVectorDim.size(),
getAffineConstantExpr(0, context));
for (auto kvp : enclosingLoopToVectorDim) {
assert(kvp.second < perm.size());
auto invariants = getInvariantAccesses(
*kvp.first->cast<AffineForOp>().getInductionVar(), unwrappedIndices);
unsigned numIndices = unwrappedIndices.size();
kvp.first->cast<AffineForOp>().getInductionVar(), indices);
unsigned numIndices = indices.size();
unsigned countInvariantIndices = 0;
for (unsigned dim = 0; dim < numIndices; ++dim) {
if (!invariants.count(unwrappedIndices[dim])) {
if (!invariants.count(indices[dim])) {
assert(perm[kvp.second] == getAffineConstantExpr(0, context) &&
"permutationMap already has an entry along dim");
perm[kvp.second] = getAffineDimExpr(dim, context);
@ -132,7 +140,7 @@ static AffineMap makePermutationMap(
"Vectorization prerequisite violated: at most 1 index may be "
"invariant wrt a vectorized loop");
}
return AffineMap::get(unwrappedIndices.size(), 0, perm, {});
return AffineMap::get(indices.size(), 0, perm, {});
}
/// Implementation detail that walks up the parents and records the ones with
@ -170,13 +178,11 @@ AffineMap mlir::makePermutationMap(
}
if (auto load = op->dyn_cast<LoadOp>()) {
return ::makePermutationMap(op->getContext(), load.getIndices(),
enclosingLoopToVectorDim);
return ::makePermutationMap(load.getIndices(), enclosingLoopToVectorDim);
}
auto store = op->cast<StoreOp>();
return ::makePermutationMap(op->getContext(), store.getIndices(),
enclosingLoopToVectorDim);
return ::makePermutationMap(store.getIndices(), enclosingLoopToVectorDim);
}
bool mlir::matcher::operatesOnSuperVectors(Operation &op,

View File

@ -553,7 +553,7 @@ static llvm::cl::OptionCategory clOptionsCategory("vectorize options");
static llvm::cl::list<int> clVirtualVectorSize(
"virtual-vector-size",
llvm::cl::desc("Specify n-D virtual vector size for vectorization"),
llvm::cl::desc("Specify an n-D virtual vector size for vectorization"),
llvm::cl::ZeroOrMore, llvm::cl::cat(clOptionsCategory));
static llvm::cl::list<int> clFastestVaryingPattern(
@ -567,124 +567,84 @@ static llvm::cl::list<int> clFastestVaryingPattern(
/// Forward declaration.
static FilterFunctionType
isVectorizableLoopPtrFactory(const llvm::DenseSet<Operation *> &parallelLoops,
unsigned fastestVaryingMemRefDimension);
// Build a bunch of predetermined patterns that will be traversed in order.
// Due to the recursive nature of NestedPatterns, this captures
// arbitrarily nested pairs of loops at any position in the tree.
/// Note that this currently only matches 2 nested loops and will be extended.
// TODO(ntv): support 3-D loop patterns with a common reduction loop that can
// be matched to GEMMs.
static std::vector<NestedPattern>
defaultPatterns(const llvm::DenseSet<Operation *> &parallelLoops) {
using matcher::For;
return std::vector<NestedPattern>{
// 3-D patterns
For(isVectorizableLoopPtrFactory(parallelLoops, 2),
For(isVectorizableLoopPtrFactory(parallelLoops, 1),
For(isVectorizableLoopPtrFactory(parallelLoops, 0)))),
// for i { for j { A[??f(not i, not j), f(i, not j), f(not i, j)];}}
// test independently with:
// --test-fastest-varying=1 --test-fastest-varying=0
For(isVectorizableLoopPtrFactory(parallelLoops, 1),
For(isVectorizableLoopPtrFactory(parallelLoops, 0))),
// for i { for j { A[??f(not i, not j), f(i, not j), ?, f(not i, j)];}}
// test independently with:
// --test-fastest-varying=2 --test-fastest-varying=0
For(isVectorizableLoopPtrFactory(parallelLoops, 2),
For(isVectorizableLoopPtrFactory(parallelLoops, 0))),
// for i { for j { A[??f(not i, not j), f(i, not j), ?, ?, f(not i, j)];}}
// test independently with:
// --test-fastest-varying=3 --test-fastest-varying=0
For(isVectorizableLoopPtrFactory(parallelLoops, 3),
For(isVectorizableLoopPtrFactory(parallelLoops, 0))),
// for i { for j { A[??f(not i, not j), f(not i, j), f(i, not j)];}}
// test independently with:
// --test-fastest-varying=0 --test-fastest-varying=1
For(isVectorizableLoopPtrFactory(parallelLoops, 0),
For(isVectorizableLoopPtrFactory(parallelLoops, 1))),
// for i { for j { A[??f(not i, not j), f(not i, j), ?, f(i, not j)];}}
// test independently with:
// --test-fastest-varying=0 --test-fastest-varying=2
For(isVectorizableLoopPtrFactory(parallelLoops, 0),
For(isVectorizableLoopPtrFactory(parallelLoops, 2))),
// for i { for j { A[??f(not i, not j), f(not i, j), ?, ?, f(i, not j)];}}
// test independently with:
// --test-fastest-varying=0 --test-fastest-varying=3
For(isVectorizableLoopPtrFactory(parallelLoops, 0),
For(isVectorizableLoopPtrFactory(parallelLoops, 3))),
// for i { A[??f(not i) , f(i)];}
// test independently with: --test-fastest-varying=0
For(isVectorizableLoopPtrFactory(parallelLoops, 0)),
// for i { A[??f(not i) , f(i), ?];}
// test independently with: --test-fastest-varying=1
For(isVectorizableLoopPtrFactory(parallelLoops, 1)),
// for i { A[??f(not i) , f(i), ?, ?];}
// test independently with: --test-fastest-varying=2
For(isVectorizableLoopPtrFactory(parallelLoops, 2)),
// for i { A[??f(not i) , f(i), ?, ?, ?];}
// test independently with: --test-fastest-varying=3
For(isVectorizableLoopPtrFactory(parallelLoops, 3))};
}
int fastestVaryingMemRefDimension);
/// Creates a vectorization pattern from the command line arguments.
/// Up to 3-D patterns are supported.
/// If the command line argument requests a pattern of higher order, returns an
/// empty pattern list which will conservatively result in no vectorization.
static std::vector<NestedPattern>
makePatterns(const llvm::DenseSet<Operation *> &parallelLoops) {
makePatterns(const llvm::DenseSet<Operation *> &parallelLoops, int vectorRank,
ArrayRef<int64_t> fastestVaryingPattern) {
using matcher::For;
if (clFastestVaryingPattern.empty()) {
return defaultPatterns(parallelLoops);
}
switch (clFastestVaryingPattern.size()) {
int64_t d0 = fastestVaryingPattern.empty() ? -1 : fastestVaryingPattern[0];
int64_t d1 = fastestVaryingPattern.size() < 2 ? -1 : fastestVaryingPattern[1];
int64_t d2 = fastestVaryingPattern.size() < 3 ? -1 : fastestVaryingPattern[2];
switch (vectorRank) {
case 1:
return {For(isVectorizableLoopPtrFactory(parallelLoops,
clFastestVaryingPattern[0]))};
return {For(isVectorizableLoopPtrFactory(parallelLoops, d0))};
case 2:
return {For(
isVectorizableLoopPtrFactory(parallelLoops, clFastestVaryingPattern[0]),
For(isVectorizableLoopPtrFactory(parallelLoops,
clFastestVaryingPattern[1])))};
return {For(isVectorizableLoopPtrFactory(parallelLoops, d0),
For(isVectorizableLoopPtrFactory(parallelLoops, d1)))};
case 3:
return {For(
isVectorizableLoopPtrFactory(parallelLoops, clFastestVaryingPattern[0]),
For(isVectorizableLoopPtrFactory(parallelLoops,
clFastestVaryingPattern[1]),
For(isVectorizableLoopPtrFactory(parallelLoops,
clFastestVaryingPattern[2]))))};
default:
return {For(isVectorizableLoopPtrFactory(parallelLoops, d0),
For(isVectorizableLoopPtrFactory(parallelLoops, d1),
For(isVectorizableLoopPtrFactory(parallelLoops, d2))))};
default: {
return std::vector<NestedPattern>();
}
}
}
namespace {
/// Base state for the vectorize pass.
/// Command line arguments are preempted by non-empty pass arguments.
struct Vectorize : public FunctionPass<Vectorize> {
Vectorize() {
if (!clVirtualVectorSize.empty()) {
vectorSizes.reserve(clVirtualVectorSize.size());
this->vectorSizes.assign(clVirtualVectorSize.begin(),
clVirtualVectorSize.end());
}
}
Vectorize(ArrayRef<int64_t> virtualVectorSize) {
if (clVirtualVectorSize.empty()) {
this->vectorSizes.assign(virtualVectorSize.begin(),
virtualVectorSize.end());
} else {
vectorSizes.reserve(clVirtualVectorSize.size());
this->vectorSizes.assign(clVirtualVectorSize.begin(),
clVirtualVectorSize.end());
}
}
Vectorize();
Vectorize(ArrayRef<int64_t> virtualVectorSize);
Vectorize(ArrayRef<int64_t> virtualVectorSize,
ArrayRef<int64_t> fastestVaryingPattern);
void runOnFunction() override;
// The virtual vector size that we vectorize to.
SmallVector<int64_t, 4> vectorSizes;
// Optionally, the fixed mapping from loop to fastest varying MemRef dimension
// for all the MemRefs within a loop pattern:
// the index represents the loop depth, the value represents the k^th
// fastest varying memory dimension.
// This is voluntarily restrictive and is meant to precisely target a
// particular loop/op pair, for testing purposes.
SmallVector<int64_t, 4> fastestVaryingPattern;
};
} // end anonymous namespace
/////// TODO(ntv): Hoist to a VectorizationStrategy.cpp when appropriate. //////
Vectorize::Vectorize() {
this->vectorSizes.assign(clVirtualVectorSize.begin(),
clVirtualVectorSize.end());
this->fastestVaryingPattern.assign(clFastestVaryingPattern.begin(),
clFastestVaryingPattern.end());
}
Vectorize::Vectorize(ArrayRef<int64_t> virtualVectorSize) : Vectorize() {
if (!virtualVectorSize.empty()) {
this->vectorSizes.assign(virtualVectorSize.begin(),
virtualVectorSize.end());
}
}
Vectorize::Vectorize(ArrayRef<int64_t> virtualVectorSize,
ArrayRef<int64_t> fastestVaryingPattern)
: Vectorize(virtualVectorSize) {
if (!fastestVaryingPattern.empty()) {
this->fastestVaryingPattern.assign(fastestVaryingPattern.begin(),
fastestVaryingPattern.end());
}
}
/////// TODO(ntv): Hoist to a VectorizationStrategy.cpp when appropriate.
/////////
namespace {
struct VectorizationStrategy {
@ -833,12 +793,12 @@ void VectorizationState::registerReplacement(Value *key, Value *value) {
/// vectorized immediately. The resulting vector_transfer_read is immediately
/// registered to replace all uses of the LoadOp in this pattern's scope.
///
/// StoreOp are the terminals of the vectorizeNonTerminals call. They need
/// to be vectorized late once all the use-def chains have been traversed.
/// Additionally, they may have ssa-values operands which come from outside
/// the scope of the current pattern.
/// Such special cases force us to delay the vectorization of the stores
/// until the last step. Here we merely register the store operation.
/// StoreOp are the terminals of the vectorizeNonTerminals call. They need to be
/// vectorized late once all the use-def chains have been traversed.
/// Additionally, they may have ssa-values operands which come from outside the
/// scope of the current pattern.
/// Such special cases force us to delay the vectorization of the stores until
/// the last step. Here we merely register the store operation.
template <typename LoadOrStoreOpPointer>
static LogicalResult vectorizeRootOrTerminal(Value *iv,
LoadOrStoreOpPointer memoryOp,
@ -860,6 +820,8 @@ static LogicalResult vectorizeRootOrTerminal(Value *iv,
if (opInst->template isa<LoadOp>()) {
auto permutationMap =
makePermutationMap(opInst, state->strategy->loopToVectorDim);
if (!permutationMap)
return LogicalResult::Failure;
LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ permutationMap: ");
LLVM_DEBUG(permutationMap.print(dbgs()));
FuncBuilder b(opInst);
@ -907,22 +869,23 @@ static LogicalResult vectorizeAffineForOp(AffineForOp loop, int64_t step,
return success();
}
/// Returns a FilterFunctionType that can be used in NestedPattern to
/// match a loop whose underlying load/store accesses are all varying along the
/// `fastestVaryingMemRefDimension`.
/// TODO(ntv): In the future, allow more interesting mixed layout permutation
/// once we understand better the performance implications and we are confident
/// we can build a cost model and a search procedure.
/// Returns a FilterFunctionType that can be used in NestedPattern to match a
/// loop whose underlying load/store accesses are either invariant or all
// varying along the `fastestVaryingMemRefDimension`.
static FilterFunctionType
isVectorizableLoopPtrFactory(const llvm::DenseSet<Operation *> &parallelLoops,
unsigned fastestVaryingMemRefDimension) {
int fastestVaryingMemRefDimension) {
return [&parallelLoops, fastestVaryingMemRefDimension](Operation &forOp) {
auto loop = forOp.cast<AffineForOp>();
auto parallelIt = parallelLoops.find(loop);
if (parallelIt == parallelLoops.end())
return false;
return isVectorizableLoopBodyAlongFastestVaryingMemRefDim(
loop, fastestVaryingMemRefDimension);
int memRefDim = -1;
auto vectorizableBody = isVectorizableLoopBody(loop, &memRefDim);
if (!vectorizableBody)
return false;
return memRefDim == -1 || fastestVaryingMemRefDimension == -1 ||
memRefDim == fastestVaryingMemRefDimension;
};
}
@ -1047,15 +1010,15 @@ static Value *vectorizeOperand(Value *operand, Operation *op,
return nullptr;
};
/// Encodes Operation-specific behavior for vectorization. In general we
/// assume that all operands of an op must be vectorized but this is not always
/// true. In the future, it would be nice to have a trait that describes how a
/// Encodes Operation-specific behavior for vectorization. In general we assume
/// that all operands of an op must be vectorized but this is not always true.
/// In the future, it would be nice to have a trait that describes how a
/// particular operation vectorizes. For now we implement the case distinction
/// here.
/// Returns a vectorized form of an operation or nullptr if vectorization fails.
/// TODO(ntv): consider adding a trait to Op to describe how it gets vectorized.
/// Maybe some Ops are not vectorizable or require some tricky logic, we cannot
/// do one-off logic here; ideally it would be TableGen'd.
// TODO(ntv): consider adding a trait to Op to describe how it gets vectorized.
// Maybe some Ops are not vectorizable or require some tricky logic, we cannot
// do one-off logic here; ideally it would be TableGen'd.
static Operation *vectorizeOneOperation(Operation *opInst,
VectorizationState *state) {
// Sanity checks.
@ -1074,6 +1037,8 @@ static Operation *vectorizeOneOperation(Operation *opInst,
FuncBuilder b(opInst);
auto permutationMap =
makePermutationMap(opInst, state->strategy->loopToVectorDim);
if (!permutationMap)
return nullptr;
LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ permutationMap: ");
LLVM_DEBUG(permutationMap.print(dbgs()));
auto transfer = b.create<VectorTransferWriteOp>(
@ -1249,10 +1214,18 @@ static LogicalResult vectorizeRootMatch(NestedMatch m,
/// Applies vectorization to the current Function by searching over a bunch of
/// predetermined patterns.
void Vectorize::runOnFunction() {
Function &f = getFunction();
if (!fastestVaryingPattern.empty() &&
fastestVaryingPattern.size() != vectorSizes.size()) {
f.emitNote("Fastest varying pattern specified with different size than the "
"vector size.");
this->signalPassFailure();
return;
}
// Thread-safe RAII local context, BumpPtrAllocator freed on exit.
NestedPatternContext mlContext;
Function &f = getFunction();
llvm::DenseSet<Operation *> parallelLoops;
f.walkPostOrder([&parallelLoops](Operation *op) {
if (auto loop = op->dyn_cast<AffineForOp>()) {
@ -1262,7 +1235,8 @@ void Vectorize::runOnFunction() {
}
});
for (auto &pat : makePatterns(parallelLoops)) {
for (auto &pat :
makePatterns(parallelLoops, vectorSizes.size(), fastestVaryingPattern)) {
LLVM_DEBUG(dbgs() << "\n******************************************");
LLVM_DEBUG(dbgs() << "\n******************************************");
LLVM_DEBUG(dbgs() << "\n[early-vect] new pattern on Function\n");

View File

@ -1,7 +1,15 @@
// RUN: mlir-opt %s -vectorize -virtual-vector-size 4 -virtual-vector-size 8 | FileCheck %s -check-prefix=VECT
// RUN: mlir-opt %s -vectorize -virtual-vector-size 32 -virtual-vector-size 256 --test-fastest-varying=1 --test-fastest-varying=0 | FileCheck %s
// Permutation maps used in vectorization.
// CHECK: #[[map_proj_d0d1_d0d1:map[0-9]+]] = (d0, d1) -> (d0, d1)
// CHECK-DAG: #[[map_id1:map[0-9]+]] = (d0) -> (d0)
// CHECK-DAG: #[[map_id2:map[0-9]+]] = (d0, d1) -> (d0, d1)
// CHECK-DAG: #[[map_proj_d0d1_zerod1:map[0-9]+]] = (d0, d1) -> (0, d1)
// CHECK-DAG: #[[map_proj_d0d1_d0zero:map[0-9]+]] = (d0, d1) -> (d0, 0)
// VECT-DAG: #[[map_id1:map[0-9]+]] = (d0) -> (d0)
// VECT-DAG: #[[map_id2:map[0-9]+]] = (d0, d1) -> (d0, d1)
// VECT-DAG: #[[map_proj_d0d1_zerod1:map[0-9]+]] = (d0, d1) -> (0, d1)
// VECT-DAG: #[[map_proj_d0d1_d0zero:map[0-9]+]] = (d0, d1) -> (d0, 0)
func @vec2d(%A : memref<?x?x?xf32>) {
%M = dim %A, 0 : memref<?x?x?xf32>
@ -46,7 +54,7 @@ func @vector_add_2d(%M : index, %N : index) -> f32 {
affine.for %i0 = 0 to %M {
affine.for %i1 = 0 to %N {
// CHECK: [[C1:%.*]] = constant splat<vector<32x256xf32>, 1.000000e+00> : vector<32x256xf32>
// CHECK: vector_transfer_write [[C1]], {{.*}} {permutation_map: #[[map_proj_d0d1_d0d1]]} : vector<32x256xf32>, memref<?x?xf32>, index, index
// CHECK: vector_transfer_write [[C1]], {{.*}} {permutation_map: #[[map_id2]]} : vector<32x256xf32>, memref<?x?xf32>, index, index
// non-scoped %f1
store %f1, %A[%i0, %i1] : memref<?x?xf32, 0>
}
@ -54,22 +62,22 @@ func @vector_add_2d(%M : index, %N : index) -> f32 {
affine.for %i2 = 0 to %M {
affine.for %i3 = 0 to %N {
// CHECK: [[C3:%.*]] = constant splat<vector<32x256xf32>, 2.000000e+00> : vector<32x256xf32>
// CHECK: vector_transfer_write [[C3]], {{.*}} {permutation_map: #[[map_proj_d0d1_d0d1]]} : vector<32x256xf32>, memref<?x?xf32>, index, index
// CHECK: vector_transfer_write [[C3]], {{.*}} {permutation_map: #[[map_id2]]} : vector<32x256xf32>, memref<?x?xf32>, index, index
// non-scoped %f2
store %f2, %B[%i2, %i3] : memref<?x?xf32, 0>
}
}
affine.for %i4 = 0 to %M {
affine.for %i5 = 0 to %N {
// CHECK: [[A5:%.*]] = vector_transfer_read %0, {{.*}} {permutation_map: #[[map_proj_d0d1_d0d1]]} : (memref<?x?xf32>, index, index) -> vector<32x256xf32>
// CHECK: [[B5:%.*]] = vector_transfer_read %1, {{.*}} {permutation_map: #[[map_proj_d0d1_d0d1]]} : (memref<?x?xf32>, index, index) -> vector<32x256xf32>
// CHECK: [[A5:%.*]] = vector_transfer_read %0, {{.*}} {permutation_map: #[[map_id2]]} : (memref<?x?xf32>, index, index) -> vector<32x256xf32>
// CHECK: [[B5:%.*]] = vector_transfer_read %1, {{.*}} {permutation_map: #[[map_id2]]} : (memref<?x?xf32>, index, index) -> vector<32x256xf32>
// CHECK: [[S5:%.*]] = addf [[A5]], [[B5]] : vector<32x256xf32>
// CHECK: [[SPLAT1:%.*]] = constant splat<vector<32x256xf32>, 1.000000e+00> : vector<32x256xf32>
// CHECK: [[S6:%.*]] = addf [[S5]], [[SPLAT1]] : vector<32x256xf32>
// CHECK: [[SPLAT2:%.*]] = constant splat<vector<32x256xf32>, 2.000000e+00> : vector<32x256xf32>
// CHECK: [[S7:%.*]] = addf [[S5]], [[SPLAT2]] : vector<32x256xf32>
// CHECK: [[S8:%.*]] = addf [[S7]], [[S6]] : vector<32x256xf32>
// CHECK: vector_transfer_write [[S8]], {{.*}} {permutation_map: #[[map_proj_d0d1_d0d1]]} : vector<32x256xf32>, memref<?x?xf32>, index, index
// CHECK: vector_transfer_write [[S8]], {{.*}} {permutation_map: #[[map_id2]]} : vector<32x256xf32>, memref<?x?xf32>, index, index
//
%a5 = load %A[%i4, %i5] : memref<?x?xf32, 0>
%b5 = load %B[%i4, %i5] : memref<?x?xf32, 0>
@ -89,3 +97,46 @@ func @vector_add_2d(%M : index, %N : index) -> f32 {
return %res : f32
}
// VECT-LABEL: func @vectorize_matmul
func @vectorize_matmul(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>, %arg2: memref<?x?xf32>) {
%c0 = constant 0 : index
%M = dim %arg0, 0 : memref<?x?xf32>
%K = dim %arg0, 1 : memref<?x?xf32>
%N = dim %arg2, 1 : memref<?x?xf32>
// VECT: %[[C0:.*]] = constant 0 : index
// VECT-NEXT: %[[M:.*]] = dim %arg0, 0 : memref<?x?xf32>
// VECT-NEXT: %[[K:.*]] = dim %arg0, 1 : memref<?x?xf32>
// VECT-NEXT: %[[N:.*]] = dim %arg2, 1 : memref<?x?xf32>
// VECT: {{.*}} #[[map_id1]](%[[M]]) step 4 {
// VECT-NEXT: {{.*}} #[[map_id1]](%[[N]]) step 8 {
// VECT: %[[VC0:.*]] = constant splat<vector<4x8xf32>, 0.000000e+00> : vector<4x8xf32>
// VECT-NEXT: vector_transfer_write %[[VC0]], %arg2, %{{.*}}, %{{.*}} {permutation_map: #[[map_id2]]}
affine.for %i0 = (d0) -> (d0)(%c0) to (d0) -> (d0)(%M) {
affine.for %i1 = (d0) -> (d0)(%c0) to (d0) -> (d0)(%N) {
%cst = constant 0.000000e+00 : f32
store %cst, %arg2[%i0, %i1] : memref<?x?xf32>
}
}
// VECT: affine.for %[[I2:.*]] = #[[map_id1]](%[[C0]]) to #[[map_id1]](%[[M]]) step 4 {
// VECT-NEXT: affine.for %[[I3:.*]] = #[[map_id1]](%[[C0]]) to #[[map_id1]](%[[N]]) step 8 {
// VECT-NEXT: affine.for %[[I4:.*]] = #map5(%[[C0]]) to #[[map_id1]](%[[K]]) {
// VECT-NEXT: %[[A:.*]] = vector_transfer_read %arg1, %[[I4]], %[[I3]] {permutation_map: #[[map_proj_d0d1_zerod1]]}
// VECT-NEXT: %[[B:.*]] = vector_transfer_read %arg0, %[[I2]], %[[I4]] {permutation_map: #[[map_proj_d0d1_d0zero]]}
// VECT-NEXT: %[[C:.*]] = mulf %[[B]], %[[A]] : vector<4x8xf32>
// VECT-NEXT: %[[D:.*]] = vector_transfer_read %arg2, %[[I2]], %[[I3]] {permutation_map: #[[map_id2]]}
// VECT-NEXT: %[[E:.*]] = addf %[[D]], %[[C]] : vector<4x8xf32>
// VECT-NEXT: vector_transfer_write %[[E]], %arg2, %[[I2]], %[[I3]] {permutation_map: #[[map_id2]]} : vector<4x8xf32>, memref<?x?xf32>, index, index
affine.for %i2 = (d0) -> (d0)(%c0) to (d0) -> (d0)(%M) {
affine.for %i3 = (d0) -> (d0)(%c0) to (d0) -> (d0)(%N) {
affine.for %i4 = (d0) -> (d0)(%c0) to (d0) -> (d0)(%K) {
%6 = load %arg1[%i4, %i3] : memref<?x?xf32>
%7 = load %arg0[%i2, %i4] : memref<?x?xf32>
%8 = mulf %7, %6 : f32
%9 = load %arg2[%i2, %i3] : memref<?x?xf32>
%10 = addf %9, %8 : f32
store %10, %arg2[%i2, %i3] : memref<?x?xf32>
}
}
}
return
}