forked from OSchip/llvm-project
Refactor vectorization patterns
This CL removes the reliance of the vectorize pass on the specification of a `fastestVaryingDim` parameter. This parameter is a restriction meant to more easily target a particular loop/memref combination for vectorization and is mainly used for testing. This also had the side-effect of restricting vectorization patterns to only the ones in which all memrefs were contiguous along the same loop dimension. This simple restriction prevented matmul to vectorize in 2-D. this CL removes the restriction and adds the matmul test which vectorizes in 2-D along the parallel loops. Support for reduction loops is left for future work. PiperOrigin-RevId: 240993827
This commit is contained in:
parent
3ddd0411d0
commit
094ca64ab0
|
@ -71,34 +71,34 @@ uint64_t getLargestDivisorOfTripCount(AffineForOp forOp);
|
|||
///
|
||||
/// Returns false in cases with more than one AffineApplyOp, this is
|
||||
/// conservative.
|
||||
bool isAccessInvariant(Value &iv, Value &index);
|
||||
bool isAccessInvariant(Value *iv, Value *index);
|
||||
|
||||
/// Given an induction variable `iv` of type AffineForOp and `indices` of type
|
||||
/// IndexType, returns the set of `indices` that are independent of `iv`.
|
||||
///
|
||||
/// Prerequisites (inherited from `isAccessInvariant` above):
|
||||
/// 1. `iv` and `indices` of the proper type;
|
||||
/// 2. at most one reachable AffineApplyOp from index;
|
||||
/// 2. at most one affine.apply is reachable from each index in `indices`;
|
||||
///
|
||||
/// Returns false in cases with more than one AffineApplyOp, this is
|
||||
/// conservative.
|
||||
/// Emits a note if it encounters a chain of affine.apply and conservatively
|
||||
/// those cases.
|
||||
llvm::DenseSet<Value *, llvm::DenseMapInfo<Value *>>
|
||||
getInvariantAccesses(Value &iv, llvm::ArrayRef<Value *> indices);
|
||||
getInvariantAccesses(Value *iv, llvm::ArrayRef<Value *> indices);
|
||||
|
||||
using VectorizableLoopFun = std::function<bool(AffineForOp)>;
|
||||
|
||||
/// Checks whether the loop is structurally vectorizable; i.e.:
|
||||
/// 1. no conditionals are nested under the loop;
|
||||
/// 2. all nested load/stores are to scalar MemRefs.
|
||||
/// 1. no conditionals are nested under the loop;
|
||||
/// 2. all nested load/stores are to scalar MemRefs.
|
||||
/// TODO(ntv): relax the no-conditionals restriction
|
||||
bool isVectorizableLoopBody(AffineForOp loop);
|
||||
|
||||
/// Checks whether the loop is structurally vectorizable and that all the LoadOp
|
||||
/// and StoreOp matched have access indexing functions that are are either:
|
||||
/// 1. invariant along the loop induction variable created by 'loop';
|
||||
/// 2. varying along the 'fastestVaryingDim' memory dimension.
|
||||
bool isVectorizableLoopBodyAlongFastestVaryingMemRefDim(
|
||||
AffineForOp loop, unsigned fastestVaryingDim);
|
||||
/// 2. varying along at most one memory dimension. If such a unique dimension
|
||||
/// is found, it is written into `memRefDim`.
|
||||
bool isVectorizableLoopBody(AffineForOp loop, int *memRefDim);
|
||||
|
||||
/// Checks where SSA dominance would be violated if a for op's body
|
||||
/// operations are shifted by the specified shifts. This method checks if a
|
||||
|
|
|
@ -174,36 +174,36 @@ uint64_t mlir::getLargestDivisorOfTripCount(AffineForOp forOp) {
|
|||
return gcd.getValue();
|
||||
}
|
||||
|
||||
bool mlir::isAccessInvariant(Value &iv, Value &index) {
|
||||
assert(isForInductionVar(&iv) && "iv must be a AffineForOp");
|
||||
assert(index.getType().isa<IndexType>() && "index must be of IndexType");
|
||||
bool mlir::isAccessInvariant(Value *iv, Value *index) {
|
||||
assert(isForInductionVar(iv) && "iv must be a AffineForOp");
|
||||
assert(index->getType().isa<IndexType>() && "index must be of IndexType");
|
||||
SmallVector<Operation *, 4> affineApplyOps;
|
||||
getReachableAffineApplyOps({&index}, affineApplyOps);
|
||||
getReachableAffineApplyOps({index}, affineApplyOps);
|
||||
|
||||
if (affineApplyOps.empty()) {
|
||||
// Pointer equality test because of Value pointer semantics.
|
||||
return &index != &iv;
|
||||
return index != iv;
|
||||
}
|
||||
|
||||
if (affineApplyOps.size() > 1) {
|
||||
affineApplyOps[0]->emitError(
|
||||
affineApplyOps[0]->emitNote(
|
||||
"CompositionAffineMapsPass must have been run: there should be at most "
|
||||
"one AffineApplyOp");
|
||||
"one AffineApplyOp, returning false conservatively.");
|
||||
return false;
|
||||
}
|
||||
|
||||
auto composeOp = affineApplyOps[0]->cast<AffineApplyOp>();
|
||||
// We need yet another level of indirection because the `dim` index of the
|
||||
// access may not correspond to the `dim` index of composeOp.
|
||||
return !(AffineValueMap(composeOp).isFunctionOf(0, &iv));
|
||||
return !(AffineValueMap(composeOp).isFunctionOf(0, iv));
|
||||
}
|
||||
|
||||
llvm::DenseSet<Value *>
|
||||
mlir::getInvariantAccesses(Value &iv, llvm::ArrayRef<Value *> indices) {
|
||||
mlir::getInvariantAccesses(Value *iv, llvm::ArrayRef<Value *> indices) {
|
||||
llvm::DenseSet<Value *> res;
|
||||
for (unsigned idx = 0, n = indices.size(); idx < n; ++idx) {
|
||||
auto *val = indices[idx];
|
||||
if (isAccessInvariant(iv, *val)) {
|
||||
if (isAccessInvariant(iv, val)) {
|
||||
res.insert(val);
|
||||
}
|
||||
}
|
||||
|
@ -213,31 +213,30 @@ mlir::getInvariantAccesses(Value &iv, llvm::ArrayRef<Value *> indices) {
|
|||
/// Given:
|
||||
/// 1. an induction variable `iv` of type AffineForOp;
|
||||
/// 2. a `memoryOp` of type const LoadOp& or const StoreOp&;
|
||||
/// 3. the index of the `fastestVaryingDim` along which to check;
|
||||
/// determines whether `memoryOp`[`fastestVaryingDim`] is a contiguous access
|
||||
/// along `iv`.
|
||||
/// Contiguous is defined as either invariant or varying only along
|
||||
/// `fastestVaryingDim`.
|
||||
/// determines whether `memoryOp` has a contiguous access along `iv`. Contiguous
|
||||
/// is defined as either invariant or varying only along a unique MemRef dim.
|
||||
/// Upon success, the unique MemRef dim is written in `memRefDim` (or -1 to
|
||||
/// convey the memRef access is invariant along `iv`).
|
||||
///
|
||||
/// Prerequisites:
|
||||
/// 1. `iv` of the proper type;
|
||||
/// 2. the MemRef accessed by `memoryOp` has no layout map or at most an
|
||||
/// 1. `memRefDim` ~= nullptr;
|
||||
/// 2. `iv` of the proper type;
|
||||
/// 3. the MemRef accessed by `memoryOp` has no layout map or at most an
|
||||
/// identity layout map.
|
||||
///
|
||||
/// Currently only supports no layoutMap or identity layoutMap in the MemRef.
|
||||
/// Returns false if the MemRef has a non-identity layoutMap or more than
|
||||
/// 1 layoutMap. This is conservative.
|
||||
/// Returns false if the MemRef has a non-identity layoutMap or more than 1
|
||||
/// layoutMap. This is conservative.
|
||||
///
|
||||
// TODO(ntv): check strides.
|
||||
template <typename LoadOrStoreOp>
|
||||
static bool isContiguousAccess(Value &iv, LoadOrStoreOp memoryOp,
|
||||
unsigned fastestVaryingDim) {
|
||||
static bool isContiguousAccess(Value *iv, LoadOrStoreOp memoryOp,
|
||||
int *memRefDim) {
|
||||
static_assert(std::is_same<LoadOrStoreOp, LoadOp>::value ||
|
||||
std::is_same<LoadOrStoreOp, StoreOp>::value,
|
||||
"Must be called on either const LoadOp & or const StoreOp &");
|
||||
assert(memRefDim && "memRefDim == nullptr");
|
||||
auto memRefType = memoryOp.getMemRefType();
|
||||
if (fastestVaryingDim >= memRefType.getRank())
|
||||
return false;
|
||||
|
||||
auto layoutMap = memRefType.getAffineMaps();
|
||||
// TODO(ntv): remove dependence on Builder once we support non-identity
|
||||
|
@ -250,17 +249,26 @@ static bool isContiguousAccess(Value &iv, LoadOrStoreOp memoryOp,
|
|||
return memoryOp.emitError("NYI: non-trivial layoutMap"), false;
|
||||
}
|
||||
|
||||
int uniqueVaryingIndexAlongIv = -1;
|
||||
auto indices = memoryOp.getIndices();
|
||||
auto numIndices = llvm::size(indices);
|
||||
unsigned d = 0;
|
||||
for (auto index : indices) {
|
||||
if (fastestVaryingDim == (numIndices - 1) - d++) {
|
||||
continue;
|
||||
}
|
||||
if (!isAccessInvariant(iv, *index)) {
|
||||
return false;
|
||||
unsigned numIndices = llvm::size(indices);
|
||||
unsigned dim = 0;
|
||||
for (auto *index : indices) {
|
||||
if (!isAccessInvariant(iv, index)) {
|
||||
if (uniqueVaryingIndexAlongIv != -1) {
|
||||
// 2+ varying indices -> do not vectorize along iv.
|
||||
return false;
|
||||
}
|
||||
uniqueVaryingIndexAlongIv = dim;
|
||||
}
|
||||
++dim;
|
||||
}
|
||||
|
||||
if (uniqueVaryingIndexAlongIv == -1)
|
||||
*memRefDim = -1;
|
||||
else
|
||||
*memRefDim = numIndices - (uniqueVaryingIndexAlongIv + 1);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -328,15 +336,12 @@ isVectorizableLoopBodyWithOpCond(AffineForOp loop,
|
|||
return true;
|
||||
}
|
||||
|
||||
bool mlir::isVectorizableLoopBodyAlongFastestVaryingMemRefDim(
|
||||
AffineForOp loop, unsigned fastestVaryingDim) {
|
||||
VectorizableOpFun fun([fastestVaryingDim](AffineForOp loop, Operation &op) {
|
||||
bool mlir::isVectorizableLoopBody(AffineForOp loop, int *memRefDim) {
|
||||
VectorizableOpFun fun([memRefDim](AffineForOp loop, Operation &op) {
|
||||
auto load = op.dyn_cast<LoadOp>();
|
||||
auto store = op.dyn_cast<StoreOp>();
|
||||
return load ? isContiguousAccess(*loop.getInductionVar(), load,
|
||||
fastestVaryingDim)
|
||||
: isContiguousAccess(*loop.getInductionVar(), store,
|
||||
fastestVaryingDim);
|
||||
return load ? isContiguousAccess(loop.getInductionVar(), load, memRefDim)
|
||||
: isContiguousAccess(loop.getInductionVar(), store, memRefDim);
|
||||
});
|
||||
return isVectorizableLoopBodyWithOpCond(loop, fun);
|
||||
}
|
||||
|
@ -348,8 +353,8 @@ bool mlir::isVectorizableLoopBody(AffineForOp loop) {
|
|||
/// Checks whether SSA dominance would be violated if a for op's body
|
||||
/// operations are shifted by the specified shifts. This method checks if a
|
||||
/// 'def' and all its uses have the same shift factor.
|
||||
// TODO(mlir-team): extend this to check for memory-based dependence
|
||||
// violation when we have the support.
|
||||
// TODO(mlir-team): extend this to check for memory-based dependence violation
|
||||
// when we have the support.
|
||||
bool mlir::isInstwiseShiftValid(AffineForOp forOp, ArrayRef<uint64_t> shifts) {
|
||||
auto *forBody = forOp.getBody();
|
||||
assert(shifts.size() == forBody->getOperations().size());
|
||||
|
|
|
@ -101,25 +101,33 @@ Optional<SmallVector<unsigned, 4>> mlir::shapeRatio(VectorType superVectorType,
|
|||
/// If no index is found to be invariant, 0 is added to the permutation_map and
|
||||
/// corresponds to a vector broadcast along that dimension.
|
||||
///
|
||||
/// Returns an empty AffineMap if `enclosingLoopToVectorDim` is empty,
|
||||
/// signalling that no permutation map can be constructed given
|
||||
/// `enclosingLoopToVectorDim`.
|
||||
///
|
||||
/// Examples can be found in the documentation of `makePermutationMap`, in the
|
||||
/// header file.
|
||||
static AffineMap makePermutationMap(
|
||||
MLIRContext *context,
|
||||
llvm::iterator_range<Operation::operand_iterator> indices,
|
||||
llvm::iterator_range<Operation::operand_iterator> operands,
|
||||
const DenseMap<Operation *, unsigned> &enclosingLoopToVectorDim) {
|
||||
if (enclosingLoopToVectorDim.empty())
|
||||
return AffineMap();
|
||||
MLIRContext *context =
|
||||
enclosingLoopToVectorDim.begin()->getFirst()->getContext();
|
||||
using functional::makePtrDynCaster;
|
||||
using functional::map;
|
||||
auto unwrappedIndices = map(makePtrDynCaster<Value, Value>(), indices);
|
||||
SmallVector<Value *, 8> indices(operands);
|
||||
SmallVector<AffineExpr, 4> perm(enclosingLoopToVectorDim.size(),
|
||||
getAffineConstantExpr(0, context));
|
||||
|
||||
for (auto kvp : enclosingLoopToVectorDim) {
|
||||
assert(kvp.second < perm.size());
|
||||
auto invariants = getInvariantAccesses(
|
||||
*kvp.first->cast<AffineForOp>().getInductionVar(), unwrappedIndices);
|
||||
unsigned numIndices = unwrappedIndices.size();
|
||||
kvp.first->cast<AffineForOp>().getInductionVar(), indices);
|
||||
unsigned numIndices = indices.size();
|
||||
unsigned countInvariantIndices = 0;
|
||||
for (unsigned dim = 0; dim < numIndices; ++dim) {
|
||||
if (!invariants.count(unwrappedIndices[dim])) {
|
||||
if (!invariants.count(indices[dim])) {
|
||||
assert(perm[kvp.second] == getAffineConstantExpr(0, context) &&
|
||||
"permutationMap already has an entry along dim");
|
||||
perm[kvp.second] = getAffineDimExpr(dim, context);
|
||||
|
@ -132,7 +140,7 @@ static AffineMap makePermutationMap(
|
|||
"Vectorization prerequisite violated: at most 1 index may be "
|
||||
"invariant wrt a vectorized loop");
|
||||
}
|
||||
return AffineMap::get(unwrappedIndices.size(), 0, perm, {});
|
||||
return AffineMap::get(indices.size(), 0, perm, {});
|
||||
}
|
||||
|
||||
/// Implementation detail that walks up the parents and records the ones with
|
||||
|
@ -170,13 +178,11 @@ AffineMap mlir::makePermutationMap(
|
|||
}
|
||||
|
||||
if (auto load = op->dyn_cast<LoadOp>()) {
|
||||
return ::makePermutationMap(op->getContext(), load.getIndices(),
|
||||
enclosingLoopToVectorDim);
|
||||
return ::makePermutationMap(load.getIndices(), enclosingLoopToVectorDim);
|
||||
}
|
||||
|
||||
auto store = op->cast<StoreOp>();
|
||||
return ::makePermutationMap(op->getContext(), store.getIndices(),
|
||||
enclosingLoopToVectorDim);
|
||||
return ::makePermutationMap(store.getIndices(), enclosingLoopToVectorDim);
|
||||
}
|
||||
|
||||
bool mlir::matcher::operatesOnSuperVectors(Operation &op,
|
||||
|
|
|
@ -553,7 +553,7 @@ static llvm::cl::OptionCategory clOptionsCategory("vectorize options");
|
|||
|
||||
static llvm::cl::list<int> clVirtualVectorSize(
|
||||
"virtual-vector-size",
|
||||
llvm::cl::desc("Specify n-D virtual vector size for vectorization"),
|
||||
llvm::cl::desc("Specify an n-D virtual vector size for vectorization"),
|
||||
llvm::cl::ZeroOrMore, llvm::cl::cat(clOptionsCategory));
|
||||
|
||||
static llvm::cl::list<int> clFastestVaryingPattern(
|
||||
|
@ -567,124 +567,84 @@ static llvm::cl::list<int> clFastestVaryingPattern(
|
|||
/// Forward declaration.
|
||||
static FilterFunctionType
|
||||
isVectorizableLoopPtrFactory(const llvm::DenseSet<Operation *> ¶llelLoops,
|
||||
unsigned fastestVaryingMemRefDimension);
|
||||
|
||||
// Build a bunch of predetermined patterns that will be traversed in order.
|
||||
// Due to the recursive nature of NestedPatterns, this captures
|
||||
// arbitrarily nested pairs of loops at any position in the tree.
|
||||
/// Note that this currently only matches 2 nested loops and will be extended.
|
||||
// TODO(ntv): support 3-D loop patterns with a common reduction loop that can
|
||||
// be matched to GEMMs.
|
||||
static std::vector<NestedPattern>
|
||||
defaultPatterns(const llvm::DenseSet<Operation *> ¶llelLoops) {
|
||||
using matcher::For;
|
||||
return std::vector<NestedPattern>{
|
||||
// 3-D patterns
|
||||
For(isVectorizableLoopPtrFactory(parallelLoops, 2),
|
||||
For(isVectorizableLoopPtrFactory(parallelLoops, 1),
|
||||
For(isVectorizableLoopPtrFactory(parallelLoops, 0)))),
|
||||
// for i { for j { A[??f(not i, not j), f(i, not j), f(not i, j)];}}
|
||||
// test independently with:
|
||||
// --test-fastest-varying=1 --test-fastest-varying=0
|
||||
For(isVectorizableLoopPtrFactory(parallelLoops, 1),
|
||||
For(isVectorizableLoopPtrFactory(parallelLoops, 0))),
|
||||
// for i { for j { A[??f(not i, not j), f(i, not j), ?, f(not i, j)];}}
|
||||
// test independently with:
|
||||
// --test-fastest-varying=2 --test-fastest-varying=0
|
||||
For(isVectorizableLoopPtrFactory(parallelLoops, 2),
|
||||
For(isVectorizableLoopPtrFactory(parallelLoops, 0))),
|
||||
// for i { for j { A[??f(not i, not j), f(i, not j), ?, ?, f(not i, j)];}}
|
||||
// test independently with:
|
||||
// --test-fastest-varying=3 --test-fastest-varying=0
|
||||
For(isVectorizableLoopPtrFactory(parallelLoops, 3),
|
||||
For(isVectorizableLoopPtrFactory(parallelLoops, 0))),
|
||||
// for i { for j { A[??f(not i, not j), f(not i, j), f(i, not j)];}}
|
||||
// test independently with:
|
||||
// --test-fastest-varying=0 --test-fastest-varying=1
|
||||
For(isVectorizableLoopPtrFactory(parallelLoops, 0),
|
||||
For(isVectorizableLoopPtrFactory(parallelLoops, 1))),
|
||||
// for i { for j { A[??f(not i, not j), f(not i, j), ?, f(i, not j)];}}
|
||||
// test independently with:
|
||||
// --test-fastest-varying=0 --test-fastest-varying=2
|
||||
For(isVectorizableLoopPtrFactory(parallelLoops, 0),
|
||||
For(isVectorizableLoopPtrFactory(parallelLoops, 2))),
|
||||
// for i { for j { A[??f(not i, not j), f(not i, j), ?, ?, f(i, not j)];}}
|
||||
// test independently with:
|
||||
// --test-fastest-varying=0 --test-fastest-varying=3
|
||||
For(isVectorizableLoopPtrFactory(parallelLoops, 0),
|
||||
For(isVectorizableLoopPtrFactory(parallelLoops, 3))),
|
||||
// for i { A[??f(not i) , f(i)];}
|
||||
// test independently with: --test-fastest-varying=0
|
||||
For(isVectorizableLoopPtrFactory(parallelLoops, 0)),
|
||||
// for i { A[??f(not i) , f(i), ?];}
|
||||
// test independently with: --test-fastest-varying=1
|
||||
For(isVectorizableLoopPtrFactory(parallelLoops, 1)),
|
||||
// for i { A[??f(not i) , f(i), ?, ?];}
|
||||
// test independently with: --test-fastest-varying=2
|
||||
For(isVectorizableLoopPtrFactory(parallelLoops, 2)),
|
||||
// for i { A[??f(not i) , f(i), ?, ?, ?];}
|
||||
// test independently with: --test-fastest-varying=3
|
||||
For(isVectorizableLoopPtrFactory(parallelLoops, 3))};
|
||||
}
|
||||
int fastestVaryingMemRefDimension);
|
||||
|
||||
/// Creates a vectorization pattern from the command line arguments.
|
||||
/// Up to 3-D patterns are supported.
|
||||
/// If the command line argument requests a pattern of higher order, returns an
|
||||
/// empty pattern list which will conservatively result in no vectorization.
|
||||
static std::vector<NestedPattern>
|
||||
makePatterns(const llvm::DenseSet<Operation *> ¶llelLoops) {
|
||||
makePatterns(const llvm::DenseSet<Operation *> ¶llelLoops, int vectorRank,
|
||||
ArrayRef<int64_t> fastestVaryingPattern) {
|
||||
using matcher::For;
|
||||
if (clFastestVaryingPattern.empty()) {
|
||||
return defaultPatterns(parallelLoops);
|
||||
}
|
||||
switch (clFastestVaryingPattern.size()) {
|
||||
int64_t d0 = fastestVaryingPattern.empty() ? -1 : fastestVaryingPattern[0];
|
||||
int64_t d1 = fastestVaryingPattern.size() < 2 ? -1 : fastestVaryingPattern[1];
|
||||
int64_t d2 = fastestVaryingPattern.size() < 3 ? -1 : fastestVaryingPattern[2];
|
||||
switch (vectorRank) {
|
||||
case 1:
|
||||
return {For(isVectorizableLoopPtrFactory(parallelLoops,
|
||||
clFastestVaryingPattern[0]))};
|
||||
return {For(isVectorizableLoopPtrFactory(parallelLoops, d0))};
|
||||
case 2:
|
||||
return {For(
|
||||
isVectorizableLoopPtrFactory(parallelLoops, clFastestVaryingPattern[0]),
|
||||
For(isVectorizableLoopPtrFactory(parallelLoops,
|
||||
clFastestVaryingPattern[1])))};
|
||||
return {For(isVectorizableLoopPtrFactory(parallelLoops, d0),
|
||||
For(isVectorizableLoopPtrFactory(parallelLoops, d1)))};
|
||||
case 3:
|
||||
return {For(
|
||||
isVectorizableLoopPtrFactory(parallelLoops, clFastestVaryingPattern[0]),
|
||||
For(isVectorizableLoopPtrFactory(parallelLoops,
|
||||
clFastestVaryingPattern[1]),
|
||||
For(isVectorizableLoopPtrFactory(parallelLoops,
|
||||
clFastestVaryingPattern[2]))))};
|
||||
default:
|
||||
return {For(isVectorizableLoopPtrFactory(parallelLoops, d0),
|
||||
For(isVectorizableLoopPtrFactory(parallelLoops, d1),
|
||||
For(isVectorizableLoopPtrFactory(parallelLoops, d2))))};
|
||||
default: {
|
||||
return std::vector<NestedPattern>();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
/// Base state for the vectorize pass.
|
||||
/// Command line arguments are preempted by non-empty pass arguments.
|
||||
struct Vectorize : public FunctionPass<Vectorize> {
|
||||
Vectorize() {
|
||||
if (!clVirtualVectorSize.empty()) {
|
||||
vectorSizes.reserve(clVirtualVectorSize.size());
|
||||
this->vectorSizes.assign(clVirtualVectorSize.begin(),
|
||||
clVirtualVectorSize.end());
|
||||
}
|
||||
}
|
||||
Vectorize(ArrayRef<int64_t> virtualVectorSize) {
|
||||
if (clVirtualVectorSize.empty()) {
|
||||
this->vectorSizes.assign(virtualVectorSize.begin(),
|
||||
virtualVectorSize.end());
|
||||
} else {
|
||||
vectorSizes.reserve(clVirtualVectorSize.size());
|
||||
this->vectorSizes.assign(clVirtualVectorSize.begin(),
|
||||
clVirtualVectorSize.end());
|
||||
}
|
||||
}
|
||||
Vectorize();
|
||||
Vectorize(ArrayRef<int64_t> virtualVectorSize);
|
||||
Vectorize(ArrayRef<int64_t> virtualVectorSize,
|
||||
ArrayRef<int64_t> fastestVaryingPattern);
|
||||
void runOnFunction() override;
|
||||
|
||||
// The virtual vector size that we vectorize to.
|
||||
SmallVector<int64_t, 4> vectorSizes;
|
||||
// Optionally, the fixed mapping from loop to fastest varying MemRef dimension
|
||||
// for all the MemRefs within a loop pattern:
|
||||
// the index represents the loop depth, the value represents the k^th
|
||||
// fastest varying memory dimension.
|
||||
// This is voluntarily restrictive and is meant to precisely target a
|
||||
// particular loop/op pair, for testing purposes.
|
||||
SmallVector<int64_t, 4> fastestVaryingPattern;
|
||||
};
|
||||
|
||||
} // end anonymous namespace
|
||||
|
||||
/////// TODO(ntv): Hoist to a VectorizationStrategy.cpp when appropriate. //////
|
||||
Vectorize::Vectorize() {
|
||||
this->vectorSizes.assign(clVirtualVectorSize.begin(),
|
||||
clVirtualVectorSize.end());
|
||||
this->fastestVaryingPattern.assign(clFastestVaryingPattern.begin(),
|
||||
clFastestVaryingPattern.end());
|
||||
}
|
||||
|
||||
Vectorize::Vectorize(ArrayRef<int64_t> virtualVectorSize) : Vectorize() {
|
||||
if (!virtualVectorSize.empty()) {
|
||||
this->vectorSizes.assign(virtualVectorSize.begin(),
|
||||
virtualVectorSize.end());
|
||||
}
|
||||
}
|
||||
|
||||
Vectorize::Vectorize(ArrayRef<int64_t> virtualVectorSize,
|
||||
ArrayRef<int64_t> fastestVaryingPattern)
|
||||
: Vectorize(virtualVectorSize) {
|
||||
if (!fastestVaryingPattern.empty()) {
|
||||
this->fastestVaryingPattern.assign(fastestVaryingPattern.begin(),
|
||||
fastestVaryingPattern.end());
|
||||
}
|
||||
}
|
||||
|
||||
/////// TODO(ntv): Hoist to a VectorizationStrategy.cpp when appropriate.
|
||||
/////////
|
||||
namespace {
|
||||
|
||||
struct VectorizationStrategy {
|
||||
|
@ -833,12 +793,12 @@ void VectorizationState::registerReplacement(Value *key, Value *value) {
|
|||
/// vectorized immediately. The resulting vector_transfer_read is immediately
|
||||
/// registered to replace all uses of the LoadOp in this pattern's scope.
|
||||
///
|
||||
/// StoreOp are the terminals of the vectorizeNonTerminals call. They need
|
||||
/// to be vectorized late once all the use-def chains have been traversed.
|
||||
/// Additionally, they may have ssa-values operands which come from outside
|
||||
/// the scope of the current pattern.
|
||||
/// Such special cases force us to delay the vectorization of the stores
|
||||
/// until the last step. Here we merely register the store operation.
|
||||
/// StoreOp are the terminals of the vectorizeNonTerminals call. They need to be
|
||||
/// vectorized late once all the use-def chains have been traversed.
|
||||
/// Additionally, they may have ssa-values operands which come from outside the
|
||||
/// scope of the current pattern.
|
||||
/// Such special cases force us to delay the vectorization of the stores until
|
||||
/// the last step. Here we merely register the store operation.
|
||||
template <typename LoadOrStoreOpPointer>
|
||||
static LogicalResult vectorizeRootOrTerminal(Value *iv,
|
||||
LoadOrStoreOpPointer memoryOp,
|
||||
|
@ -860,6 +820,8 @@ static LogicalResult vectorizeRootOrTerminal(Value *iv,
|
|||
if (opInst->template isa<LoadOp>()) {
|
||||
auto permutationMap =
|
||||
makePermutationMap(opInst, state->strategy->loopToVectorDim);
|
||||
if (!permutationMap)
|
||||
return LogicalResult::Failure;
|
||||
LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ permutationMap: ");
|
||||
LLVM_DEBUG(permutationMap.print(dbgs()));
|
||||
FuncBuilder b(opInst);
|
||||
|
@ -907,22 +869,23 @@ static LogicalResult vectorizeAffineForOp(AffineForOp loop, int64_t step,
|
|||
return success();
|
||||
}
|
||||
|
||||
/// Returns a FilterFunctionType that can be used in NestedPattern to
|
||||
/// match a loop whose underlying load/store accesses are all varying along the
|
||||
/// `fastestVaryingMemRefDimension`.
|
||||
/// TODO(ntv): In the future, allow more interesting mixed layout permutation
|
||||
/// once we understand better the performance implications and we are confident
|
||||
/// we can build a cost model and a search procedure.
|
||||
/// Returns a FilterFunctionType that can be used in NestedPattern to match a
|
||||
/// loop whose underlying load/store accesses are either invariant or all
|
||||
// varying along the `fastestVaryingMemRefDimension`.
|
||||
static FilterFunctionType
|
||||
isVectorizableLoopPtrFactory(const llvm::DenseSet<Operation *> ¶llelLoops,
|
||||
unsigned fastestVaryingMemRefDimension) {
|
||||
int fastestVaryingMemRefDimension) {
|
||||
return [¶llelLoops, fastestVaryingMemRefDimension](Operation &forOp) {
|
||||
auto loop = forOp.cast<AffineForOp>();
|
||||
auto parallelIt = parallelLoops.find(loop);
|
||||
if (parallelIt == parallelLoops.end())
|
||||
return false;
|
||||
return isVectorizableLoopBodyAlongFastestVaryingMemRefDim(
|
||||
loop, fastestVaryingMemRefDimension);
|
||||
int memRefDim = -1;
|
||||
auto vectorizableBody = isVectorizableLoopBody(loop, &memRefDim);
|
||||
if (!vectorizableBody)
|
||||
return false;
|
||||
return memRefDim == -1 || fastestVaryingMemRefDimension == -1 ||
|
||||
memRefDim == fastestVaryingMemRefDimension;
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -1047,15 +1010,15 @@ static Value *vectorizeOperand(Value *operand, Operation *op,
|
|||
return nullptr;
|
||||
};
|
||||
|
||||
/// Encodes Operation-specific behavior for vectorization. In general we
|
||||
/// assume that all operands of an op must be vectorized but this is not always
|
||||
/// true. In the future, it would be nice to have a trait that describes how a
|
||||
/// Encodes Operation-specific behavior for vectorization. In general we assume
|
||||
/// that all operands of an op must be vectorized but this is not always true.
|
||||
/// In the future, it would be nice to have a trait that describes how a
|
||||
/// particular operation vectorizes. For now we implement the case distinction
|
||||
/// here.
|
||||
/// Returns a vectorized form of an operation or nullptr if vectorization fails.
|
||||
/// TODO(ntv): consider adding a trait to Op to describe how it gets vectorized.
|
||||
/// Maybe some Ops are not vectorizable or require some tricky logic, we cannot
|
||||
/// do one-off logic here; ideally it would be TableGen'd.
|
||||
// TODO(ntv): consider adding a trait to Op to describe how it gets vectorized.
|
||||
// Maybe some Ops are not vectorizable or require some tricky logic, we cannot
|
||||
// do one-off logic here; ideally it would be TableGen'd.
|
||||
static Operation *vectorizeOneOperation(Operation *opInst,
|
||||
VectorizationState *state) {
|
||||
// Sanity checks.
|
||||
|
@ -1074,6 +1037,8 @@ static Operation *vectorizeOneOperation(Operation *opInst,
|
|||
FuncBuilder b(opInst);
|
||||
auto permutationMap =
|
||||
makePermutationMap(opInst, state->strategy->loopToVectorDim);
|
||||
if (!permutationMap)
|
||||
return nullptr;
|
||||
LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ permutationMap: ");
|
||||
LLVM_DEBUG(permutationMap.print(dbgs()));
|
||||
auto transfer = b.create<VectorTransferWriteOp>(
|
||||
|
@ -1249,10 +1214,18 @@ static LogicalResult vectorizeRootMatch(NestedMatch m,
|
|||
/// Applies vectorization to the current Function by searching over a bunch of
|
||||
/// predetermined patterns.
|
||||
void Vectorize::runOnFunction() {
|
||||
Function &f = getFunction();
|
||||
if (!fastestVaryingPattern.empty() &&
|
||||
fastestVaryingPattern.size() != vectorSizes.size()) {
|
||||
f.emitNote("Fastest varying pattern specified with different size than the "
|
||||
"vector size.");
|
||||
this->signalPassFailure();
|
||||
return;
|
||||
}
|
||||
|
||||
// Thread-safe RAII local context, BumpPtrAllocator freed on exit.
|
||||
NestedPatternContext mlContext;
|
||||
|
||||
Function &f = getFunction();
|
||||
llvm::DenseSet<Operation *> parallelLoops;
|
||||
f.walkPostOrder([¶llelLoops](Operation *op) {
|
||||
if (auto loop = op->dyn_cast<AffineForOp>()) {
|
||||
|
@ -1262,7 +1235,8 @@ void Vectorize::runOnFunction() {
|
|||
}
|
||||
});
|
||||
|
||||
for (auto &pat : makePatterns(parallelLoops)) {
|
||||
for (auto &pat :
|
||||
makePatterns(parallelLoops, vectorSizes.size(), fastestVaryingPattern)) {
|
||||
LLVM_DEBUG(dbgs() << "\n******************************************");
|
||||
LLVM_DEBUG(dbgs() << "\n******************************************");
|
||||
LLVM_DEBUG(dbgs() << "\n[early-vect] new pattern on Function\n");
|
||||
|
|
|
@ -1,7 +1,15 @@
|
|||
// RUN: mlir-opt %s -vectorize -virtual-vector-size 4 -virtual-vector-size 8 | FileCheck %s -check-prefix=VECT
|
||||
// RUN: mlir-opt %s -vectorize -virtual-vector-size 32 -virtual-vector-size 256 --test-fastest-varying=1 --test-fastest-varying=0 | FileCheck %s
|
||||
|
||||
// Permutation maps used in vectorization.
|
||||
// CHECK: #[[map_proj_d0d1_d0d1:map[0-9]+]] = (d0, d1) -> (d0, d1)
|
||||
// CHECK-DAG: #[[map_id1:map[0-9]+]] = (d0) -> (d0)
|
||||
// CHECK-DAG: #[[map_id2:map[0-9]+]] = (d0, d1) -> (d0, d1)
|
||||
// CHECK-DAG: #[[map_proj_d0d1_zerod1:map[0-9]+]] = (d0, d1) -> (0, d1)
|
||||
// CHECK-DAG: #[[map_proj_d0d1_d0zero:map[0-9]+]] = (d0, d1) -> (d0, 0)
|
||||
// VECT-DAG: #[[map_id1:map[0-9]+]] = (d0) -> (d0)
|
||||
// VECT-DAG: #[[map_id2:map[0-9]+]] = (d0, d1) -> (d0, d1)
|
||||
// VECT-DAG: #[[map_proj_d0d1_zerod1:map[0-9]+]] = (d0, d1) -> (0, d1)
|
||||
// VECT-DAG: #[[map_proj_d0d1_d0zero:map[0-9]+]] = (d0, d1) -> (d0, 0)
|
||||
|
||||
func @vec2d(%A : memref<?x?x?xf32>) {
|
||||
%M = dim %A, 0 : memref<?x?x?xf32>
|
||||
|
@ -46,7 +54,7 @@ func @vector_add_2d(%M : index, %N : index) -> f32 {
|
|||
affine.for %i0 = 0 to %M {
|
||||
affine.for %i1 = 0 to %N {
|
||||
// CHECK: [[C1:%.*]] = constant splat<vector<32x256xf32>, 1.000000e+00> : vector<32x256xf32>
|
||||
// CHECK: vector_transfer_write [[C1]], {{.*}} {permutation_map: #[[map_proj_d0d1_d0d1]]} : vector<32x256xf32>, memref<?x?xf32>, index, index
|
||||
// CHECK: vector_transfer_write [[C1]], {{.*}} {permutation_map: #[[map_id2]]} : vector<32x256xf32>, memref<?x?xf32>, index, index
|
||||
// non-scoped %f1
|
||||
store %f1, %A[%i0, %i1] : memref<?x?xf32, 0>
|
||||
}
|
||||
|
@ -54,22 +62,22 @@ func @vector_add_2d(%M : index, %N : index) -> f32 {
|
|||
affine.for %i2 = 0 to %M {
|
||||
affine.for %i3 = 0 to %N {
|
||||
// CHECK: [[C3:%.*]] = constant splat<vector<32x256xf32>, 2.000000e+00> : vector<32x256xf32>
|
||||
// CHECK: vector_transfer_write [[C3]], {{.*}} {permutation_map: #[[map_proj_d0d1_d0d1]]} : vector<32x256xf32>, memref<?x?xf32>, index, index
|
||||
// CHECK: vector_transfer_write [[C3]], {{.*}} {permutation_map: #[[map_id2]]} : vector<32x256xf32>, memref<?x?xf32>, index, index
|
||||
// non-scoped %f2
|
||||
store %f2, %B[%i2, %i3] : memref<?x?xf32, 0>
|
||||
}
|
||||
}
|
||||
affine.for %i4 = 0 to %M {
|
||||
affine.for %i5 = 0 to %N {
|
||||
// CHECK: [[A5:%.*]] = vector_transfer_read %0, {{.*}} {permutation_map: #[[map_proj_d0d1_d0d1]]} : (memref<?x?xf32>, index, index) -> vector<32x256xf32>
|
||||
// CHECK: [[B5:%.*]] = vector_transfer_read %1, {{.*}} {permutation_map: #[[map_proj_d0d1_d0d1]]} : (memref<?x?xf32>, index, index) -> vector<32x256xf32>
|
||||
// CHECK: [[A5:%.*]] = vector_transfer_read %0, {{.*}} {permutation_map: #[[map_id2]]} : (memref<?x?xf32>, index, index) -> vector<32x256xf32>
|
||||
// CHECK: [[B5:%.*]] = vector_transfer_read %1, {{.*}} {permutation_map: #[[map_id2]]} : (memref<?x?xf32>, index, index) -> vector<32x256xf32>
|
||||
// CHECK: [[S5:%.*]] = addf [[A5]], [[B5]] : vector<32x256xf32>
|
||||
// CHECK: [[SPLAT1:%.*]] = constant splat<vector<32x256xf32>, 1.000000e+00> : vector<32x256xf32>
|
||||
// CHECK: [[S6:%.*]] = addf [[S5]], [[SPLAT1]] : vector<32x256xf32>
|
||||
// CHECK: [[SPLAT2:%.*]] = constant splat<vector<32x256xf32>, 2.000000e+00> : vector<32x256xf32>
|
||||
// CHECK: [[S7:%.*]] = addf [[S5]], [[SPLAT2]] : vector<32x256xf32>
|
||||
// CHECK: [[S8:%.*]] = addf [[S7]], [[S6]] : vector<32x256xf32>
|
||||
// CHECK: vector_transfer_write [[S8]], {{.*}} {permutation_map: #[[map_proj_d0d1_d0d1]]} : vector<32x256xf32>, memref<?x?xf32>, index, index
|
||||
// CHECK: vector_transfer_write [[S8]], {{.*}} {permutation_map: #[[map_id2]]} : vector<32x256xf32>, memref<?x?xf32>, index, index
|
||||
//
|
||||
%a5 = load %A[%i4, %i5] : memref<?x?xf32, 0>
|
||||
%b5 = load %B[%i4, %i5] : memref<?x?xf32, 0>
|
||||
|
@ -89,3 +97,46 @@ func @vector_add_2d(%M : index, %N : index) -> f32 {
|
|||
return %res : f32
|
||||
}
|
||||
|
||||
// VECT-LABEL: func @vectorize_matmul
|
||||
func @vectorize_matmul(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>, %arg2: memref<?x?xf32>) {
|
||||
%c0 = constant 0 : index
|
||||
%M = dim %arg0, 0 : memref<?x?xf32>
|
||||
%K = dim %arg0, 1 : memref<?x?xf32>
|
||||
%N = dim %arg2, 1 : memref<?x?xf32>
|
||||
// VECT: %[[C0:.*]] = constant 0 : index
|
||||
// VECT-NEXT: %[[M:.*]] = dim %arg0, 0 : memref<?x?xf32>
|
||||
// VECT-NEXT: %[[K:.*]] = dim %arg0, 1 : memref<?x?xf32>
|
||||
// VECT-NEXT: %[[N:.*]] = dim %arg2, 1 : memref<?x?xf32>
|
||||
// VECT: {{.*}} #[[map_id1]](%[[M]]) step 4 {
|
||||
// VECT-NEXT: {{.*}} #[[map_id1]](%[[N]]) step 8 {
|
||||
// VECT: %[[VC0:.*]] = constant splat<vector<4x8xf32>, 0.000000e+00> : vector<4x8xf32>
|
||||
// VECT-NEXT: vector_transfer_write %[[VC0]], %arg2, %{{.*}}, %{{.*}} {permutation_map: #[[map_id2]]}
|
||||
affine.for %i0 = (d0) -> (d0)(%c0) to (d0) -> (d0)(%M) {
|
||||
affine.for %i1 = (d0) -> (d0)(%c0) to (d0) -> (d0)(%N) {
|
||||
%cst = constant 0.000000e+00 : f32
|
||||
store %cst, %arg2[%i0, %i1] : memref<?x?xf32>
|
||||
}
|
||||
}
|
||||
// VECT: affine.for %[[I2:.*]] = #[[map_id1]](%[[C0]]) to #[[map_id1]](%[[M]]) step 4 {
|
||||
// VECT-NEXT: affine.for %[[I3:.*]] = #[[map_id1]](%[[C0]]) to #[[map_id1]](%[[N]]) step 8 {
|
||||
// VECT-NEXT: affine.for %[[I4:.*]] = #map5(%[[C0]]) to #[[map_id1]](%[[K]]) {
|
||||
// VECT-NEXT: %[[A:.*]] = vector_transfer_read %arg1, %[[I4]], %[[I3]] {permutation_map: #[[map_proj_d0d1_zerod1]]}
|
||||
// VECT-NEXT: %[[B:.*]] = vector_transfer_read %arg0, %[[I2]], %[[I4]] {permutation_map: #[[map_proj_d0d1_d0zero]]}
|
||||
// VECT-NEXT: %[[C:.*]] = mulf %[[B]], %[[A]] : vector<4x8xf32>
|
||||
// VECT-NEXT: %[[D:.*]] = vector_transfer_read %arg2, %[[I2]], %[[I3]] {permutation_map: #[[map_id2]]}
|
||||
// VECT-NEXT: %[[E:.*]] = addf %[[D]], %[[C]] : vector<4x8xf32>
|
||||
// VECT-NEXT: vector_transfer_write %[[E]], %arg2, %[[I2]], %[[I3]] {permutation_map: #[[map_id2]]} : vector<4x8xf32>, memref<?x?xf32>, index, index
|
||||
affine.for %i2 = (d0) -> (d0)(%c0) to (d0) -> (d0)(%M) {
|
||||
affine.for %i3 = (d0) -> (d0)(%c0) to (d0) -> (d0)(%N) {
|
||||
affine.for %i4 = (d0) -> (d0)(%c0) to (d0) -> (d0)(%K) {
|
||||
%6 = load %arg1[%i4, %i3] : memref<?x?xf32>
|
||||
%7 = load %arg0[%i2, %i4] : memref<?x?xf32>
|
||||
%8 = mulf %7, %6 : f32
|
||||
%9 = load %arg2[%i2, %i3] : memref<?x?xf32>
|
||||
%10 = addf %9, %8 : f32
|
||||
store %10, %arg2[%i2, %i3] : memref<?x?xf32>
|
||||
}
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue