forked from OSchip/llvm-project
[MLIR][MaterializeVectors] Add a MaterializeVector pass via unrolling.
This CL adds an MLIR-MLIR pass which materializes super-vectors to hardware-dependent sized vectors. While the physical vector size is target-dependent, the pass is written in a target-independent way: the target vector size is specified as a parameter to the pass. This pass is thus a partial lowering that opens the "greybox" that is the super-vector abstraction. This first CL adds a first materilization pass iterates over vector_transfer_write operations and: 1. computes the program slice including the current vector_transfer_write; 2. computes the multi-dimensional ratio of super-vector shape to hardware vector shape; 3. for each possible multi-dimensional value within the bounds of ratio, a new slice is instantiated (i.e. cloned and rewritten) so that all operations in this instance operate on the hardware vector type. As a simple example, given: ```mlir mlfunc @vector_add_2d(%M : index, %N : index) -> memref<?x?xf32> { %A = alloc (%M, %N) : memref<?x?xf32> %B = alloc (%M, %N) : memref<?x?xf32> %C = alloc (%M, %N) : memref<?x?xf32> for %i0 = 0 to %M { for %i1 = 0 to %N { %a1 = load %A[%i0, %i1] : memref<?x?xf32> %b1 = load %B[%i0, %i1] : memref<?x?xf32> %s1 = addf %a1, %b1 : f32 store %s1, %C[%i0, %i1] : memref<?x?xf32> } } return %C : memref<?x?xf32> } ``` and the following options: ``` -vectorize -virtual-vector-size 32 --test-fastest-varying=0 -materialize-vectors -vector-size=8 ``` materialization emits: ```mlir #map0 = (d0, d1) -> (d0, d1) #map1 = (d0, d1) -> (d0, d1 + 8) #map2 = (d0, d1) -> (d0, d1 + 16) #map3 = (d0, d1) -> (d0, d1 + 24) mlfunc @vector_add_2d(%arg0 : index, %arg1 : index) -> memref<?x?xf32> { %0 = alloc(%arg0, %arg1) : memref<?x?xf32> %1 = alloc(%arg0, %arg1) : memref<?x?xf32> %2 = alloc(%arg0, %arg1) : memref<?x?xf32> for %i0 = 0 to %arg0 { for %i1 = 0 to %arg1 step 32 { %3 = affine_apply #map0(%i0, %i1) %4 = "vector_transfer_read"(%0, %3tensorflow/mlir#0, %3tensorflow/mlir#1) : (memref<?x?xf32>, index, index) -> vector<8xf32> %5 = affine_apply #map1(%i0, %i1) %6 = "vector_transfer_read"(%0, %5tensorflow/mlir#0, %5tensorflow/mlir#1) : (memref<?x?xf32>, index, index) -> vector<8xf32> %7 = affine_apply #map2(%i0, %i1) %8 = "vector_transfer_read"(%0, %7tensorflow/mlir#0, %7tensorflow/mlir#1) : (memref<?x?xf32>, index, index) -> vector<8xf32> %9 = affine_apply #map3(%i0, %i1) %10 = "vector_transfer_read"(%0, %9tensorflow/mlir#0, %9tensorflow/mlir#1) : (memref<?x?xf32>, index, index) -> vector<8xf32> %11 = affine_apply #map0(%i0, %i1) %12 = "vector_transfer_read"(%1, %11tensorflow/mlir#0, %11tensorflow/mlir#1) : (memref<?x?xf32>, index, index) -> vector<8xf32> %13 = affine_apply #map1(%i0, %i1) %14 = "vector_transfer_read"(%1, %13tensorflow/mlir#0, %13tensorflow/mlir#1) : (memref<?x?xf32>, index, index) -> vector<8xf32> %15 = affine_apply #map2(%i0, %i1) %16 = "vector_transfer_read"(%1, %15tensorflow/mlir#0, %15tensorflow/mlir#1) : (memref<?x?xf32>, index, index) -> vector<8xf32> %17 = affine_apply #map3(%i0, %i1) %18 = "vector_transfer_read"(%1, %17tensorflow/mlir#0, %17tensorflow/mlir#1) : (memref<?x?xf32>, index, index) -> vector<8xf32> %19 = addf %4, %12 : vector<8xf32> %20 = addf %6, %14 : vector<8xf32> %21 = addf %8, %16 : vector<8xf32> %22 = addf %10, %18 : vector<8xf32> %23 = affine_apply #map0(%i0, %i1) "vector_transfer_write"(%19, %2, %23tensorflow/mlir#0, %23tensorflow/mlir#1) : (vector<8xf32>, memref<?x?xf32>, index, index) -> () %24 = affine_apply #map1(%i0, %i1) "vector_transfer_write"(%20, %2, %24tensorflow/mlir#0, %24tensorflow/mlir#1) : (vector<8xf32>, memref<?x?xf32>, index, index) -> () %25 = affine_apply #map2(%i0, %i1) "vector_transfer_write"(%21, %2, %25tensorflow/mlir#0, %25tensorflow/mlir#1) : (vector<8xf32>, memref<?x?xf32>, index, index) -> () %26 = affine_apply #map3(%i0, %i1) "vector_transfer_write"(%22, %2, %26tensorflow/mlir#0, %26tensorflow/mlir#1) : (vector<8xf32>, memref<?x?xf32>, index, index) -> () } } return %2 : memref<?x?xf32> } ``` PiperOrigin-RevId: 222455351
This commit is contained in:
parent
258dae5d73
commit
a5782f0d40
|
@ -70,8 +70,8 @@ using TransitiveFilter = std::function<bool(Statement *)>;
|
|||
/// 9
|
||||
///
|
||||
/// Assuming all local orders match the numbering order:
|
||||
/// 1. after getting back to the root getForwardSlice,
|
||||
/// `forwardSlice` may contain:
|
||||
/// 1. after getting back to the root getForwardSlice, `forwardSlice` may
|
||||
/// contain:
|
||||
/// {9, 7, 8, 5, 1, 2, 6, 3, 4}
|
||||
/// 2. reversing the result of 1. gives:
|
||||
/// {4, 3, 6, 2, 1, 5, 8, 7, 9}
|
||||
|
|
|
@ -44,6 +44,9 @@ FunctionPass *createVectorizePass();
|
|||
/// FileCheck.
|
||||
FunctionPass *createVectorizerTestPass();
|
||||
|
||||
/// Creates a pass to lower super-vectors to target-dependent HW vectors.
|
||||
FunctionPass *createMaterializeVectors();
|
||||
|
||||
/// Creates a loop unrolling pass. Default option or command-line options take
|
||||
/// effect if -1 is passed as parameter.
|
||||
FunctionPass *createLoopUnrollPass(int unrollFactor = -1, int unrollFull = -1);
|
||||
|
|
|
@ -59,7 +59,7 @@ void mlir::getForwardSlice(Statement *stmt,
|
|||
auto *ownerStmt = u.getOwner();
|
||||
if (forwardSlice->count(ownerStmt) == 0) {
|
||||
getForwardSlice(ownerStmt, forwardSlice, filter,
|
||||
/* topLevel */ false);
|
||||
/*topLevel=*/false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -68,7 +68,7 @@ void mlir::getForwardSlice(Statement *stmt,
|
|||
auto *ownerStmt = u.getOwner();
|
||||
if (forwardSlice->count(ownerStmt) == 0) {
|
||||
getForwardSlice(ownerStmt, forwardSlice, filter,
|
||||
/* topLevel */ false);
|
||||
/*topLevel=*/false);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
@ -105,7 +105,7 @@ void mlir::getBackwardSlice(Statement *stmt,
|
|||
auto *stmt = operand->getDefiningStmt();
|
||||
if (backwardSlice->count(stmt) == 0) {
|
||||
getBackwardSlice(stmt, backwardSlice, filter,
|
||||
/* topLevel */ false);
|
||||
/*topLevel=*/false);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,595 @@
|
|||
//===- MaterializeVectors.cpp - MaterializeVectors Pass Impl ----*- C++ -*-===//
|
||||
//
|
||||
// Copyright 2019 The MLIR Authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
// =============================================================================
|
||||
//
|
||||
// This file implements target-dependent materialization of super-vectors to
|
||||
// vectors of the proper size for the hardware.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "mlir/Analysis/LoopAnalysis.h"
|
||||
#include "mlir/Analysis/MLFunctionMatcher.h"
|
||||
#include "mlir/Analysis/SliceAnalysis.h"
|
||||
#include "mlir/Analysis/Utils.h"
|
||||
#include "mlir/Analysis/VectorAnalysis.h"
|
||||
#include "mlir/IR/AffineExpr.h"
|
||||
#include "mlir/IR/AffineMap.h"
|
||||
#include "mlir/IR/Attributes.h"
|
||||
#include "mlir/IR/Builders.h"
|
||||
#include "mlir/IR/BuiltinOps.h"
|
||||
#include "mlir/IR/Location.h"
|
||||
#include "mlir/IR/MLValue.h"
|
||||
#include "mlir/IR/OperationSupport.h"
|
||||
#include "mlir/IR/SSAValue.h"
|
||||
#include "mlir/IR/Types.h"
|
||||
#include "mlir/Pass.h"
|
||||
#include "mlir/StandardOps/StandardOps.h"
|
||||
#include "mlir/Support/Functional.h"
|
||||
#include "mlir/Support/LLVM.h"
|
||||
#include "mlir/Transforms/Passes.h"
|
||||
|
||||
#include "llvm/Support/CommandLine.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
#include "llvm/Support/raw_ostream.h"
|
||||
|
||||
///
|
||||
/// Implements target-dependent materialization of virtual super-vectors to
|
||||
/// vectors of the proper size for the hardware.
|
||||
///
|
||||
/// While the physical vector size is target-dependent, the pass is written in
|
||||
/// a target-independent way: the target vector size is specified as a parameter
|
||||
/// to the pass. This pass is thus a partial lowering that opens the "greybox"
|
||||
/// that is the super-vector abstraction. In particular, this pass can turn the
|
||||
/// vector_transfer_read and vector_transfer_write ops in either:
|
||||
/// 1. a loop nest with either scalar and vector load/store instructions; or
|
||||
/// 2. a loop-nest with DmaStartOp / DmaWaitOp; or
|
||||
/// 3. a pre-existing blackbox library call that can be written manually or
|
||||
/// synthesized using search and superoptimization.
|
||||
/// An important feature that either of these 3 target lowering abstractions
|
||||
/// must handle is the handling of "non-effecting" padding with the proper
|
||||
/// neutral element in order to guarantee that all "partial tiles" are actually
|
||||
/// "full tiles" in practice.
|
||||
///
|
||||
/// In particular this pass is a MLIR-MLIR rewriting and does not concern itself
|
||||
/// with target-specific instruction-selection and register allocation. These
|
||||
/// will happen downstream in LLVM.
|
||||
///
|
||||
/// In this sense, despite performing lowering to a target-dependent size, this
|
||||
/// pass is still target-agnostic.
|
||||
///
|
||||
/// Implementation details
|
||||
/// ======================
|
||||
/// The current decisions made by the super-vectorization pass guarantee that
|
||||
/// use-def chains do not escape an enclosing vectorized ForStmt. In other
|
||||
/// words, this pass operates on a scoped program slice. Furthermore, since we
|
||||
/// do not vectorize in the presence of conditionals for now, sliced chains are
|
||||
/// guaranteed not to escape the innermost scope, which has to be either the top
|
||||
/// MLFunction scope of the innermost loop scope, by construction. As a
|
||||
/// consequence, the implementation just starts from vector_transfer_write
|
||||
/// operations and builds the slice scoped the innermost loop enclosing the
|
||||
/// current vector_transfer_write. These assumptions and the implementation
|
||||
/// details are subject to revision in the future.
|
||||
|
||||
using llvm::dbgs;
|
||||
using llvm::DenseSet;
|
||||
using llvm::SetVector;
|
||||
|
||||
using namespace mlir;
|
||||
|
||||
using functional::map;
|
||||
|
||||
static llvm::cl::list<int>
|
||||
clVectorSize("vector-size",
|
||||
llvm::cl::desc("Specify the HW vector size for vectorization"),
|
||||
llvm::cl::ZeroOrMore);
|
||||
|
||||
#define DEBUG_TYPE "materialize-vect"
|
||||
|
||||
namespace {
|
||||
struct MaterializationState {
|
||||
/// In practice, the determination of the HW-specific vector type to use when
|
||||
/// lowering a super-vector type must be based on the elemental type. The
|
||||
/// elemental type must be retrieved from the super-vector type. In the future
|
||||
/// information about hardware vector type for a particular elemental type
|
||||
/// will be part of the contract between MLIR and the backend.
|
||||
///
|
||||
/// For example, 8xf32 has the same size as 16xf16 but the targeted HW itself
|
||||
/// may exhibit the following property:
|
||||
/// 1. have a special unit for a 128xf16 datapath;
|
||||
/// 2. no F16 FPU support on the regular 8xf32/16xf16 vector datapath.
|
||||
///
|
||||
/// For now, we just assume hwVectorSize has the proper information regardless
|
||||
/// of the type and we assert everything is f32.
|
||||
/// TODO(ntv): relax the assumptions on admissible element type once a
|
||||
/// contract exists.
|
||||
MaterializationState() : hwVectorSize(clVectorSize.size(), 0) {
|
||||
std::copy(clVectorSize.begin(), clVectorSize.end(), hwVectorSize.begin());
|
||||
}
|
||||
SmallVector<int, 8> hwVectorSize;
|
||||
VectorType superVectorType;
|
||||
VectorType hwVectorType;
|
||||
SmallVector<unsigned, 8> hwVectorInstance;
|
||||
DenseMap<const MLValue *, MLValue *> *substitutionsMap;
|
||||
};
|
||||
|
||||
struct MaterializeVectors : public FunctionPass {
|
||||
MaterializeVectors() : FunctionPass(&MaterializeVectors::passID) {}
|
||||
|
||||
PassResult runOnMLFunction(MLFunction *f) override;
|
||||
|
||||
// Thread-safe RAII contexts local to pass, BumpPtrAllocator freed on exit.
|
||||
MLFunctionMatcherContext mlContext;
|
||||
|
||||
static char passID;
|
||||
};
|
||||
|
||||
} // end anonymous namespace
|
||||
|
||||
char MaterializeVectors::passID = 0;
|
||||
|
||||
// Returns the distance, in number of elements, between a slice in a dimension
|
||||
// and the next slice in the same dimension.
|
||||
// e.g. shape[3, 4, 5] -> strides[20, 5, 1]
|
||||
static SmallVector<unsigned, 8> makeStrides(ArrayRef<unsigned> shape) {
|
||||
SmallVector<unsigned, 8> tmp;
|
||||
tmp.reserve(shape.size());
|
||||
unsigned running = 1;
|
||||
for (auto rit = shape.rbegin(), reit = shape.rend(); rit != reit; ++rit) {
|
||||
assert(*rit > 0 && "NYI: symbolic or null shape dimension");
|
||||
tmp.push_back(running);
|
||||
running *= *rit;
|
||||
}
|
||||
return SmallVector<unsigned, 8>(tmp.rbegin(), tmp.rend());
|
||||
}
|
||||
|
||||
// Returns the linearized expression.
|
||||
static SmallVector<unsigned, 8> delinearize(unsigned linearIndex,
|
||||
ArrayRef<unsigned> shape) {
|
||||
SmallVector<unsigned, 8> res;
|
||||
res.reserve(shape.size());
|
||||
auto strides = makeStrides(shape);
|
||||
for (unsigned idx = 0; idx < strides.size(); ++idx) {
|
||||
assert(strides[idx] > 0);
|
||||
auto val = linearIndex / strides[idx];
|
||||
res.push_back(val);
|
||||
assert((val >= 0 && val < shape[idx]) &&
|
||||
"delinearization is out of bounds");
|
||||
linearIndex %= strides[idx];
|
||||
}
|
||||
// Sanity check.
|
||||
assert(linearIndex == 0 && "linear index constructed from shape must "
|
||||
"have 0 remainder after delinearization");
|
||||
return res;
|
||||
}
|
||||
|
||||
// Since this is used during a traversal of a topologically sorted set, we
|
||||
// can just return the original SSAValue if we do not have a substitution.
|
||||
// The topological order guarantees there will never be one.
|
||||
static MLValue *
|
||||
substitute(SSAValue *v,
|
||||
const DenseMap<const MLValue *, MLValue *> &substitutionsMap) {
|
||||
auto it = substitutionsMap.find(cast<MLValue>(v));
|
||||
if (it == substitutionsMap.end()) {
|
||||
return cast<MLValue>(v);
|
||||
}
|
||||
return it->second;
|
||||
};
|
||||
|
||||
/// Returns an AffineMap that reindexed the memRefIndices by the
|
||||
/// multi-dimensional hwVectorInstance.
|
||||
/// This is used by the function that materialized a vector_transfer operation
|
||||
/// to use hardware vector types instead of super-vector types.
|
||||
///
|
||||
/// The general problem this pass solves is as follows:
|
||||
/// Assume a vector_transfer operation at the super-vector granularity that has
|
||||
/// `l` enclosing loops (ForStmt). Assume the vector transfer operation operates
|
||||
/// on a MemRef of rank `r`, a super-vector of rank `s` and a hardware vector of
|
||||
/// rank `h`.
|
||||
/// For the purpose of illustration assume l==4, r==3, s==2, h==1 and that the
|
||||
/// super-vector is vector<3x32xf32> and the hardware vector is vector<8xf32>.
|
||||
/// Assume the following MLIR snippet after super-vectorizationhas been applied:
|
||||
/// ```mlir
|
||||
/// for %i0 = 0 to %M {
|
||||
/// for %i1 = 0 to %N step 3 {
|
||||
/// for %i2 = 0 to %O {
|
||||
/// for %i3 = 0 to %P step 32 {
|
||||
/// %r = vector_transfer_read(%A, map(%i..)#0, map(%i..)#1, map(%i..)#2)
|
||||
/// -> vector<3x32xf32>
|
||||
/// ...
|
||||
/// }}}}
|
||||
/// ```
|
||||
/// where map denotes an AffineMap operating on enclosing loops with properties
|
||||
/// compatible for vectorization (i.e. some contiguity left unspecified here).
|
||||
/// Note that the vectorized loops are %i1 and %i3.
|
||||
/// This function translates the vector_transfer_read operation to multiple
|
||||
/// instances of vector_transfer_read that operate on vector<8x32>.
|
||||
///
|
||||
/// Without loss of generality, we assume hwVectorInstance is: {2, 1}.
|
||||
/// The only constraints on hwVectorInstance is they belong to:
|
||||
/// [0, 2] x [0, 3], which is the span of ratio of super-vector shape to
|
||||
/// hardware vector shape in our example.
|
||||
///
|
||||
/// This function instantiates the iteration <2, 1> of vector_transfer_read
|
||||
/// into the set of operations in pseudo-MLIR:
|
||||
/// ```mlir
|
||||
/// map2 = (d0, d1, d2, d3) -> (d0, d1 + 2, d2, d3 + 1 * 8)
|
||||
/// map3 = map o map2 // where o denotes composition
|
||||
/// %r = vector_transfer_read(%A, map3(%i..)#0, map3(%i..)#1, map3(%i..)#2)
|
||||
/// -> vector<3x32xf32>
|
||||
/// ```
|
||||
///
|
||||
/// Practical considerations
|
||||
/// ========================
|
||||
/// For now, `map` is assumed to be the identity map and the indices are
|
||||
/// specified just as vector_transfer_read(%A, %i0, %i1, %i2, %i3). This will be
|
||||
/// extended in the future once we have a proper Op for vector transfers.
|
||||
/// Additionally, the example above is specified in pseudo-MLIR form; once we
|
||||
/// have proper support for generic maps we can generate the code and show
|
||||
/// actual MLIR.
|
||||
///
|
||||
/// TODO(ntv): support a concrete AffineMap and compose with it.
|
||||
/// TODO(ntv): these implementation details should be captured in a
|
||||
/// vectorization trait at the op level directly.
|
||||
static SmallVector<MLValue *, 8>
|
||||
reindexAffineIndices(MLFuncBuilder *b, Type hwVectorType,
|
||||
ArrayRef<unsigned> hwVectorInstance,
|
||||
ArrayRef<SSAValue *> memrefIndices) {
|
||||
auto vectorShape = hwVectorType.cast<VectorType>().getShape();
|
||||
assert(hwVectorInstance.size() >= vectorShape.size());
|
||||
|
||||
unsigned numIndices = memrefIndices.size();
|
||||
auto numMemRefIndices = numIndices - hwVectorInstance.size();
|
||||
auto numSuperVectorIndices = hwVectorInstance.size() - vectorShape.size();
|
||||
|
||||
SmallVector<AffineExpr, 8> affineExprs;
|
||||
// TODO(ntv): support a concrete map and composition.
|
||||
unsigned i = 0;
|
||||
// The first numMemRefIndices correspond to ForStmt that have not been
|
||||
// vectorized, the transformation is the identity on those.
|
||||
for (i = 0; i < numMemRefIndices; ++i) {
|
||||
auto d_i = b->getAffineDimExpr(i);
|
||||
affineExprs.push_back(d_i);
|
||||
}
|
||||
// The next numSuperVectorIndices correspond to super-vector dimensions that
|
||||
// do not have a hardware vector dimension counterpart. For those we only
|
||||
// need to increment the index by the corresponding hwVectorInstance.
|
||||
for (i = numMemRefIndices; i < numMemRefIndices + numSuperVectorIndices;
|
||||
++i) {
|
||||
auto d_i = b->getAffineDimExpr(i);
|
||||
auto offset = hwVectorInstance[i - numMemRefIndices];
|
||||
affineExprs.push_back(d_i + offset);
|
||||
}
|
||||
// The remaining indices correspond to super-vector dimensions that
|
||||
// have a hardware vector dimension counterpart. For those we to increment the
|
||||
// index by "hwVectorInstance" multiples of the corresponding hardware
|
||||
// vector size.
|
||||
for (; i < numIndices; ++i) {
|
||||
auto d_i = b->getAffineDimExpr(i);
|
||||
auto offset = hwVectorInstance[i - numMemRefIndices];
|
||||
auto stride = vectorShape[i - numMemRefIndices - numSuperVectorIndices];
|
||||
affineExprs.push_back(d_i + offset * stride);
|
||||
}
|
||||
auto affineMap = AffineMap::get(numIndices, 0, affineExprs, {});
|
||||
|
||||
// TODO(ntv): support a concrete map and composition.
|
||||
auto app = b->create<AffineApplyOp>(b->getInsertionPoint()->getLoc(),
|
||||
affineMap, memrefIndices);
|
||||
unsigned numResults = app->getNumResults();
|
||||
SmallVector<MLValue *, 8> res;
|
||||
for (unsigned i = 0; i < numResults; ++i) {
|
||||
res.push_back(cast<MLValue>(app->getResult(i)));
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
/// Returns the cloned operands of `opStmt` for the instance of
|
||||
/// `hwVectorInstance` when lowering from a super-vector type to
|
||||
/// `hwVectorType`. `hwVectorInstance` represents one particular instance of
|
||||
/// `hwVectorType` int the covering of the super-vector type. For a more
|
||||
/// detailed description of the problem, see the description of
|
||||
/// reindexAffineIndices.
|
||||
static SmallVector<MLValue *, 8>
|
||||
cloneAndUnrollOperands(OperationStmt *opStmt, Type hwVectorType,
|
||||
ArrayRef<unsigned> hwVectorInstance,
|
||||
DenseMap<const MLValue *, MLValue *> *substitutionsMap) {
|
||||
using functional::map;
|
||||
|
||||
// For Ops that are not vector_transfer_read/vector_transfer_write we can just
|
||||
// substitute and be done.
|
||||
if (!isaVectorTransferRead(*opStmt) && !isaVectorTransferWrite(*opStmt)) {
|
||||
return map([substitutionsMap](
|
||||
SSAValue *v) { return substitute(v, *substitutionsMap); },
|
||||
opStmt->getOperands());
|
||||
}
|
||||
|
||||
// TODO(ntv): this error-prone boilerplate can be removed once we have a
|
||||
// proper Op for vectr_transfer.
|
||||
unsigned offset = 0;
|
||||
unsigned numIndices = 0;
|
||||
SmallVector<MLValue *, 8> res;
|
||||
auto operands = opStmt->getOperands();
|
||||
if (isaVectorTransferRead(*opStmt)) {
|
||||
offset = 1;
|
||||
numIndices = opStmt->getNumOperands() - 1;
|
||||
} else if (isaVectorTransferWrite(*opStmt)) {
|
||||
offset = 2;
|
||||
numIndices = opStmt->getNumOperands() - 2;
|
||||
}
|
||||
// Copy as-is the [optional valueToStore], memref.
|
||||
for (unsigned i = 0; i < offset; ++i) {
|
||||
res.push_back(substitute(*(operands.begin() + i), *substitutionsMap));
|
||||
}
|
||||
|
||||
MLFuncBuilder b(opStmt);
|
||||
// TODO(ntv): indices extraction is brittle and unsafe before we have an Op.
|
||||
SmallVector<SSAValue *, 8> indices;
|
||||
for (auto it = operands.begin() + offset; it != operands.end(); ++it) {
|
||||
indices.push_back(*it);
|
||||
}
|
||||
auto affineValues =
|
||||
reindexAffineIndices(&b, hwVectorType, hwVectorInstance, indices);
|
||||
res.append(affineValues.begin(), affineValues.end());
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
// Returns attributes with the following substitutions applied:
|
||||
// - splat of `superVectorType` is replaced by splat of `hwVectorType`.
|
||||
// TODO(ntv): add more substitutions on a per-need basis.
|
||||
static SmallVector<NamedAttribute, 2>
|
||||
materializeAttributes(OperationStmt *opStmt, VectorType superVectorType,
|
||||
VectorType hwVectorType) {
|
||||
SmallVector<NamedAttribute, 2> res;
|
||||
for (auto a : opStmt->getAttrs()) {
|
||||
auto splat = a.second.dyn_cast<SplatElementsAttr>();
|
||||
bool splatOfSuperVectorType = splat && (splat.getType() == superVectorType);
|
||||
if (splatOfSuperVectorType) {
|
||||
auto attr = SplatElementsAttr::get(hwVectorType.cast<VectorType>(),
|
||||
splat.getValue());
|
||||
res.push_back(NamedAttribute(a.first, attr));
|
||||
} else {
|
||||
res.push_back(a);
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
/// Returns `true` if stmt instance is properly cloned and inserted, false
|
||||
/// otherwise.
|
||||
/// The multi-dimensional `hwVectorInstance` belongs to the shapeRatio of
|
||||
/// super-vector type to hw vector type.
|
||||
/// A cloned instance of `stmt` is formed as follows:
|
||||
/// 1. vector_transfer_read: the return `superVectorType` is replaced by
|
||||
/// `hwVectorType`. Additionally, affine indices are reindexed with
|
||||
/// `reindexAffineIndices` using `hwVectorInstance` and vector type
|
||||
/// information;
|
||||
/// 2. vector_transfer_write: the `valueToStore` type is simply substituted.
|
||||
/// Since we operate on a topologically sorted slice, a substitution must
|
||||
/// have been registered for non-constant ops. Additionally, affine indices
|
||||
/// are reindexed in the same way as for vector_transfer_read;
|
||||
/// 3. constant ops are splats of the super-vector type by construction.
|
||||
/// They are cloned to a splat on the hw vector type with the same value;
|
||||
/// 4. remaining ops are cloned to version of the op that returns a hw vector
|
||||
/// type, all operands are substituted according to `substitutions`. Thanks
|
||||
/// to the topological order of a slice, the substitution is always
|
||||
/// possible.
|
||||
static bool cloneAndInsertHardwareVectorInstance(Statement *stmt,
|
||||
MaterializationState *state) {
|
||||
LLVM_DEBUG(dbgs() << "\nclone" << *stmt);
|
||||
if (auto *opStmt = dyn_cast<OperationStmt>(stmt)) {
|
||||
// TODO(ntv): Is it worth considering an OperationStmt.clone operation
|
||||
// which changes the type so we can promote an OperationStmt with less
|
||||
// boilerplate?
|
||||
assert(opStmt->getNumResults() <= 1 && "NYI: opStmt has > 1 results");
|
||||
auto operands = cloneAndUnrollOperands(opStmt, state->hwVectorType,
|
||||
state->hwVectorInstance,
|
||||
state->substitutionsMap);
|
||||
MLFuncBuilder b(stmt);
|
||||
if (opStmt->getNumResults() == 0) {
|
||||
// vector_transfer_write
|
||||
b.createOperation(stmt->getLoc(), opStmt->getName(), operands, {},
|
||||
materializeAttributes(opStmt, state->superVectorType,
|
||||
state->hwVectorType));
|
||||
} else {
|
||||
// vector_transfer_read
|
||||
auto *cloned = b.createOperation(
|
||||
stmt->getLoc(), opStmt->getName(), operands, {state->hwVectorType},
|
||||
materializeAttributes(opStmt, state->superVectorType,
|
||||
state->hwVectorType));
|
||||
state->substitutionsMap->insert(std::make_pair(
|
||||
cast<MLValue>(opStmt->getResult(0)),
|
||||
cast<MLValue>(cast<OperationStmt>(cloned)->getResult(0))));
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
if (isa<ForStmt>(stmt)) {
|
||||
// Fail hard and wake up when needed.
|
||||
stmt->emitError("NYI path ForStmt");
|
||||
return true;
|
||||
}
|
||||
|
||||
// Fail hard and wake up when needed.
|
||||
stmt->emitError("NYI path IfStmt");
|
||||
return true;
|
||||
}
|
||||
|
||||
/// Takes a slice and rewrites the operations in it so that occurrences
|
||||
/// of `superVectorType` are replaced by `hwVectorType`.
|
||||
///
|
||||
/// Implementation
|
||||
/// ==============
|
||||
/// 1. computes the shape ratio of super-vector to HW vector shapes. This
|
||||
/// gives for each op in the slice, how many instantiations are required
|
||||
/// in each dimension;
|
||||
/// 2. performs the concrete materialization. Note that in a first
|
||||
/// implementation we use full unrolling because it pragmatically removes
|
||||
/// the need to explicitly materialize an AllocOp. Thanks to the properties
|
||||
/// of super-vectors, this unrolling is always possible and simple:
|
||||
/// vectorizing to a super-vector abstraction already achieved the
|
||||
/// equivalent of loop strip-mining + loop sinking and encoded this in the
|
||||
/// vector type.
|
||||
///
|
||||
/// TODO(ntv): materialized allocs.
|
||||
/// TODO(ntv): full loops + materialized allocs.
|
||||
/// TODO(ntv): partial unrolling + materialized allocs.
|
||||
static void emitSlice(MaterializationState *state,
|
||||
SetVector<Statement *> *slice) {
|
||||
auto ratio = shapeRatio(state->superVectorType, state->hwVectorType);
|
||||
assert(ratio.hasValue() &&
|
||||
"ratio of super-vector to HW-vector shape is not integral");
|
||||
// The number of integer points in a hyperrectangular region is:
|
||||
// shape[0] * strides[0].
|
||||
auto numValueToUnroll = (*ratio)[0] * makeStrides(*ratio)[0];
|
||||
// Full unrolling to hardware vectors in a first approximation.
|
||||
for (unsigned idx = 0; idx < numValueToUnroll; ++idx) {
|
||||
// Fresh RAII instanceIndices and substitutionsMap.
|
||||
MaterializationState scopedState = *state;
|
||||
scopedState.hwVectorInstance = delinearize(idx, *ratio);
|
||||
DenseMap<const MLValue *, MLValue *> substitutionMap;
|
||||
scopedState.substitutionsMap = &substitutionMap;
|
||||
// slice are topologically sorted, we can just clone them in order.
|
||||
for (auto *stmt : *slice) {
|
||||
auto fail = cloneAndInsertHardwareVectorInstance(stmt, &scopedState);
|
||||
assert(!fail && "Unhandled super-vector materialization failure");
|
||||
}
|
||||
}
|
||||
// slice are topologically sorted, we can just erase them in reverse
|
||||
// order. Reverse iterator does not just work simply with an operator*
|
||||
// dereference.
|
||||
for (int idx = slice->size() - 1; idx >= 0; --idx) {
|
||||
(*slice)[idx]->erase();
|
||||
}
|
||||
}
|
||||
|
||||
/// Materializes super-vector types into concrete hw vector types as follows:
|
||||
/// 1. start from super-vector terminators (current vector_transfer_write
|
||||
/// ops);
|
||||
/// 2. collect all the statements that can be reached by transitive use-defs
|
||||
/// chains;
|
||||
/// 3. get the superVectorType for this particular terminator and the
|
||||
/// corresponding hardware vector type (for now limited to F32)
|
||||
/// TODO(ntv): be more general than F32.
|
||||
/// 4. emit the transitive useDef set to operate on the finer grain vector
|
||||
/// types.
|
||||
///
|
||||
/// Notes
|
||||
/// =====
|
||||
/// The `slice` is sorted in topological order by construction.
|
||||
/// Additionally, this set is limited to statements in the same lexical scope
|
||||
/// because we currently disallow vectorization of defs that come from another
|
||||
/// scope.
|
||||
static void materialize(MLFunction *f,
|
||||
const SetVector<OperationStmt *> &terminators,
|
||||
MaterializationState *state) {
|
||||
DenseSet<Statement *> seen;
|
||||
for (auto terminator : terminators) {
|
||||
LLVM_DEBUG(dbgs() << "\nFrom terminator:" << *terminator);
|
||||
|
||||
// Short-circuit test, a given terminator may have been reached by some
|
||||
// other previous transitive use-def chains.
|
||||
if (seen.count(terminator) > 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Terminators are vector_transfer_write with 0 results by construction atm.
|
||||
assert(isaVectorTransferWrite(*terminator) && "");
|
||||
assert(terminator->getNumResults() == 0 &&
|
||||
"NYI: terminators must have 0 results");
|
||||
|
||||
// Get the transitive use-defs starting from terminator, limited to the
|
||||
// current enclosing scope of the terminator. See the top of the function
|
||||
// Note for the justification of this restriction.
|
||||
// TODO(ntv): relax scoping constraints.
|
||||
auto *enclosingScope = terminator->getParentStmt();
|
||||
auto keepIfInSameScope = [enclosingScope](Statement *stmt) {
|
||||
assert(stmt && "NULL stmt");
|
||||
if (!enclosingScope) {
|
||||
// by construction, everyone is always under the top scope (null scope).
|
||||
return true;
|
||||
}
|
||||
return properlyDominates(*enclosingScope, *stmt);
|
||||
};
|
||||
SetVector<Statement *> slice =
|
||||
getSlice(terminator, keepIfInSameScope, keepIfInSameScope);
|
||||
assert(!slice.empty());
|
||||
|
||||
// Sanity checks: transitive slice must be completely disjoint from
|
||||
// what we have seen so far.
|
||||
LLVM_DEBUG(dbgs() << "\nTransitive use-defs:");
|
||||
for (auto *ud : slice) {
|
||||
LLVM_DEBUG(dbgs() << "\nud:" << *ud);
|
||||
assert(seen.count(ud) == 0 &&
|
||||
"Transitive use-defs not disjoint from already seen");
|
||||
seen.insert(ud);
|
||||
}
|
||||
|
||||
// Emit the current slice.
|
||||
// Set scoped super-vector and corresponding hw vector types.
|
||||
state->superVectorType =
|
||||
terminator->getOperand(0)->getType().cast<VectorType>();
|
||||
assert((state->superVectorType.getElementType() ==
|
||||
Type::getF32(terminator->getContext())) &&
|
||||
"Only f32 supported for now");
|
||||
state->hwVectorType = VectorType::get(
|
||||
state->hwVectorSize, state->superVectorType.getElementType());
|
||||
emitSlice(state, &slice);
|
||||
LLVM_DEBUG(dbgs() << "\nMLFunction is now\n");
|
||||
LLVM_DEBUG(f->print(dbgs()));
|
||||
}
|
||||
}
|
||||
|
||||
PassResult MaterializeVectors::runOnMLFunction(MLFunction *f) {
|
||||
using matcher::Op;
|
||||
LLVM_DEBUG(dbgs() << "\nMaterializeVectors on MLFunction\n");
|
||||
LLVM_DEBUG(f->print(dbgs()));
|
||||
|
||||
MaterializationState state;
|
||||
// Get the hardware vector type.
|
||||
// TODO(ntv): get elemental type from super-vector type rather than force f32.
|
||||
auto subVectorType =
|
||||
VectorType::get(state.hwVectorSize, Type::getF32(f->getContext()));
|
||||
|
||||
// Capture terminators; i.e. vector_transfer_write ops involving a strict
|
||||
// super-vector of subVectorType.
|
||||
auto filter = [subVectorType](const Statement &stmt) {
|
||||
const auto &opStmt = cast<OperationStmt>(stmt);
|
||||
if (!isaVectorTransferWrite(opStmt)) {
|
||||
return false;
|
||||
}
|
||||
return matcher::operatesOnStrictSuperVectors(opStmt, subVectorType);
|
||||
};
|
||||
auto pat = Op(filter);
|
||||
auto matches = pat.match(f);
|
||||
SetVector<OperationStmt *> terminators;
|
||||
for (auto m : matches) {
|
||||
terminators.insert(cast<OperationStmt>(m.first));
|
||||
}
|
||||
|
||||
// Call materialization.
|
||||
materialize(f, terminators, &state);
|
||||
return PassResult::Success;
|
||||
}
|
||||
|
||||
FunctionPass *mlir::createMaterializeVectors() {
|
||||
return new MaterializeVectors();
|
||||
}
|
||||
|
||||
static PassRegistration<MaterializeVectors>
|
||||
pass("materialize-vectors", "Materializes super-vectors to vectors of the "
|
||||
"proper size for the hardware");
|
||||
|
||||
#undef DEBUG_TYPE
|
|
@ -0,0 +1,87 @@
|
|||
// RUN: mlir-opt %s -vectorize -virtual-vector-size 32 --test-fastest-varying=0 -materialize-vectors -vector-size=8 | FileCheck %s -check-prefix=VEC1DTO1D
|
||||
// RUN: mlir-opt %s -vectorize -virtual-vector-size 3 -virtual-vector-size 16 --test-fastest-varying=1 --test-fastest-varying=0 -materialize-vectors -vector-size=8 | FileCheck %s -check-prefix=VEC2DTO1D
|
||||
// RUN_: mlir-opt %s -vectorize -virtual-vector-size 3 -virtual-vector-size 32 --test-fastest-varying=1 --test-fastest-varying=0 -materialize-vectors -vector-size=3 -vector-size=16 | FileCheck %s -check-prefix=VEC2DTO2D
|
||||
|
||||
// vector<32xf32> -> vector<8xf32>
|
||||
// VEC1DTO1D: [[MAP0:#.*]] = (d0, d1) -> (d0, d1)
|
||||
// VEC1DTO1D: [[MAP1:#.*]] = (d0, d1) -> (d0, d1 + 8)
|
||||
// VEC1DTO1D: [[MAP2:#.*]] = (d0, d1) -> (d0, d1 + 16)
|
||||
// VEC1DTO1D: [[MAP3:#.*]] = (d0, d1) -> (d0, d1 + 24)
|
||||
// vector<3x16xf32> -> vector<8xf32>
|
||||
// VEC2DTO1D: [[MAP0:#.*]] = (d0, d1) -> (d0, d1)
|
||||
// VEC2DTO1D: [[MAP1:#.*]] = (d0, d1) -> (d0, d1 + 8)
|
||||
// VEC2DTO1D: [[MAP2:#.*]] = (d0, d1) -> (d0 + 1, d1)
|
||||
// VEC2DTO1D: [[MAP3:#.*]] = (d0, d1) -> (d0 + 1, d1 + 8)
|
||||
// VEC2DTO1D: [[MAP4:#.*]] = (d0, d1) -> (d0 + 2, d1)
|
||||
// VEC2DTO1D: [[MAP5:#.*]] = (d0, d1) -> (d0 + 2, d1 + 8)
|
||||
// vector<3x32xf32> -> vector<3x16xf32>
|
||||
// VEC2DTO2D: [[MAP0:#.*]] = (d0, d1) -> (d0, d1)
|
||||
// VEC2DTO2D: [[MAP1:#.*]] = (d0, d1) -> (d0, d1 + 16)
|
||||
mlfunc @vector_add_2d(%M : index, %N : index) -> f32 {
|
||||
%A = alloc (%M, %N) : memref<?x?xf32, 0>
|
||||
%B = alloc (%M, %N) : memref<?x?xf32, 0>
|
||||
%C = alloc (%M, %N) : memref<?x?xf32, 0>
|
||||
%f1 = constant 1.0 : f32
|
||||
%f2 = constant 2.0 : f32
|
||||
for %i0 = 0 to %M {
|
||||
for %i1 = 0 to %N {
|
||||
// non-scoped %f1
|
||||
// VEC1DTO1D does 4x unrolling.
|
||||
// VEC1DTO1D: [[CST0:%.*]] = constant splat<vector<8xf32>, 1.000000e+00> : vector<8xf32>
|
||||
// VEC1DTO1D: [[CST1:%.*]] = constant splat<vector<8xf32>, 1.000000e+00> : vector<8xf32>
|
||||
// VEC1DTO1D: [[CST2:%.*]] = constant splat<vector<8xf32>, 1.000000e+00> : vector<8xf32>
|
||||
// VEC1DTO1D: [[CST3:%.*]] = constant splat<vector<8xf32>, 1.000000e+00> : vector<8xf32>
|
||||
// VEC1DTO1D: [[VAL0:%.*]] = affine_apply [[MAP0]]{{.*}}
|
||||
// VEC1DTO1D: "vector_transfer_write"([[CST0]], {{.*}}, [[VAL0]]#0, [[VAL0]]#1) : (vector<8xf32>
|
||||
// VEC1DTO1D: [[VAL1:%.*]] = affine_apply [[MAP1]]{{.*}}
|
||||
// VEC1DTO1D: "vector_transfer_write"([[CST1]], {{.*}}, [[VAL1]]#0, [[VAL1]]#1) : (vector<8xf32>
|
||||
// VEC1DTO1D: [[VAL2:%.*]] = affine_apply [[MAP2]]{{.*}}
|
||||
// VEC1DTO1D:"vector_transfer_write"([[CST2]], {{.*}}, [[VAL2]]#0, [[VAL2]]#1) : (vector<8xf32>
|
||||
// VEC1DTO1D: [[VAL3:%.*]] = affine_apply [[MAP3]]{{.*}}
|
||||
// VEC1DTO1D:"vector_transfer_write"([[CST3]], {{.*}}, [[VAL3]]#0, [[VAL3]]#1) : (vector<8xf32>
|
||||
//
|
||||
store %f1, %A[%i0, %i1] : memref<?x?xf32, 0>
|
||||
}
|
||||
}
|
||||
for %i2 = 0 to %M {
|
||||
for %i3 = 0 to %N {
|
||||
// non-scoped %f2
|
||||
// VEC2DTO1D does (3x4)x unrolling.
|
||||
// VEC2DTO1D-COUNT-6: {{.*}} = constant splat<vector<8xf32>, 1.000000e+00> : vector<8xf32>
|
||||
// VEC2DTO1D: [[VAL0:%.*]] = affine_apply [[MAP0]]{{.*}}
|
||||
// VEC2DTO1D: "vector_transfer_write"({{.*}}, [[VAL0]]#0, [[VAL0]]#1) : (vector<8xf32>
|
||||
// ... 4 other interleaved affine_apply, vector_transfer_write
|
||||
// VEC2DTO1D: [[VAL5:%.*]] = affine_apply [[MAP5]]{{.*}}
|
||||
// VEC2DTO1D: "vector_transfer_write"({{.*}}, [[VAL5]]#0, [[VAL5]]#1) : (vector<8xf32>
|
||||
//
|
||||
store %f2, %B[%i2, %i3] : memref<?x?xf32, 0>
|
||||
}
|
||||
}
|
||||
for %i4 = 0 to %M {
|
||||
for %i5 = 0 to %N {
|
||||
// VEC2DTO2D: %7 = affine_apply #map0(%i4, %i5)
|
||||
// VEC2DTO2D: %8 = "vector_transfer_read"(%0, %7#0, %7#1) : (memref<?x?xf32>, index, index) -> vector<3x16xf32>
|
||||
// VEC2DTO2D: %9 = affine_apply #map1(%i4, %i5)
|
||||
// VEC2DTO2D: %10 = "vector_transfer_read"(%0, %9#0, %9#1) : (memref<?x?xf32>, index, index) -> vector<3x16xf32>
|
||||
// VEC2DTO2D: %11 = affine_apply #map0(%i4, %i5)
|
||||
// VEC2DTO2D: %12 = "vector_transfer_read"(%1, %11#0, %11#1) : (memref<?x?xf32>, index, index) -> vector<3x16xf32>
|
||||
// VEC2DTO2D: %13 = affine_apply #map1(%i4, %i5)
|
||||
// VEC2DTO2D: %14 = "vector_transfer_read"(%1, %13#0, %13#1) : (memref<?x?xf32>, index, index) -> vector<3x16xf32>
|
||||
// VEC2DTO2D: %15 = addf %8, %12 : vector<3x16xf32>
|
||||
// VEC2DTO2D: %16 = addf %10, %14 : vector<3x16xf32>
|
||||
// VEC2DTO2D: %17 = affine_apply #map0(%i4, %i5)
|
||||
// VEC2DTO2D: "vector_transfer_write"(%15, %2, %17#0, %17#1) : (vector<3x16xf32>, memref<?x?xf32>, index, index) -> ()
|
||||
// VEC2DTO2D: %18 = affine_apply #map1(%i4, %i5)
|
||||
// VEC2DTO2D: "vector_transfer_write"(%16, %2, %18#0, %18#1) : (vector<3x16xf32>, memref<?x?xf32>, index, index) -> ()
|
||||
//
|
||||
%a5 = load %A[%i4, %i5] : memref<?x?xf32, 0>
|
||||
%b5 = load %B[%i4, %i5] : memref<?x?xf32, 0>
|
||||
%s5 = addf %a5, %b5 : f32
|
||||
store %s5, %C[%i4, %i5] : memref<?x?xf32, 0>
|
||||
}
|
||||
}
|
||||
%c7 = constant 7 : index
|
||||
%c42 = constant 42 : index
|
||||
%res = load %C[%c7, %c42] : memref<?x?xf32, 0>
|
||||
return %res : f32
|
||||
}
|
Loading…
Reference in New Issue