forked from OSchip/llvm-project
Updates to transformation/analysis passes/utilities. Update DMA generation pass
and getMemRefRegion() to work with specified loop depths; add support for outgoing DMAs, store op's. - add support for getMemRefRegion symbolic in outer loops - hence support for DMAs symbolic in outer surrounding loops. - add DMA generation support for outgoing DMAs (store op's to lower memory space); extend getMemoryRegion to store op's. -memref-bound-check now works with store op's as well. - fix dma-generate (references to the old memref in the dma_start op were also being replaced with the new buffer); we need replace all memref uses to work only on a subset of the uses - add a new optional argument for replaceAllMemRefUsesWith. update replaceAllMemRefUsesWith to take an optional 'operation' argument to serve as a filter - if provided, only those uses that are dominated by the filter are replaced. - Add missing print for attributes for dma_start, dma_wait op's. - update the FlatAffineConstraints API PiperOrigin-RevId: 221889223
This commit is contained in:
parent
6b52ac3aa6
commit
fff1efbaf5
|
@ -34,6 +34,7 @@ class AffineApplyOp;
|
|||
class AffineBound;
|
||||
class AffineCondition;
|
||||
class AffineMap;
|
||||
class ForStmt;
|
||||
class IntegerSet;
|
||||
class MLIRContext;
|
||||
class MLValue;
|
||||
|
@ -177,7 +178,6 @@ public:
|
|||
ArrayRef<MLValue *> getOperands() const;
|
||||
AffineMap getAffineMap() const;
|
||||
|
||||
|
||||
private:
|
||||
void forwardSubstitute(const AffineApplyOp &inputOp,
|
||||
ArrayRef<bool> inputResultsToSubstitute);
|
||||
|
@ -244,13 +244,19 @@ public:
|
|||
FlatAffineConstraints(unsigned numReservedInequalities,
|
||||
unsigned numReservedEqualities,
|
||||
unsigned numReservedCols, unsigned numDims = 0,
|
||||
unsigned numSymbols = 0, unsigned numLocals = 0)
|
||||
unsigned numSymbols = 0, unsigned numLocals = 0,
|
||||
ArrayRef<Optional<MLValue *>> idArgs = {})
|
||||
: numReservedCols(numReservedCols), numDims(numDims),
|
||||
numSymbols(numSymbols) {
|
||||
assert(numReservedCols >= numDims + numSymbols + 1);
|
||||
equalities.reserve(numReservedCols * numReservedEqualities);
|
||||
inequalities.reserve(numReservedCols * numReservedInequalities);
|
||||
numIds = numDims + numSymbols + numLocals;
|
||||
ids.reserve(numReservedCols);
|
||||
if (idArgs.empty())
|
||||
ids.resize(numIds, None);
|
||||
else
|
||||
ids.insert(ids.end(), idArgs.begin(), idArgs.end());
|
||||
}
|
||||
|
||||
/// Constructs a constraint system with the specified number of
|
||||
|
@ -261,6 +267,7 @@ public:
|
|||
numSymbols(numSymbols) {
|
||||
assert(numReservedCols >= numDims + numSymbols + 1);
|
||||
numIds = numDims + numSymbols + numLocals;
|
||||
ids.resize(numIds, None);
|
||||
}
|
||||
|
||||
explicit FlatAffineConstraints(const HyperRectangularSet &set);
|
||||
|
@ -290,10 +297,10 @@ public:
|
|||
// Clears any existing data and reserves memory for the specified constraints.
|
||||
void reset(unsigned numReservedInequalities, unsigned numReservedEqualities,
|
||||
unsigned numReservedCols, unsigned numDims, unsigned numSymbols,
|
||||
unsigned numLocals = 0);
|
||||
unsigned numLocals = 0, ArrayRef<MLValue *> idArgs = {});
|
||||
|
||||
void reset(unsigned numDims = 0, unsigned numSymbols = 0,
|
||||
unsigned numLocals = 0);
|
||||
unsigned numLocals = 0, ArrayRef<MLValue *> idArgs = {});
|
||||
|
||||
/// Appends constraints from 'other' into this. This is equivalent to an
|
||||
/// intersection with no simplification of any sort attempted.
|
||||
|
@ -396,6 +403,12 @@ public:
|
|||
/// Adds a lower bound expression for the specified expression.
|
||||
void addLowerBound(ArrayRef<int64_t> expr, ArrayRef<int64_t> lb);
|
||||
|
||||
/// Adds constraints (lower and upper bounds) from the ForStmt into the
|
||||
/// FlatAffineConstraints. 'forStmt's' MLValue is used to look up the right
|
||||
/// identifier, and if it doesn't exist, a new one is added. Returns false for
|
||||
/// the yet unimplemented/unsupported cases.
|
||||
bool addBoundsFromForStmt(unsigned pos, ForStmt *forStmt);
|
||||
|
||||
/// Adds an upper bound expression for the specified expression.
|
||||
void addUpperBound(ArrayRef<int64_t> expr, ArrayRef<int64_t> ub);
|
||||
|
||||
|
@ -407,12 +420,17 @@ public:
|
|||
/// Sets the identifier at the specified position to a constant.
|
||||
void setIdToConstant(unsigned pos, int64_t val);
|
||||
|
||||
/// Looks up the identifier with the specified MLValue. Returns false if not
|
||||
/// found.
|
||||
bool findId(const MLValue &operand, unsigned *pos);
|
||||
|
||||
// Add identifiers of the specified kind - specified positions are relative to
|
||||
// the kind of identifier.
|
||||
void addDimId(unsigned pos);
|
||||
// the kind of identifier. 'id' is the MLValue corresponding to the
|
||||
// identifier that can optionally be provided.
|
||||
void addDimId(unsigned pos, MLValue *id = nullptr);
|
||||
void addSymbolId(unsigned pos);
|
||||
void addLocalId(unsigned pos);
|
||||
void addId(IdKind kind, unsigned pos);
|
||||
void addId(IdKind kind, unsigned pos, MLValue *id = nullptr);
|
||||
|
||||
/// Composes the affine value map with this FlatAffineConstrains, adding the
|
||||
/// results of the map as dimensions at the specified position and with the
|
||||
|
@ -435,6 +453,9 @@ public:
|
|||
// value to mark exactness for example.
|
||||
void projectOut(unsigned pos, unsigned num);
|
||||
|
||||
/// Projects out the identifier that is associate with MLValue *.
|
||||
void projectOut(MLValue *id);
|
||||
|
||||
void removeId(IdKind idKind, unsigned pos);
|
||||
void removeId(unsigned pos);
|
||||
|
||||
|
@ -453,19 +474,30 @@ public:
|
|||
return numIds - numDims - numSymbols;
|
||||
}
|
||||
|
||||
inline ArrayRef<Optional<MLValue *>> getIds() const {
|
||||
return {ids.data(), ids.size()};
|
||||
}
|
||||
|
||||
/// Clears this list of constraints and copies other into it.
|
||||
void clearAndCopyFrom(const FlatAffineConstraints &other);
|
||||
|
||||
/// Returns the constant lower bound of the specified identifier (through a
|
||||
/// scan through the constraints); returns None if the bound isn't trivially a
|
||||
/// constant.
|
||||
Optional<int64_t> getConstantLowerBound(unsigned pos);
|
||||
Optional<int64_t> getConstantLowerBound(unsigned pos) const;
|
||||
|
||||
/// Returns the constant upper bound of the specified identifier (through a
|
||||
/// scan through the constraints); returns None if the bound isn't trivially a
|
||||
/// constant. Note that the upper bound for FlatAffineConstraints is
|
||||
/// inclusive.
|
||||
Optional<int64_t> getConstantUpperBound(unsigned pos);
|
||||
Optional<int64_t> getConstantUpperBound(unsigned pos) const;
|
||||
|
||||
/// Returns the extent (upper bound - lower bound) of the specified
|
||||
/// identifier if it is found to be a constant; returns None if it's not a
|
||||
/// constant. 'lbPosition' is set to the row position of the corresponding
|
||||
/// lower bound.
|
||||
Optional<int64_t> getConstantBoundDifference(unsigned pos,
|
||||
unsigned *lbPosition) const;
|
||||
|
||||
// Returns the lower and upper bounds of the specified dimensions as
|
||||
// AffineMap's. Returns false for the unimplemented cases for the moment.
|
||||
|
@ -509,6 +541,12 @@ private:
|
|||
/// Number of identifiers corresponding to symbols (unknown but constant for
|
||||
/// analysis).
|
||||
unsigned numSymbols;
|
||||
|
||||
/// MLValues corresponding to the (column) identifiers of this constraint
|
||||
/// system appearing in the order the identifiers correspond to columns.
|
||||
/// Temporary ones or those that aren't associated to any MLValue are to be
|
||||
/// set to None.
|
||||
SmallVector<Optional<MLValue *>, 8> ids;
|
||||
};
|
||||
|
||||
} // end namespace mlir.
|
||||
|
|
|
@ -25,9 +25,15 @@
|
|||
#ifndef MLIR_ANALYSIS_UTILS_H
|
||||
#define MLIR_ANALYSIS_UTILS_H
|
||||
|
||||
#include "mlir/Analysis/AffineStructures.h"
|
||||
#include "mlir/Support/LLVM.h"
|
||||
#include "llvm/ADT/SmallVector.h"
|
||||
#include <memory>
|
||||
|
||||
namespace mlir {
|
||||
|
||||
class FlatAffineConstraints;
|
||||
class MLValue;
|
||||
class OperationStmt;
|
||||
class Statement;
|
||||
|
||||
|
@ -37,8 +43,69 @@ bool dominates(const Statement &a, const Statement &b);
|
|||
/// Returns true if statement 'a' properly dominates statement b.
|
||||
bool properlyDominates(const Statement &a, const Statement &b);
|
||||
|
||||
/// Returns the memory region accessed by this memref.
|
||||
bool getMemoryRegion(OperationStmt *opStmt, FlatAffineConstraints *region);
|
||||
/// A region of a memref's data space; this is typically constructed by
|
||||
/// analyzing load/store op's on this memref and the index space of loops
|
||||
/// surrounding such op's.
|
||||
// For example, the memref region for a load operation at loop depth = 1:
|
||||
//
|
||||
// for %i = 0 to 32 {
|
||||
// for %ii = %i to (d0) -> (d0 + 8) (%i) {
|
||||
// load %A[%ii]
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// Region: {memref = %A, write = false, {%i <= m0 <= %i + 7} }
|
||||
// The last field is a 2-d FlatAffineConstraints symbolic in %i.
|
||||
//
|
||||
struct MemRefRegion {
|
||||
FlatAffineConstraints *getConstraints() { return &cst; }
|
||||
const FlatAffineConstraints *getConstraints() const { return &cst; }
|
||||
bool isWrite() const { return write; }
|
||||
void setWrite(bool flag) { write = flag; }
|
||||
|
||||
// Computes the shape if the extents are known constants, returns false
|
||||
// otherwise.
|
||||
bool getConstantShape(llvm::SmallVectorImpl<int> *shape) const;
|
||||
|
||||
// Returns the number of elements in this region if it's a known constant. We
|
||||
// use int64_t instead of uint64_t since index types can be at most int64_t.
|
||||
Optional<int64_t> getConstantSize() const;
|
||||
|
||||
/// Memref that this region corresponds to.
|
||||
MLValue *memref;
|
||||
|
||||
private:
|
||||
/// Read or write.
|
||||
bool write;
|
||||
|
||||
/// Region (data space) of the memref accessed. This set will thus have at
|
||||
/// least as many dimensional identifiers as the shape dimensionality of the
|
||||
/// memref, and these are the leading dimensions of the set appearing in that
|
||||
/// order (major to minor / outermost to innermost). There may be additional
|
||||
/// identifiers since getMemRefRegion() is called with a specific loop depth,
|
||||
/// and thus the region is symbolic in the outer surrounding loops at that
|
||||
/// depth.
|
||||
// TODO(bondhugula): Replace this to exploit HyperRectangularSet.
|
||||
FlatAffineConstraints cst;
|
||||
};
|
||||
|
||||
/// Computes the memory region accessed by this memref with the region
|
||||
/// represented as constraints symbolic/parameteric in 'loopDepth' loops
|
||||
/// surrounding opStmt. Returns false if this fails due to yet unimplemented
|
||||
/// cases.
|
||||
// For example, the memref region for this operation at loopDepth = 1 will be:
|
||||
//
|
||||
// for %i = 0 to 32 {
|
||||
// for %ii = %i to (d0) -> (d0 + 8) (%i) {
|
||||
// load %A[%ii]
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// {memref = %A, write = false, {%i <= m0 <= %i + 7} }
|
||||
// The last field is a 2-d FlatAffineConstraints symbolic in %i.
|
||||
//
|
||||
bool getMemRefRegion(OperationStmt *opStmt, unsigned loopDepth,
|
||||
MemRefRegion *region);
|
||||
|
||||
} // end namespace mlir
|
||||
|
||||
|
|
|
@ -43,15 +43,17 @@ class SSAValue;
|
|||
|
||||
/// Replace all uses of oldMemRef with newMemRef while optionally remapping the
|
||||
/// old memref's indices using the supplied affine map and adding any additional
|
||||
/// indices. The new memref could be of a different shape or rank. Returns true
|
||||
/// on success and false if the replacement is not possible (whenever a memref
|
||||
/// is used as an operand in a non-deferencing scenario).
|
||||
/// Additional indices are added at the start.
|
||||
/// indices. The new memref could be of a different shape or rank. An optional
|
||||
/// argument 'domOpFilter' restricts the replacement to only those operations
|
||||
/// that are dominated by the former. Returns true on success and false if the
|
||||
/// replacement is not possible (whenever a memref is used as an operand in a
|
||||
/// non-deferencing scenario). Additional indices are added at the start.
|
||||
// TODO(mlir-team): extend this for SSAValue / CFGFunctions. Can also be easily
|
||||
// extended to add additional indices at any position.
|
||||
bool replaceAllMemRefUsesWith(const MLValue *oldMemRef, MLValue *newMemRef,
|
||||
llvm::ArrayRef<MLValue *> extraIndices = {},
|
||||
AffineMap indexRemap = AffineMap::Null());
|
||||
AffineMap indexRemap = AffineMap::Null(),
|
||||
const Statement *domStmtFilter = nullptr);
|
||||
|
||||
/// Creates and inserts into 'builder' a new AffineApplyOp, with the number of
|
||||
/// its results equal to the number of operands, as a composition
|
||||
|
@ -64,7 +66,7 @@ OperationStmt *
|
|||
createComposedAffineApplyOp(FuncBuilder *builder, Location loc,
|
||||
ArrayRef<MLValue *> operands,
|
||||
ArrayRef<OperationStmt *> affineApplyOps,
|
||||
SmallVectorImpl<SSAValue *> &results);
|
||||
SmallVectorImpl<SSAValue *> *results);
|
||||
|
||||
/// Given an operation statement, inserts a new single affine apply operation,
|
||||
/// that is exclusively used by this operation statement, and that provides all
|
||||
|
|
|
@ -897,7 +897,7 @@ static void computeDirectionVector(
|
|||
dependenceDomain->addDimId(j);
|
||||
}
|
||||
|
||||
// Add equality contraints for each common loop, setting newly instroduced
|
||||
// Add equality contraints for each common loop, setting newly introduced
|
||||
// variable at column 'j' to the 'dst' IV minus the 'src IV.
|
||||
SmallVector<int64_t, 4> eq;
|
||||
eq.resize(dependenceDomain->getNumCols());
|
||||
|
|
|
@ -26,6 +26,7 @@
|
|||
#include "mlir/IR/BuiltinOps.h"
|
||||
#include "mlir/IR/IntegerSet.h"
|
||||
#include "mlir/IR/MLValue.h"
|
||||
#include "mlir/IR/Statements.h"
|
||||
#include "mlir/Support/MathExtras.h"
|
||||
#include "llvm/ADT/DenseSet.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
|
@ -480,6 +481,10 @@ FlatAffineConstraints::FlatAffineConstraints(
|
|||
numSymbols = other.getNumSymbolIds();
|
||||
numIds = other.getNumIds();
|
||||
|
||||
auto otherIds = other.getIds();
|
||||
ids.reserve(numReservedCols);
|
||||
ids.insert(ids.end(), otherIds.begin(), otherIds.end());
|
||||
|
||||
unsigned numReservedEqualities = other.getNumReservedEqualities();
|
||||
unsigned numReservedInequalities = other.getNumReservedInequalities();
|
||||
|
||||
|
@ -506,6 +511,7 @@ FlatAffineConstraints::FlatAffineConstraints(IntegerSet set)
|
|||
numSymbols(set.getNumSymbols()) {
|
||||
equalities.reserve(set.getNumEqualities() * numReservedCols);
|
||||
inequalities.reserve(set.getNumInequalities() * numReservedCols);
|
||||
ids.resize(numIds, None);
|
||||
|
||||
for (unsigned i = 0, e = set.getNumConstraints(); i < e; ++i) {
|
||||
AffineExpr expr = set.getConstraint(i);
|
||||
|
@ -525,7 +531,8 @@ void FlatAffineConstraints::reset(unsigned numReservedInequalities,
|
|||
unsigned numReservedEqualities,
|
||||
unsigned newNumReservedCols,
|
||||
unsigned newNumDims, unsigned newNumSymbols,
|
||||
unsigned newNumLocals) {
|
||||
unsigned newNumLocals,
|
||||
ArrayRef<MLValue *> idArgs) {
|
||||
assert(newNumReservedCols >= newNumDims + newNumSymbols + newNumLocals + 1 &&
|
||||
"minimum 1 column");
|
||||
numReservedCols = newNumReservedCols;
|
||||
|
@ -538,12 +545,20 @@ void FlatAffineConstraints::reset(unsigned numReservedInequalities,
|
|||
equalities.reserve(newNumReservedCols * numReservedEqualities);
|
||||
if (numReservedInequalities >= 1)
|
||||
inequalities.reserve(newNumReservedCols * numReservedInequalities);
|
||||
ids.clear();
|
||||
if (idArgs.empty()) {
|
||||
ids.resize(numIds, None);
|
||||
} else {
|
||||
ids.reserve(idArgs.size());
|
||||
ids.insert(ids.end(), idArgs.begin(), idArgs.end());
|
||||
}
|
||||
}
|
||||
|
||||
void FlatAffineConstraints::reset(unsigned newNumDims, unsigned newNumSymbols,
|
||||
unsigned newNumLocals) {
|
||||
unsigned newNumLocals,
|
||||
ArrayRef<MLValue *> idArgs) {
|
||||
reset(0, 0, newNumDims + newNumSymbols + newNumLocals + 1, newNumDims,
|
||||
newNumSymbols, newNumLocals);
|
||||
newNumSymbols, newNumLocals, idArgs);
|
||||
}
|
||||
|
||||
void FlatAffineConstraints::append(const FlatAffineConstraints &other) {
|
||||
|
@ -567,8 +582,8 @@ void FlatAffineConstraints::addLocalId(unsigned pos) {
|
|||
addId(IdKind::Local, pos);
|
||||
}
|
||||
|
||||
void FlatAffineConstraints::addDimId(unsigned pos) {
|
||||
addId(IdKind::Dimension, pos);
|
||||
void FlatAffineConstraints::addDimId(unsigned pos, MLValue *id) {
|
||||
addId(IdKind::Dimension, pos, id);
|
||||
}
|
||||
|
||||
void FlatAffineConstraints::addSymbolId(unsigned pos) {
|
||||
|
@ -577,7 +592,7 @@ void FlatAffineConstraints::addSymbolId(unsigned pos) {
|
|||
|
||||
/// Adds a dimensional identifier. The added column is initialized to
|
||||
/// zero.
|
||||
void FlatAffineConstraints::addId(IdKind kind, unsigned pos) {
|
||||
void FlatAffineConstraints::addId(IdKind kind, unsigned pos, MLValue *id) {
|
||||
if (kind == IdKind::Dimension) {
|
||||
assert(pos <= getNumDimIds());
|
||||
} else if (kind == IdKind::Symbol) {
|
||||
|
@ -595,16 +610,16 @@ void FlatAffineConstraints::addId(IdKind kind, unsigned pos) {
|
|||
numReservedCols++;
|
||||
}
|
||||
|
||||
unsigned elimPos;
|
||||
unsigned absolutePos;
|
||||
|
||||
if (kind == IdKind::Dimension) {
|
||||
elimPos = pos;
|
||||
absolutePos = pos;
|
||||
numDims++;
|
||||
} else if (kind == IdKind::Symbol) {
|
||||
elimPos = pos + getNumDimIds();
|
||||
absolutePos = pos + getNumDimIds();
|
||||
numSymbols++;
|
||||
} else {
|
||||
elimPos = pos + getNumDimIds() + getNumSymbolIds();
|
||||
absolutePos = pos + getNumDimIds() + getNumSymbolIds();
|
||||
}
|
||||
numIds++;
|
||||
|
||||
|
@ -615,41 +630,53 @@ void FlatAffineConstraints::addId(IdKind kind, unsigned pos) {
|
|||
int numCols = static_cast<int>(getNumCols());
|
||||
for (int r = numInequalities - 1; r >= 0; r--) {
|
||||
for (int c = numCols - 2; c >= 0; c--) {
|
||||
if (c < elimPos)
|
||||
if (c < absolutePos)
|
||||
atIneq(r, c) = inequalities[r * oldNumReservedCols + c];
|
||||
else
|
||||
atIneq(r, c + 1) = inequalities[r * oldNumReservedCols + c];
|
||||
}
|
||||
atIneq(r, elimPos) = 0;
|
||||
atIneq(r, absolutePos) = 0;
|
||||
}
|
||||
|
||||
for (int r = numEqualities - 1; r >= 0; r--) {
|
||||
for (int c = numCols - 2; c >= 0; c--) {
|
||||
// All values in column elimPositions < elimPos have the same coordinates
|
||||
// in the 2-d view of the coefficient buffer.
|
||||
if (c < elimPos)
|
||||
// All values in column absolutePositions < absolutePos have the same
|
||||
// coordinates in the 2-d view of the coefficient buffer.
|
||||
if (c < absolutePos)
|
||||
atEq(r, c) = equalities[r * oldNumReservedCols + c];
|
||||
else
|
||||
// Those at elimPosition >= elimPos, get a shifted elimPosition.
|
||||
// Those at absolutePosition >= absolutePos, get a shifted
|
||||
// absolutePosition.
|
||||
atEq(r, c + 1) = equalities[r * oldNumReservedCols + c];
|
||||
}
|
||||
// Initialize added dimension to zero.
|
||||
atEq(r, elimPos) = 0;
|
||||
atEq(r, absolutePos) = 0;
|
||||
}
|
||||
|
||||
// If an 'id' is provided, insert it; otherwise use None.
|
||||
if (id) {
|
||||
ids.insert(ids.begin() + absolutePos, id);
|
||||
} else {
|
||||
ids.insert(ids.begin() + absolutePos, None);
|
||||
}
|
||||
assert(ids.size() == getNumIds());
|
||||
}
|
||||
|
||||
// This routine may add additional local variables if the flattened
|
||||
// expression corresponding to the map has such variables due to the presence of
|
||||
// mod's, ceildiv's, and floordiv's.
|
||||
void FlatAffineConstraints::composeMap(AffineValueMap *vMap, unsigned pos) {
|
||||
assert(vMap->getNumOperands() == getNumIds() && "inconsistent map");
|
||||
assert(vMap->getNumDims() == getNumDimIds() && "inconsistent map");
|
||||
assert(pos <= getNumIds() && "invalid position");
|
||||
assert(vMap->getNumSymbols() == getNumSymbolIds());
|
||||
|
||||
AffineMap map = vMap->getAffineMap();
|
||||
|
||||
// We add one equality for each result connecting the result dim of the map to
|
||||
// the other identifiers.
|
||||
// For eg: if the expression is 16*i0 + i1, and this is the r^th
|
||||
// iteration/result of the value map, we are adding the equality:
|
||||
// d_r - 16*i0 - i1 = 0. Hence, when flattening say (i0 + 1, i0 + 8*i2), we
|
||||
// add two equalities overall: d_0 - i0 - 1 == 0, d1 - i0 - 8*i2 == 0.
|
||||
for (unsigned r = 0, e = map.getNumResults(); r < e; r++) {
|
||||
// Add dimension.
|
||||
addDimId(pos + r);
|
||||
|
@ -660,44 +687,60 @@ void FlatAffineConstraints::composeMap(AffineValueMap *vMap, unsigned pos) {
|
|||
map.getNumSymbols(), &eq, &cst);
|
||||
(void)ret;
|
||||
assert(ret && "unimplemented for semi-affine maps");
|
||||
for (unsigned j = 0, e = eq.size(); j < e; j++) {
|
||||
eq[j] = -eq[j];
|
||||
}
|
||||
// Make the value map and the flat affine cst dimensions compatible.
|
||||
// A lot of this code will be refactored/cleaned up.
|
||||
for (unsigned l = 0, e = cst.getNumLocalIds(); l < e; l++) {
|
||||
addLocalId(getNumLocalIds());
|
||||
addLocalId(0);
|
||||
}
|
||||
// TODO(andydavis,bondhugula,ntv): we need common code to merge
|
||||
// dimensions/symbols.
|
||||
assert(cst.getNumDimIds() <= getNumIds());
|
||||
for (unsigned t = 0, e = getNumDimIds() - cst.getNumDimIds(); t < e; t++) {
|
||||
for (unsigned t = 0, e = r + 1; t < e; t++) {
|
||||
// TODO: Consider using a batched version to add a range of IDs.
|
||||
cst.addDimId(0);
|
||||
eq.insert(eq.begin(), 0);
|
||||
}
|
||||
// Set the ceofficient for this result to one.
|
||||
eq[r] = 1;
|
||||
// TODO(andydavis,bondhugula,ntv): we need common code to merge
|
||||
// dimensions/symbols.
|
||||
assert(cst.getNumSymbolIds() <= getNumSymbolIds());
|
||||
for (unsigned t = 0, e = getNumSymbolIds() - cst.getNumSymbolIds(); t < e;
|
||||
t++) {
|
||||
eq.insert(eq.begin() + cst.getNumSymbolIds(), 0);
|
||||
cst.addSymbolId(cst.getNumSymbolIds());
|
||||
|
||||
assert(cst.getNumDimIds() <= getNumDimIds());
|
||||
for (unsigned t = 0, e = getNumDimIds() - cst.getNumDimIds(); t < e; t++) {
|
||||
cst.addDimId(cst.getNumDimIds() - 1);
|
||||
}
|
||||
// TODO(andydavis,bondhugula,ntv): we need common code to merge
|
||||
// identifiers. All of this will be cleaned up. At this point, it's fine as
|
||||
// long as it stays *inside* the FlatAffineConstraints API methods.
|
||||
assert(cst.getNumSymbolIds() <= getNumSymbolIds());
|
||||
assert(cst.getNumLocalIds() <= getNumLocalIds());
|
||||
for (unsigned t = 0, e = getNumLocalIds() - cst.getNumLocalIds(); t < e;
|
||||
t++) {
|
||||
eq.insert(eq.begin() + cst.getNumDimIds() + cst.getNumSymbolIds(), 0);
|
||||
cst.addLocalId(0);
|
||||
cst.addLocalId(cst.getNumLocalIds());
|
||||
}
|
||||
/// Finally, append cst to this constraint set.
|
||||
append(cst);
|
||||
|
||||
// eqToAdd is the equality corresponding to the flattened affine expression.
|
||||
SmallVector<int64_t, 8> eqToAdd(getNumCols(), 0);
|
||||
// Set the coefficient for this result to one.
|
||||
eqToAdd[r] = 1;
|
||||
|
||||
// Dims and symbols.
|
||||
for (unsigned i = 0, e = vMap->getNumOperands(); i < e; i++) {
|
||||
unsigned loc;
|
||||
bool ret = findId(*cast<MLValue>(vMap->getOperand(i)), &loc);
|
||||
assert(ret && "id expected, but not found");
|
||||
(void)ret;
|
||||
// We need to negate 'eq' since the newly added dimension is going to be
|
||||
// set to this one.
|
||||
eqToAdd[loc] = -eq[i];
|
||||
}
|
||||
// Local vars common to eq and cst are at the beginning.
|
||||
int j = getNumDimIds() + getNumSymbolIds();
|
||||
int end = eq.size() - 1;
|
||||
for (int i = vMap->getNumOperands(); i < end; i++, j++) {
|
||||
eqToAdd[j] = -eq[i];
|
||||
}
|
||||
|
||||
// Constant term.
|
||||
eqToAdd[getNumCols() - 1] = -eq[eq.size() - 1];
|
||||
|
||||
// Add the equality connecting the result of the map to this constraint set.
|
||||
addEquality(eq);
|
||||
addEquality(eqToAdd);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -858,6 +901,7 @@ void FlatAffineConstraints::removeColumnRange(unsigned colStart,
|
|||
numDims -= numDimsEliminated;
|
||||
numSymbols -= numSymbolsEliminated;
|
||||
numIds = numIds - numColsEliminated;
|
||||
ids.erase(ids.begin() + colStart, ids.begin() + colLimit);
|
||||
|
||||
// No resize necessary. numReservedCols remains the same.
|
||||
}
|
||||
|
@ -1071,6 +1115,90 @@ void FlatAffineConstraints::addUpperBound(ArrayRef<int64_t> expr,
|
|||
}
|
||||
}
|
||||
|
||||
bool FlatAffineConstraints::findId(const MLValue &operand, unsigned *pos) {
|
||||
unsigned i = 0;
|
||||
for (const auto &mayBeId : ids) {
|
||||
if (mayBeId.hasValue() && mayBeId.getValue() == &operand) {
|
||||
*pos = i;
|
||||
return true;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// TODO(andydavis, bondhugula) AFFINE REFACTOR: merge with loop bounds
|
||||
// code in dependence analysis.
|
||||
bool FlatAffineConstraints::addBoundsFromForStmt(unsigned pos,
|
||||
ForStmt *forStmt) {
|
||||
// Adds a lower or upper bound when the bounds aren't constant.
|
||||
auto addLowerOrUpperBound = [&](bool lower) -> bool {
|
||||
const auto &operands = lower ? forStmt->getLowerBoundOperands()
|
||||
: forStmt->getUpperBoundOperands();
|
||||
SmallVector<unsigned, 8> positions;
|
||||
|
||||
for (const auto &operand : operands) {
|
||||
unsigned loc;
|
||||
// TODO(andydavis, bondhugula) AFFINE REFACTOR: merge with loop bounds
|
||||
// code in dependence analysis.
|
||||
if (!findId(*operand, &loc)) {
|
||||
addDimId(getNumDimIds(), operand);
|
||||
loc = getNumDimIds() - 1;
|
||||
}
|
||||
positions.push_back(loc);
|
||||
}
|
||||
|
||||
auto boundMap =
|
||||
lower ? forStmt->getLowerBoundMap() : forStmt->getUpperBoundMap();
|
||||
|
||||
for (auto result : boundMap.getResults()) {
|
||||
SmallVector<int64_t, 4> flattenedExpr;
|
||||
SmallVector<int64_t, 4> ineq(getNumCols(), 0);
|
||||
// TODO(andydavis, bondhugula) AFFINE REFACTOR: merge with loop bounds in
|
||||
// dependence analysis.
|
||||
FlatAffineConstraints cst;
|
||||
if (!getFlattenedAffineExpr(result, boundMap.getNumDims(),
|
||||
boundMap.getNumSymbols(), &flattenedExpr,
|
||||
&cst)) {
|
||||
LLVM_DEBUG(llvm::dbgs()
|
||||
<< "semi-affine expressions not yet supported\n");
|
||||
return false;
|
||||
}
|
||||
if (cst.getNumLocalIds() > 0) {
|
||||
LLVM_DEBUG(
|
||||
llvm::dbgs()
|
||||
<< "loop bounds with mod/floordiv expr's not yet supported\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
ineq[pos] = lower ? 1 : -1;
|
||||
for (unsigned j = 0, e = boundMap.getNumInputs(); j < e; j++) {
|
||||
ineq[positions[j]] = lower ? -flattenedExpr[j] : flattenedExpr[j];
|
||||
}
|
||||
// Constant term.
|
||||
ineq[getNumCols() - 1] = lower ? -flattenedExpr[flattenedExpr.size() - 1]
|
||||
: flattenedExpr[flattenedExpr.size() - 1];
|
||||
addInequality(ineq);
|
||||
}
|
||||
return true;
|
||||
};
|
||||
|
||||
if (forStmt->hasConstantLowerBound()) {
|
||||
addConstantLowerBound(pos, forStmt->getConstantLowerBound());
|
||||
} else {
|
||||
// Non-constant lower bound case.
|
||||
if (!addLowerOrUpperBound(/*lower=*/true))
|
||||
return false;
|
||||
}
|
||||
|
||||
if (forStmt->hasConstantUpperBound()) {
|
||||
addConstantUpperBound(pos, forStmt->getConstantUpperBound() - 1);
|
||||
return true;
|
||||
}
|
||||
// Non-constant upper bound case.
|
||||
return addLowerOrUpperBound(/*lower=*/false);
|
||||
}
|
||||
|
||||
/// Sets the specified identifer to a constant value.
|
||||
void FlatAffineConstraints::setIdToConstant(unsigned pos, int64_t val) {
|
||||
unsigned offset = equalities.size();
|
||||
|
@ -1119,7 +1247,8 @@ bool FlatAffineConstraints::getDimensionBounds(unsigned pos, unsigned num,
|
|||
return true;
|
||||
}
|
||||
|
||||
Optional<int64_t> FlatAffineConstraints::getConstantLowerBound(unsigned pos) {
|
||||
Optional<int64_t>
|
||||
FlatAffineConstraints::getConstantLowerBound(unsigned pos) const {
|
||||
assert(pos < getNumCols() - 1);
|
||||
Optional<int64_t> lb = None;
|
||||
for (unsigned r = 0; r < getNumInequalities(); r++) {
|
||||
|
@ -1143,7 +1272,71 @@ Optional<int64_t> FlatAffineConstraints::getConstantLowerBound(unsigned pos) {
|
|||
return lb;
|
||||
}
|
||||
|
||||
Optional<int64_t> FlatAffineConstraints::getConstantUpperBound(unsigned pos) {
|
||||
/// Returns the extent of the specified identifier (upper bound - lower bound)
|
||||
/// if it found to be a constant; returns None if it's not a constant.
|
||||
/// 'lbPosition' is set to the row position of the corresponding lower bound.
|
||||
Optional<int64_t>
|
||||
FlatAffineConstraints::getConstantBoundDifference(unsigned pos,
|
||||
unsigned *lbPosition) const {
|
||||
// Check if the identifier appears at all in any of the inequalities.
|
||||
unsigned r, e;
|
||||
for (r = 0, e = getNumInequalities(); r < e; r++) {
|
||||
if (atIneq(r, pos) != 0)
|
||||
break;
|
||||
}
|
||||
if (r == e) {
|
||||
// If it doesn't appear, just remove the column and return.
|
||||
// TODO(andydavis,bondhugula): refactor removeColumns to use it from here.
|
||||
return None;
|
||||
}
|
||||
|
||||
// Positions of constraints that are lower/upper bounds on the variable.
|
||||
SmallVector<unsigned, 4> lbIndices, ubIndices;
|
||||
|
||||
// Gather all lower bounds and upper bounds of the variable. Since the
|
||||
// canonical form c_1*x_1 + c_2*x_2 + ... + c_0 >= 0, a constraint is a lower
|
||||
// bound for x_i if c_i >= 1, and an upper bound if c_i <= -1.
|
||||
for (unsigned r = 0, e = getNumInequalities(); r < e; r++) {
|
||||
if (atIneq(r, pos) >= 1)
|
||||
// Lower bound.
|
||||
lbIndices.push_back(r);
|
||||
else if (atIneq(r, pos) <= -1)
|
||||
// Upper bound.
|
||||
ubIndices.push_back(r);
|
||||
}
|
||||
|
||||
// TODO(bondhugula): eliminate all variables that aren't part of any of the
|
||||
// lower/upper bounds - to make this more powerful.
|
||||
|
||||
Optional<int64_t> minDiff = None;
|
||||
for (auto ubPos : ubIndices) {
|
||||
for (auto lbPos : lbIndices) {
|
||||
// Look for a lower bound and an upper bound that only differ by a
|
||||
// constant, i.e., pairs of the form 0 <= c_pos - f(c_i's) <= diffConst.
|
||||
// For example, if ii is the pos^th variable, we are looking for
|
||||
// constraints like ii >= i, ii <= ii + 50, 50 being the difference. The
|
||||
// minimum among all such constant differences is kept since that's the
|
||||
// constant bounding the extent of the pos^th variable.
|
||||
unsigned j;
|
||||
for (j = 0; j < getNumCols() - 1; j++)
|
||||
if (atIneq(ubPos, j) != -atIneq(lbPos, j)) {
|
||||
break;
|
||||
}
|
||||
if (j < getNumCols() - 1)
|
||||
continue;
|
||||
int64_t mayDiff =
|
||||
atIneq(ubPos, getNumCols() - 1) + atIneq(lbPos, getNumCols() - 1) + 1;
|
||||
if (minDiff == None || mayDiff < minDiff) {
|
||||
minDiff = mayDiff;
|
||||
*lbPosition = lbPos;
|
||||
}
|
||||
}
|
||||
}
|
||||
return minDiff;
|
||||
}
|
||||
|
||||
Optional<int64_t>
|
||||
FlatAffineConstraints::getConstantUpperBound(unsigned pos) const {
|
||||
assert(pos < getNumCols() - 1);
|
||||
Optional<int64_t> ub = None;
|
||||
for (unsigned r = 0; r < getNumInequalities(); r++) {
|
||||
|
@ -1196,8 +1389,17 @@ bool FlatAffineConstraints::isHyperRectangular(unsigned pos,
|
|||
void FlatAffineConstraints::print(raw_ostream &os) const {
|
||||
assert(inequalities.size() == getNumInequalities() * numReservedCols);
|
||||
assert(equalities.size() == getNumEqualities() * numReservedCols);
|
||||
assert(ids.size() == getNumIds());
|
||||
os << "\nConstraints (" << getNumDimIds() << " dims, " << getNumSymbolIds()
|
||||
<< " symbols, " << getNumLocalIds() << " locals): \n";
|
||||
os << "(";
|
||||
for (unsigned i = 0, e = getNumIds(); i < e; i++) {
|
||||
if (ids[i] == None)
|
||||
os << "None ";
|
||||
else
|
||||
os << "MLValue ";
|
||||
}
|
||||
os << ")\n";
|
||||
for (unsigned i = 0, e = getNumEqualities(); i < e; ++i) {
|
||||
for (unsigned j = 0; j < getNumCols(); ++j) {
|
||||
os << atEq(i, j) << " ";
|
||||
|
@ -1223,6 +1425,7 @@ void FlatAffineConstraints::clearAndCopyFrom(
|
|||
const FlatAffineConstraints &other) {
|
||||
FlatAffineConstraints copy(other);
|
||||
std::swap(*this, copy);
|
||||
assert(copy.getNumIds() == copy.getIds().size());
|
||||
}
|
||||
|
||||
void FlatAffineConstraints::removeId(unsigned pos) {
|
||||
|
@ -1245,6 +1448,7 @@ void FlatAffineConstraints::removeId(unsigned pos) {
|
|||
atEq(r, c) = atEq(r, c + 1);
|
||||
}
|
||||
}
|
||||
ids.erase(ids.begin() + pos);
|
||||
}
|
||||
|
||||
static std::pair<unsigned, unsigned>
|
||||
|
@ -1375,11 +1579,18 @@ void FlatAffineConstraints::FourierMotzkinEliminate(
|
|||
unsigned newNumDims = dimsSymbols.first;
|
||||
unsigned newNumSymbols = dimsSymbols.second;
|
||||
|
||||
SmallVector<Optional<MLValue *>, 8> newIds;
|
||||
newIds.reserve(numIds - 1);
|
||||
newIds.insert(newIds.end(), ids.begin(), ids.begin() + pos);
|
||||
newIds.insert(newIds.end(), ids.begin() + pos + 1, ids.end());
|
||||
|
||||
/// Create the new system which has one identifier less.
|
||||
FlatAffineConstraints newFac(
|
||||
lbIndices.size() * ubIndices.size() + nbIndices.size(),
|
||||
getNumEqualities(), getNumCols() - 1, newNumDims, newNumSymbols,
|
||||
/*numLocals=*/getNumIds() - 1 - newNumDims - newNumSymbols);
|
||||
/*numLocals=*/getNumIds() - 1 - newNumDims - newNumSymbols, newIds);
|
||||
|
||||
assert(newFac.getIds().size() == newFac.getNumIds());
|
||||
|
||||
// This will be used to check if the elimination was integer exact.
|
||||
unsigned lcmProducts = 1;
|
||||
|
@ -1462,9 +1673,19 @@ void FlatAffineConstraints::FourierMotzkinEliminate(
|
|||
|
||||
void FlatAffineConstraints::projectOut(unsigned pos, unsigned num) {
|
||||
// 'pos' can be at most getNumCols() - 2.
|
||||
if (num == 0)
|
||||
return;
|
||||
assert(pos <= getNumCols() - 2 && "invalid position");
|
||||
assert(pos + num < getNumCols() && "invalid range");
|
||||
for (unsigned i = 0; i < num; i++) {
|
||||
FourierMotzkinEliminate(pos);
|
||||
}
|
||||
}
|
||||
|
||||
void FlatAffineConstraints::projectOut(MLValue *id) {
|
||||
unsigned pos;
|
||||
bool ret = findId(*id, &pos);
|
||||
assert(ret);
|
||||
(void)ret;
|
||||
FourierMotzkinEliminate(pos);
|
||||
}
|
||||
|
|
|
@ -63,15 +63,15 @@ void MemRefBoundCheck::visitOperationStmt(OperationStmt *opStmt) {
|
|||
// TODO(bondhugula): extend this to store's and other memref dereferencing
|
||||
// op's.
|
||||
if (auto loadOp = opStmt->dyn_cast<LoadOp>()) {
|
||||
FlatAffineConstraints memoryRegion;
|
||||
if (!getMemoryRegion(opStmt, &memoryRegion))
|
||||
MemRefRegion region;
|
||||
if (!getMemRefRegion(opStmt, /*loopDepth=*/0, ®ion))
|
||||
return;
|
||||
LLVM_DEBUG(llvm::dbgs() << "Memory region");
|
||||
LLVM_DEBUG(memoryRegion.dump());
|
||||
LLVM_DEBUG(region.getConstraints()->dump());
|
||||
unsigned rank = loadOp->getMemRefType().getRank();
|
||||
// For each dimension, check for out of bounds.
|
||||
for (unsigned r = 0; r < rank; r++) {
|
||||
FlatAffineConstraints ucst(memoryRegion);
|
||||
FlatAffineConstraints ucst(*region.getConstraints());
|
||||
// Intersect memory region with constraint capturing out of bounds,
|
||||
// and check if the constraint system is feasible. If it is, there is at
|
||||
// least one point out of bounds.
|
||||
|
@ -91,7 +91,7 @@ void MemRefBoundCheck::visitOperationStmt(OperationStmt *opStmt) {
|
|||
Twine(r + 1));
|
||||
}
|
||||
// Check for less than negative index.
|
||||
FlatAffineConstraints lcst(memoryRegion);
|
||||
FlatAffineConstraints lcst(*region.getConstraints());
|
||||
std::fill(ineq.begin(), ineq.end(), 0);
|
||||
// d_i <= -1;
|
||||
lcst.addConstantUpperBound(r, -1);
|
||||
|
|
|
@ -27,6 +27,9 @@
|
|||
#include "mlir/IR/Builders.h"
|
||||
#include "mlir/IR/BuiltinOps.h"
|
||||
#include "mlir/StandardOps/StandardOps.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
|
||||
#define DEBUG_TYPE "analysis-utils"
|
||||
|
||||
using namespace mlir;
|
||||
|
||||
|
@ -65,62 +68,141 @@ bool mlir::dominates(const Statement &a, const Statement &b) {
|
|||
return &a == &b || properlyDominates(a, b);
|
||||
}
|
||||
|
||||
/// Returns the memory region accessed by this memref.
|
||||
// TODO(bondhugula): extend this to store's and other memref dereferencing ops.
|
||||
bool mlir::getMemoryRegion(OperationStmt *opStmt,
|
||||
FlatAffineConstraints *region) {
|
||||
OpPointer<LoadOp> loadOp;
|
||||
if (!(loadOp = opStmt->dyn_cast<LoadOp>()))
|
||||
return false;
|
||||
Optional<int64_t> MemRefRegion::getConstantSize() const {
|
||||
auto memRefType = memref->getType().cast<MemRefType>();
|
||||
unsigned rank = memRefType.getRank();
|
||||
|
||||
// Compute the extents of the buffer.
|
||||
int64_t numElements = 1;
|
||||
for (unsigned d = 0; d < rank; d++) {
|
||||
unsigned lbPos;
|
||||
Optional<int64_t> diff = cst.getConstantBoundDifference(d, &lbPos);
|
||||
|
||||
if (!diff.hasValue())
|
||||
return None;
|
||||
int64_t diffConstant = diff.getValue();
|
||||
|
||||
if (diffConstant <= 0)
|
||||
return 0;
|
||||
numElements *= diffConstant;
|
||||
}
|
||||
return numElements;
|
||||
}
|
||||
|
||||
bool MemRefRegion::getConstantShape(SmallVectorImpl<int> *shape) const {
|
||||
auto memRefType = memref->getType().cast<MemRefType>();
|
||||
unsigned rank = memRefType.getRank();
|
||||
shape->reserve(rank);
|
||||
|
||||
// Compute the extents of this memref region.
|
||||
for (unsigned d = 0; d < rank; d++) {
|
||||
unsigned lbPos;
|
||||
Optional<int64_t> diff = cst.getConstantBoundDifference(d, &lbPos);
|
||||
if (!diff.hasValue())
|
||||
return false;
|
||||
|
||||
int diffConstant = std::max(0L, diff.getValue());
|
||||
shape->push_back(diffConstant);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/// Computes the memory region accessed by this memref with the region
|
||||
/// represented as constraints symbolic/parameteric in 'loopDepth' loops
|
||||
/// surrounding opStmt. Returns false if this fails due to yet unimplemented
|
||||
/// cases.
|
||||
// For example, the memref region for this load operation at loopDepth = 1 will
|
||||
// be as below:
|
||||
//
|
||||
// for %i = 0 to 32 {
|
||||
// for %ii = %i to (d0) -> (d0 + 8) (%i) {
|
||||
// load %A[%ii]
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// region: {memref = %A, write = false, {%i <= m0 <= %i + 7} }
|
||||
// The last field is a 2-d FlatAffineConstraints symbolic in %i.
|
||||
//
|
||||
// TODO(bondhugula): extend this to any other memref dereferencing ops
|
||||
// (dma_start, dma_wait).
|
||||
bool mlir::getMemRefRegion(OperationStmt *opStmt, unsigned loopDepth,
|
||||
MemRefRegion *region) {
|
||||
OpPointer<LoadOp> loadOp;
|
||||
OpPointer<StoreOp> storeOp;
|
||||
unsigned rank;
|
||||
SmallVector<MLValue *, 4> indices;
|
||||
|
||||
if ((loadOp = opStmt->dyn_cast<LoadOp>())) {
|
||||
rank = loadOp->getMemRefType().getRank();
|
||||
for (auto *index : loadOp->getIndices()) {
|
||||
indices.push_back(cast<MLValue>(index));
|
||||
}
|
||||
region->memref = cast<MLValue>(loadOp->getMemRef());
|
||||
region->setWrite(false);
|
||||
} else if ((storeOp = opStmt->dyn_cast<StoreOp>())) {
|
||||
rank = storeOp->getMemRefType().getRank();
|
||||
for (auto *index : storeOp->getIndices()) {
|
||||
indices.push_back(cast<MLValue>(index));
|
||||
}
|
||||
region->memref = cast<MLValue>(storeOp->getMemRef());
|
||||
region->setWrite(true);
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Build the constraints for this region.
|
||||
FlatAffineConstraints *regionCst = region->getConstraints();
|
||||
|
||||
unsigned rank = loadOp->getMemRefType().getRank();
|
||||
MLFuncBuilder b(opStmt);
|
||||
auto idMap = b.getMultiDimIdentityMap(rank);
|
||||
|
||||
SmallVector<MLValue *, 4> indices;
|
||||
for (auto *index : loadOp->getIndices()) {
|
||||
indices.push_back(cast<MLValue>(index));
|
||||
}
|
||||
// Initialize 'accessValueMap' and compose with reachable AffineApplyOps.
|
||||
AffineValueMap accessValueMap(idMap, indices);
|
||||
forwardSubstituteReachableOps(&accessValueMap);
|
||||
AffineMap accessMap = accessValueMap.getAffineMap();
|
||||
|
||||
// Initialize 'accessMap' and compose with reachable AffineApplyOps.
|
||||
AffineValueMap accessMap(idMap, indices);
|
||||
forwardSubstituteReachableOps(&accessMap);
|
||||
AffineMap srcMap = accessMap.getAffineMap();
|
||||
|
||||
region->reset(srcMap.getNumDims(), srcMap.getNumSymbols());
|
||||
regionCst->reset(accessMap.getNumDims(), accessMap.getNumSymbols(), 0,
|
||||
accessValueMap.getOperands());
|
||||
|
||||
// Add equality constraints.
|
||||
AffineMap map = accessMap.getAffineMap();
|
||||
unsigned numDims = map.getNumDims();
|
||||
unsigned numSymbols = map.getNumSymbols();
|
||||
// Add inEqualties for loop lower/upper bounds.
|
||||
unsigned numDims = accessMap.getNumDims();
|
||||
unsigned numSymbols = accessMap.getNumSymbols();
|
||||
// Add inequalties for loop lower/upper bounds.
|
||||
for (unsigned i = 0; i < numDims + numSymbols; ++i) {
|
||||
if (auto *loop = dyn_cast<ForStmt>(accessMap.getOperand(i))) {
|
||||
if (!loop->hasConstantBounds())
|
||||
if (auto *loop = dyn_cast<ForStmt>(accessValueMap.getOperand(i))) {
|
||||
// Note that regionCst can now have more dimensions than accessMap if the
|
||||
// bounds expressions involve outer loops or other symbols.
|
||||
if (!regionCst->addBoundsFromForStmt(i, loop))
|
||||
return false;
|
||||
// Add lower bound and upper bounds.
|
||||
region->addConstantLowerBound(i, loop->getConstantLowerBound());
|
||||
region->addConstantUpperBound(i, loop->getConstantUpperBound() - 1);
|
||||
} else {
|
||||
// Has to be a valid symbol.
|
||||
auto *symbol = cast<MLValue>(accessMap.getOperand(i));
|
||||
auto *symbol = cast<MLValue>(accessValueMap.getOperand(i));
|
||||
assert(symbol->isValidSymbol());
|
||||
// Check if the symbols is a constant.
|
||||
if (auto *opStmt = symbol->getDefiningStmt()) {
|
||||
if (auto constOp = opStmt->dyn_cast<ConstantIndexOp>()) {
|
||||
region->setIdToConstant(i, constOp->getValue());
|
||||
regionCst->setIdToConstant(i, constOp->getValue());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Add access function equalities to connect loop IVs to data dimensions.
|
||||
region->composeMap(&accessMap);
|
||||
regionCst->composeMap(&accessValueMap);
|
||||
|
||||
// Eliminate the loop IVs and any local variables to yield the memory region
|
||||
// involving just the memref dimensions.
|
||||
region->projectOut(srcMap.getNumResults(),
|
||||
accessMap.getNumOperands() + region->getNumLocalIds());
|
||||
assert(region->getNumDimIds() == rank);
|
||||
// Eliminate the loop IVs and any local variables to yield the memory
|
||||
// region involving just the memref dimensions and outer loop IVs up to
|
||||
// loopDepth.
|
||||
for (auto *operand : accessValueMap.getOperands()) {
|
||||
regionCst->projectOut(operand);
|
||||
}
|
||||
regionCst->projectOut(regionCst->getNumDimIds() +
|
||||
regionCst->getNumSymbolIds(),
|
||||
regionCst->getNumLocalIds());
|
||||
|
||||
// Tighten the set.
|
||||
regionCst->GCDTightenInequalities();
|
||||
|
||||
assert(regionCst->getNumDimIds() >= rank);
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -717,6 +717,7 @@ void DmaStartOp::print(OpAsmPrinter *p) const {
|
|||
*p << " : " << getSrcMemRef()->getType();
|
||||
*p << ", " << getDstMemRef()->getType();
|
||||
*p << ", " << getTagMemRef()->getType();
|
||||
p->printOptionalAttrDict(getAttrs());
|
||||
}
|
||||
|
||||
// Parse DmaStartOp.
|
||||
|
@ -811,6 +812,7 @@ void DmaWaitOp::print(OpAsmPrinter *p) const {
|
|||
*p << "], ";
|
||||
p->printOperand(getNumElements());
|
||||
*p << " : " << getTagMemRef()->getType();
|
||||
p->printOptionalAttrDict(getAttrs());
|
||||
}
|
||||
|
||||
// Parse DmaWaitOp.
|
||||
|
|
|
@ -30,193 +30,306 @@
|
|||
#include "mlir/StandardOps/StandardOps.h"
|
||||
#include "mlir/Transforms/Passes.h"
|
||||
#include "mlir/Transforms/Utils.h"
|
||||
#include "llvm/ADT/DenseMap.h"
|
||||
#include "llvm/Support/CommandLine.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
#define DEBUG_TYPE "dma-generate"
|
||||
|
||||
using namespace mlir;
|
||||
|
||||
static llvm::cl::opt<unsigned> clFastMemorySpace(
|
||||
"dma-fast-memory-space", llvm::cl::Hidden,
|
||||
llvm::cl::desc("Set fast memory space id for DMA generation"));
|
||||
|
||||
namespace {
|
||||
|
||||
// A region of memory in a lower memory space.
|
||||
struct Region {
|
||||
// Memref corresponding to the region.
|
||||
MLValue *memref;
|
||||
// Read or write.
|
||||
bool isWrite;
|
||||
// Region of memory accessed.
|
||||
// TODO(bondhugula): Replace this to exploit HyperRectangularSet.
|
||||
std::unique_ptr<FlatAffineConstraints> cst;
|
||||
};
|
||||
|
||||
/// Generates DMAs for memref's living in 'lowMemorySpace' into newly created
|
||||
/// buffers in 'highMemorySpace', and replaces memory operations to the former
|
||||
/// Generates DMAs for memref's living in 'slowMemorySpace' into newly created
|
||||
/// buffers in 'fastMemorySpace', and replaces memory operations to the former
|
||||
/// by the latter. Only load op's handled for now.
|
||||
/// TODO(bondhugula): extend this to store op's.
|
||||
struct DmaGeneration : public FunctionPass, StmtWalker<DmaGeneration> {
|
||||
explicit DmaGeneration(unsigned lowMemorySpace = 0,
|
||||
unsigned highMemorySpace = 1,
|
||||
explicit DmaGeneration(unsigned slowMemorySpace = 0,
|
||||
unsigned fastMemorySpaceArg = 1,
|
||||
int minDmaTransferSize = 1024)
|
||||
: FunctionPass(&DmaGeneration::passID), lowMemorySpace(lowMemorySpace),
|
||||
highMemorySpace(highMemorySpace),
|
||||
minDmaTransferSize(minDmaTransferSize) {}
|
||||
: FunctionPass(&DmaGeneration::passID), slowMemorySpace(slowMemorySpace),
|
||||
minDmaTransferSize(minDmaTransferSize) {
|
||||
if (clFastMemorySpace.getNumOccurrences() > 0) {
|
||||
fastMemorySpace = clFastMemorySpace;
|
||||
} else {
|
||||
fastMemorySpace = fastMemorySpaceArg;
|
||||
}
|
||||
}
|
||||
|
||||
PassResult runOnMLFunction(MLFunction *f) override;
|
||||
// Not applicable to CFG functions.
|
||||
PassResult runOnCFGFunction(CFGFunction *f) override { return success(); }
|
||||
bool runOnForStmt(ForStmt *forStmt);
|
||||
PassResult runOnMLFunction(MLFunction *f) override;
|
||||
void runOnForStmt(ForStmt *forStmt);
|
||||
|
||||
void visitOperationStmt(OperationStmt *opStmt);
|
||||
void generateDma(const Region ®ion, Location loc, MLFuncBuilder *b);
|
||||
bool generateDma(const MemRefRegion ®ion, ForStmt *forStmt);
|
||||
|
||||
// List of memory regions to promote.
|
||||
std::vector<Region> regions;
|
||||
// List of memory regions to DMA for.
|
||||
std::vector<std::unique_ptr<MemRefRegion>> regions;
|
||||
|
||||
// Map from original memref's to the DMA buffers that their accesses are
|
||||
// replaced with.
|
||||
DenseMap<SSAValue *, SSAValue *> fastBufferMap;
|
||||
|
||||
// Slow memory space associated with DMAs.
|
||||
const unsigned slowMemorySpace;
|
||||
// Fast memory space associated with DMAs.
|
||||
unsigned fastMemorySpace;
|
||||
// Minimum DMA transfer size supported by the target in bytes.
|
||||
const int minDmaTransferSize;
|
||||
|
||||
// The loop level at which DMAs should be generated. '0' is an outermost loop.
|
||||
unsigned dmaDepth;
|
||||
|
||||
static char passID;
|
||||
const unsigned lowMemorySpace;
|
||||
const unsigned highMemorySpace;
|
||||
const int minDmaTransferSize;
|
||||
};
|
||||
|
||||
} // end anonymous namespace
|
||||
|
||||
char DmaGeneration::passID = 0;
|
||||
|
||||
/// Generates DMAs for memref's living in 'lowMemorySpace' into newly created
|
||||
/// buffers in 'highMemorySpace', and replaces memory operations to the former
|
||||
/// Generates DMAs for memref's living in 'slowMemorySpace' into newly created
|
||||
/// buffers in 'fastMemorySpace', and replaces memory operations to the former
|
||||
/// by the latter. Only load op's handled for now.
|
||||
/// TODO(bondhugula): extend this to store op's.
|
||||
FunctionPass *mlir::createDmaGenerationPass(unsigned lowMemorySpace,
|
||||
unsigned highMemorySpace,
|
||||
FunctionPass *mlir::createDmaGenerationPass(unsigned slowMemorySpace,
|
||||
unsigned fastMemorySpace,
|
||||
int minDmaTransferSize) {
|
||||
return new DmaGeneration(lowMemorySpace, highMemorySpace, minDmaTransferSize);
|
||||
return new DmaGeneration(slowMemorySpace, fastMemorySpace,
|
||||
minDmaTransferSize);
|
||||
}
|
||||
|
||||
// Gather regions to promote to buffers in higher memory space.
|
||||
// Gather regions to promote to buffers in faster memory space.
|
||||
// TODO(bondhugula): handle store op's; only load's handled for now.
|
||||
void DmaGeneration::visitOperationStmt(OperationStmt *opStmt) {
|
||||
if (auto loadOp = opStmt->dyn_cast<LoadOp>()) {
|
||||
if (loadOp->getMemRefType().getMemorySpace() != lowMemorySpace)
|
||||
if (loadOp->getMemRefType().getMemorySpace() != slowMemorySpace)
|
||||
return;
|
||||
|
||||
// TODO(bondhugula): eventually, we need to be performing a union across all
|
||||
// regions for a given memref instead of creating one region per memory op.
|
||||
// This way we would be allocating O(num of memref's) sets instead of
|
||||
// O(num of load/store op's).
|
||||
auto memoryRegion = std::make_unique<FlatAffineConstraints>();
|
||||
if (!getMemoryRegion(opStmt, memoryRegion.get())) {
|
||||
LLVM_DEBUG(llvm::dbgs() << "Error obtaining memory region");
|
||||
} else if (auto storeOp = opStmt->dyn_cast<StoreOp>()) {
|
||||
if (storeOp->getMemRefType().getMemorySpace() != slowMemorySpace)
|
||||
return;
|
||||
}
|
||||
LLVM_DEBUG(llvm::dbgs() << "Memory region");
|
||||
LLVM_DEBUG(memoryRegion->dump());
|
||||
|
||||
regions.push_back(
|
||||
{cast<MLValue>(loadOp->getMemRef()), false, std::move(memoryRegion)});
|
||||
} else {
|
||||
// Neither load nor a store op.
|
||||
return;
|
||||
}
|
||||
|
||||
// TODO(bondhugula): eventually, we need to be performing a union across all
|
||||
// regions for a given memref instead of creating one region per memory op.
|
||||
// This way we would be allocating O(num of memref's) sets instead of
|
||||
// O(num of load/store op's).
|
||||
auto region = std::make_unique<MemRefRegion>();
|
||||
if (!getMemRefRegion(opStmt, dmaDepth, region.get())) {
|
||||
LLVM_DEBUG(llvm::dbgs() << "Error obtaining memory region\n");
|
||||
return;
|
||||
}
|
||||
LLVM_DEBUG(llvm::dbgs() << "Memory region:\n");
|
||||
LLVM_DEBUG(region->getConstraints()->dump());
|
||||
|
||||
regions.push_back(std::move(region));
|
||||
}
|
||||
|
||||
// Create a buffer in the higher (faster) memory space for the specified region;
|
||||
// generate a DMA from the lower memory space to this one, and replace all loads
|
||||
// to load from the buffer.
|
||||
// TODO: handle write regions by generating outgoing DMAs; only read regions are
|
||||
// handled for now.
|
||||
void DmaGeneration::generateDma(const Region ®ion, Location loc,
|
||||
MLFuncBuilder *b) {
|
||||
// Only memref read regions handled for now.
|
||||
if (region.isWrite)
|
||||
return;
|
||||
// Creates a buffer in the faster memory space for the specified region;
|
||||
// generates a DMA from the lower memory space to this one, and replaces all
|
||||
// loads to load from the buffer. Returns true if DMAs are generated.
|
||||
bool DmaGeneration::generateDma(const MemRefRegion ®ion, ForStmt *forStmt) {
|
||||
// DMAs for read regions are going to be inserted just before the for loop.
|
||||
MLFuncBuilder prologue(forStmt);
|
||||
// DMAs for write regions are going to be inserted just after the for loop.
|
||||
MLFuncBuilder epilogue(forStmt->getBlock(),
|
||||
std::next(StmtBlock::iterator(forStmt)));
|
||||
MLFuncBuilder *b = region.isWrite() ? &epilogue : &prologue;
|
||||
|
||||
// Builder to create constants at the top level.
|
||||
MLFuncBuilder top(forStmt->findFunction());
|
||||
|
||||
FlatAffineConstraints *cst =
|
||||
const_cast<FlatAffineConstraints *>(region.getConstraints());
|
||||
|
||||
auto loc = forStmt->getLoc();
|
||||
auto *memref = region.memref;
|
||||
auto memRefType = memref->getType().cast<MemRefType>();
|
||||
|
||||
// Indices to use for DmaStart op.
|
||||
SmallVector<SSAValue *, 4> srcIndices, destIndices;
|
||||
|
||||
SSAValue *zeroIndex = b->create<ConstantIndexOp>(loc, 0);
|
||||
SSAValue *zeroIndex = top.create<ConstantIndexOp>(loc, 0);
|
||||
|
||||
unsigned rank = memRefType.getRank();
|
||||
SmallVector<int, 4> shape;
|
||||
shape.reserve(rank);
|
||||
|
||||
// Compute the extents of the buffer.
|
||||
Optional<int64_t> numElements = region.getConstantSize();
|
||||
if (!numElements.hasValue()) {
|
||||
LLVM_DEBUG(llvm::dbgs() << "Non-constant region size\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (numElements.getValue() == 0) {
|
||||
LLVM_DEBUG(llvm::dbgs() << "Nothing to DMA\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
region.getConstantShape(&shape);
|
||||
|
||||
// Index start offsets for faster memory buffer relative to the original.
|
||||
SmallVector<int, 4> offsets;
|
||||
SmallVector<AffineExpr, 4> offsets;
|
||||
offsets.reserve(rank);
|
||||
|
||||
unsigned numElements = 1;
|
||||
for (unsigned d = 0; d < rank; d++) {
|
||||
auto lb = region.cst->getConstantLowerBound(d);
|
||||
auto ub = region.cst->getConstantUpperBound(d);
|
||||
unsigned lbPos;
|
||||
cst->getConstantBoundDifference(d, &lbPos);
|
||||
|
||||
if (!lb.hasValue() || !ub.hasValue()) {
|
||||
LLVM_DEBUG(llvm::dbgs() << "Non-constant loop bounds");
|
||||
return;
|
||||
// Construct the index expressions for the fast memory buffer. The index
|
||||
// expression for a particular dimension of the fast buffer is obtained by
|
||||
// subtracting out the lower bound on the original memref's data region
|
||||
// along the corresponding dimension.
|
||||
AffineExpr offset = top.getAffineConstantExpr(0);
|
||||
for (unsigned j = rank; j < cst->getNumCols() - 1; j++) {
|
||||
offset = offset - cst->atIneq(lbPos, j) * top.getAffineDimExpr(j - rank);
|
||||
}
|
||||
offset = offset - cst->atIneq(lbPos, cst->getNumCols() - 1);
|
||||
offsets.push_back(offset);
|
||||
|
||||
offsets.push_back(lb.getValue());
|
||||
int dimSize = ub.getValue() - lb.getValue() + 1;
|
||||
if (dimSize <= 0)
|
||||
return;
|
||||
shape.push_back(dimSize);
|
||||
numElements *= dimSize;
|
||||
srcIndices.push_back(b->create<ConstantIndexOp>(loc, lb.getValue()));
|
||||
auto ids = cst->getIds();
|
||||
SmallVector<SSAValue *, 8> operands;
|
||||
for (unsigned i = rank, e = ids.size(); i < e; i++) {
|
||||
auto id = cst->getIds()[i];
|
||||
assert(id.hasValue());
|
||||
operands.push_back(id.getValue());
|
||||
}
|
||||
// Set DMA start location for this dimension in the lower memory space
|
||||
// memref.
|
||||
if (auto caf = offsets[d].dyn_cast<AffineConstantExpr>()) {
|
||||
srcIndices.push_back(cast<MLValue>(
|
||||
top.create<ConstantIndexOp>(loc, caf.getValue())->getResult()));
|
||||
} else {
|
||||
auto map =
|
||||
top.getAffineMap(cst->getNumDimIds() + cst->getNumSymbolIds() - rank,
|
||||
0, offsets[d], {});
|
||||
srcIndices.push_back(cast<MLValue>(
|
||||
b->create<AffineApplyOp>(loc, map, operands)->getResult(0)));
|
||||
}
|
||||
// The fast buffer is DMAed into at location zero; addressing is relative.
|
||||
destIndices.push_back(zeroIndex);
|
||||
}
|
||||
|
||||
// Create the faster memref buffer.
|
||||
auto fastMemRefType =
|
||||
b->getMemRefType(shape, memRefType.getElementType(), {}, highMemorySpace);
|
||||
SSAValue *fastMemRef;
|
||||
|
||||
auto fastMemRef = b->create<AllocOp>(loc, fastMemRefType)->getResult();
|
||||
// Check if a buffer was already created.
|
||||
// TODO(bondhugula): union across all memory op's per buffer. For now assuming
|
||||
// that multiple memory op's on the same memref have the *same* memory
|
||||
// footprint.
|
||||
if (fastBufferMap.find(memref) == fastBufferMap.end()) {
|
||||
auto fastMemRefType = top.getMemRefType(shape, memRefType.getElementType(),
|
||||
{}, fastMemorySpace);
|
||||
|
||||
LLVM_DEBUG(llvm::dbgs() << "Creating a new buffer of type: ");
|
||||
LLVM_DEBUG(fastMemRefType.dump(); llvm::dbgs() << "\n");
|
||||
|
||||
// Create the fast memory space buffer just before the 'for' statement.
|
||||
fastMemRef = prologue.create<AllocOp>(loc, fastMemRefType)->getResult();
|
||||
// Record it.
|
||||
fastBufferMap[memref] = fastMemRef;
|
||||
} else {
|
||||
// Reuse the one already created.
|
||||
fastMemRef = fastBufferMap[memref];
|
||||
}
|
||||
// Create a tag (single element 1-d memref) for the DMA.
|
||||
auto tagMemRefType = b->getMemRefType({1}, b->getIntegerType(32));
|
||||
auto tagMemRef = b->create<AllocOp>(loc, tagMemRefType);
|
||||
auto numElementsSSA = b->create<ConstantIndexOp>(loc, numElements);
|
||||
auto tagMemRefType = top.getMemRefType({1}, top.getIntegerType(32));
|
||||
auto tagMemRef = prologue.create<AllocOp>(loc, tagMemRefType);
|
||||
auto numElementsSSA =
|
||||
top.create<ConstantIndexOp>(loc, numElements.getValue());
|
||||
|
||||
// TODO(bondhugula): check for transfer sizes not being a multiple of
|
||||
// minDmaTransferSize and handle them appropriately.
|
||||
|
||||
// TODO(bondhugula): Need to use strided DMA for multi-dimensional (>= 2-d)
|
||||
// case.
|
||||
b->create<DmaStartOp>(loc, memref, srcIndices, fastMemRef, destIndices,
|
||||
numElementsSSA, tagMemRef, zeroIndex);
|
||||
|
||||
if (!region.isWrite()) {
|
||||
b->create<DmaStartOp>(loc, memref, srcIndices, fastMemRef, destIndices,
|
||||
numElementsSSA, tagMemRef, zeroIndex);
|
||||
} else {
|
||||
// dest and src is switched for the writes (since DMA is from the faster
|
||||
// memory space to the slower one).
|
||||
b->create<DmaStartOp>(loc, fastMemRef, destIndices, memref, srcIndices,
|
||||
numElementsSSA, tagMemRef, zeroIndex);
|
||||
}
|
||||
|
||||
// Matching DMA wait to block on completion; tag always has a 0 index.
|
||||
b->create<DmaWaitOp>(loc, tagMemRef, zeroIndex, numElementsSSA);
|
||||
|
||||
// Replace all uses of the old memref with the promoted one while remapping
|
||||
// Replace all uses of the old memref with the faster one while remapping
|
||||
// access indices (subtracting out lower bound offsets for each dimension).
|
||||
SmallVector<AffineExpr, 4> remapExprs;
|
||||
remapExprs.reserve(rank);
|
||||
for (unsigned i = 0; i < rank; i++) {
|
||||
auto d0 = b->getAffineDimExpr(i);
|
||||
remapExprs.push_back(d0 - offsets[i]);
|
||||
auto dim = b->getAffineDimExpr(i);
|
||||
remapExprs.push_back(dim - offsets[i]);
|
||||
}
|
||||
auto indexRemap = b->getAffineMap(rank, 0, remapExprs, {});
|
||||
replaceAllMemRefUsesWith(memref, cast<MLValue>(fastMemRef), {}, indexRemap);
|
||||
// *Only* those uses within the body of 'forStmt' are replaced.
|
||||
replaceAllMemRefUsesWith(memref, cast<MLValue>(fastMemRef), {}, indexRemap,
|
||||
&*forStmt->begin());
|
||||
return true;
|
||||
}
|
||||
|
||||
bool DmaGeneration::runOnForStmt(ForStmt *forStmt) {
|
||||
walk(forStmt);
|
||||
/// Returns the nesting depth of this statement, i.e., the number of loops
|
||||
/// surrounding this statement.
|
||||
// TODO(bondhugula): move this to utilities later.
|
||||
static unsigned getNestingDepth(const Statement &stmt) {
|
||||
const Statement *currStmt = &stmt;
|
||||
unsigned depth = 0;
|
||||
while ((currStmt = currStmt->getParentStmt())) {
|
||||
if (isa<ForStmt>(currStmt))
|
||||
depth++;
|
||||
}
|
||||
return depth;
|
||||
}
|
||||
|
||||
MLFuncBuilder b(forStmt);
|
||||
for (const auto ®ion : regions) {
|
||||
generateDma(region, forStmt->getLoc(), &b);
|
||||
// TODO(bondhugula): make this run on a StmtBlock instead of a 'for' stmt.
|
||||
void DmaGeneration::runOnForStmt(ForStmt *forStmt) {
|
||||
// For now (for testing purposes), we'll run this on the outermost among 'for'
|
||||
// stmt's with unit stride, i.e., right at the top of the tile if tiling has
|
||||
// been done. In the future, the DMA generation has to be done at a level
|
||||
// where the generated data fits in a higher level of the memory hierarchy; so
|
||||
// the pass has to be instantiated with additional information that we aren't
|
||||
// provided with at the moment.
|
||||
if (forStmt->getStep() != 1) {
|
||||
if (auto *innerFor = dyn_cast<ForStmt>(&*forStmt->begin())) {
|
||||
runOnForStmt(innerFor);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// This function never leaves the IR in an invalid state.
|
||||
return false;
|
||||
// DMAs will be generated for this depth, i.e., for all data accessed by this
|
||||
// loop.
|
||||
dmaDepth = getNestingDepth(*forStmt);
|
||||
|
||||
regions.clear();
|
||||
fastBufferMap.clear();
|
||||
|
||||
// Walk this 'for' statement to gather all memory regions.
|
||||
walk(forStmt);
|
||||
|
||||
for (const auto ®ion : regions) {
|
||||
generateDma(*region, forStmt);
|
||||
}
|
||||
}
|
||||
|
||||
PassResult DmaGeneration::runOnMLFunction(MLFunction *f) {
|
||||
bool ret = false;
|
||||
|
||||
for (auto &stmt : *f) {
|
||||
// Run on all 'for' statements for now.
|
||||
if (auto *forStmt = dyn_cast<ForStmt>(&stmt)) {
|
||||
ret = ret | runOnForStmt(forStmt);
|
||||
runOnForStmt(forStmt);
|
||||
}
|
||||
}
|
||||
return ret ? failure() : success();
|
||||
// This function never leaves the IR in an invalid state.
|
||||
return success();
|
||||
}
|
||||
|
||||
static PassRegistration<DmaGeneration>
|
||||
|
|
|
@ -42,7 +42,7 @@ namespace {
|
|||
struct LoopTiling : public FunctionPass {
|
||||
LoopTiling() : FunctionPass(&LoopTiling::passID) {}
|
||||
PassResult runOnMLFunction(MLFunction *f) override;
|
||||
constexpr static unsigned kDefaultTileSize = 32;
|
||||
constexpr static unsigned kDefaultTileSize = 4;
|
||||
|
||||
static char passID;
|
||||
};
|
||||
|
|
|
@ -117,7 +117,7 @@ static bool doubleBuffer(const MLValue *oldMemRef, ForStmt *forStmt) {
|
|||
return true;
|
||||
}
|
||||
|
||||
/// Returns false if this succeeds on at least one 'for' stmt.
|
||||
/// Returns success if the IR is in a valid state.
|
||||
PassResult PipelineDataTransfer::runOnMLFunction(MLFunction *f) {
|
||||
// Do a post order walk so that inner loop DMAs are processed first. This is
|
||||
// necessary since 'for' statements nested within would otherwise become
|
||||
|
@ -126,9 +126,9 @@ PassResult PipelineDataTransfer::runOnMLFunction(MLFunction *f) {
|
|||
// epilogue).
|
||||
forStmts.clear();
|
||||
walkPostOrder(f);
|
||||
bool ret = true;
|
||||
bool ret = false;
|
||||
for (auto *forStmt : forStmts) {
|
||||
ret = ret & runOnForStmt(forStmt);
|
||||
ret = ret | runOnForStmt(forStmt);
|
||||
}
|
||||
return ret ? failure() : success();
|
||||
}
|
||||
|
@ -293,9 +293,16 @@ PassResult PipelineDataTransfer::runOnForStmt(ForStmt *forStmt) {
|
|||
// Get delays stored in map.
|
||||
std::vector<uint64_t> delays(forStmt->getStatements().size());
|
||||
unsigned s = 0;
|
||||
for (const auto &stmt : *forStmt) {
|
||||
for (auto &stmt : *forStmt) {
|
||||
assert(stmtDelayMap.find(&stmt) != stmtDelayMap.end());
|
||||
delays[s++] = stmtDelayMap[&stmt];
|
||||
LLVM_DEBUG(
|
||||
// Tagging statements with delays for debugging purposes.
|
||||
if (auto *opStmt = dyn_cast<OperationStmt>(&stmt)) {
|
||||
MLFuncBuilder b(opStmt);
|
||||
opStmt->setAttr(b.getIdentifier("delay"),
|
||||
b.getIntegerAttr(delays[s - 1]));
|
||||
});
|
||||
}
|
||||
|
||||
if (!isStmtwiseShiftValid(*forStmt, delays)) {
|
||||
|
|
|
@ -24,6 +24,7 @@
|
|||
|
||||
#include "mlir/Analysis/AffineAnalysis.h"
|
||||
#include "mlir/Analysis/AffineStructures.h"
|
||||
#include "mlir/Analysis/Utils.h"
|
||||
#include "mlir/IR/Builders.h"
|
||||
#include "mlir/IR/Module.h"
|
||||
#include "mlir/IR/StmtVisitor.h"
|
||||
|
@ -47,13 +48,15 @@ static bool isMemRefDereferencingOp(const Operation &op) {
|
|||
/// old memref's indices to the new memref using the supplied affine map
|
||||
/// and adding any additional indices. The new memref could be of a different
|
||||
/// shape or rank, but of the same elemental type. Additional indices are added
|
||||
/// at the start for now.
|
||||
/// at the start. An optional argument 'domOpFilter' restricts the
|
||||
/// replacement to only those operations that are dominated by the former.
|
||||
// TODO(mlir-team): extend this for SSAValue / CFGFunctions. Can also be easily
|
||||
// extended to add additional indices at any position.
|
||||
bool mlir::replaceAllMemRefUsesWith(const MLValue *oldMemRef,
|
||||
MLValue *newMemRef,
|
||||
ArrayRef<MLValue *> extraIndices,
|
||||
AffineMap indexRemap) {
|
||||
AffineMap indexRemap,
|
||||
const Statement *domStmtFilter) {
|
||||
unsigned newMemRefRank = newMemRef->getType().cast<MemRefType>().getRank();
|
||||
(void)newMemRefRank; // unused in opt mode
|
||||
unsigned oldMemRefRank = oldMemRef->getType().cast<MemRefType>().getRank();
|
||||
|
@ -82,6 +85,11 @@ bool mlir::replaceAllMemRefUsesWith(const MLValue *oldMemRef,
|
|||
for (auto it = oldMemRef->use_begin(); it != oldMemRef->use_end();) {
|
||||
StmtOperand &use = *(it++);
|
||||
auto *opStmt = cast<OperationStmt>(use.getOwner());
|
||||
|
||||
// Skip this use if it's not dominated by domStmtFilter.
|
||||
if (domStmtFilter && !dominates(*domStmtFilter, *opStmt))
|
||||
continue;
|
||||
|
||||
assert(isMemRefDereferencingOp(*opStmt) &&
|
||||
"memref deferencing op expected");
|
||||
|
||||
|
@ -172,7 +180,7 @@ OperationStmt *
|
|||
mlir::createComposedAffineApplyOp(FuncBuilder *builder, Location loc,
|
||||
ArrayRef<MLValue *> operands,
|
||||
ArrayRef<OperationStmt *> affineApplyOps,
|
||||
SmallVectorImpl<SSAValue *> &results) {
|
||||
SmallVectorImpl<SSAValue *> *results) {
|
||||
// Create identity map with same number of dimensions as number of operands.
|
||||
auto map = builder->getMultiDimIdentityMap(operands.size());
|
||||
// Initialize AffineValueMap with identity map.
|
||||
|
@ -194,9 +202,9 @@ mlir::createComposedAffineApplyOp(FuncBuilder *builder, Location loc,
|
|||
// Create new AffineApplyOp based on 'valueMap'.
|
||||
auto affineApplyOp =
|
||||
builder->create<AffineApplyOp>(loc, valueMap.getAffineMap(), outOperands);
|
||||
results.resize(operands.size());
|
||||
results->resize(operands.size());
|
||||
for (unsigned i = 0, e = operands.size(); i < e; ++i) {
|
||||
results[i] = affineApplyOp->getResult(i);
|
||||
(*results)[i] = affineApplyOp->getResult(i);
|
||||
}
|
||||
return cast<OperationStmt>(affineApplyOp->getOperation());
|
||||
}
|
||||
|
@ -247,8 +255,8 @@ OperationStmt *mlir::createAffineComputationSlice(OperationStmt *opStmt) {
|
|||
if (affineApplyOps.empty())
|
||||
return nullptr;
|
||||
|
||||
// Check if all uses of the affine apply op's lie in this op stmt
|
||||
// itself, in which case there would be nothing to do.
|
||||
// Check if all uses of the affine apply op's lie only in this op stmt, in
|
||||
// which case there would be nothing to do.
|
||||
bool localized = true;
|
||||
for (auto *op : affineApplyOps) {
|
||||
for (auto *result : op->getResults()) {
|
||||
|
@ -266,7 +274,7 @@ OperationStmt *mlir::createAffineComputationSlice(OperationStmt *opStmt) {
|
|||
FuncBuilder builder(opStmt);
|
||||
SmallVector<SSAValue *, 4> results;
|
||||
auto *affineApplyStmt = createComposedAffineApplyOp(
|
||||
&builder, opStmt->getLoc(), subOperands, affineApplyOps, results);
|
||||
&builder, opStmt->getLoc(), subOperands, affineApplyOps, &results);
|
||||
assert(results.size() == subOperands.size() &&
|
||||
"number of results should be the same as the number of subOperands");
|
||||
|
||||
|
|
|
@ -1,42 +1,155 @@
|
|||
// RUN: mlir-opt %s -dma-generate | FileCheck %s
|
||||
// RUN: mlir-opt %s -dma-generate -canonicalize | FileCheck %s
|
||||
|
||||
// Index of the buffer for the second DMA is remapped.
|
||||
// CHECK-DAG: [[MAP:#map[0-9]+]] = (d0) -> (d0 - 256)
|
||||
// CHECK-DAG: #map{{[0-9]+}} = (d0, d1) -> (d0 * 16 + d1)
|
||||
// CHECK-DAG: #map{{[0-9]+}} = (d0, d1) -> (d0, d1)
|
||||
|
||||
// CHECK-LABEL: mlfunc @loop_tiling() {
|
||||
mlfunc @loop_tiling() {
|
||||
// CHECK-LABEL: mlfunc @loop_nest_1d() {
|
||||
mlfunc @loop_nest_1d() {
|
||||
%A = alloc() : memref<256 x f32>
|
||||
%B = alloc() : memref<512 x f32>
|
||||
%F = alloc() : memref<128 x f32, 1>
|
||||
%F = alloc() : memref<256 x f32, 1>
|
||||
// First DMA buffer.
|
||||
// CHECK: %3 = alloc() : memref<256xf32, 1>
|
||||
// Tag for first DMA.
|
||||
// CHECK: %4 = alloc() : memref<1xi32>
|
||||
// First DMA transfer.
|
||||
// CHECK: dma_start %3[%5], %3[%c0], %c256, %4[%c0] : memref<256xf32, 1>, memref<256xf32, 1>, memref<1xi32>
|
||||
// CHECK: dma_start %0[%c0], %3[%c0], %c256, %4[%c0] : memref<256xf32>, memref<256xf32, 1>, memref<1xi32>
|
||||
// CHECK: dma_wait %4[%c0], %c256 : memref<1xi32>
|
||||
// Second DMA buffer.
|
||||
// CHECK: %6 = alloc() : memref<256xf32, 1>
|
||||
// CHECK: %5 = alloc() : memref<256xf32, 1>
|
||||
// Tag for second DMA.
|
||||
// CHECK: %7 = alloc() : memref<1xi32>
|
||||
// CHECK: %6 = alloc() : memref<1xi32>
|
||||
// Second DMA transfer.
|
||||
// CHECK: dma_start %6[%8], %6[%c0_1], %c256_3, %7[%c0_1] : memref<256xf32, 1>, memref<256xf32, 1>, memref<1xi32>
|
||||
// CHECK-NEXT: dma_wait %7[%c0_1], %c256_3 : memref<1xi32>
|
||||
// CHECK: dma_start %1[%c256], %5[%c0], %c256, %6[%c0] : memref<512xf32>, memref<256xf32, 1>, memref<1xi32>
|
||||
// CHECK-NEXT: dma_wait %6[%c0], %c256 : memref<1xi32>
|
||||
// CHECK: for %i0 = 0 to 256 {
|
||||
// CHECK: %7 = affine_apply #map{{[0-9]+}}(%i0)
|
||||
// CHECK-NEXT: %8 = load %3[%7] : memref<256xf32, 1>
|
||||
// CHECK: %9 = affine_apply #map{{[0-9]+}}(%i0)
|
||||
// CHECK-NEXT: %10 = load %3[%9] : memref<256xf32, 1>
|
||||
// CHECK: %11 = affine_apply #map{{[0-9]+}}(%i0)
|
||||
// CHECK: %12 = affine_apply [[MAP]](%11)
|
||||
// CHECK-NEXT: %13 = load %6[%12] : memref<256xf32, 1>
|
||||
// CHECK: %10 = affine_apply [[MAP]](%9)
|
||||
// CHECK-NEXT: %11 = load %5[%10] : memref<256xf32, 1>
|
||||
// Already in faster memory space.
|
||||
// CHECK: %14 = load %2[%i0] : memref<128xf32, 1>
|
||||
// CHECK: %12 = load %2[%i0] : memref<256xf32, 1>
|
||||
// CHECK-NEXT: }
|
||||
// CHECK-NEXT: return
|
||||
for %i = 0 to 256 {
|
||||
load %A[%i] : memref<256 x f32>
|
||||
%idx = affine_apply (d0) -> (d0 + 256)(%i)
|
||||
load %B[%idx] : memref<512 x f32>
|
||||
load %F[%i] : memref<128 x f32, 1>
|
||||
load %F[%i] : memref<256 x f32, 1>
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// CHECK-LABEL: mlfunc @loop_nest_high_d
|
||||
// CHECK: %c16384 = constant 16384 : index
|
||||
// CHECK-NEXT: %0 = alloc() : memref<512x32xf32, 1>
|
||||
// CHECK-NEXT: %1 = alloc() : memref<1xi32>
|
||||
// INCOMING DMA for B
|
||||
// CHECK-NEXT: dma_start %arg1[%c0, %c0], %0[%c0, %c0], %c16384, %1[%c0] : memref<512x32xf32>, memref<512x32xf32, 1>, memref<1xi32>
|
||||
// CHECK-NEXT: dma_wait %1[%c0], %c16384 : memref<1xi32>
|
||||
// CHECK-NEXT: %2 = alloc() : memref<512x32xf32, 1>
|
||||
// CHECK-NEXT: %3 = alloc() : memref<1xi32>
|
||||
// INCOMING DMA for A.
|
||||
// CHECK-NEXT: dma_start %arg0[%c0, %c0], %2[%c0, %c0], %c16384, %3[%c0] : memref<512x32xf32>, memref<512x32xf32, 1>, memref<1xi32>
|
||||
// CHECK-NEXT: dma_wait %3[%c0], %c16384 : memref<1xi32>
|
||||
// CHECK-NEXT: %4 = alloc() : memref<512x32xf32, 1>
|
||||
// CHECK-NEXT: %5 = alloc() : memref<1xi32>
|
||||
// INCOMING DMA for C.
|
||||
// CHECK-NEXT: dma_start %arg2[%c0, %c0], %4[%c0, %c0], %c16384, %5[%c0] : memref<512x32xf32>, memref<512x32xf32, 1>, memref<1xi32>
|
||||
// CHECK-NEXT: dma_wait %5[%c0], %c16384 : memref<1xi32>
|
||||
// CHECK-NEXT: %6 = alloc() : memref<1xi32>
|
||||
// CHECK-NEXT: for %i0 = 0 to 32 {
|
||||
// CHECK-NEXT: for %i1 = 0 to 32 {
|
||||
// CHECK-NEXT: for %i2 = 0 to 32 {
|
||||
// CHECK-NEXT: for %i3 = 0 to 16 {
|
||||
// CHECK-NEXT: %7 = affine_apply #map{{[0-9]+}}(%i1, %i3)
|
||||
// CHECK-NEXT: %8 = affine_apply #map{{[0-9]+}}(%7, %i0)
|
||||
// CHECK-NEXT: %9 = load %0[%8#0, %8#1] : memref<512x32xf32, 1>
|
||||
// CHECK-NEXT: "foo"(%9) : (f32) -> ()
|
||||
// CHECK-NEXT: }
|
||||
// CHECK-NEXT: for %i4 = 0 to 16 {
|
||||
// CHECK-NEXT: %10 = affine_apply #map{{[0-9]+}}(%i2, %i4)
|
||||
// CHECK-NEXT: %11 = affine_apply #map{{[0-9]+}}(%10, %i1)
|
||||
// CHECK-NEXT: %12 = load %2[%11#0, %11#1] : memref<512x32xf32, 1>
|
||||
// CHECK-NEXT: "bar"(%12) {mxu_id: 0} : (f32) -> ()
|
||||
// CHECK-NEXT: }
|
||||
// CHECK-NEXT: for %i5 = 0 to 16 {
|
||||
// CHECK-NEXT: %13 = "abc_compute"() : () -> f32
|
||||
// CHECK-NEXT: %14 = affine_apply #map{{[0-9]+}}(%i2, %i5)
|
||||
// CHECK-NEXT: %15 = affine_apply #map{{[0-9]+}}(%14, %i0)
|
||||
// CHECK-NEXT: %16 = load %4[%15#0, %15#1] : memref<512x32xf32, 1>
|
||||
// CHECK-NEXT: %17 = "addf32"(%13, %16) : (f32, f32) -> f32
|
||||
// CHECK-NEXT: %18 = affine_apply #map{{[0-9]+}}(%14, %i0)
|
||||
// CHECK-NEXT: store %17, %4[%18#0, %18#1] : memref<512x32xf32, 1>
|
||||
// CHECK-NEXT: }
|
||||
// CHECK-NEXT: "foobar"() : () -> ()
|
||||
// CHECK-NEXT: }
|
||||
// CHECK-NEXT: }
|
||||
// CHECK-NEXT: }
|
||||
// OUTGOING DMA for C.
|
||||
// CHECK-NEXT: dma_start %4[%c0, %c0], %arg2[%c0, %c0], %c16384, %6[%c0] : memref<512x32xf32, 1>, memref<512x32xf32>, memref<1xi32>
|
||||
// CHECK-NEXT: dma_wait %6[%c0], %c16384 : memref<1xi32>
|
||||
// CHECK-NEXT: return
|
||||
// CHECK-NEXT:}
|
||||
mlfunc @loop_nest_high_d(%A: memref<512 x 32 x f32>,
|
||||
%B: memref<512 x 32 x f32>, %C: memref<512 x 32 x f32>) {
|
||||
// DMAs will be performed at this level (jT is the first loop without a stride).
|
||||
// A and B are read, while C is both read and written. A total of three new buffers
|
||||
// are allocated and existing load's/store's are replaced by accesses to those buffers.
|
||||
for %jT = 0 to 32 {
|
||||
for %kT = 0 to 32 {
|
||||
for %iT = 0 to 32 {
|
||||
for %kk = 0 to 16 { // k intratile
|
||||
%k = affine_apply (d0, d1) -> (16*d0 + d1) (%kT, %kk)
|
||||
%v0 = load %B[%k, %jT] : memref<512 x 32 x f32>
|
||||
"foo"(%v0) : (f32) -> ()
|
||||
}
|
||||
for %ii = 0 to 16 { // i intratile.
|
||||
%i = affine_apply (d0, d1) -> (16*d0 + d1)(%iT, %ii)
|
||||
%v1 = load %A[%i, %kT] : memref<512 x 32 x f32>
|
||||
"bar"(%v1) {mxu_id: 0} : (f32) -> ()
|
||||
}
|
||||
for %ii_ = 0 to 16 { // i intratile.
|
||||
%v2 = "abc_compute"() : () -> f32
|
||||
%i_ = affine_apply (d0, d1) -> (16*d0 + d1)(%iT, %ii_)
|
||||
%v3 = load %C[%i_, %jT] : memref<512 x 32 x f32>
|
||||
%v4 = "addf32"(%v2, %v3) : (f32, f32) -> (f32)
|
||||
store %v4, %C[%i_, %jT] : memref<512 x 32 x f32>
|
||||
}
|
||||
"foobar"() : () -> ()
|
||||
}
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// A loop nest with a modulo 2 access.
|
||||
//
|
||||
// CHECK-LABEL: mlfunc @loop_nest_modulo() {
|
||||
// CHECK: %0 = alloc() : memref<256x8xf32>
|
||||
// CHECK-NEXT: for %i0 = 0 to 32 step 4 {
|
||||
// CHECK-NEXT: %1 = alloc() : memref<32x2xf32, 1>
|
||||
// CHECK-NEXT: %2 = alloc() : memref<1xi32>
|
||||
// CHECK-NEXT: dma_start %0[%c0, %c0], %1[%c0, %c0], %c64, %2[%c0] : memref<256x8xf32>, memref<32x2xf32, 1>, memref<1xi32>
|
||||
// CHECK-NEXT: dma_wait %2[%c0], %c64 : memref<1xi32>
|
||||
// CHECK-NEXT: for %i1 = 0 to 8 {
|
||||
// ...
|
||||
// ...
|
||||
// CHECK: }
|
||||
// CHECK-NEXT: }
|
||||
// CHECK-NEXT: return
|
||||
mlfunc @loop_nest_modulo() {
|
||||
%A = alloc() : memref<256 x 8 x f32>
|
||||
for %i = 0 to 32 step 4 {
|
||||
// DMAs will be performed at this level (%j is the first unit stride loop)
|
||||
for %j = 0 to 8 {
|
||||
%idx = affine_apply (d0) -> (d0 mod 2) (%j)
|
||||
// A buffer of size 32 x 2 will be allocated (original buffer was 256 x 8).
|
||||
%v = load %A[%i, %idx] : memref<256 x 8 x f32>
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
// RUN: mlir-opt %s -loop-tile | FileCheck %s
|
||||
// RUN: mlir-opt %s -loop-tile -tile-size=32 | FileCheck %s
|
||||
|
||||
// CHECK: #map0 = (d0) -> (d0 + 32)
|
||||
// CHECK: #map1 = (d0) -> (d0 + 32, 50)
|
||||
|
|
Loading…
Reference in New Issue