Updates to transformation/analysis passes/utilities. Update DMA generation pass

and getMemRefRegion() to work with specified loop depths; add support for
outgoing DMAs, store op's.

- add support for getMemRefRegion symbolic in outer loops - hence support for
  DMAs symbolic in outer surrounding loops.

- add DMA generation support for outgoing DMAs (store op's to lower memory
  space); extend getMemoryRegion to store op's. -memref-bound-check now works
  with store op's as well.

- fix dma-generate (references to the old memref in the dma_start op were also
  being replaced with the new buffer); we need replace all memref uses to work
  only on a subset of the uses - add a new optional argument for
  replaceAllMemRefUsesWith. update replaceAllMemRefUsesWith to take an optional
  'operation' argument to serve as a filter - if provided, only those uses that
  are dominated by the filter are replaced.

- Add missing print for attributes for dma_start, dma_wait op's.

- update the FlatAffineConstraints API

PiperOrigin-RevId: 221889223
This commit is contained in:
Uday Bondhugula 2018-11-16 20:12:06 -08:00 committed by jpienaar
parent 6b52ac3aa6
commit fff1efbaf5
14 changed files with 882 additions and 229 deletions

View File

@ -34,6 +34,7 @@ class AffineApplyOp;
class AffineBound;
class AffineCondition;
class AffineMap;
class ForStmt;
class IntegerSet;
class MLIRContext;
class MLValue;
@ -177,7 +178,6 @@ public:
ArrayRef<MLValue *> getOperands() const;
AffineMap getAffineMap() const;
private:
void forwardSubstitute(const AffineApplyOp &inputOp,
ArrayRef<bool> inputResultsToSubstitute);
@ -244,13 +244,19 @@ public:
FlatAffineConstraints(unsigned numReservedInequalities,
unsigned numReservedEqualities,
unsigned numReservedCols, unsigned numDims = 0,
unsigned numSymbols = 0, unsigned numLocals = 0)
unsigned numSymbols = 0, unsigned numLocals = 0,
ArrayRef<Optional<MLValue *>> idArgs = {})
: numReservedCols(numReservedCols), numDims(numDims),
numSymbols(numSymbols) {
assert(numReservedCols >= numDims + numSymbols + 1);
equalities.reserve(numReservedCols * numReservedEqualities);
inequalities.reserve(numReservedCols * numReservedInequalities);
numIds = numDims + numSymbols + numLocals;
ids.reserve(numReservedCols);
if (idArgs.empty())
ids.resize(numIds, None);
else
ids.insert(ids.end(), idArgs.begin(), idArgs.end());
}
/// Constructs a constraint system with the specified number of
@ -261,6 +267,7 @@ public:
numSymbols(numSymbols) {
assert(numReservedCols >= numDims + numSymbols + 1);
numIds = numDims + numSymbols + numLocals;
ids.resize(numIds, None);
}
explicit FlatAffineConstraints(const HyperRectangularSet &set);
@ -290,10 +297,10 @@ public:
// Clears any existing data and reserves memory for the specified constraints.
void reset(unsigned numReservedInequalities, unsigned numReservedEqualities,
unsigned numReservedCols, unsigned numDims, unsigned numSymbols,
unsigned numLocals = 0);
unsigned numLocals = 0, ArrayRef<MLValue *> idArgs = {});
void reset(unsigned numDims = 0, unsigned numSymbols = 0,
unsigned numLocals = 0);
unsigned numLocals = 0, ArrayRef<MLValue *> idArgs = {});
/// Appends constraints from 'other' into this. This is equivalent to an
/// intersection with no simplification of any sort attempted.
@ -396,6 +403,12 @@ public:
/// Adds a lower bound expression for the specified expression.
void addLowerBound(ArrayRef<int64_t> expr, ArrayRef<int64_t> lb);
/// Adds constraints (lower and upper bounds) from the ForStmt into the
/// FlatAffineConstraints. 'forStmt's' MLValue is used to look up the right
/// identifier, and if it doesn't exist, a new one is added. Returns false for
/// the yet unimplemented/unsupported cases.
bool addBoundsFromForStmt(unsigned pos, ForStmt *forStmt);
/// Adds an upper bound expression for the specified expression.
void addUpperBound(ArrayRef<int64_t> expr, ArrayRef<int64_t> ub);
@ -407,12 +420,17 @@ public:
/// Sets the identifier at the specified position to a constant.
void setIdToConstant(unsigned pos, int64_t val);
/// Looks up the identifier with the specified MLValue. Returns false if not
/// found.
bool findId(const MLValue &operand, unsigned *pos);
// Add identifiers of the specified kind - specified positions are relative to
// the kind of identifier.
void addDimId(unsigned pos);
// the kind of identifier. 'id' is the MLValue corresponding to the
// identifier that can optionally be provided.
void addDimId(unsigned pos, MLValue *id = nullptr);
void addSymbolId(unsigned pos);
void addLocalId(unsigned pos);
void addId(IdKind kind, unsigned pos);
void addId(IdKind kind, unsigned pos, MLValue *id = nullptr);
/// Composes the affine value map with this FlatAffineConstrains, adding the
/// results of the map as dimensions at the specified position and with the
@ -435,6 +453,9 @@ public:
// value to mark exactness for example.
void projectOut(unsigned pos, unsigned num);
/// Projects out the identifier that is associate with MLValue *.
void projectOut(MLValue *id);
void removeId(IdKind idKind, unsigned pos);
void removeId(unsigned pos);
@ -453,19 +474,30 @@ public:
return numIds - numDims - numSymbols;
}
inline ArrayRef<Optional<MLValue *>> getIds() const {
return {ids.data(), ids.size()};
}
/// Clears this list of constraints and copies other into it.
void clearAndCopyFrom(const FlatAffineConstraints &other);
/// Returns the constant lower bound of the specified identifier (through a
/// scan through the constraints); returns None if the bound isn't trivially a
/// constant.
Optional<int64_t> getConstantLowerBound(unsigned pos);
Optional<int64_t> getConstantLowerBound(unsigned pos) const;
/// Returns the constant upper bound of the specified identifier (through a
/// scan through the constraints); returns None if the bound isn't trivially a
/// constant. Note that the upper bound for FlatAffineConstraints is
/// inclusive.
Optional<int64_t> getConstantUpperBound(unsigned pos);
Optional<int64_t> getConstantUpperBound(unsigned pos) const;
/// Returns the extent (upper bound - lower bound) of the specified
/// identifier if it is found to be a constant; returns None if it's not a
/// constant. 'lbPosition' is set to the row position of the corresponding
/// lower bound.
Optional<int64_t> getConstantBoundDifference(unsigned pos,
unsigned *lbPosition) const;
// Returns the lower and upper bounds of the specified dimensions as
// AffineMap's. Returns false for the unimplemented cases for the moment.
@ -509,6 +541,12 @@ private:
/// Number of identifiers corresponding to symbols (unknown but constant for
/// analysis).
unsigned numSymbols;
/// MLValues corresponding to the (column) identifiers of this constraint
/// system appearing in the order the identifiers correspond to columns.
/// Temporary ones or those that aren't associated to any MLValue are to be
/// set to None.
SmallVector<Optional<MLValue *>, 8> ids;
};
} // end namespace mlir.

View File

@ -25,9 +25,15 @@
#ifndef MLIR_ANALYSIS_UTILS_H
#define MLIR_ANALYSIS_UTILS_H
#include "mlir/Analysis/AffineStructures.h"
#include "mlir/Support/LLVM.h"
#include "llvm/ADT/SmallVector.h"
#include <memory>
namespace mlir {
class FlatAffineConstraints;
class MLValue;
class OperationStmt;
class Statement;
@ -37,8 +43,69 @@ bool dominates(const Statement &a, const Statement &b);
/// Returns true if statement 'a' properly dominates statement b.
bool properlyDominates(const Statement &a, const Statement &b);
/// Returns the memory region accessed by this memref.
bool getMemoryRegion(OperationStmt *opStmt, FlatAffineConstraints *region);
/// A region of a memref's data space; this is typically constructed by
/// analyzing load/store op's on this memref and the index space of loops
/// surrounding such op's.
// For example, the memref region for a load operation at loop depth = 1:
//
// for %i = 0 to 32 {
// for %ii = %i to (d0) -> (d0 + 8) (%i) {
// load %A[%ii]
// }
// }
//
// Region: {memref = %A, write = false, {%i <= m0 <= %i + 7} }
// The last field is a 2-d FlatAffineConstraints symbolic in %i.
//
struct MemRefRegion {
FlatAffineConstraints *getConstraints() { return &cst; }
const FlatAffineConstraints *getConstraints() const { return &cst; }
bool isWrite() const { return write; }
void setWrite(bool flag) { write = flag; }
// Computes the shape if the extents are known constants, returns false
// otherwise.
bool getConstantShape(llvm::SmallVectorImpl<int> *shape) const;
// Returns the number of elements in this region if it's a known constant. We
// use int64_t instead of uint64_t since index types can be at most int64_t.
Optional<int64_t> getConstantSize() const;
/// Memref that this region corresponds to.
MLValue *memref;
private:
/// Read or write.
bool write;
/// Region (data space) of the memref accessed. This set will thus have at
/// least as many dimensional identifiers as the shape dimensionality of the
/// memref, and these are the leading dimensions of the set appearing in that
/// order (major to minor / outermost to innermost). There may be additional
/// identifiers since getMemRefRegion() is called with a specific loop depth,
/// and thus the region is symbolic in the outer surrounding loops at that
/// depth.
// TODO(bondhugula): Replace this to exploit HyperRectangularSet.
FlatAffineConstraints cst;
};
/// Computes the memory region accessed by this memref with the region
/// represented as constraints symbolic/parameteric in 'loopDepth' loops
/// surrounding opStmt. Returns false if this fails due to yet unimplemented
/// cases.
// For example, the memref region for this operation at loopDepth = 1 will be:
//
// for %i = 0 to 32 {
// for %ii = %i to (d0) -> (d0 + 8) (%i) {
// load %A[%ii]
// }
// }
//
// {memref = %A, write = false, {%i <= m0 <= %i + 7} }
// The last field is a 2-d FlatAffineConstraints symbolic in %i.
//
bool getMemRefRegion(OperationStmt *opStmt, unsigned loopDepth,
MemRefRegion *region);
} // end namespace mlir

View File

@ -43,15 +43,17 @@ class SSAValue;
/// Replace all uses of oldMemRef with newMemRef while optionally remapping the
/// old memref's indices using the supplied affine map and adding any additional
/// indices. The new memref could be of a different shape or rank. Returns true
/// on success and false if the replacement is not possible (whenever a memref
/// is used as an operand in a non-deferencing scenario).
/// Additional indices are added at the start.
/// indices. The new memref could be of a different shape or rank. An optional
/// argument 'domOpFilter' restricts the replacement to only those operations
/// that are dominated by the former. Returns true on success and false if the
/// replacement is not possible (whenever a memref is used as an operand in a
/// non-deferencing scenario). Additional indices are added at the start.
// TODO(mlir-team): extend this for SSAValue / CFGFunctions. Can also be easily
// extended to add additional indices at any position.
bool replaceAllMemRefUsesWith(const MLValue *oldMemRef, MLValue *newMemRef,
llvm::ArrayRef<MLValue *> extraIndices = {},
AffineMap indexRemap = AffineMap::Null());
AffineMap indexRemap = AffineMap::Null(),
const Statement *domStmtFilter = nullptr);
/// Creates and inserts into 'builder' a new AffineApplyOp, with the number of
/// its results equal to the number of operands, as a composition
@ -64,7 +66,7 @@ OperationStmt *
createComposedAffineApplyOp(FuncBuilder *builder, Location loc,
ArrayRef<MLValue *> operands,
ArrayRef<OperationStmt *> affineApplyOps,
SmallVectorImpl<SSAValue *> &results);
SmallVectorImpl<SSAValue *> *results);
/// Given an operation statement, inserts a new single affine apply operation,
/// that is exclusively used by this operation statement, and that provides all

View File

@ -897,7 +897,7 @@ static void computeDirectionVector(
dependenceDomain->addDimId(j);
}
// Add equality contraints for each common loop, setting newly instroduced
// Add equality contraints for each common loop, setting newly introduced
// variable at column 'j' to the 'dst' IV minus the 'src IV.
SmallVector<int64_t, 4> eq;
eq.resize(dependenceDomain->getNumCols());

View File

@ -26,6 +26,7 @@
#include "mlir/IR/BuiltinOps.h"
#include "mlir/IR/IntegerSet.h"
#include "mlir/IR/MLValue.h"
#include "mlir/IR/Statements.h"
#include "mlir/Support/MathExtras.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/Support/Debug.h"
@ -480,6 +481,10 @@ FlatAffineConstraints::FlatAffineConstraints(
numSymbols = other.getNumSymbolIds();
numIds = other.getNumIds();
auto otherIds = other.getIds();
ids.reserve(numReservedCols);
ids.insert(ids.end(), otherIds.begin(), otherIds.end());
unsigned numReservedEqualities = other.getNumReservedEqualities();
unsigned numReservedInequalities = other.getNumReservedInequalities();
@ -506,6 +511,7 @@ FlatAffineConstraints::FlatAffineConstraints(IntegerSet set)
numSymbols(set.getNumSymbols()) {
equalities.reserve(set.getNumEqualities() * numReservedCols);
inequalities.reserve(set.getNumInequalities() * numReservedCols);
ids.resize(numIds, None);
for (unsigned i = 0, e = set.getNumConstraints(); i < e; ++i) {
AffineExpr expr = set.getConstraint(i);
@ -525,7 +531,8 @@ void FlatAffineConstraints::reset(unsigned numReservedInequalities,
unsigned numReservedEqualities,
unsigned newNumReservedCols,
unsigned newNumDims, unsigned newNumSymbols,
unsigned newNumLocals) {
unsigned newNumLocals,
ArrayRef<MLValue *> idArgs) {
assert(newNumReservedCols >= newNumDims + newNumSymbols + newNumLocals + 1 &&
"minimum 1 column");
numReservedCols = newNumReservedCols;
@ -538,12 +545,20 @@ void FlatAffineConstraints::reset(unsigned numReservedInequalities,
equalities.reserve(newNumReservedCols * numReservedEqualities);
if (numReservedInequalities >= 1)
inequalities.reserve(newNumReservedCols * numReservedInequalities);
ids.clear();
if (idArgs.empty()) {
ids.resize(numIds, None);
} else {
ids.reserve(idArgs.size());
ids.insert(ids.end(), idArgs.begin(), idArgs.end());
}
}
void FlatAffineConstraints::reset(unsigned newNumDims, unsigned newNumSymbols,
unsigned newNumLocals) {
unsigned newNumLocals,
ArrayRef<MLValue *> idArgs) {
reset(0, 0, newNumDims + newNumSymbols + newNumLocals + 1, newNumDims,
newNumSymbols, newNumLocals);
newNumSymbols, newNumLocals, idArgs);
}
void FlatAffineConstraints::append(const FlatAffineConstraints &other) {
@ -567,8 +582,8 @@ void FlatAffineConstraints::addLocalId(unsigned pos) {
addId(IdKind::Local, pos);
}
void FlatAffineConstraints::addDimId(unsigned pos) {
addId(IdKind::Dimension, pos);
void FlatAffineConstraints::addDimId(unsigned pos, MLValue *id) {
addId(IdKind::Dimension, pos, id);
}
void FlatAffineConstraints::addSymbolId(unsigned pos) {
@ -577,7 +592,7 @@ void FlatAffineConstraints::addSymbolId(unsigned pos) {
/// Adds a dimensional identifier. The added column is initialized to
/// zero.
void FlatAffineConstraints::addId(IdKind kind, unsigned pos) {
void FlatAffineConstraints::addId(IdKind kind, unsigned pos, MLValue *id) {
if (kind == IdKind::Dimension) {
assert(pos <= getNumDimIds());
} else if (kind == IdKind::Symbol) {
@ -595,16 +610,16 @@ void FlatAffineConstraints::addId(IdKind kind, unsigned pos) {
numReservedCols++;
}
unsigned elimPos;
unsigned absolutePos;
if (kind == IdKind::Dimension) {
elimPos = pos;
absolutePos = pos;
numDims++;
} else if (kind == IdKind::Symbol) {
elimPos = pos + getNumDimIds();
absolutePos = pos + getNumDimIds();
numSymbols++;
} else {
elimPos = pos + getNumDimIds() + getNumSymbolIds();
absolutePos = pos + getNumDimIds() + getNumSymbolIds();
}
numIds++;
@ -615,41 +630,53 @@ void FlatAffineConstraints::addId(IdKind kind, unsigned pos) {
int numCols = static_cast<int>(getNumCols());
for (int r = numInequalities - 1; r >= 0; r--) {
for (int c = numCols - 2; c >= 0; c--) {
if (c < elimPos)
if (c < absolutePos)
atIneq(r, c) = inequalities[r * oldNumReservedCols + c];
else
atIneq(r, c + 1) = inequalities[r * oldNumReservedCols + c];
}
atIneq(r, elimPos) = 0;
atIneq(r, absolutePos) = 0;
}
for (int r = numEqualities - 1; r >= 0; r--) {
for (int c = numCols - 2; c >= 0; c--) {
// All values in column elimPositions < elimPos have the same coordinates
// in the 2-d view of the coefficient buffer.
if (c < elimPos)
// All values in column absolutePositions < absolutePos have the same
// coordinates in the 2-d view of the coefficient buffer.
if (c < absolutePos)
atEq(r, c) = equalities[r * oldNumReservedCols + c];
else
// Those at elimPosition >= elimPos, get a shifted elimPosition.
// Those at absolutePosition >= absolutePos, get a shifted
// absolutePosition.
atEq(r, c + 1) = equalities[r * oldNumReservedCols + c];
}
// Initialize added dimension to zero.
atEq(r, elimPos) = 0;
atEq(r, absolutePos) = 0;
}
// If an 'id' is provided, insert it; otherwise use None.
if (id) {
ids.insert(ids.begin() + absolutePos, id);
} else {
ids.insert(ids.begin() + absolutePos, None);
}
assert(ids.size() == getNumIds());
}
// This routine may add additional local variables if the flattened
// expression corresponding to the map has such variables due to the presence of
// mod's, ceildiv's, and floordiv's.
void FlatAffineConstraints::composeMap(AffineValueMap *vMap, unsigned pos) {
assert(vMap->getNumOperands() == getNumIds() && "inconsistent map");
assert(vMap->getNumDims() == getNumDimIds() && "inconsistent map");
assert(pos <= getNumIds() && "invalid position");
assert(vMap->getNumSymbols() == getNumSymbolIds());
AffineMap map = vMap->getAffineMap();
// We add one equality for each result connecting the result dim of the map to
// the other identifiers.
// For eg: if the expression is 16*i0 + i1, and this is the r^th
// iteration/result of the value map, we are adding the equality:
// d_r - 16*i0 - i1 = 0. Hence, when flattening say (i0 + 1, i0 + 8*i2), we
// add two equalities overall: d_0 - i0 - 1 == 0, d1 - i0 - 8*i2 == 0.
for (unsigned r = 0, e = map.getNumResults(); r < e; r++) {
// Add dimension.
addDimId(pos + r);
@ -660,44 +687,60 @@ void FlatAffineConstraints::composeMap(AffineValueMap *vMap, unsigned pos) {
map.getNumSymbols(), &eq, &cst);
(void)ret;
assert(ret && "unimplemented for semi-affine maps");
for (unsigned j = 0, e = eq.size(); j < e; j++) {
eq[j] = -eq[j];
}
// Make the value map and the flat affine cst dimensions compatible.
// A lot of this code will be refactored/cleaned up.
for (unsigned l = 0, e = cst.getNumLocalIds(); l < e; l++) {
addLocalId(getNumLocalIds());
addLocalId(0);
}
// TODO(andydavis,bondhugula,ntv): we need common code to merge
// dimensions/symbols.
assert(cst.getNumDimIds() <= getNumIds());
for (unsigned t = 0, e = getNumDimIds() - cst.getNumDimIds(); t < e; t++) {
for (unsigned t = 0, e = r + 1; t < e; t++) {
// TODO: Consider using a batched version to add a range of IDs.
cst.addDimId(0);
eq.insert(eq.begin(), 0);
}
// Set the ceofficient for this result to one.
eq[r] = 1;
// TODO(andydavis,bondhugula,ntv): we need common code to merge
// dimensions/symbols.
assert(cst.getNumSymbolIds() <= getNumSymbolIds());
for (unsigned t = 0, e = getNumSymbolIds() - cst.getNumSymbolIds(); t < e;
t++) {
eq.insert(eq.begin() + cst.getNumSymbolIds(), 0);
cst.addSymbolId(cst.getNumSymbolIds());
assert(cst.getNumDimIds() <= getNumDimIds());
for (unsigned t = 0, e = getNumDimIds() - cst.getNumDimIds(); t < e; t++) {
cst.addDimId(cst.getNumDimIds() - 1);
}
// TODO(andydavis,bondhugula,ntv): we need common code to merge
// identifiers. All of this will be cleaned up. At this point, it's fine as
// long as it stays *inside* the FlatAffineConstraints API methods.
assert(cst.getNumSymbolIds() <= getNumSymbolIds());
assert(cst.getNumLocalIds() <= getNumLocalIds());
for (unsigned t = 0, e = getNumLocalIds() - cst.getNumLocalIds(); t < e;
t++) {
eq.insert(eq.begin() + cst.getNumDimIds() + cst.getNumSymbolIds(), 0);
cst.addLocalId(0);
cst.addLocalId(cst.getNumLocalIds());
}
/// Finally, append cst to this constraint set.
append(cst);
// eqToAdd is the equality corresponding to the flattened affine expression.
SmallVector<int64_t, 8> eqToAdd(getNumCols(), 0);
// Set the coefficient for this result to one.
eqToAdd[r] = 1;
// Dims and symbols.
for (unsigned i = 0, e = vMap->getNumOperands(); i < e; i++) {
unsigned loc;
bool ret = findId(*cast<MLValue>(vMap->getOperand(i)), &loc);
assert(ret && "id expected, but not found");
(void)ret;
// We need to negate 'eq' since the newly added dimension is going to be
// set to this one.
eqToAdd[loc] = -eq[i];
}
// Local vars common to eq and cst are at the beginning.
int j = getNumDimIds() + getNumSymbolIds();
int end = eq.size() - 1;
for (int i = vMap->getNumOperands(); i < end; i++, j++) {
eqToAdd[j] = -eq[i];
}
// Constant term.
eqToAdd[getNumCols() - 1] = -eq[eq.size() - 1];
// Add the equality connecting the result of the map to this constraint set.
addEquality(eq);
addEquality(eqToAdd);
}
}
@ -858,6 +901,7 @@ void FlatAffineConstraints::removeColumnRange(unsigned colStart,
numDims -= numDimsEliminated;
numSymbols -= numSymbolsEliminated;
numIds = numIds - numColsEliminated;
ids.erase(ids.begin() + colStart, ids.begin() + colLimit);
// No resize necessary. numReservedCols remains the same.
}
@ -1071,6 +1115,90 @@ void FlatAffineConstraints::addUpperBound(ArrayRef<int64_t> expr,
}
}
bool FlatAffineConstraints::findId(const MLValue &operand, unsigned *pos) {
unsigned i = 0;
for (const auto &mayBeId : ids) {
if (mayBeId.hasValue() && mayBeId.getValue() == &operand) {
*pos = i;
return true;
}
i++;
}
return false;
}
// TODO(andydavis, bondhugula) AFFINE REFACTOR: merge with loop bounds
// code in dependence analysis.
bool FlatAffineConstraints::addBoundsFromForStmt(unsigned pos,
ForStmt *forStmt) {
// Adds a lower or upper bound when the bounds aren't constant.
auto addLowerOrUpperBound = [&](bool lower) -> bool {
const auto &operands = lower ? forStmt->getLowerBoundOperands()
: forStmt->getUpperBoundOperands();
SmallVector<unsigned, 8> positions;
for (const auto &operand : operands) {
unsigned loc;
// TODO(andydavis, bondhugula) AFFINE REFACTOR: merge with loop bounds
// code in dependence analysis.
if (!findId(*operand, &loc)) {
addDimId(getNumDimIds(), operand);
loc = getNumDimIds() - 1;
}
positions.push_back(loc);
}
auto boundMap =
lower ? forStmt->getLowerBoundMap() : forStmt->getUpperBoundMap();
for (auto result : boundMap.getResults()) {
SmallVector<int64_t, 4> flattenedExpr;
SmallVector<int64_t, 4> ineq(getNumCols(), 0);
// TODO(andydavis, bondhugula) AFFINE REFACTOR: merge with loop bounds in
// dependence analysis.
FlatAffineConstraints cst;
if (!getFlattenedAffineExpr(result, boundMap.getNumDims(),
boundMap.getNumSymbols(), &flattenedExpr,
&cst)) {
LLVM_DEBUG(llvm::dbgs()
<< "semi-affine expressions not yet supported\n");
return false;
}
if (cst.getNumLocalIds() > 0) {
LLVM_DEBUG(
llvm::dbgs()
<< "loop bounds with mod/floordiv expr's not yet supported\n");
return false;
}
ineq[pos] = lower ? 1 : -1;
for (unsigned j = 0, e = boundMap.getNumInputs(); j < e; j++) {
ineq[positions[j]] = lower ? -flattenedExpr[j] : flattenedExpr[j];
}
// Constant term.
ineq[getNumCols() - 1] = lower ? -flattenedExpr[flattenedExpr.size() - 1]
: flattenedExpr[flattenedExpr.size() - 1];
addInequality(ineq);
}
return true;
};
if (forStmt->hasConstantLowerBound()) {
addConstantLowerBound(pos, forStmt->getConstantLowerBound());
} else {
// Non-constant lower bound case.
if (!addLowerOrUpperBound(/*lower=*/true))
return false;
}
if (forStmt->hasConstantUpperBound()) {
addConstantUpperBound(pos, forStmt->getConstantUpperBound() - 1);
return true;
}
// Non-constant upper bound case.
return addLowerOrUpperBound(/*lower=*/false);
}
/// Sets the specified identifer to a constant value.
void FlatAffineConstraints::setIdToConstant(unsigned pos, int64_t val) {
unsigned offset = equalities.size();
@ -1119,7 +1247,8 @@ bool FlatAffineConstraints::getDimensionBounds(unsigned pos, unsigned num,
return true;
}
Optional<int64_t> FlatAffineConstraints::getConstantLowerBound(unsigned pos) {
Optional<int64_t>
FlatAffineConstraints::getConstantLowerBound(unsigned pos) const {
assert(pos < getNumCols() - 1);
Optional<int64_t> lb = None;
for (unsigned r = 0; r < getNumInequalities(); r++) {
@ -1143,7 +1272,71 @@ Optional<int64_t> FlatAffineConstraints::getConstantLowerBound(unsigned pos) {
return lb;
}
Optional<int64_t> FlatAffineConstraints::getConstantUpperBound(unsigned pos) {
/// Returns the extent of the specified identifier (upper bound - lower bound)
/// if it found to be a constant; returns None if it's not a constant.
/// 'lbPosition' is set to the row position of the corresponding lower bound.
Optional<int64_t>
FlatAffineConstraints::getConstantBoundDifference(unsigned pos,
unsigned *lbPosition) const {
// Check if the identifier appears at all in any of the inequalities.
unsigned r, e;
for (r = 0, e = getNumInequalities(); r < e; r++) {
if (atIneq(r, pos) != 0)
break;
}
if (r == e) {
// If it doesn't appear, just remove the column and return.
// TODO(andydavis,bondhugula): refactor removeColumns to use it from here.
return None;
}
// Positions of constraints that are lower/upper bounds on the variable.
SmallVector<unsigned, 4> lbIndices, ubIndices;
// Gather all lower bounds and upper bounds of the variable. Since the
// canonical form c_1*x_1 + c_2*x_2 + ... + c_0 >= 0, a constraint is a lower
// bound for x_i if c_i >= 1, and an upper bound if c_i <= -1.
for (unsigned r = 0, e = getNumInequalities(); r < e; r++) {
if (atIneq(r, pos) >= 1)
// Lower bound.
lbIndices.push_back(r);
else if (atIneq(r, pos) <= -1)
// Upper bound.
ubIndices.push_back(r);
}
// TODO(bondhugula): eliminate all variables that aren't part of any of the
// lower/upper bounds - to make this more powerful.
Optional<int64_t> minDiff = None;
for (auto ubPos : ubIndices) {
for (auto lbPos : lbIndices) {
// Look for a lower bound and an upper bound that only differ by a
// constant, i.e., pairs of the form 0 <= c_pos - f(c_i's) <= diffConst.
// For example, if ii is the pos^th variable, we are looking for
// constraints like ii >= i, ii <= ii + 50, 50 being the difference. The
// minimum among all such constant differences is kept since that's the
// constant bounding the extent of the pos^th variable.
unsigned j;
for (j = 0; j < getNumCols() - 1; j++)
if (atIneq(ubPos, j) != -atIneq(lbPos, j)) {
break;
}
if (j < getNumCols() - 1)
continue;
int64_t mayDiff =
atIneq(ubPos, getNumCols() - 1) + atIneq(lbPos, getNumCols() - 1) + 1;
if (minDiff == None || mayDiff < minDiff) {
minDiff = mayDiff;
*lbPosition = lbPos;
}
}
}
return minDiff;
}
Optional<int64_t>
FlatAffineConstraints::getConstantUpperBound(unsigned pos) const {
assert(pos < getNumCols() - 1);
Optional<int64_t> ub = None;
for (unsigned r = 0; r < getNumInequalities(); r++) {
@ -1196,8 +1389,17 @@ bool FlatAffineConstraints::isHyperRectangular(unsigned pos,
void FlatAffineConstraints::print(raw_ostream &os) const {
assert(inequalities.size() == getNumInequalities() * numReservedCols);
assert(equalities.size() == getNumEqualities() * numReservedCols);
assert(ids.size() == getNumIds());
os << "\nConstraints (" << getNumDimIds() << " dims, " << getNumSymbolIds()
<< " symbols, " << getNumLocalIds() << " locals): \n";
os << "(";
for (unsigned i = 0, e = getNumIds(); i < e; i++) {
if (ids[i] == None)
os << "None ";
else
os << "MLValue ";
}
os << ")\n";
for (unsigned i = 0, e = getNumEqualities(); i < e; ++i) {
for (unsigned j = 0; j < getNumCols(); ++j) {
os << atEq(i, j) << " ";
@ -1223,6 +1425,7 @@ void FlatAffineConstraints::clearAndCopyFrom(
const FlatAffineConstraints &other) {
FlatAffineConstraints copy(other);
std::swap(*this, copy);
assert(copy.getNumIds() == copy.getIds().size());
}
void FlatAffineConstraints::removeId(unsigned pos) {
@ -1245,6 +1448,7 @@ void FlatAffineConstraints::removeId(unsigned pos) {
atEq(r, c) = atEq(r, c + 1);
}
}
ids.erase(ids.begin() + pos);
}
static std::pair<unsigned, unsigned>
@ -1375,11 +1579,18 @@ void FlatAffineConstraints::FourierMotzkinEliminate(
unsigned newNumDims = dimsSymbols.first;
unsigned newNumSymbols = dimsSymbols.second;
SmallVector<Optional<MLValue *>, 8> newIds;
newIds.reserve(numIds - 1);
newIds.insert(newIds.end(), ids.begin(), ids.begin() + pos);
newIds.insert(newIds.end(), ids.begin() + pos + 1, ids.end());
/// Create the new system which has one identifier less.
FlatAffineConstraints newFac(
lbIndices.size() * ubIndices.size() + nbIndices.size(),
getNumEqualities(), getNumCols() - 1, newNumDims, newNumSymbols,
/*numLocals=*/getNumIds() - 1 - newNumDims - newNumSymbols);
/*numLocals=*/getNumIds() - 1 - newNumDims - newNumSymbols, newIds);
assert(newFac.getIds().size() == newFac.getNumIds());
// This will be used to check if the elimination was integer exact.
unsigned lcmProducts = 1;
@ -1462,9 +1673,19 @@ void FlatAffineConstraints::FourierMotzkinEliminate(
void FlatAffineConstraints::projectOut(unsigned pos, unsigned num) {
// 'pos' can be at most getNumCols() - 2.
if (num == 0)
return;
assert(pos <= getNumCols() - 2 && "invalid position");
assert(pos + num < getNumCols() && "invalid range");
for (unsigned i = 0; i < num; i++) {
FourierMotzkinEliminate(pos);
}
}
void FlatAffineConstraints::projectOut(MLValue *id) {
unsigned pos;
bool ret = findId(*id, &pos);
assert(ret);
(void)ret;
FourierMotzkinEliminate(pos);
}

View File

@ -63,15 +63,15 @@ void MemRefBoundCheck::visitOperationStmt(OperationStmt *opStmt) {
// TODO(bondhugula): extend this to store's and other memref dereferencing
// op's.
if (auto loadOp = opStmt->dyn_cast<LoadOp>()) {
FlatAffineConstraints memoryRegion;
if (!getMemoryRegion(opStmt, &memoryRegion))
MemRefRegion region;
if (!getMemRefRegion(opStmt, /*loopDepth=*/0, &region))
return;
LLVM_DEBUG(llvm::dbgs() << "Memory region");
LLVM_DEBUG(memoryRegion.dump());
LLVM_DEBUG(region.getConstraints()->dump());
unsigned rank = loadOp->getMemRefType().getRank();
// For each dimension, check for out of bounds.
for (unsigned r = 0; r < rank; r++) {
FlatAffineConstraints ucst(memoryRegion);
FlatAffineConstraints ucst(*region.getConstraints());
// Intersect memory region with constraint capturing out of bounds,
// and check if the constraint system is feasible. If it is, there is at
// least one point out of bounds.
@ -91,7 +91,7 @@ void MemRefBoundCheck::visitOperationStmt(OperationStmt *opStmt) {
Twine(r + 1));
}
// Check for less than negative index.
FlatAffineConstraints lcst(memoryRegion);
FlatAffineConstraints lcst(*region.getConstraints());
std::fill(ineq.begin(), ineq.end(), 0);
// d_i <= -1;
lcst.addConstantUpperBound(r, -1);

View File

@ -27,6 +27,9 @@
#include "mlir/IR/Builders.h"
#include "mlir/IR/BuiltinOps.h"
#include "mlir/StandardOps/StandardOps.h"
#include "llvm/Support/Debug.h"
#define DEBUG_TYPE "analysis-utils"
using namespace mlir;
@ -65,62 +68,141 @@ bool mlir::dominates(const Statement &a, const Statement &b) {
return &a == &b || properlyDominates(a, b);
}
/// Returns the memory region accessed by this memref.
// TODO(bondhugula): extend this to store's and other memref dereferencing ops.
bool mlir::getMemoryRegion(OperationStmt *opStmt,
FlatAffineConstraints *region) {
OpPointer<LoadOp> loadOp;
if (!(loadOp = opStmt->dyn_cast<LoadOp>()))
return false;
Optional<int64_t> MemRefRegion::getConstantSize() const {
auto memRefType = memref->getType().cast<MemRefType>();
unsigned rank = memRefType.getRank();
// Compute the extents of the buffer.
int64_t numElements = 1;
for (unsigned d = 0; d < rank; d++) {
unsigned lbPos;
Optional<int64_t> diff = cst.getConstantBoundDifference(d, &lbPos);
if (!diff.hasValue())
return None;
int64_t diffConstant = diff.getValue();
if (diffConstant <= 0)
return 0;
numElements *= diffConstant;
}
return numElements;
}
bool MemRefRegion::getConstantShape(SmallVectorImpl<int> *shape) const {
auto memRefType = memref->getType().cast<MemRefType>();
unsigned rank = memRefType.getRank();
shape->reserve(rank);
// Compute the extents of this memref region.
for (unsigned d = 0; d < rank; d++) {
unsigned lbPos;
Optional<int64_t> diff = cst.getConstantBoundDifference(d, &lbPos);
if (!diff.hasValue())
return false;
int diffConstant = std::max(0L, diff.getValue());
shape->push_back(diffConstant);
}
return true;
}
/// Computes the memory region accessed by this memref with the region
/// represented as constraints symbolic/parameteric in 'loopDepth' loops
/// surrounding opStmt. Returns false if this fails due to yet unimplemented
/// cases.
// For example, the memref region for this load operation at loopDepth = 1 will
// be as below:
//
// for %i = 0 to 32 {
// for %ii = %i to (d0) -> (d0 + 8) (%i) {
// load %A[%ii]
// }
// }
//
// region: {memref = %A, write = false, {%i <= m0 <= %i + 7} }
// The last field is a 2-d FlatAffineConstraints symbolic in %i.
//
// TODO(bondhugula): extend this to any other memref dereferencing ops
// (dma_start, dma_wait).
bool mlir::getMemRefRegion(OperationStmt *opStmt, unsigned loopDepth,
MemRefRegion *region) {
OpPointer<LoadOp> loadOp;
OpPointer<StoreOp> storeOp;
unsigned rank;
SmallVector<MLValue *, 4> indices;
if ((loadOp = opStmt->dyn_cast<LoadOp>())) {
rank = loadOp->getMemRefType().getRank();
for (auto *index : loadOp->getIndices()) {
indices.push_back(cast<MLValue>(index));
}
region->memref = cast<MLValue>(loadOp->getMemRef());
region->setWrite(false);
} else if ((storeOp = opStmt->dyn_cast<StoreOp>())) {
rank = storeOp->getMemRefType().getRank();
for (auto *index : storeOp->getIndices()) {
indices.push_back(cast<MLValue>(index));
}
region->memref = cast<MLValue>(storeOp->getMemRef());
region->setWrite(true);
} else {
return false;
}
// Build the constraints for this region.
FlatAffineConstraints *regionCst = region->getConstraints();
unsigned rank = loadOp->getMemRefType().getRank();
MLFuncBuilder b(opStmt);
auto idMap = b.getMultiDimIdentityMap(rank);
SmallVector<MLValue *, 4> indices;
for (auto *index : loadOp->getIndices()) {
indices.push_back(cast<MLValue>(index));
}
// Initialize 'accessValueMap' and compose with reachable AffineApplyOps.
AffineValueMap accessValueMap(idMap, indices);
forwardSubstituteReachableOps(&accessValueMap);
AffineMap accessMap = accessValueMap.getAffineMap();
// Initialize 'accessMap' and compose with reachable AffineApplyOps.
AffineValueMap accessMap(idMap, indices);
forwardSubstituteReachableOps(&accessMap);
AffineMap srcMap = accessMap.getAffineMap();
region->reset(srcMap.getNumDims(), srcMap.getNumSymbols());
regionCst->reset(accessMap.getNumDims(), accessMap.getNumSymbols(), 0,
accessValueMap.getOperands());
// Add equality constraints.
AffineMap map = accessMap.getAffineMap();
unsigned numDims = map.getNumDims();
unsigned numSymbols = map.getNumSymbols();
// Add inEqualties for loop lower/upper bounds.
unsigned numDims = accessMap.getNumDims();
unsigned numSymbols = accessMap.getNumSymbols();
// Add inequalties for loop lower/upper bounds.
for (unsigned i = 0; i < numDims + numSymbols; ++i) {
if (auto *loop = dyn_cast<ForStmt>(accessMap.getOperand(i))) {
if (!loop->hasConstantBounds())
if (auto *loop = dyn_cast<ForStmt>(accessValueMap.getOperand(i))) {
// Note that regionCst can now have more dimensions than accessMap if the
// bounds expressions involve outer loops or other symbols.
if (!regionCst->addBoundsFromForStmt(i, loop))
return false;
// Add lower bound and upper bounds.
region->addConstantLowerBound(i, loop->getConstantLowerBound());
region->addConstantUpperBound(i, loop->getConstantUpperBound() - 1);
} else {
// Has to be a valid symbol.
auto *symbol = cast<MLValue>(accessMap.getOperand(i));
auto *symbol = cast<MLValue>(accessValueMap.getOperand(i));
assert(symbol->isValidSymbol());
// Check if the symbols is a constant.
if (auto *opStmt = symbol->getDefiningStmt()) {
if (auto constOp = opStmt->dyn_cast<ConstantIndexOp>()) {
region->setIdToConstant(i, constOp->getValue());
regionCst->setIdToConstant(i, constOp->getValue());
}
}
}
}
// Add access function equalities to connect loop IVs to data dimensions.
region->composeMap(&accessMap);
regionCst->composeMap(&accessValueMap);
// Eliminate the loop IVs and any local variables to yield the memory region
// involving just the memref dimensions.
region->projectOut(srcMap.getNumResults(),
accessMap.getNumOperands() + region->getNumLocalIds());
assert(region->getNumDimIds() == rank);
// Eliminate the loop IVs and any local variables to yield the memory
// region involving just the memref dimensions and outer loop IVs up to
// loopDepth.
for (auto *operand : accessValueMap.getOperands()) {
regionCst->projectOut(operand);
}
regionCst->projectOut(regionCst->getNumDimIds() +
regionCst->getNumSymbolIds(),
regionCst->getNumLocalIds());
// Tighten the set.
regionCst->GCDTightenInequalities();
assert(regionCst->getNumDimIds() >= rank);
return true;
}

View File

@ -717,6 +717,7 @@ void DmaStartOp::print(OpAsmPrinter *p) const {
*p << " : " << getSrcMemRef()->getType();
*p << ", " << getDstMemRef()->getType();
*p << ", " << getTagMemRef()->getType();
p->printOptionalAttrDict(getAttrs());
}
// Parse DmaStartOp.
@ -811,6 +812,7 @@ void DmaWaitOp::print(OpAsmPrinter *p) const {
*p << "], ";
p->printOperand(getNumElements());
*p << " : " << getTagMemRef()->getType();
p->printOptionalAttrDict(getAttrs());
}
// Parse DmaWaitOp.

View File

@ -30,193 +30,306 @@
#include "mlir/StandardOps/StandardOps.h"
#include "mlir/Transforms/Passes.h"
#include "mlir/Transforms/Utils.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include <algorithm>
#define DEBUG_TYPE "dma-generate"
using namespace mlir;
static llvm::cl::opt<unsigned> clFastMemorySpace(
"dma-fast-memory-space", llvm::cl::Hidden,
llvm::cl::desc("Set fast memory space id for DMA generation"));
namespace {
// A region of memory in a lower memory space.
struct Region {
// Memref corresponding to the region.
MLValue *memref;
// Read or write.
bool isWrite;
// Region of memory accessed.
// TODO(bondhugula): Replace this to exploit HyperRectangularSet.
std::unique_ptr<FlatAffineConstraints> cst;
};
/// Generates DMAs for memref's living in 'lowMemorySpace' into newly created
/// buffers in 'highMemorySpace', and replaces memory operations to the former
/// Generates DMAs for memref's living in 'slowMemorySpace' into newly created
/// buffers in 'fastMemorySpace', and replaces memory operations to the former
/// by the latter. Only load op's handled for now.
/// TODO(bondhugula): extend this to store op's.
struct DmaGeneration : public FunctionPass, StmtWalker<DmaGeneration> {
explicit DmaGeneration(unsigned lowMemorySpace = 0,
unsigned highMemorySpace = 1,
explicit DmaGeneration(unsigned slowMemorySpace = 0,
unsigned fastMemorySpaceArg = 1,
int minDmaTransferSize = 1024)
: FunctionPass(&DmaGeneration::passID), lowMemorySpace(lowMemorySpace),
highMemorySpace(highMemorySpace),
minDmaTransferSize(minDmaTransferSize) {}
: FunctionPass(&DmaGeneration::passID), slowMemorySpace(slowMemorySpace),
minDmaTransferSize(minDmaTransferSize) {
if (clFastMemorySpace.getNumOccurrences() > 0) {
fastMemorySpace = clFastMemorySpace;
} else {
fastMemorySpace = fastMemorySpaceArg;
}
}
PassResult runOnMLFunction(MLFunction *f) override;
// Not applicable to CFG functions.
PassResult runOnCFGFunction(CFGFunction *f) override { return success(); }
bool runOnForStmt(ForStmt *forStmt);
PassResult runOnMLFunction(MLFunction *f) override;
void runOnForStmt(ForStmt *forStmt);
void visitOperationStmt(OperationStmt *opStmt);
void generateDma(const Region &region, Location loc, MLFuncBuilder *b);
bool generateDma(const MemRefRegion &region, ForStmt *forStmt);
// List of memory regions to promote.
std::vector<Region> regions;
// List of memory regions to DMA for.
std::vector<std::unique_ptr<MemRefRegion>> regions;
// Map from original memref's to the DMA buffers that their accesses are
// replaced with.
DenseMap<SSAValue *, SSAValue *> fastBufferMap;
// Slow memory space associated with DMAs.
const unsigned slowMemorySpace;
// Fast memory space associated with DMAs.
unsigned fastMemorySpace;
// Minimum DMA transfer size supported by the target in bytes.
const int minDmaTransferSize;
// The loop level at which DMAs should be generated. '0' is an outermost loop.
unsigned dmaDepth;
static char passID;
const unsigned lowMemorySpace;
const unsigned highMemorySpace;
const int minDmaTransferSize;
};
} // end anonymous namespace
char DmaGeneration::passID = 0;
/// Generates DMAs for memref's living in 'lowMemorySpace' into newly created
/// buffers in 'highMemorySpace', and replaces memory operations to the former
/// Generates DMAs for memref's living in 'slowMemorySpace' into newly created
/// buffers in 'fastMemorySpace', and replaces memory operations to the former
/// by the latter. Only load op's handled for now.
/// TODO(bondhugula): extend this to store op's.
FunctionPass *mlir::createDmaGenerationPass(unsigned lowMemorySpace,
unsigned highMemorySpace,
FunctionPass *mlir::createDmaGenerationPass(unsigned slowMemorySpace,
unsigned fastMemorySpace,
int minDmaTransferSize) {
return new DmaGeneration(lowMemorySpace, highMemorySpace, minDmaTransferSize);
return new DmaGeneration(slowMemorySpace, fastMemorySpace,
minDmaTransferSize);
}
// Gather regions to promote to buffers in higher memory space.
// Gather regions to promote to buffers in faster memory space.
// TODO(bondhugula): handle store op's; only load's handled for now.
void DmaGeneration::visitOperationStmt(OperationStmt *opStmt) {
if (auto loadOp = opStmt->dyn_cast<LoadOp>()) {
if (loadOp->getMemRefType().getMemorySpace() != lowMemorySpace)
if (loadOp->getMemRefType().getMemorySpace() != slowMemorySpace)
return;
// TODO(bondhugula): eventually, we need to be performing a union across all
// regions for a given memref instead of creating one region per memory op.
// This way we would be allocating O(num of memref's) sets instead of
// O(num of load/store op's).
auto memoryRegion = std::make_unique<FlatAffineConstraints>();
if (!getMemoryRegion(opStmt, memoryRegion.get())) {
LLVM_DEBUG(llvm::dbgs() << "Error obtaining memory region");
} else if (auto storeOp = opStmt->dyn_cast<StoreOp>()) {
if (storeOp->getMemRefType().getMemorySpace() != slowMemorySpace)
return;
}
LLVM_DEBUG(llvm::dbgs() << "Memory region");
LLVM_DEBUG(memoryRegion->dump());
regions.push_back(
{cast<MLValue>(loadOp->getMemRef()), false, std::move(memoryRegion)});
} else {
// Neither load nor a store op.
return;
}
// TODO(bondhugula): eventually, we need to be performing a union across all
// regions for a given memref instead of creating one region per memory op.
// This way we would be allocating O(num of memref's) sets instead of
// O(num of load/store op's).
auto region = std::make_unique<MemRefRegion>();
if (!getMemRefRegion(opStmt, dmaDepth, region.get())) {
LLVM_DEBUG(llvm::dbgs() << "Error obtaining memory region\n");
return;
}
LLVM_DEBUG(llvm::dbgs() << "Memory region:\n");
LLVM_DEBUG(region->getConstraints()->dump());
regions.push_back(std::move(region));
}
// Create a buffer in the higher (faster) memory space for the specified region;
// generate a DMA from the lower memory space to this one, and replace all loads
// to load from the buffer.
// TODO: handle write regions by generating outgoing DMAs; only read regions are
// handled for now.
void DmaGeneration::generateDma(const Region &region, Location loc,
MLFuncBuilder *b) {
// Only memref read regions handled for now.
if (region.isWrite)
return;
// Creates a buffer in the faster memory space for the specified region;
// generates a DMA from the lower memory space to this one, and replaces all
// loads to load from the buffer. Returns true if DMAs are generated.
bool DmaGeneration::generateDma(const MemRefRegion &region, ForStmt *forStmt) {
// DMAs for read regions are going to be inserted just before the for loop.
MLFuncBuilder prologue(forStmt);
// DMAs for write regions are going to be inserted just after the for loop.
MLFuncBuilder epilogue(forStmt->getBlock(),
std::next(StmtBlock::iterator(forStmt)));
MLFuncBuilder *b = region.isWrite() ? &epilogue : &prologue;
// Builder to create constants at the top level.
MLFuncBuilder top(forStmt->findFunction());
FlatAffineConstraints *cst =
const_cast<FlatAffineConstraints *>(region.getConstraints());
auto loc = forStmt->getLoc();
auto *memref = region.memref;
auto memRefType = memref->getType().cast<MemRefType>();
// Indices to use for DmaStart op.
SmallVector<SSAValue *, 4> srcIndices, destIndices;
SSAValue *zeroIndex = b->create<ConstantIndexOp>(loc, 0);
SSAValue *zeroIndex = top.create<ConstantIndexOp>(loc, 0);
unsigned rank = memRefType.getRank();
SmallVector<int, 4> shape;
shape.reserve(rank);
// Compute the extents of the buffer.
Optional<int64_t> numElements = region.getConstantSize();
if (!numElements.hasValue()) {
LLVM_DEBUG(llvm::dbgs() << "Non-constant region size\n");
return false;
}
if (numElements.getValue() == 0) {
LLVM_DEBUG(llvm::dbgs() << "Nothing to DMA\n");
return false;
}
region.getConstantShape(&shape);
// Index start offsets for faster memory buffer relative to the original.
SmallVector<int, 4> offsets;
SmallVector<AffineExpr, 4> offsets;
offsets.reserve(rank);
unsigned numElements = 1;
for (unsigned d = 0; d < rank; d++) {
auto lb = region.cst->getConstantLowerBound(d);
auto ub = region.cst->getConstantUpperBound(d);
unsigned lbPos;
cst->getConstantBoundDifference(d, &lbPos);
if (!lb.hasValue() || !ub.hasValue()) {
LLVM_DEBUG(llvm::dbgs() << "Non-constant loop bounds");
return;
// Construct the index expressions for the fast memory buffer. The index
// expression for a particular dimension of the fast buffer is obtained by
// subtracting out the lower bound on the original memref's data region
// along the corresponding dimension.
AffineExpr offset = top.getAffineConstantExpr(0);
for (unsigned j = rank; j < cst->getNumCols() - 1; j++) {
offset = offset - cst->atIneq(lbPos, j) * top.getAffineDimExpr(j - rank);
}
offset = offset - cst->atIneq(lbPos, cst->getNumCols() - 1);
offsets.push_back(offset);
offsets.push_back(lb.getValue());
int dimSize = ub.getValue() - lb.getValue() + 1;
if (dimSize <= 0)
return;
shape.push_back(dimSize);
numElements *= dimSize;
srcIndices.push_back(b->create<ConstantIndexOp>(loc, lb.getValue()));
auto ids = cst->getIds();
SmallVector<SSAValue *, 8> operands;
for (unsigned i = rank, e = ids.size(); i < e; i++) {
auto id = cst->getIds()[i];
assert(id.hasValue());
operands.push_back(id.getValue());
}
// Set DMA start location for this dimension in the lower memory space
// memref.
if (auto caf = offsets[d].dyn_cast<AffineConstantExpr>()) {
srcIndices.push_back(cast<MLValue>(
top.create<ConstantIndexOp>(loc, caf.getValue())->getResult()));
} else {
auto map =
top.getAffineMap(cst->getNumDimIds() + cst->getNumSymbolIds() - rank,
0, offsets[d], {});
srcIndices.push_back(cast<MLValue>(
b->create<AffineApplyOp>(loc, map, operands)->getResult(0)));
}
// The fast buffer is DMAed into at location zero; addressing is relative.
destIndices.push_back(zeroIndex);
}
// Create the faster memref buffer.
auto fastMemRefType =
b->getMemRefType(shape, memRefType.getElementType(), {}, highMemorySpace);
SSAValue *fastMemRef;
auto fastMemRef = b->create<AllocOp>(loc, fastMemRefType)->getResult();
// Check if a buffer was already created.
// TODO(bondhugula): union across all memory op's per buffer. For now assuming
// that multiple memory op's on the same memref have the *same* memory
// footprint.
if (fastBufferMap.find(memref) == fastBufferMap.end()) {
auto fastMemRefType = top.getMemRefType(shape, memRefType.getElementType(),
{}, fastMemorySpace);
LLVM_DEBUG(llvm::dbgs() << "Creating a new buffer of type: ");
LLVM_DEBUG(fastMemRefType.dump(); llvm::dbgs() << "\n");
// Create the fast memory space buffer just before the 'for' statement.
fastMemRef = prologue.create<AllocOp>(loc, fastMemRefType)->getResult();
// Record it.
fastBufferMap[memref] = fastMemRef;
} else {
// Reuse the one already created.
fastMemRef = fastBufferMap[memref];
}
// Create a tag (single element 1-d memref) for the DMA.
auto tagMemRefType = b->getMemRefType({1}, b->getIntegerType(32));
auto tagMemRef = b->create<AllocOp>(loc, tagMemRefType);
auto numElementsSSA = b->create<ConstantIndexOp>(loc, numElements);
auto tagMemRefType = top.getMemRefType({1}, top.getIntegerType(32));
auto tagMemRef = prologue.create<AllocOp>(loc, tagMemRefType);
auto numElementsSSA =
top.create<ConstantIndexOp>(loc, numElements.getValue());
// TODO(bondhugula): check for transfer sizes not being a multiple of
// minDmaTransferSize and handle them appropriately.
// TODO(bondhugula): Need to use strided DMA for multi-dimensional (>= 2-d)
// case.
b->create<DmaStartOp>(loc, memref, srcIndices, fastMemRef, destIndices,
numElementsSSA, tagMemRef, zeroIndex);
if (!region.isWrite()) {
b->create<DmaStartOp>(loc, memref, srcIndices, fastMemRef, destIndices,
numElementsSSA, tagMemRef, zeroIndex);
} else {
// dest and src is switched for the writes (since DMA is from the faster
// memory space to the slower one).
b->create<DmaStartOp>(loc, fastMemRef, destIndices, memref, srcIndices,
numElementsSSA, tagMemRef, zeroIndex);
}
// Matching DMA wait to block on completion; tag always has a 0 index.
b->create<DmaWaitOp>(loc, tagMemRef, zeroIndex, numElementsSSA);
// Replace all uses of the old memref with the promoted one while remapping
// Replace all uses of the old memref with the faster one while remapping
// access indices (subtracting out lower bound offsets for each dimension).
SmallVector<AffineExpr, 4> remapExprs;
remapExprs.reserve(rank);
for (unsigned i = 0; i < rank; i++) {
auto d0 = b->getAffineDimExpr(i);
remapExprs.push_back(d0 - offsets[i]);
auto dim = b->getAffineDimExpr(i);
remapExprs.push_back(dim - offsets[i]);
}
auto indexRemap = b->getAffineMap(rank, 0, remapExprs, {});
replaceAllMemRefUsesWith(memref, cast<MLValue>(fastMemRef), {}, indexRemap);
// *Only* those uses within the body of 'forStmt' are replaced.
replaceAllMemRefUsesWith(memref, cast<MLValue>(fastMemRef), {}, indexRemap,
&*forStmt->begin());
return true;
}
bool DmaGeneration::runOnForStmt(ForStmt *forStmt) {
walk(forStmt);
/// Returns the nesting depth of this statement, i.e., the number of loops
/// surrounding this statement.
// TODO(bondhugula): move this to utilities later.
static unsigned getNestingDepth(const Statement &stmt) {
const Statement *currStmt = &stmt;
unsigned depth = 0;
while ((currStmt = currStmt->getParentStmt())) {
if (isa<ForStmt>(currStmt))
depth++;
}
return depth;
}
MLFuncBuilder b(forStmt);
for (const auto &region : regions) {
generateDma(region, forStmt->getLoc(), &b);
// TODO(bondhugula): make this run on a StmtBlock instead of a 'for' stmt.
void DmaGeneration::runOnForStmt(ForStmt *forStmt) {
// For now (for testing purposes), we'll run this on the outermost among 'for'
// stmt's with unit stride, i.e., right at the top of the tile if tiling has
// been done. In the future, the DMA generation has to be done at a level
// where the generated data fits in a higher level of the memory hierarchy; so
// the pass has to be instantiated with additional information that we aren't
// provided with at the moment.
if (forStmt->getStep() != 1) {
if (auto *innerFor = dyn_cast<ForStmt>(&*forStmt->begin())) {
runOnForStmt(innerFor);
}
return;
}
// This function never leaves the IR in an invalid state.
return false;
// DMAs will be generated for this depth, i.e., for all data accessed by this
// loop.
dmaDepth = getNestingDepth(*forStmt);
regions.clear();
fastBufferMap.clear();
// Walk this 'for' statement to gather all memory regions.
walk(forStmt);
for (const auto &region : regions) {
generateDma(*region, forStmt);
}
}
PassResult DmaGeneration::runOnMLFunction(MLFunction *f) {
bool ret = false;
for (auto &stmt : *f) {
// Run on all 'for' statements for now.
if (auto *forStmt = dyn_cast<ForStmt>(&stmt)) {
ret = ret | runOnForStmt(forStmt);
runOnForStmt(forStmt);
}
}
return ret ? failure() : success();
// This function never leaves the IR in an invalid state.
return success();
}
static PassRegistration<DmaGeneration>

View File

@ -42,7 +42,7 @@ namespace {
struct LoopTiling : public FunctionPass {
LoopTiling() : FunctionPass(&LoopTiling::passID) {}
PassResult runOnMLFunction(MLFunction *f) override;
constexpr static unsigned kDefaultTileSize = 32;
constexpr static unsigned kDefaultTileSize = 4;
static char passID;
};

View File

@ -117,7 +117,7 @@ static bool doubleBuffer(const MLValue *oldMemRef, ForStmt *forStmt) {
return true;
}
/// Returns false if this succeeds on at least one 'for' stmt.
/// Returns success if the IR is in a valid state.
PassResult PipelineDataTransfer::runOnMLFunction(MLFunction *f) {
// Do a post order walk so that inner loop DMAs are processed first. This is
// necessary since 'for' statements nested within would otherwise become
@ -126,9 +126,9 @@ PassResult PipelineDataTransfer::runOnMLFunction(MLFunction *f) {
// epilogue).
forStmts.clear();
walkPostOrder(f);
bool ret = true;
bool ret = false;
for (auto *forStmt : forStmts) {
ret = ret & runOnForStmt(forStmt);
ret = ret | runOnForStmt(forStmt);
}
return ret ? failure() : success();
}
@ -293,9 +293,16 @@ PassResult PipelineDataTransfer::runOnForStmt(ForStmt *forStmt) {
// Get delays stored in map.
std::vector<uint64_t> delays(forStmt->getStatements().size());
unsigned s = 0;
for (const auto &stmt : *forStmt) {
for (auto &stmt : *forStmt) {
assert(stmtDelayMap.find(&stmt) != stmtDelayMap.end());
delays[s++] = stmtDelayMap[&stmt];
LLVM_DEBUG(
// Tagging statements with delays for debugging purposes.
if (auto *opStmt = dyn_cast<OperationStmt>(&stmt)) {
MLFuncBuilder b(opStmt);
opStmt->setAttr(b.getIdentifier("delay"),
b.getIntegerAttr(delays[s - 1]));
});
}
if (!isStmtwiseShiftValid(*forStmt, delays)) {

View File

@ -24,6 +24,7 @@
#include "mlir/Analysis/AffineAnalysis.h"
#include "mlir/Analysis/AffineStructures.h"
#include "mlir/Analysis/Utils.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/Module.h"
#include "mlir/IR/StmtVisitor.h"
@ -47,13 +48,15 @@ static bool isMemRefDereferencingOp(const Operation &op) {
/// old memref's indices to the new memref using the supplied affine map
/// and adding any additional indices. The new memref could be of a different
/// shape or rank, but of the same elemental type. Additional indices are added
/// at the start for now.
/// at the start. An optional argument 'domOpFilter' restricts the
/// replacement to only those operations that are dominated by the former.
// TODO(mlir-team): extend this for SSAValue / CFGFunctions. Can also be easily
// extended to add additional indices at any position.
bool mlir::replaceAllMemRefUsesWith(const MLValue *oldMemRef,
MLValue *newMemRef,
ArrayRef<MLValue *> extraIndices,
AffineMap indexRemap) {
AffineMap indexRemap,
const Statement *domStmtFilter) {
unsigned newMemRefRank = newMemRef->getType().cast<MemRefType>().getRank();
(void)newMemRefRank; // unused in opt mode
unsigned oldMemRefRank = oldMemRef->getType().cast<MemRefType>().getRank();
@ -82,6 +85,11 @@ bool mlir::replaceAllMemRefUsesWith(const MLValue *oldMemRef,
for (auto it = oldMemRef->use_begin(); it != oldMemRef->use_end();) {
StmtOperand &use = *(it++);
auto *opStmt = cast<OperationStmt>(use.getOwner());
// Skip this use if it's not dominated by domStmtFilter.
if (domStmtFilter && !dominates(*domStmtFilter, *opStmt))
continue;
assert(isMemRefDereferencingOp(*opStmt) &&
"memref deferencing op expected");
@ -172,7 +180,7 @@ OperationStmt *
mlir::createComposedAffineApplyOp(FuncBuilder *builder, Location loc,
ArrayRef<MLValue *> operands,
ArrayRef<OperationStmt *> affineApplyOps,
SmallVectorImpl<SSAValue *> &results) {
SmallVectorImpl<SSAValue *> *results) {
// Create identity map with same number of dimensions as number of operands.
auto map = builder->getMultiDimIdentityMap(operands.size());
// Initialize AffineValueMap with identity map.
@ -194,9 +202,9 @@ mlir::createComposedAffineApplyOp(FuncBuilder *builder, Location loc,
// Create new AffineApplyOp based on 'valueMap'.
auto affineApplyOp =
builder->create<AffineApplyOp>(loc, valueMap.getAffineMap(), outOperands);
results.resize(operands.size());
results->resize(operands.size());
for (unsigned i = 0, e = operands.size(); i < e; ++i) {
results[i] = affineApplyOp->getResult(i);
(*results)[i] = affineApplyOp->getResult(i);
}
return cast<OperationStmt>(affineApplyOp->getOperation());
}
@ -247,8 +255,8 @@ OperationStmt *mlir::createAffineComputationSlice(OperationStmt *opStmt) {
if (affineApplyOps.empty())
return nullptr;
// Check if all uses of the affine apply op's lie in this op stmt
// itself, in which case there would be nothing to do.
// Check if all uses of the affine apply op's lie only in this op stmt, in
// which case there would be nothing to do.
bool localized = true;
for (auto *op : affineApplyOps) {
for (auto *result : op->getResults()) {
@ -266,7 +274,7 @@ OperationStmt *mlir::createAffineComputationSlice(OperationStmt *opStmt) {
FuncBuilder builder(opStmt);
SmallVector<SSAValue *, 4> results;
auto *affineApplyStmt = createComposedAffineApplyOp(
&builder, opStmt->getLoc(), subOperands, affineApplyOps, results);
&builder, opStmt->getLoc(), subOperands, affineApplyOps, &results);
assert(results.size() == subOperands.size() &&
"number of results should be the same as the number of subOperands");

View File

@ -1,42 +1,155 @@
// RUN: mlir-opt %s -dma-generate | FileCheck %s
// RUN: mlir-opt %s -dma-generate -canonicalize | FileCheck %s
// Index of the buffer for the second DMA is remapped.
// CHECK-DAG: [[MAP:#map[0-9]+]] = (d0) -> (d0 - 256)
// CHECK-DAG: #map{{[0-9]+}} = (d0, d1) -> (d0 * 16 + d1)
// CHECK-DAG: #map{{[0-9]+}} = (d0, d1) -> (d0, d1)
// CHECK-LABEL: mlfunc @loop_tiling() {
mlfunc @loop_tiling() {
// CHECK-LABEL: mlfunc @loop_nest_1d() {
mlfunc @loop_nest_1d() {
%A = alloc() : memref<256 x f32>
%B = alloc() : memref<512 x f32>
%F = alloc() : memref<128 x f32, 1>
%F = alloc() : memref<256 x f32, 1>
// First DMA buffer.
// CHECK: %3 = alloc() : memref<256xf32, 1>
// Tag for first DMA.
// CHECK: %4 = alloc() : memref<1xi32>
// First DMA transfer.
// CHECK: dma_start %3[%5], %3[%c0], %c256, %4[%c0] : memref<256xf32, 1>, memref<256xf32, 1>, memref<1xi32>
// CHECK: dma_start %0[%c0], %3[%c0], %c256, %4[%c0] : memref<256xf32>, memref<256xf32, 1>, memref<1xi32>
// CHECK: dma_wait %4[%c0], %c256 : memref<1xi32>
// Second DMA buffer.
// CHECK: %6 = alloc() : memref<256xf32, 1>
// CHECK: %5 = alloc() : memref<256xf32, 1>
// Tag for second DMA.
// CHECK: %7 = alloc() : memref<1xi32>
// CHECK: %6 = alloc() : memref<1xi32>
// Second DMA transfer.
// CHECK: dma_start %6[%8], %6[%c0_1], %c256_3, %7[%c0_1] : memref<256xf32, 1>, memref<256xf32, 1>, memref<1xi32>
// CHECK-NEXT: dma_wait %7[%c0_1], %c256_3 : memref<1xi32>
// CHECK: dma_start %1[%c256], %5[%c0], %c256, %6[%c0] : memref<512xf32>, memref<256xf32, 1>, memref<1xi32>
// CHECK-NEXT: dma_wait %6[%c0], %c256 : memref<1xi32>
// CHECK: for %i0 = 0 to 256 {
// CHECK: %7 = affine_apply #map{{[0-9]+}}(%i0)
// CHECK-NEXT: %8 = load %3[%7] : memref<256xf32, 1>
// CHECK: %9 = affine_apply #map{{[0-9]+}}(%i0)
// CHECK-NEXT: %10 = load %3[%9] : memref<256xf32, 1>
// CHECK: %11 = affine_apply #map{{[0-9]+}}(%i0)
// CHECK: %12 = affine_apply [[MAP]](%11)
// CHECK-NEXT: %13 = load %6[%12] : memref<256xf32, 1>
// CHECK: %10 = affine_apply [[MAP]](%9)
// CHECK-NEXT: %11 = load %5[%10] : memref<256xf32, 1>
// Already in faster memory space.
// CHECK: %14 = load %2[%i0] : memref<128xf32, 1>
// CHECK: %12 = load %2[%i0] : memref<256xf32, 1>
// CHECK-NEXT: }
// CHECK-NEXT: return
for %i = 0 to 256 {
load %A[%i] : memref<256 x f32>
%idx = affine_apply (d0) -> (d0 + 256)(%i)
load %B[%idx] : memref<512 x f32>
load %F[%i] : memref<128 x f32, 1>
load %F[%i] : memref<256 x f32, 1>
}
return
}
// CHECK-LABEL: mlfunc @loop_nest_high_d
// CHECK: %c16384 = constant 16384 : index
// CHECK-NEXT: %0 = alloc() : memref<512x32xf32, 1>
// CHECK-NEXT: %1 = alloc() : memref<1xi32>
// INCOMING DMA for B
// CHECK-NEXT: dma_start %arg1[%c0, %c0], %0[%c0, %c0], %c16384, %1[%c0] : memref<512x32xf32>, memref<512x32xf32, 1>, memref<1xi32>
// CHECK-NEXT: dma_wait %1[%c0], %c16384 : memref<1xi32>
// CHECK-NEXT: %2 = alloc() : memref<512x32xf32, 1>
// CHECK-NEXT: %3 = alloc() : memref<1xi32>
// INCOMING DMA for A.
// CHECK-NEXT: dma_start %arg0[%c0, %c0], %2[%c0, %c0], %c16384, %3[%c0] : memref<512x32xf32>, memref<512x32xf32, 1>, memref<1xi32>
// CHECK-NEXT: dma_wait %3[%c0], %c16384 : memref<1xi32>
// CHECK-NEXT: %4 = alloc() : memref<512x32xf32, 1>
// CHECK-NEXT: %5 = alloc() : memref<1xi32>
// INCOMING DMA for C.
// CHECK-NEXT: dma_start %arg2[%c0, %c0], %4[%c0, %c0], %c16384, %5[%c0] : memref<512x32xf32>, memref<512x32xf32, 1>, memref<1xi32>
// CHECK-NEXT: dma_wait %5[%c0], %c16384 : memref<1xi32>
// CHECK-NEXT: %6 = alloc() : memref<1xi32>
// CHECK-NEXT: for %i0 = 0 to 32 {
// CHECK-NEXT: for %i1 = 0 to 32 {
// CHECK-NEXT: for %i2 = 0 to 32 {
// CHECK-NEXT: for %i3 = 0 to 16 {
// CHECK-NEXT: %7 = affine_apply #map{{[0-9]+}}(%i1, %i3)
// CHECK-NEXT: %8 = affine_apply #map{{[0-9]+}}(%7, %i0)
// CHECK-NEXT: %9 = load %0[%8#0, %8#1] : memref<512x32xf32, 1>
// CHECK-NEXT: "foo"(%9) : (f32) -> ()
// CHECK-NEXT: }
// CHECK-NEXT: for %i4 = 0 to 16 {
// CHECK-NEXT: %10 = affine_apply #map{{[0-9]+}}(%i2, %i4)
// CHECK-NEXT: %11 = affine_apply #map{{[0-9]+}}(%10, %i1)
// CHECK-NEXT: %12 = load %2[%11#0, %11#1] : memref<512x32xf32, 1>
// CHECK-NEXT: "bar"(%12) {mxu_id: 0} : (f32) -> ()
// CHECK-NEXT: }
// CHECK-NEXT: for %i5 = 0 to 16 {
// CHECK-NEXT: %13 = "abc_compute"() : () -> f32
// CHECK-NEXT: %14 = affine_apply #map{{[0-9]+}}(%i2, %i5)
// CHECK-NEXT: %15 = affine_apply #map{{[0-9]+}}(%14, %i0)
// CHECK-NEXT: %16 = load %4[%15#0, %15#1] : memref<512x32xf32, 1>
// CHECK-NEXT: %17 = "addf32"(%13, %16) : (f32, f32) -> f32
// CHECK-NEXT: %18 = affine_apply #map{{[0-9]+}}(%14, %i0)
// CHECK-NEXT: store %17, %4[%18#0, %18#1] : memref<512x32xf32, 1>
// CHECK-NEXT: }
// CHECK-NEXT: "foobar"() : () -> ()
// CHECK-NEXT: }
// CHECK-NEXT: }
// CHECK-NEXT: }
// OUTGOING DMA for C.
// CHECK-NEXT: dma_start %4[%c0, %c0], %arg2[%c0, %c0], %c16384, %6[%c0] : memref<512x32xf32, 1>, memref<512x32xf32>, memref<1xi32>
// CHECK-NEXT: dma_wait %6[%c0], %c16384 : memref<1xi32>
// CHECK-NEXT: return
// CHECK-NEXT:}
mlfunc @loop_nest_high_d(%A: memref<512 x 32 x f32>,
%B: memref<512 x 32 x f32>, %C: memref<512 x 32 x f32>) {
// DMAs will be performed at this level (jT is the first loop without a stride).
// A and B are read, while C is both read and written. A total of three new buffers
// are allocated and existing load's/store's are replaced by accesses to those buffers.
for %jT = 0 to 32 {
for %kT = 0 to 32 {
for %iT = 0 to 32 {
for %kk = 0 to 16 { // k intratile
%k = affine_apply (d0, d1) -> (16*d0 + d1) (%kT, %kk)
%v0 = load %B[%k, %jT] : memref<512 x 32 x f32>
"foo"(%v0) : (f32) -> ()
}
for %ii = 0 to 16 { // i intratile.
%i = affine_apply (d0, d1) -> (16*d0 + d1)(%iT, %ii)
%v1 = load %A[%i, %kT] : memref<512 x 32 x f32>
"bar"(%v1) {mxu_id: 0} : (f32) -> ()
}
for %ii_ = 0 to 16 { // i intratile.
%v2 = "abc_compute"() : () -> f32
%i_ = affine_apply (d0, d1) -> (16*d0 + d1)(%iT, %ii_)
%v3 = load %C[%i_, %jT] : memref<512 x 32 x f32>
%v4 = "addf32"(%v2, %v3) : (f32, f32) -> (f32)
store %v4, %C[%i_, %jT] : memref<512 x 32 x f32>
}
"foobar"() : () -> ()
}
}
}
return
}
// A loop nest with a modulo 2 access.
//
// CHECK-LABEL: mlfunc @loop_nest_modulo() {
// CHECK: %0 = alloc() : memref<256x8xf32>
// CHECK-NEXT: for %i0 = 0 to 32 step 4 {
// CHECK-NEXT: %1 = alloc() : memref<32x2xf32, 1>
// CHECK-NEXT: %2 = alloc() : memref<1xi32>
// CHECK-NEXT: dma_start %0[%c0, %c0], %1[%c0, %c0], %c64, %2[%c0] : memref<256x8xf32>, memref<32x2xf32, 1>, memref<1xi32>
// CHECK-NEXT: dma_wait %2[%c0], %c64 : memref<1xi32>
// CHECK-NEXT: for %i1 = 0 to 8 {
// ...
// ...
// CHECK: }
// CHECK-NEXT: }
// CHECK-NEXT: return
mlfunc @loop_nest_modulo() {
%A = alloc() : memref<256 x 8 x f32>
for %i = 0 to 32 step 4 {
// DMAs will be performed at this level (%j is the first unit stride loop)
for %j = 0 to 8 {
%idx = affine_apply (d0) -> (d0 mod 2) (%j)
// A buffer of size 32 x 2 will be allocated (original buffer was 256 x 8).
%v = load %A[%i, %idx] : memref<256 x 8 x f32>
}
}
return
}

View File

@ -1,4 +1,4 @@
// RUN: mlir-opt %s -loop-tile | FileCheck %s
// RUN: mlir-opt %s -loop-tile -tile-size=32 | FileCheck %s
// CHECK: #map0 = (d0) -> (d0 + 32)
// CHECK: #map1 = (d0) -> (d0 + 32, 50)