forked from OSchip/llvm-project
LoopFusion: adds support for computing forward computation slices, which will enable fusion of consumer loop nests into their producers in subsequent CLs.
PiperOrigin-RevId: 253601994
This commit is contained in:
parent
a14eeacf2c
commit
898cf0e968
|
@ -393,12 +393,12 @@ public:
|
||||||
bool lower = true);
|
bool lower = true);
|
||||||
|
|
||||||
/// Computes the lower and upper bounds of the first 'num' dimensional
|
/// Computes the lower and upper bounds of the first 'num' dimensional
|
||||||
/// identifiers as an affine map of the remaining identifiers (dimensional and
|
/// identifiers (starting at 'offset') as an affine map of the remaining
|
||||||
/// symbolic). This method is able to detect identifiers as floordiv's
|
/// identifiers (dimensional and symbolic). This method is able to detect
|
||||||
/// and mod's of affine expressions of other identifiers with respect to
|
/// identifiers as floordiv's and mod's of affine expressions of other
|
||||||
/// (positive) constants. Sets bound map to a null AffineMap if such a bound
|
/// identifiers with respect to (positive) constants. Sets bound map to a
|
||||||
/// can't be found (or yet unimplemented).
|
/// null AffineMap if such a bound can't be found (or yet unimplemented).
|
||||||
void getSliceBounds(unsigned num, MLIRContext *context,
|
void getSliceBounds(unsigned offset, unsigned num, MLIRContext *context,
|
||||||
SmallVectorImpl<AffineMap> *lbMaps,
|
SmallVectorImpl<AffineMap> *lbMaps,
|
||||||
SmallVectorImpl<AffineMap> *ubMaps);
|
SmallVectorImpl<AffineMap> *ubMaps);
|
||||||
|
|
||||||
|
@ -648,13 +648,14 @@ public:
|
||||||
Optional<int64_t> getConstantUpperBound(unsigned pos) const;
|
Optional<int64_t> getConstantUpperBound(unsigned pos) const;
|
||||||
|
|
||||||
/// Gets the lower and upper bound of the pos^th identifier treating
|
/// Gets the lower and upper bound of the pos^th identifier treating
|
||||||
/// [dimStartPos, symbStartPos) as dimensions and [symStartPos,
|
/// [0, offset) U [offset + num, symbStartPos) as dimensions and
|
||||||
/// getNumDimAndSymbolIds) as symbols. The returned multi-dimensional maps
|
/// [symStartPos, getNumDimAndSymbolIds) as symbols. The returned
|
||||||
/// in the pair represent the max and min of potentially multiple affine
|
/// multi-dimensional maps in the pair represent the max and min of
|
||||||
/// expressions. The upper bound is exclusive. 'localExprs' holds pre-computed
|
/// potentially multiple affine expressions. The upper bound is exclusive.
|
||||||
/// AffineExpr's for all local identifiers in the system.
|
/// 'localExprs' holds pre-computed AffineExpr's for all local identifiers in
|
||||||
|
/// the system.
|
||||||
std::pair<AffineMap, AffineMap>
|
std::pair<AffineMap, AffineMap>
|
||||||
getLowerAndUpperBound(unsigned pos, unsigned dimStartPos,
|
getLowerAndUpperBound(unsigned pos, unsigned offset, unsigned num,
|
||||||
unsigned symStartPos, ArrayRef<AffineExpr> localExprs,
|
unsigned symStartPos, ArrayRef<AffineExpr> localExprs,
|
||||||
MLIRContext *context);
|
MLIRContext *context);
|
||||||
|
|
||||||
|
|
|
@ -73,6 +73,8 @@ struct ComputationSliceState {
|
||||||
std::vector<SmallVector<Value *, 4>> lbOperands;
|
std::vector<SmallVector<Value *, 4>> lbOperands;
|
||||||
// List of upper bound operands (ubOperands[i] are used by 'ubs[i]').
|
// List of upper bound operands (ubOperands[i] are used by 'ubs[i]').
|
||||||
std::vector<SmallVector<Value *, 4>> ubOperands;
|
std::vector<SmallVector<Value *, 4>> ubOperands;
|
||||||
|
// Slice loop nest insertion point in target loop nest.
|
||||||
|
Block::iterator insertPoint;
|
||||||
// Adds to 'cst' with constraints which represent the slice bounds on 'ivs'
|
// Adds to 'cst' with constraints which represent the slice bounds on 'ivs'
|
||||||
// in 'this'. Specifically, the values in 'ivs' are added to 'cst' as dim
|
// in 'this'. Specifically, the values in 'ivs' are added to 'cst' as dim
|
||||||
// identifiers and the values in 'lb/ubOperands' are added as symbols.
|
// identifiers and the values in 'lb/ubOperands' are added as symbols.
|
||||||
|
@ -85,19 +87,67 @@ struct ComputationSliceState {
|
||||||
void clearBounds();
|
void clearBounds();
|
||||||
};
|
};
|
||||||
|
|
||||||
/// Computes computation slice loop bounds for the loop nest surrounding
|
/// Computes the computation slice loop bounds for one loop nest as affine maps
|
||||||
/// 'srcAccess', where the returned loop bound AffineMaps are functions of
|
/// of the other loop nest's IVs and symbols, using 'dependenceConstraints'
|
||||||
/// loop IVs from the loop nest surrounding 'dstAccess'.
|
/// computed between 'depSourceAccess' and 'depSinkAccess'.
|
||||||
LogicalResult getBackwardComputationSliceState(
|
/// If 'isBackwardSlice' is true, a backwards slice is computed in which the
|
||||||
const MemRefAccess &srcAccess, const MemRefAccess &dstAccess,
|
/// slice bounds of loop nest surrounding 'depSourceAccess' are computed in
|
||||||
unsigned dstLoopDepth, ComputationSliceState *sliceState);
|
/// terms of loop IVs and symbols of the loop nest surrounding 'depSinkAccess'
|
||||||
|
/// at 'loopDepth'.
|
||||||
|
/// If 'isBackwardSlice' is false, a forward slice is computed in which the
|
||||||
|
/// slice bounds of loop nest surrounding 'depSinkAccess' are computed in terms
|
||||||
|
/// of loop IVs and symbols of the loop nest surrounding 'depSourceAccess' at
|
||||||
|
/// 'loopDepth'.
|
||||||
|
/// The slice loop bounds and associated operands are returned in 'sliceState'.
|
||||||
|
//
|
||||||
|
// Backward slice example:
|
||||||
|
//
|
||||||
|
// affine.for %i0 = 0 to 10 {
|
||||||
|
// store %cst, %0[%i0] : memref<100xf32> // 'depSourceAccess'
|
||||||
|
// }
|
||||||
|
// affine.for %i1 = 0 to 10 {
|
||||||
|
// %v = load %0[%i1] : memref<100xf32> // 'depSinkAccess'
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// // Backward computation slice of loop nest '%i0'.
|
||||||
|
// affine.for %i0 = (d0) -> (d0)(%i1) to (d0) -> (d0 + 1)(%i1) {
|
||||||
|
// store %cst, %0[%i0] : memref<100xf32> // 'depSourceAccess'
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// Forward slice example:
|
||||||
|
//
|
||||||
|
// affine.for %i0 = 0 to 10 {
|
||||||
|
// store %cst, %0[%i0] : memref<100xf32> // 'depSourceAccess'
|
||||||
|
// }
|
||||||
|
// affine.for %i1 = 0 to 10 {
|
||||||
|
// %v = load %0[%i1] : memref<100xf32> // 'depSinkAccess'
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// // Forward computation slice of loop nest '%i1'.
|
||||||
|
// affine.for %i1 = (d0) -> (d0)(%i0) to (d0) -> (d0 + 1)(%i0) {
|
||||||
|
// %v = load %0[%i1] : memref<100xf32> // 'depSinkAccess'
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
void getComputationSliceState(Operation *depSourceOp, Operation *depSinkOp,
|
||||||
|
FlatAffineConstraints *dependenceConstraints,
|
||||||
|
unsigned loopDepth, bool isBackwardSlice,
|
||||||
|
ComputationSliceState *sliceState);
|
||||||
|
|
||||||
/// Computes in 'sliceUnion' the union of all slice bounds computed at
|
/// Computes in 'sliceUnion' the union of all slice bounds computed at
|
||||||
/// 'dstLoopDepth' between all pairs in 'srcOps' and 'dstOp' which access the
|
/// 'loopDepth' between all dependent pairs of ops in 'opsA' and 'opsB'.
|
||||||
/// same memref. Returns 'success' if union was computed, 'failure' otherwise.
|
/// The parameter 'numCommonLoops' is the number of loops common to the
|
||||||
LogicalResult computeSliceUnion(ArrayRef<Operation *> srcOps,
|
/// operations in 'opsA' and 'opsB'.
|
||||||
ArrayRef<Operation *> dstOps,
|
/// If 'isBackwardSlice' is true, computes slice bounds for loop nest
|
||||||
unsigned dstLoopDepth,
|
/// surrounding ops in 'opsA', as a function of IVs and symbols of loop nest
|
||||||
|
/// surrounding ops in 'opsB' at 'loopDepth'.
|
||||||
|
/// If 'isBackwardSlice' is false, computes slice bounds for loop nest
|
||||||
|
/// surrounding ops in 'opsB', as a function of IVs and symbols of loop nest
|
||||||
|
/// surrounding ops in 'opsA' at 'loopDepth'.
|
||||||
|
/// Returns 'success' if union was computed, 'failure' otherwise.
|
||||||
|
// TODO(andydavis) Change this API to take 'forOpA'/'forOpB'.
|
||||||
|
LogicalResult computeSliceUnion(ArrayRef<Operation *> opsA,
|
||||||
|
ArrayRef<Operation *> opsB, unsigned loopDepth,
|
||||||
|
unsigned numCommonLoops, bool isBackwardSlice,
|
||||||
ComputationSliceState *sliceUnion);
|
ComputationSliceState *sliceUnion);
|
||||||
|
|
||||||
/// Creates a clone of the computation contained in the loop nest surrounding
|
/// Creates a clone of the computation contained in the loop nest surrounding
|
||||||
|
|
|
@ -1423,19 +1423,28 @@ void FlatAffineConstraints::removeRedundantInequalities() {
|
||||||
}
|
}
|
||||||
|
|
||||||
std::pair<AffineMap, AffineMap> FlatAffineConstraints::getLowerAndUpperBound(
|
std::pair<AffineMap, AffineMap> FlatAffineConstraints::getLowerAndUpperBound(
|
||||||
unsigned pos, unsigned dimStartPos, unsigned symStartPos,
|
unsigned pos, unsigned offset, unsigned num, unsigned symStartPos,
|
||||||
ArrayRef<AffineExpr> localExprs, MLIRContext *context) {
|
ArrayRef<AffineExpr> localExprs, MLIRContext *context) {
|
||||||
assert(pos < dimStartPos && "invalid dim start pos");
|
assert(pos + offset < getNumDimIds() && "invalid dim start pos");
|
||||||
assert(symStartPos >= dimStartPos && "invalid sym start pos");
|
assert(symStartPos >= (pos + offset) && "invalid sym start pos");
|
||||||
assert(getNumLocalIds() == localExprs.size() &&
|
assert(getNumLocalIds() == localExprs.size() &&
|
||||||
"incorrect local exprs count");
|
"incorrect local exprs count");
|
||||||
|
|
||||||
SmallVector<unsigned, 4> lbIndices, ubIndices;
|
SmallVector<unsigned, 4> lbIndices, ubIndices;
|
||||||
getLowerAndUpperBoundIndices(*this, pos, &lbIndices, &ubIndices);
|
getLowerAndUpperBoundIndices(*this, pos + offset, &lbIndices, &ubIndices);
|
||||||
|
|
||||||
|
/// Add to 'b' from 'a' in set [0, offset) U [offset + num, symbStartPos).
|
||||||
|
auto addCoeffs = [&](ArrayRef<int64_t> a, SmallVectorImpl<int64_t> &b) {
|
||||||
|
b.clear();
|
||||||
|
for (unsigned i = 0, e = a.size(); i < e; ++i) {
|
||||||
|
if (i < offset || i >= offset + num)
|
||||||
|
b.push_back(a[i]);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
SmallVector<int64_t, 8> lb, ub;
|
SmallVector<int64_t, 8> lb, ub;
|
||||||
SmallVector<AffineExpr, 4> exprs;
|
SmallVector<AffineExpr, 4> exprs;
|
||||||
unsigned dimCount = symStartPos - dimStartPos;
|
unsigned dimCount = symStartPos - num;
|
||||||
unsigned symCount = getNumDimAndSymbolIds() - symStartPos;
|
unsigned symCount = getNumDimAndSymbolIds() - symStartPos;
|
||||||
exprs.reserve(lbIndices.size());
|
exprs.reserve(lbIndices.size());
|
||||||
// Lower bound expressions.
|
// Lower bound expressions.
|
||||||
|
@ -1444,7 +1453,7 @@ std::pair<AffineMap, AffineMap> FlatAffineConstraints::getLowerAndUpperBound(
|
||||||
// Extract the lower bound (in terms of other coeff's + const), i.e., if
|
// Extract the lower bound (in terms of other coeff's + const), i.e., if
|
||||||
// i - j + 1 >= 0 is the constraint, 'pos' is for i the lower bound is j
|
// i - j + 1 >= 0 is the constraint, 'pos' is for i the lower bound is j
|
||||||
// - 1.
|
// - 1.
|
||||||
lb.assign(ineq.begin() + dimStartPos, ineq.end());
|
addCoeffs(ineq, lb);
|
||||||
std::transform(lb.begin(), lb.end(), lb.begin(), std::negate<int64_t>());
|
std::transform(lb.begin(), lb.end(), lb.begin(), std::negate<int64_t>());
|
||||||
auto expr = mlir::toAffineExpr(lb, dimCount, symCount, localExprs, context);
|
auto expr = mlir::toAffineExpr(lb, dimCount, symCount, localExprs, context);
|
||||||
exprs.push_back(expr);
|
exprs.push_back(expr);
|
||||||
|
@ -1458,7 +1467,7 @@ std::pair<AffineMap, AffineMap> FlatAffineConstraints::getLowerAndUpperBound(
|
||||||
for (auto idx : ubIndices) {
|
for (auto idx : ubIndices) {
|
||||||
auto ineq = getInequality(idx);
|
auto ineq = getInequality(idx);
|
||||||
// Extract the upper bound (in terms of other coeff's + const).
|
// Extract the upper bound (in terms of other coeff's + const).
|
||||||
ub.assign(ineq.begin() + dimStartPos, ineq.end());
|
addCoeffs(ineq, ub);
|
||||||
auto expr = mlir::toAffineExpr(ub, dimCount, symCount, localExprs, context);
|
auto expr = mlir::toAffineExpr(ub, dimCount, symCount, localExprs, context);
|
||||||
// Upper bound is exclusive.
|
// Upper bound is exclusive.
|
||||||
exprs.push_back(expr + 1);
|
exprs.push_back(expr + 1);
|
||||||
|
@ -1470,10 +1479,12 @@ std::pair<AffineMap, AffineMap> FlatAffineConstraints::getLowerAndUpperBound(
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Computes the lower and upper bounds of the first 'num' dimensional
|
/// Computes the lower and upper bounds of the first 'num' dimensional
|
||||||
/// identifiers as affine maps of the remaining identifiers (dimensional and
|
/// identifiers (starting at 'offset') as affine maps of the remaining
|
||||||
/// symbolic identifiers). Local identifiers are themselves explicitly computed
|
/// identifiers (dimensional and symbolic identifiers). Local identifiers are
|
||||||
/// as affine functions of other identifiers in this process if needed.
|
/// themselves explicitly computed as affine functions of other identifiers in
|
||||||
void FlatAffineConstraints::getSliceBounds(unsigned num, MLIRContext *context,
|
/// this process if needed.
|
||||||
|
void FlatAffineConstraints::getSliceBounds(unsigned offset, unsigned num,
|
||||||
|
MLIRContext *context,
|
||||||
SmallVectorImpl<AffineMap> *lbMaps,
|
SmallVectorImpl<AffineMap> *lbMaps,
|
||||||
SmallVectorImpl<AffineMap> *ubMaps) {
|
SmallVectorImpl<AffineMap> *ubMaps) {
|
||||||
assert(num < getNumDimIds() && "invalid range");
|
assert(num < getNumDimIds() && "invalid range");
|
||||||
|
@ -1488,8 +1499,12 @@ void FlatAffineConstraints::getSliceBounds(unsigned num, MLIRContext *context,
|
||||||
// Record computed/detected identifiers.
|
// Record computed/detected identifiers.
|
||||||
SmallVector<AffineExpr, 8> memo(getNumIds());
|
SmallVector<AffineExpr, 8> memo(getNumIds());
|
||||||
// Initialize dimensional and symbolic identifiers.
|
// Initialize dimensional and symbolic identifiers.
|
||||||
for (unsigned i = num, e = getNumDimIds(); i < e; i++)
|
for (unsigned i = 0, e = getNumDimIds(); i < e; i++) {
|
||||||
memo[i] = getAffineDimExpr(i - num, context);
|
if (i < offset)
|
||||||
|
memo[i] = getAffineDimExpr(i, context);
|
||||||
|
else if (i >= offset + num)
|
||||||
|
memo[i] = getAffineDimExpr(i - num, context);
|
||||||
|
}
|
||||||
for (unsigned i = getNumDimIds(), e = getNumDimAndSymbolIds(); i < e; i++)
|
for (unsigned i = getNumDimIds(), e = getNumDimAndSymbolIds(); i < e; i++)
|
||||||
memo[i] = getAffineSymbolExpr(i - getNumDimIds(), context);
|
memo[i] = getAffineSymbolExpr(i - getNumDimIds(), context);
|
||||||
|
|
||||||
|
@ -1578,7 +1593,7 @@ void FlatAffineConstraints::getSliceBounds(unsigned num, MLIRContext *context,
|
||||||
for (unsigned pos = 0; pos < num; pos++) {
|
for (unsigned pos = 0; pos < num; pos++) {
|
||||||
unsigned numMapDims = getNumDimIds() - num;
|
unsigned numMapDims = getNumDimIds() - num;
|
||||||
unsigned numMapSymbols = getNumSymbolIds();
|
unsigned numMapSymbols = getNumSymbolIds();
|
||||||
AffineExpr expr = memo[pos];
|
AffineExpr expr = memo[pos + offset];
|
||||||
if (expr)
|
if (expr)
|
||||||
expr = simplifyAffineExpr(expr, numMapDims, numMapSymbols);
|
expr = simplifyAffineExpr(expr, numMapDims, numMapSymbols);
|
||||||
|
|
||||||
|
@ -1601,7 +1616,7 @@ void FlatAffineConstraints::getSliceBounds(unsigned num, MLIRContext *context,
|
||||||
tmpClone->removeRedundantInequalities();
|
tmpClone->removeRedundantInequalities();
|
||||||
}
|
}
|
||||||
std::tie(lbMap, ubMap) = tmpClone->getLowerAndUpperBound(
|
std::tie(lbMap, ubMap) = tmpClone->getLowerAndUpperBound(
|
||||||
pos, num, getNumDimIds(), {}, context);
|
pos, offset, num, getNumDimIds(), {}, context);
|
||||||
}
|
}
|
||||||
|
|
||||||
// If the above fails, we'll just use the constant lower bound and the
|
// If the above fails, we'll just use the constant lower bound and the
|
||||||
|
@ -1612,7 +1627,7 @@ void FlatAffineConstraints::getSliceBounds(unsigned num, MLIRContext *context,
|
||||||
if (!lbMap || lbMap.getNumResults() > 1) {
|
if (!lbMap || lbMap.getNumResults() > 1) {
|
||||||
LLVM_DEBUG(llvm::dbgs()
|
LLVM_DEBUG(llvm::dbgs()
|
||||||
<< "WARNING: Potentially over-approximating slice lb\n");
|
<< "WARNING: Potentially over-approximating slice lb\n");
|
||||||
auto lbConst = getConstantLowerBound(pos);
|
auto lbConst = getConstantLowerBound(pos + offset);
|
||||||
if (lbConst.hasValue()) {
|
if (lbConst.hasValue()) {
|
||||||
lbMap = AffineMap::get(
|
lbMap = AffineMap::get(
|
||||||
numMapDims, numMapSymbols,
|
numMapDims, numMapSymbols,
|
||||||
|
@ -1622,7 +1637,7 @@ void FlatAffineConstraints::getSliceBounds(unsigned num, MLIRContext *context,
|
||||||
if (!ubMap || ubMap.getNumResults() > 1) {
|
if (!ubMap || ubMap.getNumResults() > 1) {
|
||||||
LLVM_DEBUG(llvm::dbgs()
|
LLVM_DEBUG(llvm::dbgs()
|
||||||
<< "WARNING: Potentially over-approximating slice ub\n");
|
<< "WARNING: Potentially over-approximating slice ub\n");
|
||||||
auto ubConst = getConstantUpperBound(pos);
|
auto ubConst = getConstantUpperBound(pos + offset);
|
||||||
if (ubConst.hasValue()) {
|
if (ubConst.hasValue()) {
|
||||||
(ubMap) = AffineMap::get(
|
(ubMap) = AffineMap::get(
|
||||||
numMapDims, numMapSymbols,
|
numMapDims, numMapSymbols,
|
||||||
|
@ -1630,9 +1645,11 @@ void FlatAffineConstraints::getSliceBounds(unsigned num, MLIRContext *context,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
LLVM_DEBUG(llvm::dbgs() << "lb map for pos = " << Twine(pos) << ", expr: ");
|
LLVM_DEBUG(llvm::dbgs()
|
||||||
|
<< "lb map for pos = " << Twine(pos + offset) << ", expr: ");
|
||||||
LLVM_DEBUG(lbMap.dump(););
|
LLVM_DEBUG(lbMap.dump(););
|
||||||
LLVM_DEBUG(llvm::dbgs() << "ub map for pos = " << Twine(pos) << ", expr: ");
|
LLVM_DEBUG(llvm::dbgs()
|
||||||
|
<< "ub map for pos = " << Twine(pos + offset) << ", expr: ");
|
||||||
LLVM_DEBUG(ubMap.dump(););
|
LLVM_DEBUG(ubMap.dump(););
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -504,48 +504,84 @@ LogicalResult addMissingLoopIVBounds(SmallPtrSet<Value *, 8> &ivs,
|
||||||
return success();
|
return success();
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Computes in 'sliceUnion' the union of all slice bounds computed at
|
// Returns the innermost common loop depth for the set of operations in 'ops'.
|
||||||
/// 'dstLoopDepth' between all pairs in 'srcOps' and 'dstOp' which access the
|
// TODO(andydavis) Move this to LoopUtils.
|
||||||
/// same memref. Returns 'Success' if union was computed, 'failure' otherwise.
|
static unsigned
|
||||||
LogicalResult mlir::computeSliceUnion(ArrayRef<Operation *> srcOps,
|
getInnermostCommonLoopDepth(ArrayRef<Operation *> ops,
|
||||||
ArrayRef<Operation *> dstOps,
|
SmallVectorImpl<AffineForOp> &surroundingLoops) {
|
||||||
unsigned dstLoopDepth,
|
unsigned numOps = ops.size();
|
||||||
ComputationSliceState *sliceUnion) {
|
assert(numOps > 0);
|
||||||
unsigned numSrcOps = srcOps.size();
|
|
||||||
unsigned numDstOps = dstOps.size();
|
|
||||||
assert(numSrcOps > 0 && numDstOps > 0);
|
|
||||||
|
|
||||||
// Compute the intersection of 'srcMemrefToOps' and 'dstMemrefToOps'.
|
std::vector<SmallVector<AffineForOp, 4>> loops(numOps);
|
||||||
llvm::SmallDenseSet<Value *> memrefIntersection;
|
unsigned loopDepthLimit = std::numeric_limits<unsigned>::max();
|
||||||
for (auto *srcOp : srcOps) {
|
for (unsigned i = 0; i < numOps; ++i) {
|
||||||
auto *srcMemRef = getLoadOrStoreMemRef(srcOp);
|
getLoopIVs(*ops[i], &loops[i]);
|
||||||
for (auto *dstOp : dstOps) {
|
loopDepthLimit =
|
||||||
if (srcMemRef == getLoadOrStoreMemRef(dstOp))
|
std::min(loopDepthLimit, static_cast<unsigned>(loops[i].size()));
|
||||||
memrefIntersection.insert(srcMemRef);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
// Return failure if 'memrefIntersection' is empty.
|
|
||||||
if (memrefIntersection.empty())
|
|
||||||
return failure();
|
|
||||||
|
|
||||||
// Compute the union of slice bounds between all pairs in 'srcOps' and
|
unsigned loopDepth = 0;
|
||||||
// 'dstOps' in 'sliceUnionCst'.
|
for (unsigned d = 0; d < loopDepthLimit; ++d) {
|
||||||
|
unsigned i;
|
||||||
|
for (i = 1; i < numOps; ++i) {
|
||||||
|
if (loops[i - 1][d] != loops[i][d])
|
||||||
|
return loopDepth;
|
||||||
|
}
|
||||||
|
surroundingLoops.push_back(loops[i - 1][d]);
|
||||||
|
++loopDepth;
|
||||||
|
}
|
||||||
|
return loopDepth;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Computes in 'sliceUnion' the union of all slice bounds computed at
|
||||||
|
/// 'loopDepth' between all dependent pairs of ops in 'opsA' and 'opsB'.
|
||||||
|
/// Returns 'Success' if union was computed, 'failure' otherwise.
|
||||||
|
LogicalResult mlir::computeSliceUnion(ArrayRef<Operation *> opsA,
|
||||||
|
ArrayRef<Operation *> opsB,
|
||||||
|
unsigned loopDepth,
|
||||||
|
unsigned numCommonLoops,
|
||||||
|
bool isBackwardSlice,
|
||||||
|
ComputationSliceState *sliceUnion) {
|
||||||
|
// Compute the union of slice bounds between all pairs in 'opsA' and
|
||||||
|
// 'opsB' in 'sliceUnionCst'.
|
||||||
FlatAffineConstraints sliceUnionCst;
|
FlatAffineConstraints sliceUnionCst;
|
||||||
assert(sliceUnionCst.getNumDimAndSymbolIds() == 0);
|
assert(sliceUnionCst.getNumDimAndSymbolIds() == 0);
|
||||||
for (unsigned i = 0; i < numSrcOps; ++i) {
|
std::vector<std::pair<Operation *, Operation *>> dependentOpPairs;
|
||||||
MemRefAccess srcAccess(srcOps[i]);
|
for (unsigned i = 0, numOpsA = opsA.size(); i < numOpsA; ++i) {
|
||||||
for (unsigned j = 0; j < numDstOps; ++j) {
|
MemRefAccess srcAccess(opsA[i]);
|
||||||
MemRefAccess dstAccess(dstOps[j]);
|
for (unsigned j = 0, numOpsB = opsB.size(); j < numOpsB; ++j) {
|
||||||
|
MemRefAccess dstAccess(opsB[j]);
|
||||||
if (srcAccess.memref != dstAccess.memref)
|
if (srcAccess.memref != dstAccess.memref)
|
||||||
continue;
|
continue;
|
||||||
// Compute slice bounds for 'srcAccess' and 'dstAccess'.
|
// Check if 'loopDepth' exceeds nesting depth of src/dst ops.
|
||||||
ComputationSliceState tmpSliceState;
|
if ((!isBackwardSlice && loopDepth > getNestingDepth(*opsA[i])) ||
|
||||||
if (failed(mlir::getBackwardComputationSliceState(
|
(isBackwardSlice && loopDepth > getNestingDepth(*opsB[j]))) {
|
||||||
srcAccess, dstAccess, dstLoopDepth, &tmpSliceState))) {
|
LLVM_DEBUG(llvm::dbgs() << "Invalid loop depth\n.");
|
||||||
LLVM_DEBUG(llvm::dbgs() << "Unable to compute slice bounds\n.");
|
|
||||||
return failure();
|
return failure();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool readReadAccesses =
|
||||||
|
isa<LoadOp>(srcAccess.opInst) && isa<LoadOp>(dstAccess.opInst);
|
||||||
|
FlatAffineConstraints dependenceConstraints;
|
||||||
|
// Check dependence between 'srcAccess' and 'dstAccess'.
|
||||||
|
DependenceResult result = checkMemrefAccessDependence(
|
||||||
|
srcAccess, dstAccess, /*loopDepth=*/numCommonLoops + 1,
|
||||||
|
&dependenceConstraints, /*dependenceComponents=*/nullptr,
|
||||||
|
/*allowRAR=*/readReadAccesses);
|
||||||
|
if (result.value == DependenceResult::Failure) {
|
||||||
|
LLVM_DEBUG(llvm::dbgs() << "Dependence check failed\n.");
|
||||||
|
return failure();
|
||||||
|
}
|
||||||
|
if (result.value == DependenceResult::NoDependence)
|
||||||
|
continue;
|
||||||
|
dependentOpPairs.push_back({opsA[i], opsB[j]});
|
||||||
|
|
||||||
|
// Compute slice bounds for 'srcAccess' and 'dstAccess'.
|
||||||
|
ComputationSliceState tmpSliceState;
|
||||||
|
mlir::getComputationSliceState(opsA[i], opsB[j], &dependenceConstraints,
|
||||||
|
loopDepth, isBackwardSlice,
|
||||||
|
&tmpSliceState);
|
||||||
|
|
||||||
if (sliceUnionCst.getNumDimAndSymbolIds() == 0) {
|
if (sliceUnionCst.getNumDimAndSymbolIds() == 0) {
|
||||||
// Initialize 'sliceUnionCst' with the bounds computed in previous step.
|
// Initialize 'sliceUnionCst' with the bounds computed in previous step.
|
||||||
if (failed(tmpSliceState.getAsConstraints(&sliceUnionCst))) {
|
if (failed(tmpSliceState.getAsConstraints(&sliceUnionCst))) {
|
||||||
|
@ -599,116 +635,147 @@ LogicalResult mlir::computeSliceUnion(ArrayRef<Operation *> srcOps,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Store 'numSrcLoopIvs' before converting dst loop IVs to dims.
|
// Empty union.
|
||||||
unsigned numSrcLoopIVs = sliceUnionCst.getNumDimIds();
|
if (sliceUnionCst.getNumDimAndSymbolIds() == 0)
|
||||||
|
return failure();
|
||||||
|
|
||||||
|
// Gather loops surrounding ops from loop nest where slice will be inserted.
|
||||||
|
SmallVector<Operation *, 4> ops;
|
||||||
|
for (auto &dep : dependentOpPairs) {
|
||||||
|
ops.push_back(isBackwardSlice ? dep.second : dep.first);
|
||||||
|
}
|
||||||
|
SmallVector<AffineForOp, 4> surroundingLoops;
|
||||||
|
unsigned innermostCommonLoopDepth =
|
||||||
|
getInnermostCommonLoopDepth(ops, surroundingLoops);
|
||||||
|
if (loopDepth > innermostCommonLoopDepth) {
|
||||||
|
LLVM_DEBUG(llvm::dbgs() << "Exceeds max loop depth\n.");
|
||||||
|
return failure();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Store 'numSliceLoopIVs' before converting dst loop IVs to dims.
|
||||||
|
unsigned numSliceLoopIVs = sliceUnionCst.getNumDimIds();
|
||||||
|
|
||||||
// Convert any dst loop IVs which are symbol identifiers to dim identifiers.
|
// Convert any dst loop IVs which are symbol identifiers to dim identifiers.
|
||||||
sliceUnionCst.convertLoopIVSymbolsToDims();
|
sliceUnionCst.convertLoopIVSymbolsToDims();
|
||||||
sliceUnion->clearBounds();
|
sliceUnion->clearBounds();
|
||||||
sliceUnion->lbs.resize(numSrcLoopIVs, AffineMap());
|
sliceUnion->lbs.resize(numSliceLoopIVs, AffineMap());
|
||||||
sliceUnion->ubs.resize(numSrcLoopIVs, AffineMap());
|
sliceUnion->ubs.resize(numSliceLoopIVs, AffineMap());
|
||||||
|
|
||||||
// Get slice bounds from slice union constraints 'sliceUnionCst'.
|
// Get slice bounds from slice union constraints 'sliceUnionCst'.
|
||||||
sliceUnionCst.getSliceBounds(numSrcLoopIVs, srcOps[0]->getContext(),
|
sliceUnionCst.getSliceBounds(/*offset=*/0, numSliceLoopIVs,
|
||||||
&sliceUnion->lbs, &sliceUnion->ubs);
|
opsA[0]->getContext(), &sliceUnion->lbs,
|
||||||
|
&sliceUnion->ubs);
|
||||||
|
|
||||||
// Add slice bound operands of union.
|
// Add slice bound operands of union.
|
||||||
SmallVector<Value *, 4> sliceBoundOperands;
|
SmallVector<Value *, 4> sliceBoundOperands;
|
||||||
sliceUnionCst.getIdValues(numSrcLoopIVs,
|
sliceUnionCst.getIdValues(numSliceLoopIVs,
|
||||||
sliceUnionCst.getNumDimAndSymbolIds(),
|
sliceUnionCst.getNumDimAndSymbolIds(),
|
||||||
&sliceBoundOperands);
|
&sliceBoundOperands);
|
||||||
|
|
||||||
// Copy src loop IVs from 'sliceUnionCst' to 'sliceUnion'.
|
// Copy src loop IVs from 'sliceUnionCst' to 'sliceUnion'.
|
||||||
sliceUnion->ivs.clear();
|
sliceUnion->ivs.clear();
|
||||||
sliceUnionCst.getIdValues(0, numSrcLoopIVs, &sliceUnion->ivs);
|
sliceUnionCst.getIdValues(0, numSliceLoopIVs, &sliceUnion->ivs);
|
||||||
|
|
||||||
|
// Set loop nest insertion point to block start at 'loopDepth'.
|
||||||
|
sliceUnion->insertPoint =
|
||||||
|
isBackwardSlice
|
||||||
|
? surroundingLoops[loopDepth - 1].getBody()->begin()
|
||||||
|
: std::prev(surroundingLoops[loopDepth - 1].getBody()->end());
|
||||||
|
|
||||||
// Give each bound its own copy of 'sliceBoundOperands' for subsequent
|
// Give each bound its own copy of 'sliceBoundOperands' for subsequent
|
||||||
// canonicalization.
|
// canonicalization.
|
||||||
sliceUnion->lbOperands.resize(numSrcLoopIVs, sliceBoundOperands);
|
sliceUnion->lbOperands.resize(numSliceLoopIVs, sliceBoundOperands);
|
||||||
sliceUnion->ubOperands.resize(numSrcLoopIVs, sliceBoundOperands);
|
sliceUnion->ubOperands.resize(numSliceLoopIVs, sliceBoundOperands);
|
||||||
return success();
|
return success();
|
||||||
}
|
}
|
||||||
|
|
||||||
const char *const kSliceFusionBarrierAttrName = "slice_fusion_barrier";
|
const char *const kSliceFusionBarrierAttrName = "slice_fusion_barrier";
|
||||||
// Computes memref dependence between 'srcAccess' and 'dstAccess', projects
|
// Computes slice bounds by projecting out any loop IVs from
|
||||||
// out any dst loop IVs at depth greater than 'dstLoopDepth', and computes slice
|
// 'dependenceConstraints' at depth greater than 'loopDepth', and computes slice
|
||||||
// bounds in 'sliceState' which represent the src IVs in terms of the dst IVs,
|
// bounds in 'sliceState' which represent the one loop nest's IVs in terms of
|
||||||
// symbols and constants.
|
// the other loop nest's IVs, symbols and constants (using 'isBackwardsSlice').
|
||||||
LogicalResult mlir::getBackwardComputationSliceState(
|
void mlir::getComputationSliceState(
|
||||||
const MemRefAccess &srcAccess, const MemRefAccess &dstAccess,
|
Operation *depSourceOp, Operation *depSinkOp,
|
||||||
unsigned dstLoopDepth, ComputationSliceState *sliceState) {
|
FlatAffineConstraints *dependenceConstraints, unsigned loopDepth,
|
||||||
bool readReadAccesses =
|
bool isBackwardSlice, ComputationSliceState *sliceState) {
|
||||||
isa<LoadOp>(srcAccess.opInst) && isa<LoadOp>(dstAccess.opInst);
|
|
||||||
FlatAffineConstraints dependenceConstraints;
|
|
||||||
DependenceResult result = checkMemrefAccessDependence(
|
|
||||||
srcAccess, dstAccess, /*loopDepth=*/1, &dependenceConstraints,
|
|
||||||
/*dependenceComponents=*/nullptr, /*allowRAR=*/readReadAccesses);
|
|
||||||
if (!hasDependence(result)) {
|
|
||||||
return failure();
|
|
||||||
}
|
|
||||||
// Get loop nest surrounding src operation.
|
// Get loop nest surrounding src operation.
|
||||||
SmallVector<AffineForOp, 4> srcLoopIVs;
|
SmallVector<AffineForOp, 4> srcLoopIVs;
|
||||||
getLoopIVs(*srcAccess.opInst, &srcLoopIVs);
|
getLoopIVs(*depSourceOp, &srcLoopIVs);
|
||||||
unsigned numSrcLoopIVs = srcLoopIVs.size();
|
unsigned numSrcLoopIVs = srcLoopIVs.size();
|
||||||
|
|
||||||
// Get loop nest surrounding dst operation.
|
// Get loop nest surrounding dst operation.
|
||||||
SmallVector<AffineForOp, 4> dstLoopIVs;
|
SmallVector<AffineForOp, 4> dstLoopIVs;
|
||||||
getLoopIVs(*dstAccess.opInst, &dstLoopIVs);
|
getLoopIVs(*depSinkOp, &dstLoopIVs);
|
||||||
unsigned numDstLoopIVs = dstLoopIVs.size();
|
unsigned numDstLoopIVs = dstLoopIVs.size();
|
||||||
if (dstLoopDepth > numDstLoopIVs) {
|
|
||||||
dstAccess.opInst->emitError("invalid destination loop depth");
|
|
||||||
return failure();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Project out dimensions other than those up to 'dstLoopDepth'.
|
assert((!isBackwardSlice && loopDepth <= numSrcLoopIVs) ||
|
||||||
dependenceConstraints.projectOut(numSrcLoopIVs + dstLoopDepth,
|
(isBackwardSlice && loopDepth <= numDstLoopIVs));
|
||||||
numDstLoopIVs - dstLoopDepth);
|
|
||||||
|
|
||||||
// Add src loop IV values to 'sliceState'.
|
// Project out dimensions other than those up to 'loopDepth'.
|
||||||
dependenceConstraints.getIdValues(0, numSrcLoopIVs, &sliceState->ivs);
|
unsigned pos = isBackwardSlice ? numSrcLoopIVs + loopDepth : loopDepth;
|
||||||
|
unsigned num =
|
||||||
|
isBackwardSlice ? numDstLoopIVs - loopDepth : numSrcLoopIVs - loopDepth;
|
||||||
|
dependenceConstraints->projectOut(pos, num);
|
||||||
|
|
||||||
|
// Add slice loop IV values to 'sliceState'.
|
||||||
|
unsigned offset = isBackwardSlice ? 0 : loopDepth;
|
||||||
|
unsigned numSliceLoopIVs = isBackwardSlice ? numSrcLoopIVs : numDstLoopIVs;
|
||||||
|
dependenceConstraints->getIdValues(offset, offset + numSliceLoopIVs,
|
||||||
|
&sliceState->ivs);
|
||||||
|
|
||||||
// Set up lower/upper bound affine maps for the slice.
|
// Set up lower/upper bound affine maps for the slice.
|
||||||
sliceState->lbs.resize(numSrcLoopIVs, AffineMap());
|
sliceState->lbs.resize(numSliceLoopIVs, AffineMap());
|
||||||
sliceState->ubs.resize(numSrcLoopIVs, AffineMap());
|
sliceState->ubs.resize(numSliceLoopIVs, AffineMap());
|
||||||
|
|
||||||
// Get bounds for src IVs in terms of dst IVs, symbols, and constants.
|
// Get bounds for slice IVs in terms of other IVs, symbols, and constants.
|
||||||
dependenceConstraints.getSliceBounds(numSrcLoopIVs,
|
dependenceConstraints->getSliceBounds(offset, numSliceLoopIVs,
|
||||||
srcAccess.opInst->getContext(),
|
depSourceOp->getContext(),
|
||||||
&sliceState->lbs, &sliceState->ubs);
|
&sliceState->lbs, &sliceState->ubs);
|
||||||
|
|
||||||
// Set up bound operands for the slice's lower and upper bounds.
|
// Set up bound operands for the slice's lower and upper bounds.
|
||||||
SmallVector<Value *, 4> sliceBoundOperands;
|
SmallVector<Value *, 4> sliceBoundOperands;
|
||||||
dependenceConstraints.getIdValues(
|
unsigned numDimsAndSymbols = dependenceConstraints->getNumDimAndSymbolIds();
|
||||||
numSrcLoopIVs, dependenceConstraints.getNumDimAndSymbolIds(),
|
for (unsigned i = 0; i < numDimsAndSymbols; ++i) {
|
||||||
&sliceBoundOperands);
|
if (i < offset || i >= offset + numSliceLoopIVs) {
|
||||||
|
sliceBoundOperands.push_back(dependenceConstraints->getIdValue(i));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Give each bound its own copy of 'sliceBoundOperands' for subsequent
|
// Give each bound its own copy of 'sliceBoundOperands' for subsequent
|
||||||
// canonicalization.
|
// canonicalization.
|
||||||
sliceState->lbOperands.resize(numSrcLoopIVs, sliceBoundOperands);
|
sliceState->lbOperands.resize(numSliceLoopIVs, sliceBoundOperands);
|
||||||
sliceState->ubOperands.resize(numSrcLoopIVs, sliceBoundOperands);
|
sliceState->ubOperands.resize(numSliceLoopIVs, sliceBoundOperands);
|
||||||
|
|
||||||
|
// Set destination loop nest insertion point to block start at 'dstLoopDepth'.
|
||||||
|
sliceState->insertPoint =
|
||||||
|
isBackwardSlice ? dstLoopIVs[loopDepth - 1].getBody()->begin()
|
||||||
|
: std::prev(srcLoopIVs[loopDepth - 1].getBody()->end());
|
||||||
|
|
||||||
llvm::SmallDenseSet<Value *, 8> sequentialLoops;
|
llvm::SmallDenseSet<Value *, 8> sequentialLoops;
|
||||||
if (readReadAccesses) {
|
if (isa<LoadOp>(depSourceOp) && isa<LoadOp>(depSinkOp)) {
|
||||||
// For read-read access pairs, clear any slice bounds on sequential loops.
|
// For read-read access pairs, clear any slice bounds on sequential loops.
|
||||||
// Get sequential loops in loop nest rooted at 'srcLoopIVs[0]'.
|
// Get sequential loops in loop nest rooted at 'srcLoopIVs[0]'.
|
||||||
getSequentialLoops(srcLoopIVs[0], &sequentialLoops);
|
getSequentialLoops(isBackwardSlice ? srcLoopIVs[0] : dstLoopIVs[0],
|
||||||
|
&sequentialLoops);
|
||||||
}
|
}
|
||||||
// Clear all sliced loop bounds beginning at the first sequential loop, or
|
// Clear all sliced loop bounds beginning at the first sequential loop, or
|
||||||
// first loop with a slice fusion barrier attribute..
|
// first loop with a slice fusion barrier attribute..
|
||||||
// TODO(andydavis, bondhugula) Use MemRef read/write regions instead of
|
// TODO(andydavis, bondhugula) Use MemRef read/write regions instead of
|
||||||
// using 'kSliceFusionBarrierAttrName'.
|
// using 'kSliceFusionBarrierAttrName'.
|
||||||
for (unsigned i = 0; i < numSrcLoopIVs; ++i) {
|
auto getSliceLoop = [&](unsigned i) {
|
||||||
Value *iv = srcLoopIVs[i].getInductionVar();
|
return isBackwardSlice ? srcLoopIVs[i] : dstLoopIVs[i];
|
||||||
|
};
|
||||||
|
for (unsigned i = 0; i < numSliceLoopIVs; ++i) {
|
||||||
|
Value *iv = getSliceLoop(i).getInductionVar();
|
||||||
if (sequentialLoops.count(iv) == 0 &&
|
if (sequentialLoops.count(iv) == 0 &&
|
||||||
srcLoopIVs[i].getAttr(kSliceFusionBarrierAttrName) == nullptr)
|
getSliceLoop(i).getAttr(kSliceFusionBarrierAttrName) == nullptr)
|
||||||
continue;
|
continue;
|
||||||
for (unsigned j = i; j < numSrcLoopIVs; ++j) {
|
for (unsigned j = i; j < numSliceLoopIVs; ++j) {
|
||||||
sliceState->lbs[j] = AffineMap();
|
sliceState->lbs[j] = AffineMap();
|
||||||
sliceState->ubs[j] = AffineMap();
|
sliceState->ubs[j] = AffineMap();
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
return success();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Creates a computation slice of the loop nest surrounding 'srcOpInst',
|
/// Creates a computation slice of the loop nest surrounding 'srcOpInst',
|
||||||
|
|
|
@ -1329,7 +1329,9 @@ static bool isFusionProfitable(Operation *srcOpInst, Operation *srcStoreOpInst,
|
||||||
for (unsigned i = maxDstLoopDepth; i >= 1; --i) {
|
for (unsigned i = maxDstLoopDepth; i >= 1; --i) {
|
||||||
// Compute the union of slice bounds of all ops in 'dstLoadOpInsts'.
|
// Compute the union of slice bounds of all ops in 'dstLoadOpInsts'.
|
||||||
if (failed(mlir::computeSliceUnion({srcOpInst}, dstLoadOpInsts,
|
if (failed(mlir::computeSliceUnion({srcOpInst}, dstLoadOpInsts,
|
||||||
/*dstLoopDepth=*/i,
|
/*loopDepth=*/i,
|
||||||
|
/*numCommonLoops=*/0,
|
||||||
|
/*isBackwardSlice=*/true,
|
||||||
&sliceStates[i - 1]))) {
|
&sliceStates[i - 1]))) {
|
||||||
LLVM_DEBUG(llvm::dbgs()
|
LLVM_DEBUG(llvm::dbgs()
|
||||||
<< "computeSliceUnion failed for loopDepth: " << i << "\n");
|
<< "computeSliceUnion failed for loopDepth: " << i << "\n");
|
||||||
|
@ -1736,15 +1738,16 @@ public:
|
||||||
dstLoadOpInsts, dstStoreOpInsts, &sliceState,
|
dstLoadOpInsts, dstStoreOpInsts, &sliceState,
|
||||||
&bestDstLoopDepth, maximalFusion))
|
&bestDstLoopDepth, maximalFusion))
|
||||||
continue;
|
continue;
|
||||||
// TODO(andydavis) Remove assert and surrounding code when
|
// TODO(andydavis) Remove the following test code when canFuseLoops
|
||||||
// canFuseLoops is fully functional.
|
// is fully functional.
|
||||||
mlir::ComputationSliceState sliceUnion;
|
mlir::ComputationSliceState sliceUnion;
|
||||||
FusionResult result = mlir::canFuseLoops(
|
if (!maximalFusion) {
|
||||||
cast<AffineForOp>(srcNode->op), cast<AffineForOp>(dstNode->op),
|
FusionResult result = mlir::canFuseLoops(
|
||||||
bestDstLoopDepth, &sliceUnion);
|
cast<AffineForOp>(srcNode->op), cast<AffineForOp>(dstNode->op),
|
||||||
assert(result.value == FusionResult::Success);
|
bestDstLoopDepth, &sliceUnion);
|
||||||
(void)result;
|
assert(result.value == FusionResult::Success);
|
||||||
|
(void)result;
|
||||||
|
}
|
||||||
// Fuse computation slice of 'srcLoopNest' into 'dstLoopNest'.
|
// Fuse computation slice of 'srcLoopNest' into 'dstLoopNest'.
|
||||||
auto sliceLoopNest = mlir::insertBackwardComputationSlice(
|
auto sliceLoopNest = mlir::insertBackwardComputationSlice(
|
||||||
srcStoreOpInst, dstLoadOpInsts[0], bestDstLoopDepth, &sliceState);
|
srcStoreOpInst, dstLoadOpInsts[0], bestDstLoopDepth, &sliceState);
|
||||||
|
|
|
@ -45,6 +45,11 @@ static llvm::cl::opt<bool> clTestDependenceCheck(
|
||||||
llvm::cl::desc("Enable testing of loop fusion dependence check"),
|
llvm::cl::desc("Enable testing of loop fusion dependence check"),
|
||||||
llvm::cl::cat(clOptionsCategory));
|
llvm::cl::cat(clOptionsCategory));
|
||||||
|
|
||||||
|
static llvm::cl::opt<bool> clTestSliceComputation(
|
||||||
|
"test-loop-fusion-slice-computation",
|
||||||
|
llvm::cl::desc("Enable testing of loop fusion slice computation"),
|
||||||
|
llvm::cl::cat(clOptionsCategory));
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
struct TestLoopFusion : public FunctionPass<TestLoopFusion> {
|
struct TestLoopFusion : public FunctionPass<TestLoopFusion> {
|
||||||
|
@ -70,20 +75,74 @@ gatherLoops(Block *block, unsigned currLoopDepth,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Run fusion dependence check on 'loops[i]' and 'loops[j]' at 'loopDepth'.
|
// Run fusion dependence check on 'loops[i]' and 'loops[j]' at loop depths
|
||||||
|
// in range ['loopDepth' + 1, 'maxLoopDepth'].
|
||||||
// Emits a remark on 'loops[i]' if a fusion-preventing dependence exists.
|
// Emits a remark on 'loops[i]' if a fusion-preventing dependence exists.
|
||||||
static void testDependenceCheck(SmallVector<AffineForOp, 2> &loops, unsigned i,
|
static void testDependenceCheck(SmallVector<AffineForOp, 2> &loops, unsigned i,
|
||||||
unsigned j, unsigned loopDepth) {
|
unsigned j, unsigned loopDepth,
|
||||||
|
unsigned maxLoopDepth) {
|
||||||
AffineForOp srcForOp = loops[i];
|
AffineForOp srcForOp = loops[i];
|
||||||
AffineForOp dstForOp = loops[j];
|
AffineForOp dstForOp = loops[j];
|
||||||
mlir::ComputationSliceState sliceUnion;
|
mlir::ComputationSliceState sliceUnion;
|
||||||
// TODO(andydavis) Test at deeper loop depths current loop depth + 1.
|
for (unsigned d = loopDepth + 1; d <= maxLoopDepth; ++d) {
|
||||||
FusionResult result =
|
FusionResult result =
|
||||||
mlir::canFuseLoops(srcForOp, dstForOp, loopDepth + 1, &sliceUnion);
|
mlir::canFuseLoops(srcForOp, dstForOp, d, &sliceUnion);
|
||||||
if (result.value == FusionResult::FailBlockDependence) {
|
if (result.value == FusionResult::FailBlockDependence) {
|
||||||
srcForOp.getOperation()->emitRemark("block-level dependence preventing"
|
srcForOp.getOperation()->emitRemark("block-level dependence preventing"
|
||||||
" fusion of loop nest ")
|
" fusion of loop nest ")
|
||||||
<< i << " into loop nest " << j << " at depth " << loopDepth;
|
<< i << " into loop nest " << j << " at depth " << loopDepth;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Returns the index of 'op' in its block.
|
||||||
|
static unsigned getBlockIndex(Operation &op) {
|
||||||
|
unsigned index = 0;
|
||||||
|
for (auto &opX : *op.getBlock()) {
|
||||||
|
if (&op == &opX)
|
||||||
|
break;
|
||||||
|
++index;
|
||||||
|
}
|
||||||
|
return index;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Returns a string representation of 'sliceUnion'.
|
||||||
|
static std::string getSliceStr(const mlir::ComputationSliceState &sliceUnion) {
|
||||||
|
std::string result;
|
||||||
|
llvm::raw_string_ostream os(result);
|
||||||
|
// Slice insertion point format [loop-depth, operation-block-index]
|
||||||
|
unsigned ipd = getNestingDepth(*sliceUnion.insertPoint);
|
||||||
|
unsigned ipb = getBlockIndex(*sliceUnion.insertPoint);
|
||||||
|
os << "insert point: (" << std::to_string(ipd) << ", " << std::to_string(ipb)
|
||||||
|
<< ")";
|
||||||
|
assert(sliceUnion.lbs.size() == sliceUnion.ubs.size());
|
||||||
|
os << " loop bounds: ";
|
||||||
|
for (unsigned k = 0, e = sliceUnion.lbs.size(); k < e; ++k) {
|
||||||
|
os << '[';
|
||||||
|
sliceUnion.lbs[k].print(os);
|
||||||
|
os << ", ";
|
||||||
|
sliceUnion.ubs[k].print(os);
|
||||||
|
os << "] ";
|
||||||
|
}
|
||||||
|
return os.str();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Computes fusion slice union on 'loops[i]' and 'loops[j]' at loop depths
|
||||||
|
// in range ['loopDepth' + 1, 'maxLoopDepth'].
|
||||||
|
// Emits a string represention of the slice union as a remark on 'loops[j]'.
|
||||||
|
static void testSliceComputation(SmallVector<AffineForOp, 2> &loops, unsigned i,
|
||||||
|
unsigned j, unsigned loopDepth,
|
||||||
|
unsigned maxLoopDepth) {
|
||||||
|
AffineForOp forOpA = loops[i];
|
||||||
|
AffineForOp forOpB = loops[j];
|
||||||
|
for (unsigned d = loopDepth + 1; d <= maxLoopDepth; ++d) {
|
||||||
|
mlir::ComputationSliceState sliceUnion;
|
||||||
|
FusionResult result = mlir::canFuseLoops(forOpA, forOpB, d, &sliceUnion);
|
||||||
|
if (result.value == FusionResult::Success) {
|
||||||
|
forOpB.getOperation()->emitRemark("slice (")
|
||||||
|
<< " src loop: " << i << ", dst loop: " << j << ", depth: " << d
|
||||||
|
<< " : " << getSliceStr(sliceUnion) << ")";
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -104,7 +163,9 @@ void TestLoopFusion::runOnFunction() {
|
||||||
if (j == k)
|
if (j == k)
|
||||||
continue;
|
continue;
|
||||||
if (clTestDependenceCheck)
|
if (clTestDependenceCheck)
|
||||||
testDependenceCheck(loops, j, k, loopDepth);
|
testDependenceCheck(loops, j, k, loopDepth, depthToLoops.size());
|
||||||
|
if (clTestSliceComputation)
|
||||||
|
testSliceComputation(loops, j, k, loopDepth, depthToLoops.size());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -192,11 +192,7 @@ gatherLoadsAndStores(AffineForOp forOp,
|
||||||
return !hasIfOp;
|
return !hasIfOp;
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO(andydavis) Add support for the following features in subsequent CLs:
|
// TODO(andydavis) Prevent fusion of loop nests with side-effecting operations.
|
||||||
// *) Compute dependences of unfused src/dst loops.
|
|
||||||
// *) Compute dependences of src/dst loop as if they were fused.
|
|
||||||
// *) Check for fusion preventing dependences (e.g. a dependence which changes
|
|
||||||
// from loop-independent to backward loop-carried after fusion).
|
|
||||||
FusionResult mlir::canFuseLoops(AffineForOp srcForOp, AffineForOp dstForOp,
|
FusionResult mlir::canFuseLoops(AffineForOp srcForOp, AffineForOp dstForOp,
|
||||||
unsigned dstLoopDepth,
|
unsigned dstLoopDepth,
|
||||||
ComputationSliceState *srcSlice) {
|
ComputationSliceState *srcSlice) {
|
||||||
|
@ -219,24 +215,35 @@ FusionResult mlir::canFuseLoops(AffineForOp srcForOp, AffineForOp dstForOp,
|
||||||
return FusionResult::FailBlockDependence;
|
return FusionResult::FailBlockDependence;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Gather all load and store ops in 'srcForOp'.
|
// Check if 'srcForOp' precedeces 'dstForOp' in 'block'.
|
||||||
SmallVector<Operation *, 4> srcLoadAndStoreOps;
|
bool isSrcForOpBeforeDstForOp =
|
||||||
if (!gatherLoadsAndStores(srcForOp, srcLoadAndStoreOps)) {
|
srcForOp.getOperation()->isBeforeInBlock(dstForOp.getOperation());
|
||||||
|
// 'forOpA' executes before 'forOpB' in 'block'.
|
||||||
|
auto forOpA = isSrcForOpBeforeDstForOp ? srcForOp : dstForOp;
|
||||||
|
auto forOpB = isSrcForOpBeforeDstForOp ? dstForOp : srcForOp;
|
||||||
|
|
||||||
|
// Gather all load and store from 'forOpA' which precedes 'forOpB' in 'block'.
|
||||||
|
SmallVector<Operation *, 4> opsA;
|
||||||
|
if (!gatherLoadsAndStores(forOpA, opsA)) {
|
||||||
LLVM_DEBUG(llvm::dbgs() << "Fusing loops with affine.if unsupported.\n.");
|
LLVM_DEBUG(llvm::dbgs() << "Fusing loops with affine.if unsupported.\n.");
|
||||||
return FusionResult::FailPrecondition;
|
return FusionResult::FailPrecondition;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Gather all load and store ops in 'dstForOp'.
|
// Gather all load and store from 'forOpB' which succeeds 'forOpA' in 'block'.
|
||||||
SmallVector<Operation *, 4> dstLoadAndStoreOps;
|
SmallVector<Operation *, 4> opsB;
|
||||||
if (!gatherLoadsAndStores(dstForOp, dstLoadAndStoreOps)) {
|
if (!gatherLoadsAndStores(forOpB, opsB)) {
|
||||||
LLVM_DEBUG(llvm::dbgs() << "Fusing loops with affine.if unsupported.\n.");
|
LLVM_DEBUG(llvm::dbgs() << "Fusing loops with affine.if unsupported.\n.");
|
||||||
return FusionResult::FailPrecondition;
|
return FusionResult::FailPrecondition;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Compute union of computation slices computed from all pairs in
|
// Calculate the number of common loops surrounding 'srcForOp' and 'dstForOp'.
|
||||||
// {'srcLoadAndStoreOps', 'dstLoadAndStoreOps'}.
|
unsigned numCommonLoops = mlir::getNumCommonSurroundingLoops(
|
||||||
if (failed(mlir::computeSliceUnion(srcLoadAndStoreOps, dstLoadAndStoreOps,
|
*srcForOp.getOperation(), *dstForOp.getOperation());
|
||||||
dstLoopDepth, srcSlice))) {
|
|
||||||
|
// Compute union of computation slices computed between all pairs of ops
|
||||||
|
// from 'forOpA' and 'forOpB'.
|
||||||
|
if (failed(mlir::computeSliceUnion(opsA, opsB, dstLoopDepth, numCommonLoops,
|
||||||
|
isSrcForOpBeforeDstForOp, srcSlice))) {
|
||||||
LLVM_DEBUG(llvm::dbgs() << "computeSliceUnion failed\n");
|
LLVM_DEBUG(llvm::dbgs() << "computeSliceUnion failed\n");
|
||||||
return FusionResult::FailPrecondition;
|
return FusionResult::FailPrecondition;
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,145 @@
|
||||||
|
// RUN: mlir-opt %s -test-loop-fusion -test-loop-fusion-slice-computation -split-input-file -verify | FileCheck %s
|
||||||
|
|
||||||
|
// -----
|
||||||
|
|
||||||
|
// CHECK-LABEL: func @slice_depth1_loop_nest() {
|
||||||
|
func @slice_depth1_loop_nest() {
|
||||||
|
%0 = alloc() : memref<100xf32>
|
||||||
|
%cst = constant 7.000000e+00 : f32
|
||||||
|
affine.for %i0 = 0 to 16 {
|
||||||
|
// expected-remark@-1 {{slice ( src loop: 1, dst loop: 0, depth: 1 : insert point: (1, 1) loop bounds: [(d0) -> (d0), (d0) -> (d0 + 1)] )}}
|
||||||
|
store %cst, %0[%i0] : memref<100xf32>
|
||||||
|
}
|
||||||
|
affine.for %i1 = 0 to 5 {
|
||||||
|
// expected-remark@-1 {{slice ( src loop: 0, dst loop: 1, depth: 1 : insert point: (1, 0) loop bounds: [(d0) -> (d0), (d0) -> (d0 + 1)] )}}
|
||||||
|
%1 = load %0[%i1] : memref<100xf32>
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// -----
|
||||||
|
|
||||||
|
// Loop %i0 writes to locations [2, 17] and loop %i0 reads from locations [3, 6]
|
||||||
|
// Slice loop bounds should be adjusted such that the load/store are for the
|
||||||
|
// same location.
|
||||||
|
// CHECK-LABEL: func @slice_depth1_loop_nest_with_offsets() {
|
||||||
|
func @slice_depth1_loop_nest_with_offsets() {
|
||||||
|
%0 = alloc() : memref<100xf32>
|
||||||
|
%cst = constant 7.000000e+00 : f32
|
||||||
|
affine.for %i0 = 0 to 16 {
|
||||||
|
// expected-remark@-1 {{slice ( src loop: 1, dst loop: 0, depth: 1 : insert point: (1, 2) loop bounds: [(d0) -> (d0 + 3), (d0) -> (d0 + 4)] )}}
|
||||||
|
%a0 = affine.apply (d0) -> (d0 + 2)(%i0)
|
||||||
|
store %cst, %0[%a0] : memref<100xf32>
|
||||||
|
}
|
||||||
|
affine.for %i1 = 4 to 8 {
|
||||||
|
// expected-remark@-1 {{slice ( src loop: 0, dst loop: 1, depth: 1 : insert point: (1, 0) loop bounds: [(d0) -> (d0 - 3), (d0) -> (d0 - 2)] )}}
|
||||||
|
%a1 = affine.apply (d0) -> (d0 - 1)(%i1)
|
||||||
|
%1 = load %0[%a1] : memref<100xf32>
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// -----
|
||||||
|
|
||||||
|
// Slices at loop depth 1 should only slice the loop bounds of the first loop.
|
||||||
|
// Slices at loop detph 2 should slice loop bounds of both loops.
|
||||||
|
// CHECK-LABEL: func @slice_depth2_loop_nest() {
|
||||||
|
func @slice_depth2_loop_nest() {
|
||||||
|
%0 = alloc() : memref<100x100xf32>
|
||||||
|
%cst = constant 7.000000e+00 : f32
|
||||||
|
affine.for %i0 = 0 to 16 {
|
||||||
|
// expected-remark@-1 {{slice ( src loop: 1, dst loop: 0, depth: 1 : insert point: (1, 1) loop bounds: [(d0) -> (d0), (d0) -> (d0 + 1)] [(d0) -> (0), (d0) -> (8)] )}}
|
||||||
|
// expected-remark@-2 {{slice ( src loop: 1, dst loop: 0, depth: 2 : insert point: (2, 1) loop bounds: [(d0, d1) -> (d0), (d0, d1) -> (d0 + 1)] [(d0, d1) -> (d1), (d0, d1) -> (d1 + 1)] )}}
|
||||||
|
affine.for %i1 = 0 to 16 {
|
||||||
|
store %cst, %0[%i0, %i1] : memref<100x100xf32>
|
||||||
|
}
|
||||||
|
}
|
||||||
|
affine.for %i2 = 0 to 10 {
|
||||||
|
// expected-remark@-1 {{slice ( src loop: 0, dst loop: 1, depth: 1 : insert point: (1, 0) loop bounds: [(d0) -> (d0), (d0) -> (d0 + 1)] [(d0) -> (0), (d0) -> (8)] )}}
|
||||||
|
// expected-remark@-2 {{slice ( src loop: 0, dst loop: 1, depth: 2 : insert point: (2, 0) loop bounds: [(d0, d1) -> (d0), (d0, d1) -> (d0 + 1)] [(d0, d1) -> (d1), (d0, d1) -> (d1 + 1)] )}}
|
||||||
|
affine.for %i3 = 0 to 8 {
|
||||||
|
%1 = load %0[%i2, %i3] : memref<100x100xf32>
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// -----
|
||||||
|
|
||||||
|
// The load at depth 1 in loop nest %i2 prevents slicing loop nest %i0 at depths
|
||||||
|
// greater than 1. However, loop nest %i2 can be sliced into loop nest %i0 at
|
||||||
|
// depths 1 and 2 because the dependent store in loop nest %i0 is at depth 2.
|
||||||
|
// CHECK-LABEL: func @slice_depth2_loop_nest_two_loads() {
|
||||||
|
func @slice_depth2_loop_nest_two_loads() {
|
||||||
|
%0 = alloc() : memref<100x100xf32>
|
||||||
|
%c0 = constant 0 : index
|
||||||
|
%cst = constant 7.000000e+00 : f32
|
||||||
|
affine.for %i0 = 0 to 16 {
|
||||||
|
// expected-remark@-1 {{slice ( src loop: 1, dst loop: 0, depth: 1 : insert point: (1, 1) loop bounds: [(d0)[s0] -> (d0), (d0)[s0] -> (d0 + 1)] [(d0)[s0] -> (0), (d0)[s0] -> (8)] )}}
|
||||||
|
// expected-remark@-2 {{slice ( src loop: 1, dst loop: 0, depth: 2 : insert point: (2, 1) loop bounds: [(d0, d1)[s0] -> (d0), (d0, d1)[s0] -> (d0 + 1)] [(d0, d1)[s0] -> (0), (d0, d1)[s0] -> (8)] )}}
|
||||||
|
affine.for %i1 = 0 to 16 {
|
||||||
|
store %cst, %0[%i0, %i1] : memref<100x100xf32>
|
||||||
|
}
|
||||||
|
}
|
||||||
|
affine.for %i2 = 0 to 10 {
|
||||||
|
// expected-remark@-1 {{slice ( src loop: 0, dst loop: 1, depth: 1 : insert point: (1, 0) loop bounds: [(d0)[s0] -> (d0), (d0)[s0] -> (d0 + 1)] [(d0)[s0] -> (0), (d0)[s0] -> (8)] )}}
|
||||||
|
affine.for %i3 = 0 to 8 {
|
||||||
|
%1 = load %0[%i2, %i3] : memref<100x100xf32>
|
||||||
|
}
|
||||||
|
%2 = load %0[%i2, %c0] : memref<100x100xf32>
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// -----
|
||||||
|
|
||||||
|
// The store at depth 1 in loop nest %i0 prevents slicing loop nest %i2 at
|
||||||
|
// depths greater than 1 into loop nest %i0. However, loop nest %i0 can be
|
||||||
|
// sliced into loop nest %i2 at depths 1 and 2 because the dependent load in
|
||||||
|
// loop nest %i2 is at depth 2.
|
||||||
|
// CHECK-LABEL: func @slice_depth2_loop_nest_two_stores() {
|
||||||
|
func @slice_depth2_loop_nest_two_stores() {
|
||||||
|
%0 = alloc() : memref<100x100xf32>
|
||||||
|
%c0 = constant 0 : index
|
||||||
|
%cst = constant 7.000000e+00 : f32
|
||||||
|
affine.for %i0 = 0 to 16 {
|
||||||
|
// expected-remark@-1 {{slice ( src loop: 1, dst loop: 0, depth: 1 : insert point: (1, 2) loop bounds: [(d0)[s0] -> (d0), (d0)[s0] -> (d0 + 1)] [(d0)[s0] -> (0), (d0)[s0] -> (8)] )}}
|
||||||
|
affine.for %i1 = 0 to 16 {
|
||||||
|
store %cst, %0[%i0, %i1] : memref<100x100xf32>
|
||||||
|
}
|
||||||
|
store %cst, %0[%i0, %c0] : memref<100x100xf32>
|
||||||
|
}
|
||||||
|
affine.for %i2 = 0 to 10 {
|
||||||
|
// expected-remark@-1 {{slice ( src loop: 0, dst loop: 1, depth: 1 : insert point: (1, 0) loop bounds: [(d0)[s0] -> (d0), (d0)[s0] -> (d0 + 1)] [(d0)[s0] -> (0), (d0)[s0] -> (16)] )}}
|
||||||
|
// expected-remark@-2 {{slice ( src loop: 0, dst loop: 1, depth: 2 : insert point: (2, 0) loop bounds: [(d0, d1)[s0] -> (d0), (d0, d1)[s0] -> (d0 + 1)] [(d0, d1)[s0] -> (0), (d0, d1)[s0] -> (16)] )}}
|
||||||
|
affine.for %i3 = 0 to 8 {
|
||||||
|
%1 = load %0[%i2, %i3] : memref<100x100xf32>
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// -----
|
||||||
|
|
||||||
|
// Test loop nest which has a smaller outer trip count than its inner loop.
|
||||||
|
// CHECK-LABEL: func @slice_loop_nest_with_smaller_outer_trip_count() {
|
||||||
|
func @slice_loop_nest_with_smaller_outer_trip_count() {
|
||||||
|
%0 = alloc() : memref<100x100xf32>
|
||||||
|
%c0 = constant 0 : index
|
||||||
|
%cst = constant 7.000000e+00 : f32
|
||||||
|
affine.for %i0 = 0 to 16 {
|
||||||
|
// expected-remark@-1 {{slice ( src loop: 1, dst loop: 0, depth: 1 : insert point: (1, 1) loop bounds: [(d0) -> (d0), (d0) -> (d0 + 1)] [(d0) -> (0), (d0) -> (10)] )}}
|
||||||
|
// expected-remark@-2 {{slice ( src loop: 1, dst loop: 0, depth: 2 : insert point: (2, 1) loop bounds: [(d0, d1) -> (d0), (d0, d1) -> (d0 + 1)] [(d0, d1) -> (d1), (d0, d1) -> (d1 + 1)] )}}
|
||||||
|
affine.for %i1 = 0 to 16 {
|
||||||
|
store %cst, %0[%i0, %i1] : memref<100x100xf32>
|
||||||
|
}
|
||||||
|
}
|
||||||
|
affine.for %i2 = 0 to 8 {
|
||||||
|
// expected-remark@-1 {{slice ( src loop: 0, dst loop: 1, depth: 1 : insert point: (1, 0) loop bounds: [(d0) -> (d0), (d0) -> (d0 + 1)] [(d0) -> (0), (d0) -> (10)] )}}
|
||||||
|
// expected-remark@-2 {{slice ( src loop: 0, dst loop: 1, depth: 2 : insert point: (2, 0) loop bounds: [(d0, d1) -> (d0), (d0, d1) -> (d0 + 1)] [(d0, d1) -> (d1), (d0, d1) -> (d1 + 1)] )}}
|
||||||
|
affine.for %i3 = 0 to 10 {
|
||||||
|
%1 = load %0[%i2, %i3] : memref<100x100xf32>
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
Loading…
Reference in New Issue