LoopFusion: adds support for computing forward computation slices, which will enable fusion of consumer loop nests into their producers in subsequent CLs.

PiperOrigin-RevId: 253601994
This commit is contained in:
Andy Davis 2019-06-17 09:59:35 -07:00 committed by Mehdi Amini
parent a14eeacf2c
commit 898cf0e968
8 changed files with 515 additions and 164 deletions

View File

@ -393,12 +393,12 @@ public:
bool lower = true);
/// Computes the lower and upper bounds of the first 'num' dimensional
/// identifiers as an affine map of the remaining identifiers (dimensional and
/// symbolic). This method is able to detect identifiers as floordiv's
/// and mod's of affine expressions of other identifiers with respect to
/// (positive) constants. Sets bound map to a null AffineMap if such a bound
/// can't be found (or yet unimplemented).
void getSliceBounds(unsigned num, MLIRContext *context,
/// identifiers (starting at 'offset') as an affine map of the remaining
/// identifiers (dimensional and symbolic). This method is able to detect
/// identifiers as floordiv's and mod's of affine expressions of other
/// identifiers with respect to (positive) constants. Sets bound map to a
/// null AffineMap if such a bound can't be found (or yet unimplemented).
void getSliceBounds(unsigned offset, unsigned num, MLIRContext *context,
SmallVectorImpl<AffineMap> *lbMaps,
SmallVectorImpl<AffineMap> *ubMaps);
@ -648,13 +648,14 @@ public:
Optional<int64_t> getConstantUpperBound(unsigned pos) const;
/// Gets the lower and upper bound of the pos^th identifier treating
/// [dimStartPos, symbStartPos) as dimensions and [symStartPos,
/// getNumDimAndSymbolIds) as symbols. The returned multi-dimensional maps
/// in the pair represent the max and min of potentially multiple affine
/// expressions. The upper bound is exclusive. 'localExprs' holds pre-computed
/// AffineExpr's for all local identifiers in the system.
/// [0, offset) U [offset + num, symbStartPos) as dimensions and
/// [symStartPos, getNumDimAndSymbolIds) as symbols. The returned
/// multi-dimensional maps in the pair represent the max and min of
/// potentially multiple affine expressions. The upper bound is exclusive.
/// 'localExprs' holds pre-computed AffineExpr's for all local identifiers in
/// the system.
std::pair<AffineMap, AffineMap>
getLowerAndUpperBound(unsigned pos, unsigned dimStartPos,
getLowerAndUpperBound(unsigned pos, unsigned offset, unsigned num,
unsigned symStartPos, ArrayRef<AffineExpr> localExprs,
MLIRContext *context);

View File

@ -73,6 +73,8 @@ struct ComputationSliceState {
std::vector<SmallVector<Value *, 4>> lbOperands;
// List of upper bound operands (ubOperands[i] are used by 'ubs[i]').
std::vector<SmallVector<Value *, 4>> ubOperands;
// Slice loop nest insertion point in target loop nest.
Block::iterator insertPoint;
// Adds to 'cst' with constraints which represent the slice bounds on 'ivs'
// in 'this'. Specifically, the values in 'ivs' are added to 'cst' as dim
// identifiers and the values in 'lb/ubOperands' are added as symbols.
@ -85,19 +87,67 @@ struct ComputationSliceState {
void clearBounds();
};
/// Computes computation slice loop bounds for the loop nest surrounding
/// 'srcAccess', where the returned loop bound AffineMaps are functions of
/// loop IVs from the loop nest surrounding 'dstAccess'.
LogicalResult getBackwardComputationSliceState(
const MemRefAccess &srcAccess, const MemRefAccess &dstAccess,
unsigned dstLoopDepth, ComputationSliceState *sliceState);
/// Computes the computation slice loop bounds for one loop nest as affine maps
/// of the other loop nest's IVs and symbols, using 'dependenceConstraints'
/// computed between 'depSourceAccess' and 'depSinkAccess'.
/// If 'isBackwardSlice' is true, a backwards slice is computed in which the
/// slice bounds of loop nest surrounding 'depSourceAccess' are computed in
/// terms of loop IVs and symbols of the loop nest surrounding 'depSinkAccess'
/// at 'loopDepth'.
/// If 'isBackwardSlice' is false, a forward slice is computed in which the
/// slice bounds of loop nest surrounding 'depSinkAccess' are computed in terms
/// of loop IVs and symbols of the loop nest surrounding 'depSourceAccess' at
/// 'loopDepth'.
/// The slice loop bounds and associated operands are returned in 'sliceState'.
//
// Backward slice example:
//
// affine.for %i0 = 0 to 10 {
// store %cst, %0[%i0] : memref<100xf32> // 'depSourceAccess'
// }
// affine.for %i1 = 0 to 10 {
// %v = load %0[%i1] : memref<100xf32> // 'depSinkAccess'
// }
//
// // Backward computation slice of loop nest '%i0'.
// affine.for %i0 = (d0) -> (d0)(%i1) to (d0) -> (d0 + 1)(%i1) {
// store %cst, %0[%i0] : memref<100xf32> // 'depSourceAccess'
// }
//
// Forward slice example:
//
// affine.for %i0 = 0 to 10 {
// store %cst, %0[%i0] : memref<100xf32> // 'depSourceAccess'
// }
// affine.for %i1 = 0 to 10 {
// %v = load %0[%i1] : memref<100xf32> // 'depSinkAccess'
// }
//
// // Forward computation slice of loop nest '%i1'.
// affine.for %i1 = (d0) -> (d0)(%i0) to (d0) -> (d0 + 1)(%i0) {
// %v = load %0[%i1] : memref<100xf32> // 'depSinkAccess'
// }
//
void getComputationSliceState(Operation *depSourceOp, Operation *depSinkOp,
FlatAffineConstraints *dependenceConstraints,
unsigned loopDepth, bool isBackwardSlice,
ComputationSliceState *sliceState);
/// Computes in 'sliceUnion' the union of all slice bounds computed at
/// 'dstLoopDepth' between all pairs in 'srcOps' and 'dstOp' which access the
/// same memref. Returns 'success' if union was computed, 'failure' otherwise.
LogicalResult computeSliceUnion(ArrayRef<Operation *> srcOps,
ArrayRef<Operation *> dstOps,
unsigned dstLoopDepth,
/// 'loopDepth' between all dependent pairs of ops in 'opsA' and 'opsB'.
/// The parameter 'numCommonLoops' is the number of loops common to the
/// operations in 'opsA' and 'opsB'.
/// If 'isBackwardSlice' is true, computes slice bounds for loop nest
/// surrounding ops in 'opsA', as a function of IVs and symbols of loop nest
/// surrounding ops in 'opsB' at 'loopDepth'.
/// If 'isBackwardSlice' is false, computes slice bounds for loop nest
/// surrounding ops in 'opsB', as a function of IVs and symbols of loop nest
/// surrounding ops in 'opsA' at 'loopDepth'.
/// Returns 'success' if union was computed, 'failure' otherwise.
// TODO(andydavis) Change this API to take 'forOpA'/'forOpB'.
LogicalResult computeSliceUnion(ArrayRef<Operation *> opsA,
ArrayRef<Operation *> opsB, unsigned loopDepth,
unsigned numCommonLoops, bool isBackwardSlice,
ComputationSliceState *sliceUnion);
/// Creates a clone of the computation contained in the loop nest surrounding

View File

@ -1423,19 +1423,28 @@ void FlatAffineConstraints::removeRedundantInequalities() {
}
std::pair<AffineMap, AffineMap> FlatAffineConstraints::getLowerAndUpperBound(
unsigned pos, unsigned dimStartPos, unsigned symStartPos,
unsigned pos, unsigned offset, unsigned num, unsigned symStartPos,
ArrayRef<AffineExpr> localExprs, MLIRContext *context) {
assert(pos < dimStartPos && "invalid dim start pos");
assert(symStartPos >= dimStartPos && "invalid sym start pos");
assert(pos + offset < getNumDimIds() && "invalid dim start pos");
assert(symStartPos >= (pos + offset) && "invalid sym start pos");
assert(getNumLocalIds() == localExprs.size() &&
"incorrect local exprs count");
SmallVector<unsigned, 4> lbIndices, ubIndices;
getLowerAndUpperBoundIndices(*this, pos, &lbIndices, &ubIndices);
getLowerAndUpperBoundIndices(*this, pos + offset, &lbIndices, &ubIndices);
/// Add to 'b' from 'a' in set [0, offset) U [offset + num, symbStartPos).
auto addCoeffs = [&](ArrayRef<int64_t> a, SmallVectorImpl<int64_t> &b) {
b.clear();
for (unsigned i = 0, e = a.size(); i < e; ++i) {
if (i < offset || i >= offset + num)
b.push_back(a[i]);
}
};
SmallVector<int64_t, 8> lb, ub;
SmallVector<AffineExpr, 4> exprs;
unsigned dimCount = symStartPos - dimStartPos;
unsigned dimCount = symStartPos - num;
unsigned symCount = getNumDimAndSymbolIds() - symStartPos;
exprs.reserve(lbIndices.size());
// Lower bound expressions.
@ -1444,7 +1453,7 @@ std::pair<AffineMap, AffineMap> FlatAffineConstraints::getLowerAndUpperBound(
// Extract the lower bound (in terms of other coeff's + const), i.e., if
// i - j + 1 >= 0 is the constraint, 'pos' is for i the lower bound is j
// - 1.
lb.assign(ineq.begin() + dimStartPos, ineq.end());
addCoeffs(ineq, lb);
std::transform(lb.begin(), lb.end(), lb.begin(), std::negate<int64_t>());
auto expr = mlir::toAffineExpr(lb, dimCount, symCount, localExprs, context);
exprs.push_back(expr);
@ -1458,7 +1467,7 @@ std::pair<AffineMap, AffineMap> FlatAffineConstraints::getLowerAndUpperBound(
for (auto idx : ubIndices) {
auto ineq = getInequality(idx);
// Extract the upper bound (in terms of other coeff's + const).
ub.assign(ineq.begin() + dimStartPos, ineq.end());
addCoeffs(ineq, ub);
auto expr = mlir::toAffineExpr(ub, dimCount, symCount, localExprs, context);
// Upper bound is exclusive.
exprs.push_back(expr + 1);
@ -1470,10 +1479,12 @@ std::pair<AffineMap, AffineMap> FlatAffineConstraints::getLowerAndUpperBound(
}
/// Computes the lower and upper bounds of the first 'num' dimensional
/// identifiers as affine maps of the remaining identifiers (dimensional and
/// symbolic identifiers). Local identifiers are themselves explicitly computed
/// as affine functions of other identifiers in this process if needed.
void FlatAffineConstraints::getSliceBounds(unsigned num, MLIRContext *context,
/// identifiers (starting at 'offset') as affine maps of the remaining
/// identifiers (dimensional and symbolic identifiers). Local identifiers are
/// themselves explicitly computed as affine functions of other identifiers in
/// this process if needed.
void FlatAffineConstraints::getSliceBounds(unsigned offset, unsigned num,
MLIRContext *context,
SmallVectorImpl<AffineMap> *lbMaps,
SmallVectorImpl<AffineMap> *ubMaps) {
assert(num < getNumDimIds() && "invalid range");
@ -1488,8 +1499,12 @@ void FlatAffineConstraints::getSliceBounds(unsigned num, MLIRContext *context,
// Record computed/detected identifiers.
SmallVector<AffineExpr, 8> memo(getNumIds());
// Initialize dimensional and symbolic identifiers.
for (unsigned i = num, e = getNumDimIds(); i < e; i++)
for (unsigned i = 0, e = getNumDimIds(); i < e; i++) {
if (i < offset)
memo[i] = getAffineDimExpr(i, context);
else if (i >= offset + num)
memo[i] = getAffineDimExpr(i - num, context);
}
for (unsigned i = getNumDimIds(), e = getNumDimAndSymbolIds(); i < e; i++)
memo[i] = getAffineSymbolExpr(i - getNumDimIds(), context);
@ -1578,7 +1593,7 @@ void FlatAffineConstraints::getSliceBounds(unsigned num, MLIRContext *context,
for (unsigned pos = 0; pos < num; pos++) {
unsigned numMapDims = getNumDimIds() - num;
unsigned numMapSymbols = getNumSymbolIds();
AffineExpr expr = memo[pos];
AffineExpr expr = memo[pos + offset];
if (expr)
expr = simplifyAffineExpr(expr, numMapDims, numMapSymbols);
@ -1601,7 +1616,7 @@ void FlatAffineConstraints::getSliceBounds(unsigned num, MLIRContext *context,
tmpClone->removeRedundantInequalities();
}
std::tie(lbMap, ubMap) = tmpClone->getLowerAndUpperBound(
pos, num, getNumDimIds(), {}, context);
pos, offset, num, getNumDimIds(), {}, context);
}
// If the above fails, we'll just use the constant lower bound and the
@ -1612,7 +1627,7 @@ void FlatAffineConstraints::getSliceBounds(unsigned num, MLIRContext *context,
if (!lbMap || lbMap.getNumResults() > 1) {
LLVM_DEBUG(llvm::dbgs()
<< "WARNING: Potentially over-approximating slice lb\n");
auto lbConst = getConstantLowerBound(pos);
auto lbConst = getConstantLowerBound(pos + offset);
if (lbConst.hasValue()) {
lbMap = AffineMap::get(
numMapDims, numMapSymbols,
@ -1622,7 +1637,7 @@ void FlatAffineConstraints::getSliceBounds(unsigned num, MLIRContext *context,
if (!ubMap || ubMap.getNumResults() > 1) {
LLVM_DEBUG(llvm::dbgs()
<< "WARNING: Potentially over-approximating slice ub\n");
auto ubConst = getConstantUpperBound(pos);
auto ubConst = getConstantUpperBound(pos + offset);
if (ubConst.hasValue()) {
(ubMap) = AffineMap::get(
numMapDims, numMapSymbols,
@ -1630,9 +1645,11 @@ void FlatAffineConstraints::getSliceBounds(unsigned num, MLIRContext *context,
}
}
}
LLVM_DEBUG(llvm::dbgs() << "lb map for pos = " << Twine(pos) << ", expr: ");
LLVM_DEBUG(llvm::dbgs()
<< "lb map for pos = " << Twine(pos + offset) << ", expr: ");
LLVM_DEBUG(lbMap.dump(););
LLVM_DEBUG(llvm::dbgs() << "ub map for pos = " << Twine(pos) << ", expr: ");
LLVM_DEBUG(llvm::dbgs()
<< "ub map for pos = " << Twine(pos + offset) << ", expr: ");
LLVM_DEBUG(ubMap.dump(););
}
}

View File

@ -504,48 +504,84 @@ LogicalResult addMissingLoopIVBounds(SmallPtrSet<Value *, 8> &ivs,
return success();
}
// Returns the innermost common loop depth for the set of operations in 'ops'.
// TODO(andydavis) Move this to LoopUtils.
static unsigned
getInnermostCommonLoopDepth(ArrayRef<Operation *> ops,
SmallVectorImpl<AffineForOp> &surroundingLoops) {
unsigned numOps = ops.size();
assert(numOps > 0);
std::vector<SmallVector<AffineForOp, 4>> loops(numOps);
unsigned loopDepthLimit = std::numeric_limits<unsigned>::max();
for (unsigned i = 0; i < numOps; ++i) {
getLoopIVs(*ops[i], &loops[i]);
loopDepthLimit =
std::min(loopDepthLimit, static_cast<unsigned>(loops[i].size()));
}
unsigned loopDepth = 0;
for (unsigned d = 0; d < loopDepthLimit; ++d) {
unsigned i;
for (i = 1; i < numOps; ++i) {
if (loops[i - 1][d] != loops[i][d])
return loopDepth;
}
surroundingLoops.push_back(loops[i - 1][d]);
++loopDepth;
}
return loopDepth;
}
/// Computes in 'sliceUnion' the union of all slice bounds computed at
/// 'dstLoopDepth' between all pairs in 'srcOps' and 'dstOp' which access the
/// same memref. Returns 'Success' if union was computed, 'failure' otherwise.
LogicalResult mlir::computeSliceUnion(ArrayRef<Operation *> srcOps,
ArrayRef<Operation *> dstOps,
unsigned dstLoopDepth,
/// 'loopDepth' between all dependent pairs of ops in 'opsA' and 'opsB'.
/// Returns 'Success' if union was computed, 'failure' otherwise.
LogicalResult mlir::computeSliceUnion(ArrayRef<Operation *> opsA,
ArrayRef<Operation *> opsB,
unsigned loopDepth,
unsigned numCommonLoops,
bool isBackwardSlice,
ComputationSliceState *sliceUnion) {
unsigned numSrcOps = srcOps.size();
unsigned numDstOps = dstOps.size();
assert(numSrcOps > 0 && numDstOps > 0);
// Compute the intersection of 'srcMemrefToOps' and 'dstMemrefToOps'.
llvm::SmallDenseSet<Value *> memrefIntersection;
for (auto *srcOp : srcOps) {
auto *srcMemRef = getLoadOrStoreMemRef(srcOp);
for (auto *dstOp : dstOps) {
if (srcMemRef == getLoadOrStoreMemRef(dstOp))
memrefIntersection.insert(srcMemRef);
}
}
// Return failure if 'memrefIntersection' is empty.
if (memrefIntersection.empty())
return failure();
// Compute the union of slice bounds between all pairs in 'srcOps' and
// 'dstOps' in 'sliceUnionCst'.
// Compute the union of slice bounds between all pairs in 'opsA' and
// 'opsB' in 'sliceUnionCst'.
FlatAffineConstraints sliceUnionCst;
assert(sliceUnionCst.getNumDimAndSymbolIds() == 0);
for (unsigned i = 0; i < numSrcOps; ++i) {
MemRefAccess srcAccess(srcOps[i]);
for (unsigned j = 0; j < numDstOps; ++j) {
MemRefAccess dstAccess(dstOps[j]);
std::vector<std::pair<Operation *, Operation *>> dependentOpPairs;
for (unsigned i = 0, numOpsA = opsA.size(); i < numOpsA; ++i) {
MemRefAccess srcAccess(opsA[i]);
for (unsigned j = 0, numOpsB = opsB.size(); j < numOpsB; ++j) {
MemRefAccess dstAccess(opsB[j]);
if (srcAccess.memref != dstAccess.memref)
continue;
// Compute slice bounds for 'srcAccess' and 'dstAccess'.
ComputationSliceState tmpSliceState;
if (failed(mlir::getBackwardComputationSliceState(
srcAccess, dstAccess, dstLoopDepth, &tmpSliceState))) {
LLVM_DEBUG(llvm::dbgs() << "Unable to compute slice bounds\n.");
// Check if 'loopDepth' exceeds nesting depth of src/dst ops.
if ((!isBackwardSlice && loopDepth > getNestingDepth(*opsA[i])) ||
(isBackwardSlice && loopDepth > getNestingDepth(*opsB[j]))) {
LLVM_DEBUG(llvm::dbgs() << "Invalid loop depth\n.");
return failure();
}
bool readReadAccesses =
isa<LoadOp>(srcAccess.opInst) && isa<LoadOp>(dstAccess.opInst);
FlatAffineConstraints dependenceConstraints;
// Check dependence between 'srcAccess' and 'dstAccess'.
DependenceResult result = checkMemrefAccessDependence(
srcAccess, dstAccess, /*loopDepth=*/numCommonLoops + 1,
&dependenceConstraints, /*dependenceComponents=*/nullptr,
/*allowRAR=*/readReadAccesses);
if (result.value == DependenceResult::Failure) {
LLVM_DEBUG(llvm::dbgs() << "Dependence check failed\n.");
return failure();
}
if (result.value == DependenceResult::NoDependence)
continue;
dependentOpPairs.push_back({opsA[i], opsB[j]});
// Compute slice bounds for 'srcAccess' and 'dstAccess'.
ComputationSliceState tmpSliceState;
mlir::getComputationSliceState(opsA[i], opsB[j], &dependenceConstraints,
loopDepth, isBackwardSlice,
&tmpSliceState);
if (sliceUnionCst.getNumDimAndSymbolIds() == 0) {
// Initialize 'sliceUnionCst' with the bounds computed in previous step.
if (failed(tmpSliceState.getAsConstraints(&sliceUnionCst))) {
@ -599,116 +635,147 @@ LogicalResult mlir::computeSliceUnion(ArrayRef<Operation *> srcOps,
}
}
// Store 'numSrcLoopIvs' before converting dst loop IVs to dims.
unsigned numSrcLoopIVs = sliceUnionCst.getNumDimIds();
// Empty union.
if (sliceUnionCst.getNumDimAndSymbolIds() == 0)
return failure();
// Gather loops surrounding ops from loop nest where slice will be inserted.
SmallVector<Operation *, 4> ops;
for (auto &dep : dependentOpPairs) {
ops.push_back(isBackwardSlice ? dep.second : dep.first);
}
SmallVector<AffineForOp, 4> surroundingLoops;
unsigned innermostCommonLoopDepth =
getInnermostCommonLoopDepth(ops, surroundingLoops);
if (loopDepth > innermostCommonLoopDepth) {
LLVM_DEBUG(llvm::dbgs() << "Exceeds max loop depth\n.");
return failure();
}
// Store 'numSliceLoopIVs' before converting dst loop IVs to dims.
unsigned numSliceLoopIVs = sliceUnionCst.getNumDimIds();
// Convert any dst loop IVs which are symbol identifiers to dim identifiers.
sliceUnionCst.convertLoopIVSymbolsToDims();
sliceUnion->clearBounds();
sliceUnion->lbs.resize(numSrcLoopIVs, AffineMap());
sliceUnion->ubs.resize(numSrcLoopIVs, AffineMap());
sliceUnion->lbs.resize(numSliceLoopIVs, AffineMap());
sliceUnion->ubs.resize(numSliceLoopIVs, AffineMap());
// Get slice bounds from slice union constraints 'sliceUnionCst'.
sliceUnionCst.getSliceBounds(numSrcLoopIVs, srcOps[0]->getContext(),
&sliceUnion->lbs, &sliceUnion->ubs);
sliceUnionCst.getSliceBounds(/*offset=*/0, numSliceLoopIVs,
opsA[0]->getContext(), &sliceUnion->lbs,
&sliceUnion->ubs);
// Add slice bound operands of union.
SmallVector<Value *, 4> sliceBoundOperands;
sliceUnionCst.getIdValues(numSrcLoopIVs,
sliceUnionCst.getIdValues(numSliceLoopIVs,
sliceUnionCst.getNumDimAndSymbolIds(),
&sliceBoundOperands);
// Copy src loop IVs from 'sliceUnionCst' to 'sliceUnion'.
sliceUnion->ivs.clear();
sliceUnionCst.getIdValues(0, numSrcLoopIVs, &sliceUnion->ivs);
sliceUnionCst.getIdValues(0, numSliceLoopIVs, &sliceUnion->ivs);
// Set loop nest insertion point to block start at 'loopDepth'.
sliceUnion->insertPoint =
isBackwardSlice
? surroundingLoops[loopDepth - 1].getBody()->begin()
: std::prev(surroundingLoops[loopDepth - 1].getBody()->end());
// Give each bound its own copy of 'sliceBoundOperands' for subsequent
// canonicalization.
sliceUnion->lbOperands.resize(numSrcLoopIVs, sliceBoundOperands);
sliceUnion->ubOperands.resize(numSrcLoopIVs, sliceBoundOperands);
sliceUnion->lbOperands.resize(numSliceLoopIVs, sliceBoundOperands);
sliceUnion->ubOperands.resize(numSliceLoopIVs, sliceBoundOperands);
return success();
}
const char *const kSliceFusionBarrierAttrName = "slice_fusion_barrier";
// Computes memref dependence between 'srcAccess' and 'dstAccess', projects
// out any dst loop IVs at depth greater than 'dstLoopDepth', and computes slice
// bounds in 'sliceState' which represent the src IVs in terms of the dst IVs,
// symbols and constants.
LogicalResult mlir::getBackwardComputationSliceState(
const MemRefAccess &srcAccess, const MemRefAccess &dstAccess,
unsigned dstLoopDepth, ComputationSliceState *sliceState) {
bool readReadAccesses =
isa<LoadOp>(srcAccess.opInst) && isa<LoadOp>(dstAccess.opInst);
FlatAffineConstraints dependenceConstraints;
DependenceResult result = checkMemrefAccessDependence(
srcAccess, dstAccess, /*loopDepth=*/1, &dependenceConstraints,
/*dependenceComponents=*/nullptr, /*allowRAR=*/readReadAccesses);
if (!hasDependence(result)) {
return failure();
}
// Computes slice bounds by projecting out any loop IVs from
// 'dependenceConstraints' at depth greater than 'loopDepth', and computes slice
// bounds in 'sliceState' which represent the one loop nest's IVs in terms of
// the other loop nest's IVs, symbols and constants (using 'isBackwardsSlice').
void mlir::getComputationSliceState(
Operation *depSourceOp, Operation *depSinkOp,
FlatAffineConstraints *dependenceConstraints, unsigned loopDepth,
bool isBackwardSlice, ComputationSliceState *sliceState) {
// Get loop nest surrounding src operation.
SmallVector<AffineForOp, 4> srcLoopIVs;
getLoopIVs(*srcAccess.opInst, &srcLoopIVs);
getLoopIVs(*depSourceOp, &srcLoopIVs);
unsigned numSrcLoopIVs = srcLoopIVs.size();
// Get loop nest surrounding dst operation.
SmallVector<AffineForOp, 4> dstLoopIVs;
getLoopIVs(*dstAccess.opInst, &dstLoopIVs);
getLoopIVs(*depSinkOp, &dstLoopIVs);
unsigned numDstLoopIVs = dstLoopIVs.size();
if (dstLoopDepth > numDstLoopIVs) {
dstAccess.opInst->emitError("invalid destination loop depth");
return failure();
}
// Project out dimensions other than those up to 'dstLoopDepth'.
dependenceConstraints.projectOut(numSrcLoopIVs + dstLoopDepth,
numDstLoopIVs - dstLoopDepth);
assert((!isBackwardSlice && loopDepth <= numSrcLoopIVs) ||
(isBackwardSlice && loopDepth <= numDstLoopIVs));
// Add src loop IV values to 'sliceState'.
dependenceConstraints.getIdValues(0, numSrcLoopIVs, &sliceState->ivs);
// Project out dimensions other than those up to 'loopDepth'.
unsigned pos = isBackwardSlice ? numSrcLoopIVs + loopDepth : loopDepth;
unsigned num =
isBackwardSlice ? numDstLoopIVs - loopDepth : numSrcLoopIVs - loopDepth;
dependenceConstraints->projectOut(pos, num);
// Add slice loop IV values to 'sliceState'.
unsigned offset = isBackwardSlice ? 0 : loopDepth;
unsigned numSliceLoopIVs = isBackwardSlice ? numSrcLoopIVs : numDstLoopIVs;
dependenceConstraints->getIdValues(offset, offset + numSliceLoopIVs,
&sliceState->ivs);
// Set up lower/upper bound affine maps for the slice.
sliceState->lbs.resize(numSrcLoopIVs, AffineMap());
sliceState->ubs.resize(numSrcLoopIVs, AffineMap());
sliceState->lbs.resize(numSliceLoopIVs, AffineMap());
sliceState->ubs.resize(numSliceLoopIVs, AffineMap());
// Get bounds for src IVs in terms of dst IVs, symbols, and constants.
dependenceConstraints.getSliceBounds(numSrcLoopIVs,
srcAccess.opInst->getContext(),
// Get bounds for slice IVs in terms of other IVs, symbols, and constants.
dependenceConstraints->getSliceBounds(offset, numSliceLoopIVs,
depSourceOp->getContext(),
&sliceState->lbs, &sliceState->ubs);
// Set up bound operands for the slice's lower and upper bounds.
SmallVector<Value *, 4> sliceBoundOperands;
dependenceConstraints.getIdValues(
numSrcLoopIVs, dependenceConstraints.getNumDimAndSymbolIds(),
&sliceBoundOperands);
unsigned numDimsAndSymbols = dependenceConstraints->getNumDimAndSymbolIds();
for (unsigned i = 0; i < numDimsAndSymbols; ++i) {
if (i < offset || i >= offset + numSliceLoopIVs) {
sliceBoundOperands.push_back(dependenceConstraints->getIdValue(i));
}
}
// Give each bound its own copy of 'sliceBoundOperands' for subsequent
// canonicalization.
sliceState->lbOperands.resize(numSrcLoopIVs, sliceBoundOperands);
sliceState->ubOperands.resize(numSrcLoopIVs, sliceBoundOperands);
sliceState->lbOperands.resize(numSliceLoopIVs, sliceBoundOperands);
sliceState->ubOperands.resize(numSliceLoopIVs, sliceBoundOperands);
// Set destination loop nest insertion point to block start at 'dstLoopDepth'.
sliceState->insertPoint =
isBackwardSlice ? dstLoopIVs[loopDepth - 1].getBody()->begin()
: std::prev(srcLoopIVs[loopDepth - 1].getBody()->end());
llvm::SmallDenseSet<Value *, 8> sequentialLoops;
if (readReadAccesses) {
if (isa<LoadOp>(depSourceOp) && isa<LoadOp>(depSinkOp)) {
// For read-read access pairs, clear any slice bounds on sequential loops.
// Get sequential loops in loop nest rooted at 'srcLoopIVs[0]'.
getSequentialLoops(srcLoopIVs[0], &sequentialLoops);
getSequentialLoops(isBackwardSlice ? srcLoopIVs[0] : dstLoopIVs[0],
&sequentialLoops);
}
// Clear all sliced loop bounds beginning at the first sequential loop, or
// first loop with a slice fusion barrier attribute..
// TODO(andydavis, bondhugula) Use MemRef read/write regions instead of
// using 'kSliceFusionBarrierAttrName'.
for (unsigned i = 0; i < numSrcLoopIVs; ++i) {
Value *iv = srcLoopIVs[i].getInductionVar();
auto getSliceLoop = [&](unsigned i) {
return isBackwardSlice ? srcLoopIVs[i] : dstLoopIVs[i];
};
for (unsigned i = 0; i < numSliceLoopIVs; ++i) {
Value *iv = getSliceLoop(i).getInductionVar();
if (sequentialLoops.count(iv) == 0 &&
srcLoopIVs[i].getAttr(kSliceFusionBarrierAttrName) == nullptr)
getSliceLoop(i).getAttr(kSliceFusionBarrierAttrName) == nullptr)
continue;
for (unsigned j = i; j < numSrcLoopIVs; ++j) {
for (unsigned j = i; j < numSliceLoopIVs; ++j) {
sliceState->lbs[j] = AffineMap();
sliceState->ubs[j] = AffineMap();
}
break;
}
return success();
}
/// Creates a computation slice of the loop nest surrounding 'srcOpInst',

View File

@ -1329,7 +1329,9 @@ static bool isFusionProfitable(Operation *srcOpInst, Operation *srcStoreOpInst,
for (unsigned i = maxDstLoopDepth; i >= 1; --i) {
// Compute the union of slice bounds of all ops in 'dstLoadOpInsts'.
if (failed(mlir::computeSliceUnion({srcOpInst}, dstLoadOpInsts,
/*dstLoopDepth=*/i,
/*loopDepth=*/i,
/*numCommonLoops=*/0,
/*isBackwardSlice=*/true,
&sliceStates[i - 1]))) {
LLVM_DEBUG(llvm::dbgs()
<< "computeSliceUnion failed for loopDepth: " << i << "\n");
@ -1736,15 +1738,16 @@ public:
dstLoadOpInsts, dstStoreOpInsts, &sliceState,
&bestDstLoopDepth, maximalFusion))
continue;
// TODO(andydavis) Remove assert and surrounding code when
// canFuseLoops is fully functional.
// TODO(andydavis) Remove the following test code when canFuseLoops
// is fully functional.
mlir::ComputationSliceState sliceUnion;
if (!maximalFusion) {
FusionResult result = mlir::canFuseLoops(
cast<AffineForOp>(srcNode->op), cast<AffineForOp>(dstNode->op),
bestDstLoopDepth, &sliceUnion);
assert(result.value == FusionResult::Success);
(void)result;
}
// Fuse computation slice of 'srcLoopNest' into 'dstLoopNest'.
auto sliceLoopNest = mlir::insertBackwardComputationSlice(
srcStoreOpInst, dstLoadOpInsts[0], bestDstLoopDepth, &sliceState);

View File

@ -45,6 +45,11 @@ static llvm::cl::opt<bool> clTestDependenceCheck(
llvm::cl::desc("Enable testing of loop fusion dependence check"),
llvm::cl::cat(clOptionsCategory));
static llvm::cl::opt<bool> clTestSliceComputation(
"test-loop-fusion-slice-computation",
llvm::cl::desc("Enable testing of loop fusion slice computation"),
llvm::cl::cat(clOptionsCategory));
namespace {
struct TestLoopFusion : public FunctionPass<TestLoopFusion> {
@ -70,21 +75,75 @@ gatherLoops(Block *block, unsigned currLoopDepth,
}
}
// Run fusion dependence check on 'loops[i]' and 'loops[j]' at 'loopDepth'.
// Run fusion dependence check on 'loops[i]' and 'loops[j]' at loop depths
// in range ['loopDepth' + 1, 'maxLoopDepth'].
// Emits a remark on 'loops[i]' if a fusion-preventing dependence exists.
static void testDependenceCheck(SmallVector<AffineForOp, 2> &loops, unsigned i,
unsigned j, unsigned loopDepth) {
unsigned j, unsigned loopDepth,
unsigned maxLoopDepth) {
AffineForOp srcForOp = loops[i];
AffineForOp dstForOp = loops[j];
mlir::ComputationSliceState sliceUnion;
// TODO(andydavis) Test at deeper loop depths current loop depth + 1.
for (unsigned d = loopDepth + 1; d <= maxLoopDepth; ++d) {
FusionResult result =
mlir::canFuseLoops(srcForOp, dstForOp, loopDepth + 1, &sliceUnion);
mlir::canFuseLoops(srcForOp, dstForOp, d, &sliceUnion);
if (result.value == FusionResult::FailBlockDependence) {
srcForOp.getOperation()->emitRemark("block-level dependence preventing"
" fusion of loop nest ")
<< i << " into loop nest " << j << " at depth " << loopDepth;
}
}
}
// Returns the index of 'op' in its block.
static unsigned getBlockIndex(Operation &op) {
unsigned index = 0;
for (auto &opX : *op.getBlock()) {
if (&op == &opX)
break;
++index;
}
return index;
}
// Returns a string representation of 'sliceUnion'.
static std::string getSliceStr(const mlir::ComputationSliceState &sliceUnion) {
std::string result;
llvm::raw_string_ostream os(result);
// Slice insertion point format [loop-depth, operation-block-index]
unsigned ipd = getNestingDepth(*sliceUnion.insertPoint);
unsigned ipb = getBlockIndex(*sliceUnion.insertPoint);
os << "insert point: (" << std::to_string(ipd) << ", " << std::to_string(ipb)
<< ")";
assert(sliceUnion.lbs.size() == sliceUnion.ubs.size());
os << " loop bounds: ";
for (unsigned k = 0, e = sliceUnion.lbs.size(); k < e; ++k) {
os << '[';
sliceUnion.lbs[k].print(os);
os << ", ";
sliceUnion.ubs[k].print(os);
os << "] ";
}
return os.str();
}
// Computes fusion slice union on 'loops[i]' and 'loops[j]' at loop depths
// in range ['loopDepth' + 1, 'maxLoopDepth'].
// Emits a string represention of the slice union as a remark on 'loops[j]'.
static void testSliceComputation(SmallVector<AffineForOp, 2> &loops, unsigned i,
unsigned j, unsigned loopDepth,
unsigned maxLoopDepth) {
AffineForOp forOpA = loops[i];
AffineForOp forOpB = loops[j];
for (unsigned d = loopDepth + 1; d <= maxLoopDepth; ++d) {
mlir::ComputationSliceState sliceUnion;
FusionResult result = mlir::canFuseLoops(forOpA, forOpB, d, &sliceUnion);
if (result.value == FusionResult::Success) {
forOpB.getOperation()->emitRemark("slice (")
<< " src loop: " << i << ", dst loop: " << j << ", depth: " << d
<< " : " << getSliceStr(sliceUnion) << ")";
}
}
}
void TestLoopFusion::runOnFunction() {
@ -104,7 +163,9 @@ void TestLoopFusion::runOnFunction() {
if (j == k)
continue;
if (clTestDependenceCheck)
testDependenceCheck(loops, j, k, loopDepth);
testDependenceCheck(loops, j, k, loopDepth, depthToLoops.size());
if (clTestSliceComputation)
testSliceComputation(loops, j, k, loopDepth, depthToLoops.size());
}
}
}

View File

@ -192,11 +192,7 @@ gatherLoadsAndStores(AffineForOp forOp,
return !hasIfOp;
}
// TODO(andydavis) Add support for the following features in subsequent CLs:
// *) Compute dependences of unfused src/dst loops.
// *) Compute dependences of src/dst loop as if they were fused.
// *) Check for fusion preventing dependences (e.g. a dependence which changes
// from loop-independent to backward loop-carried after fusion).
// TODO(andydavis) Prevent fusion of loop nests with side-effecting operations.
FusionResult mlir::canFuseLoops(AffineForOp srcForOp, AffineForOp dstForOp,
unsigned dstLoopDepth,
ComputationSliceState *srcSlice) {
@ -219,24 +215,35 @@ FusionResult mlir::canFuseLoops(AffineForOp srcForOp, AffineForOp dstForOp,
return FusionResult::FailBlockDependence;
}
// Gather all load and store ops in 'srcForOp'.
SmallVector<Operation *, 4> srcLoadAndStoreOps;
if (!gatherLoadsAndStores(srcForOp, srcLoadAndStoreOps)) {
// Check if 'srcForOp' precedeces 'dstForOp' in 'block'.
bool isSrcForOpBeforeDstForOp =
srcForOp.getOperation()->isBeforeInBlock(dstForOp.getOperation());
// 'forOpA' executes before 'forOpB' in 'block'.
auto forOpA = isSrcForOpBeforeDstForOp ? srcForOp : dstForOp;
auto forOpB = isSrcForOpBeforeDstForOp ? dstForOp : srcForOp;
// Gather all load and store from 'forOpA' which precedes 'forOpB' in 'block'.
SmallVector<Operation *, 4> opsA;
if (!gatherLoadsAndStores(forOpA, opsA)) {
LLVM_DEBUG(llvm::dbgs() << "Fusing loops with affine.if unsupported.\n.");
return FusionResult::FailPrecondition;
}
// Gather all load and store ops in 'dstForOp'.
SmallVector<Operation *, 4> dstLoadAndStoreOps;
if (!gatherLoadsAndStores(dstForOp, dstLoadAndStoreOps)) {
// Gather all load and store from 'forOpB' which succeeds 'forOpA' in 'block'.
SmallVector<Operation *, 4> opsB;
if (!gatherLoadsAndStores(forOpB, opsB)) {
LLVM_DEBUG(llvm::dbgs() << "Fusing loops with affine.if unsupported.\n.");
return FusionResult::FailPrecondition;
}
// Compute union of computation slices computed from all pairs in
// {'srcLoadAndStoreOps', 'dstLoadAndStoreOps'}.
if (failed(mlir::computeSliceUnion(srcLoadAndStoreOps, dstLoadAndStoreOps,
dstLoopDepth, srcSlice))) {
// Calculate the number of common loops surrounding 'srcForOp' and 'dstForOp'.
unsigned numCommonLoops = mlir::getNumCommonSurroundingLoops(
*srcForOp.getOperation(), *dstForOp.getOperation());
// Compute union of computation slices computed between all pairs of ops
// from 'forOpA' and 'forOpB'.
if (failed(mlir::computeSliceUnion(opsA, opsB, dstLoopDepth, numCommonLoops,
isSrcForOpBeforeDstForOp, srcSlice))) {
LLVM_DEBUG(llvm::dbgs() << "computeSliceUnion failed\n");
return FusionResult::FailPrecondition;
}

View File

@ -0,0 +1,145 @@
// RUN: mlir-opt %s -test-loop-fusion -test-loop-fusion-slice-computation -split-input-file -verify | FileCheck %s
// -----
// CHECK-LABEL: func @slice_depth1_loop_nest() {
func @slice_depth1_loop_nest() {
%0 = alloc() : memref<100xf32>
%cst = constant 7.000000e+00 : f32
affine.for %i0 = 0 to 16 {
// expected-remark@-1 {{slice ( src loop: 1, dst loop: 0, depth: 1 : insert point: (1, 1) loop bounds: [(d0) -> (d0), (d0) -> (d0 + 1)] )}}
store %cst, %0[%i0] : memref<100xf32>
}
affine.for %i1 = 0 to 5 {
// expected-remark@-1 {{slice ( src loop: 0, dst loop: 1, depth: 1 : insert point: (1, 0) loop bounds: [(d0) -> (d0), (d0) -> (d0 + 1)] )}}
%1 = load %0[%i1] : memref<100xf32>
}
return
}
// -----
// Loop %i0 writes to locations [2, 17] and loop %i0 reads from locations [3, 6]
// Slice loop bounds should be adjusted such that the load/store are for the
// same location.
// CHECK-LABEL: func @slice_depth1_loop_nest_with_offsets() {
func @slice_depth1_loop_nest_with_offsets() {
%0 = alloc() : memref<100xf32>
%cst = constant 7.000000e+00 : f32
affine.for %i0 = 0 to 16 {
// expected-remark@-1 {{slice ( src loop: 1, dst loop: 0, depth: 1 : insert point: (1, 2) loop bounds: [(d0) -> (d0 + 3), (d0) -> (d0 + 4)] )}}
%a0 = affine.apply (d0) -> (d0 + 2)(%i0)
store %cst, %0[%a0] : memref<100xf32>
}
affine.for %i1 = 4 to 8 {
// expected-remark@-1 {{slice ( src loop: 0, dst loop: 1, depth: 1 : insert point: (1, 0) loop bounds: [(d0) -> (d0 - 3), (d0) -> (d0 - 2)] )}}
%a1 = affine.apply (d0) -> (d0 - 1)(%i1)
%1 = load %0[%a1] : memref<100xf32>
}
return
}
// -----
// Slices at loop depth 1 should only slice the loop bounds of the first loop.
// Slices at loop detph 2 should slice loop bounds of both loops.
// CHECK-LABEL: func @slice_depth2_loop_nest() {
func @slice_depth2_loop_nest() {
%0 = alloc() : memref<100x100xf32>
%cst = constant 7.000000e+00 : f32
affine.for %i0 = 0 to 16 {
// expected-remark@-1 {{slice ( src loop: 1, dst loop: 0, depth: 1 : insert point: (1, 1) loop bounds: [(d0) -> (d0), (d0) -> (d0 + 1)] [(d0) -> (0), (d0) -> (8)] )}}
// expected-remark@-2 {{slice ( src loop: 1, dst loop: 0, depth: 2 : insert point: (2, 1) loop bounds: [(d0, d1) -> (d0), (d0, d1) -> (d0 + 1)] [(d0, d1) -> (d1), (d0, d1) -> (d1 + 1)] )}}
affine.for %i1 = 0 to 16 {
store %cst, %0[%i0, %i1] : memref<100x100xf32>
}
}
affine.for %i2 = 0 to 10 {
// expected-remark@-1 {{slice ( src loop: 0, dst loop: 1, depth: 1 : insert point: (1, 0) loop bounds: [(d0) -> (d0), (d0) -> (d0 + 1)] [(d0) -> (0), (d0) -> (8)] )}}
// expected-remark@-2 {{slice ( src loop: 0, dst loop: 1, depth: 2 : insert point: (2, 0) loop bounds: [(d0, d1) -> (d0), (d0, d1) -> (d0 + 1)] [(d0, d1) -> (d1), (d0, d1) -> (d1 + 1)] )}}
affine.for %i3 = 0 to 8 {
%1 = load %0[%i2, %i3] : memref<100x100xf32>
}
}
return
}
// -----
// The load at depth 1 in loop nest %i2 prevents slicing loop nest %i0 at depths
// greater than 1. However, loop nest %i2 can be sliced into loop nest %i0 at
// depths 1 and 2 because the dependent store in loop nest %i0 is at depth 2.
// CHECK-LABEL: func @slice_depth2_loop_nest_two_loads() {
func @slice_depth2_loop_nest_two_loads() {
%0 = alloc() : memref<100x100xf32>
%c0 = constant 0 : index
%cst = constant 7.000000e+00 : f32
affine.for %i0 = 0 to 16 {
// expected-remark@-1 {{slice ( src loop: 1, dst loop: 0, depth: 1 : insert point: (1, 1) loop bounds: [(d0)[s0] -> (d0), (d0)[s0] -> (d0 + 1)] [(d0)[s0] -> (0), (d0)[s0] -> (8)] )}}
// expected-remark@-2 {{slice ( src loop: 1, dst loop: 0, depth: 2 : insert point: (2, 1) loop bounds: [(d0, d1)[s0] -> (d0), (d0, d1)[s0] -> (d0 + 1)] [(d0, d1)[s0] -> (0), (d0, d1)[s0] -> (8)] )}}
affine.for %i1 = 0 to 16 {
store %cst, %0[%i0, %i1] : memref<100x100xf32>
}
}
affine.for %i2 = 0 to 10 {
// expected-remark@-1 {{slice ( src loop: 0, dst loop: 1, depth: 1 : insert point: (1, 0) loop bounds: [(d0)[s0] -> (d0), (d0)[s0] -> (d0 + 1)] [(d0)[s0] -> (0), (d0)[s0] -> (8)] )}}
affine.for %i3 = 0 to 8 {
%1 = load %0[%i2, %i3] : memref<100x100xf32>
}
%2 = load %0[%i2, %c0] : memref<100x100xf32>
}
return
}
// -----
// The store at depth 1 in loop nest %i0 prevents slicing loop nest %i2 at
// depths greater than 1 into loop nest %i0. However, loop nest %i0 can be
// sliced into loop nest %i2 at depths 1 and 2 because the dependent load in
// loop nest %i2 is at depth 2.
// CHECK-LABEL: func @slice_depth2_loop_nest_two_stores() {
func @slice_depth2_loop_nest_two_stores() {
%0 = alloc() : memref<100x100xf32>
%c0 = constant 0 : index
%cst = constant 7.000000e+00 : f32
affine.for %i0 = 0 to 16 {
// expected-remark@-1 {{slice ( src loop: 1, dst loop: 0, depth: 1 : insert point: (1, 2) loop bounds: [(d0)[s0] -> (d0), (d0)[s0] -> (d0 + 1)] [(d0)[s0] -> (0), (d0)[s0] -> (8)] )}}
affine.for %i1 = 0 to 16 {
store %cst, %0[%i0, %i1] : memref<100x100xf32>
}
store %cst, %0[%i0, %c0] : memref<100x100xf32>
}
affine.for %i2 = 0 to 10 {
// expected-remark@-1 {{slice ( src loop: 0, dst loop: 1, depth: 1 : insert point: (1, 0) loop bounds: [(d0)[s0] -> (d0), (d0)[s0] -> (d0 + 1)] [(d0)[s0] -> (0), (d0)[s0] -> (16)] )}}
// expected-remark@-2 {{slice ( src loop: 0, dst loop: 1, depth: 2 : insert point: (2, 0) loop bounds: [(d0, d1)[s0] -> (d0), (d0, d1)[s0] -> (d0 + 1)] [(d0, d1)[s0] -> (0), (d0, d1)[s0] -> (16)] )}}
affine.for %i3 = 0 to 8 {
%1 = load %0[%i2, %i3] : memref<100x100xf32>
}
}
return
}
// -----
// Test loop nest which has a smaller outer trip count than its inner loop.
// CHECK-LABEL: func @slice_loop_nest_with_smaller_outer_trip_count() {
func @slice_loop_nest_with_smaller_outer_trip_count() {
%0 = alloc() : memref<100x100xf32>
%c0 = constant 0 : index
%cst = constant 7.000000e+00 : f32
affine.for %i0 = 0 to 16 {
// expected-remark@-1 {{slice ( src loop: 1, dst loop: 0, depth: 1 : insert point: (1, 1) loop bounds: [(d0) -> (d0), (d0) -> (d0 + 1)] [(d0) -> (0), (d0) -> (10)] )}}
// expected-remark@-2 {{slice ( src loop: 1, dst loop: 0, depth: 2 : insert point: (2, 1) loop bounds: [(d0, d1) -> (d0), (d0, d1) -> (d0 + 1)] [(d0, d1) -> (d1), (d0, d1) -> (d1 + 1)] )}}
affine.for %i1 = 0 to 16 {
store %cst, %0[%i0, %i1] : memref<100x100xf32>
}
}
affine.for %i2 = 0 to 8 {
// expected-remark@-1 {{slice ( src loop: 0, dst loop: 1, depth: 1 : insert point: (1, 0) loop bounds: [(d0) -> (d0), (d0) -> (d0 + 1)] [(d0) -> (0), (d0) -> (10)] )}}
// expected-remark@-2 {{slice ( src loop: 0, dst loop: 1, depth: 2 : insert point: (2, 0) loop bounds: [(d0, d1) -> (d0), (d0, d1) -> (d0 + 1)] [(d0, d1) -> (d1), (d0, d1) -> (d1 + 1)] )}}
affine.for %i3 = 0 to 10 {
%1 = load %0[%i2, %i3] : memref<100x100xf32>
}
}
return
}