From 898cf0e96878530e76a98ebe00c8f9d1492bea7e Mon Sep 17 00:00:00 2001 From: Andy Davis Date: Mon, 17 Jun 2019 09:59:35 -0700 Subject: [PATCH] LoopFusion: adds support for computing forward computation slices, which will enable fusion of consumer loop nests into their producers in subsequent CLs. PiperOrigin-RevId: 253601994 --- mlir/include/mlir/Analysis/AffineStructures.h | 25 +- mlir/include/mlir/Analysis/Utils.h | 72 +++++- mlir/lib/Analysis/AffineStructures.cpp | 55 ++-- mlir/lib/Analysis/Utils.cpp | 243 +++++++++++------- mlir/lib/Transforms/LoopFusion.cpp | 21 +- mlir/lib/Transforms/TestLoopFusion.cpp | 81 +++++- mlir/lib/Transforms/Utils/LoopFusionUtils.cpp | 37 +-- .../loop-fusion-slice-computation.mlir | 145 +++++++++++ 8 files changed, 515 insertions(+), 164 deletions(-) create mode 100644 mlir/test/Transforms/loop-fusion-slice-computation.mlir diff --git a/mlir/include/mlir/Analysis/AffineStructures.h b/mlir/include/mlir/Analysis/AffineStructures.h index d3feb3436ff3..3e2b90d65573 100644 --- a/mlir/include/mlir/Analysis/AffineStructures.h +++ b/mlir/include/mlir/Analysis/AffineStructures.h @@ -393,12 +393,12 @@ public: bool lower = true); /// Computes the lower and upper bounds of the first 'num' dimensional - /// identifiers as an affine map of the remaining identifiers (dimensional and - /// symbolic). This method is able to detect identifiers as floordiv's - /// and mod's of affine expressions of other identifiers with respect to - /// (positive) constants. Sets bound map to a null AffineMap if such a bound - /// can't be found (or yet unimplemented). - void getSliceBounds(unsigned num, MLIRContext *context, + /// identifiers (starting at 'offset') as an affine map of the remaining + /// identifiers (dimensional and symbolic). This method is able to detect + /// identifiers as floordiv's and mod's of affine expressions of other + /// identifiers with respect to (positive) constants. Sets bound map to a + /// null AffineMap if such a bound can't be found (or yet unimplemented). + void getSliceBounds(unsigned offset, unsigned num, MLIRContext *context, SmallVectorImpl *lbMaps, SmallVectorImpl *ubMaps); @@ -648,13 +648,14 @@ public: Optional getConstantUpperBound(unsigned pos) const; /// Gets the lower and upper bound of the pos^th identifier treating - /// [dimStartPos, symbStartPos) as dimensions and [symStartPos, - /// getNumDimAndSymbolIds) as symbols. The returned multi-dimensional maps - /// in the pair represent the max and min of potentially multiple affine - /// expressions. The upper bound is exclusive. 'localExprs' holds pre-computed - /// AffineExpr's for all local identifiers in the system. + /// [0, offset) U [offset + num, symbStartPos) as dimensions and + /// [symStartPos, getNumDimAndSymbolIds) as symbols. The returned + /// multi-dimensional maps in the pair represent the max and min of + /// potentially multiple affine expressions. The upper bound is exclusive. + /// 'localExprs' holds pre-computed AffineExpr's for all local identifiers in + /// the system. std::pair - getLowerAndUpperBound(unsigned pos, unsigned dimStartPos, + getLowerAndUpperBound(unsigned pos, unsigned offset, unsigned num, unsigned symStartPos, ArrayRef localExprs, MLIRContext *context); diff --git a/mlir/include/mlir/Analysis/Utils.h b/mlir/include/mlir/Analysis/Utils.h index d6bf0c617ae4..5c1f47a13489 100644 --- a/mlir/include/mlir/Analysis/Utils.h +++ b/mlir/include/mlir/Analysis/Utils.h @@ -73,6 +73,8 @@ struct ComputationSliceState { std::vector> lbOperands; // List of upper bound operands (ubOperands[i] are used by 'ubs[i]'). std::vector> ubOperands; + // Slice loop nest insertion point in target loop nest. + Block::iterator insertPoint; // Adds to 'cst' with constraints which represent the slice bounds on 'ivs' // in 'this'. Specifically, the values in 'ivs' are added to 'cst' as dim // identifiers and the values in 'lb/ubOperands' are added as symbols. @@ -85,19 +87,67 @@ struct ComputationSliceState { void clearBounds(); }; -/// Computes computation slice loop bounds for the loop nest surrounding -/// 'srcAccess', where the returned loop bound AffineMaps are functions of -/// loop IVs from the loop nest surrounding 'dstAccess'. -LogicalResult getBackwardComputationSliceState( - const MemRefAccess &srcAccess, const MemRefAccess &dstAccess, - unsigned dstLoopDepth, ComputationSliceState *sliceState); +/// Computes the computation slice loop bounds for one loop nest as affine maps +/// of the other loop nest's IVs and symbols, using 'dependenceConstraints' +/// computed between 'depSourceAccess' and 'depSinkAccess'. +/// If 'isBackwardSlice' is true, a backwards slice is computed in which the +/// slice bounds of loop nest surrounding 'depSourceAccess' are computed in +/// terms of loop IVs and symbols of the loop nest surrounding 'depSinkAccess' +/// at 'loopDepth'. +/// If 'isBackwardSlice' is false, a forward slice is computed in which the +/// slice bounds of loop nest surrounding 'depSinkAccess' are computed in terms +/// of loop IVs and symbols of the loop nest surrounding 'depSourceAccess' at +/// 'loopDepth'. +/// The slice loop bounds and associated operands are returned in 'sliceState'. +// +// Backward slice example: +// +// affine.for %i0 = 0 to 10 { +// store %cst, %0[%i0] : memref<100xf32> // 'depSourceAccess' +// } +// affine.for %i1 = 0 to 10 { +// %v = load %0[%i1] : memref<100xf32> // 'depSinkAccess' +// } +// +// // Backward computation slice of loop nest '%i0'. +// affine.for %i0 = (d0) -> (d0)(%i1) to (d0) -> (d0 + 1)(%i1) { +// store %cst, %0[%i0] : memref<100xf32> // 'depSourceAccess' +// } +// +// Forward slice example: +// +// affine.for %i0 = 0 to 10 { +// store %cst, %0[%i0] : memref<100xf32> // 'depSourceAccess' +// } +// affine.for %i1 = 0 to 10 { +// %v = load %0[%i1] : memref<100xf32> // 'depSinkAccess' +// } +// +// // Forward computation slice of loop nest '%i1'. +// affine.for %i1 = (d0) -> (d0)(%i0) to (d0) -> (d0 + 1)(%i0) { +// %v = load %0[%i1] : memref<100xf32> // 'depSinkAccess' +// } +// +void getComputationSliceState(Operation *depSourceOp, Operation *depSinkOp, + FlatAffineConstraints *dependenceConstraints, + unsigned loopDepth, bool isBackwardSlice, + ComputationSliceState *sliceState); /// Computes in 'sliceUnion' the union of all slice bounds computed at -/// 'dstLoopDepth' between all pairs in 'srcOps' and 'dstOp' which access the -/// same memref. Returns 'success' if union was computed, 'failure' otherwise. -LogicalResult computeSliceUnion(ArrayRef srcOps, - ArrayRef dstOps, - unsigned dstLoopDepth, +/// 'loopDepth' between all dependent pairs of ops in 'opsA' and 'opsB'. +/// The parameter 'numCommonLoops' is the number of loops common to the +/// operations in 'opsA' and 'opsB'. +/// If 'isBackwardSlice' is true, computes slice bounds for loop nest +/// surrounding ops in 'opsA', as a function of IVs and symbols of loop nest +/// surrounding ops in 'opsB' at 'loopDepth'. +/// If 'isBackwardSlice' is false, computes slice bounds for loop nest +/// surrounding ops in 'opsB', as a function of IVs and symbols of loop nest +/// surrounding ops in 'opsA' at 'loopDepth'. +/// Returns 'success' if union was computed, 'failure' otherwise. +// TODO(andydavis) Change this API to take 'forOpA'/'forOpB'. +LogicalResult computeSliceUnion(ArrayRef opsA, + ArrayRef opsB, unsigned loopDepth, + unsigned numCommonLoops, bool isBackwardSlice, ComputationSliceState *sliceUnion); /// Creates a clone of the computation contained in the loop nest surrounding diff --git a/mlir/lib/Analysis/AffineStructures.cpp b/mlir/lib/Analysis/AffineStructures.cpp index 41f8e075813f..46e45351d54e 100644 --- a/mlir/lib/Analysis/AffineStructures.cpp +++ b/mlir/lib/Analysis/AffineStructures.cpp @@ -1423,19 +1423,28 @@ void FlatAffineConstraints::removeRedundantInequalities() { } std::pair FlatAffineConstraints::getLowerAndUpperBound( - unsigned pos, unsigned dimStartPos, unsigned symStartPos, + unsigned pos, unsigned offset, unsigned num, unsigned symStartPos, ArrayRef localExprs, MLIRContext *context) { - assert(pos < dimStartPos && "invalid dim start pos"); - assert(symStartPos >= dimStartPos && "invalid sym start pos"); + assert(pos + offset < getNumDimIds() && "invalid dim start pos"); + assert(symStartPos >= (pos + offset) && "invalid sym start pos"); assert(getNumLocalIds() == localExprs.size() && "incorrect local exprs count"); SmallVector lbIndices, ubIndices; - getLowerAndUpperBoundIndices(*this, pos, &lbIndices, &ubIndices); + getLowerAndUpperBoundIndices(*this, pos + offset, &lbIndices, &ubIndices); + + /// Add to 'b' from 'a' in set [0, offset) U [offset + num, symbStartPos). + auto addCoeffs = [&](ArrayRef a, SmallVectorImpl &b) { + b.clear(); + for (unsigned i = 0, e = a.size(); i < e; ++i) { + if (i < offset || i >= offset + num) + b.push_back(a[i]); + } + }; SmallVector lb, ub; SmallVector exprs; - unsigned dimCount = symStartPos - dimStartPos; + unsigned dimCount = symStartPos - num; unsigned symCount = getNumDimAndSymbolIds() - symStartPos; exprs.reserve(lbIndices.size()); // Lower bound expressions. @@ -1444,7 +1453,7 @@ std::pair FlatAffineConstraints::getLowerAndUpperBound( // Extract the lower bound (in terms of other coeff's + const), i.e., if // i - j + 1 >= 0 is the constraint, 'pos' is for i the lower bound is j // - 1. - lb.assign(ineq.begin() + dimStartPos, ineq.end()); + addCoeffs(ineq, lb); std::transform(lb.begin(), lb.end(), lb.begin(), std::negate()); auto expr = mlir::toAffineExpr(lb, dimCount, symCount, localExprs, context); exprs.push_back(expr); @@ -1458,7 +1467,7 @@ std::pair FlatAffineConstraints::getLowerAndUpperBound( for (auto idx : ubIndices) { auto ineq = getInequality(idx); // Extract the upper bound (in terms of other coeff's + const). - ub.assign(ineq.begin() + dimStartPos, ineq.end()); + addCoeffs(ineq, ub); auto expr = mlir::toAffineExpr(ub, dimCount, symCount, localExprs, context); // Upper bound is exclusive. exprs.push_back(expr + 1); @@ -1470,10 +1479,12 @@ std::pair FlatAffineConstraints::getLowerAndUpperBound( } /// Computes the lower and upper bounds of the first 'num' dimensional -/// identifiers as affine maps of the remaining identifiers (dimensional and -/// symbolic identifiers). Local identifiers are themselves explicitly computed -/// as affine functions of other identifiers in this process if needed. -void FlatAffineConstraints::getSliceBounds(unsigned num, MLIRContext *context, +/// identifiers (starting at 'offset') as affine maps of the remaining +/// identifiers (dimensional and symbolic identifiers). Local identifiers are +/// themselves explicitly computed as affine functions of other identifiers in +/// this process if needed. +void FlatAffineConstraints::getSliceBounds(unsigned offset, unsigned num, + MLIRContext *context, SmallVectorImpl *lbMaps, SmallVectorImpl *ubMaps) { assert(num < getNumDimIds() && "invalid range"); @@ -1488,8 +1499,12 @@ void FlatAffineConstraints::getSliceBounds(unsigned num, MLIRContext *context, // Record computed/detected identifiers. SmallVector memo(getNumIds()); // Initialize dimensional and symbolic identifiers. - for (unsigned i = num, e = getNumDimIds(); i < e; i++) - memo[i] = getAffineDimExpr(i - num, context); + for (unsigned i = 0, e = getNumDimIds(); i < e; i++) { + if (i < offset) + memo[i] = getAffineDimExpr(i, context); + else if (i >= offset + num) + memo[i] = getAffineDimExpr(i - num, context); + } for (unsigned i = getNumDimIds(), e = getNumDimAndSymbolIds(); i < e; i++) memo[i] = getAffineSymbolExpr(i - getNumDimIds(), context); @@ -1578,7 +1593,7 @@ void FlatAffineConstraints::getSliceBounds(unsigned num, MLIRContext *context, for (unsigned pos = 0; pos < num; pos++) { unsigned numMapDims = getNumDimIds() - num; unsigned numMapSymbols = getNumSymbolIds(); - AffineExpr expr = memo[pos]; + AffineExpr expr = memo[pos + offset]; if (expr) expr = simplifyAffineExpr(expr, numMapDims, numMapSymbols); @@ -1601,7 +1616,7 @@ void FlatAffineConstraints::getSliceBounds(unsigned num, MLIRContext *context, tmpClone->removeRedundantInequalities(); } std::tie(lbMap, ubMap) = tmpClone->getLowerAndUpperBound( - pos, num, getNumDimIds(), {}, context); + pos, offset, num, getNumDimIds(), {}, context); } // If the above fails, we'll just use the constant lower bound and the @@ -1612,7 +1627,7 @@ void FlatAffineConstraints::getSliceBounds(unsigned num, MLIRContext *context, if (!lbMap || lbMap.getNumResults() > 1) { LLVM_DEBUG(llvm::dbgs() << "WARNING: Potentially over-approximating slice lb\n"); - auto lbConst = getConstantLowerBound(pos); + auto lbConst = getConstantLowerBound(pos + offset); if (lbConst.hasValue()) { lbMap = AffineMap::get( numMapDims, numMapSymbols, @@ -1622,7 +1637,7 @@ void FlatAffineConstraints::getSliceBounds(unsigned num, MLIRContext *context, if (!ubMap || ubMap.getNumResults() > 1) { LLVM_DEBUG(llvm::dbgs() << "WARNING: Potentially over-approximating slice ub\n"); - auto ubConst = getConstantUpperBound(pos); + auto ubConst = getConstantUpperBound(pos + offset); if (ubConst.hasValue()) { (ubMap) = AffineMap::get( numMapDims, numMapSymbols, @@ -1630,9 +1645,11 @@ void FlatAffineConstraints::getSliceBounds(unsigned num, MLIRContext *context, } } } - LLVM_DEBUG(llvm::dbgs() << "lb map for pos = " << Twine(pos) << ", expr: "); + LLVM_DEBUG(llvm::dbgs() + << "lb map for pos = " << Twine(pos + offset) << ", expr: "); LLVM_DEBUG(lbMap.dump();); - LLVM_DEBUG(llvm::dbgs() << "ub map for pos = " << Twine(pos) << ", expr: "); + LLVM_DEBUG(llvm::dbgs() + << "ub map for pos = " << Twine(pos + offset) << ", expr: "); LLVM_DEBUG(ubMap.dump();); } } diff --git a/mlir/lib/Analysis/Utils.cpp b/mlir/lib/Analysis/Utils.cpp index e5418fc17a27..ae991f796e03 100644 --- a/mlir/lib/Analysis/Utils.cpp +++ b/mlir/lib/Analysis/Utils.cpp @@ -504,48 +504,84 @@ LogicalResult addMissingLoopIVBounds(SmallPtrSet &ivs, return success(); } -/// Computes in 'sliceUnion' the union of all slice bounds computed at -/// 'dstLoopDepth' between all pairs in 'srcOps' and 'dstOp' which access the -/// same memref. Returns 'Success' if union was computed, 'failure' otherwise. -LogicalResult mlir::computeSliceUnion(ArrayRef srcOps, - ArrayRef dstOps, - unsigned dstLoopDepth, - ComputationSliceState *sliceUnion) { - unsigned numSrcOps = srcOps.size(); - unsigned numDstOps = dstOps.size(); - assert(numSrcOps > 0 && numDstOps > 0); +// Returns the innermost common loop depth for the set of operations in 'ops'. +// TODO(andydavis) Move this to LoopUtils. +static unsigned +getInnermostCommonLoopDepth(ArrayRef ops, + SmallVectorImpl &surroundingLoops) { + unsigned numOps = ops.size(); + assert(numOps > 0); - // Compute the intersection of 'srcMemrefToOps' and 'dstMemrefToOps'. - llvm::SmallDenseSet memrefIntersection; - for (auto *srcOp : srcOps) { - auto *srcMemRef = getLoadOrStoreMemRef(srcOp); - for (auto *dstOp : dstOps) { - if (srcMemRef == getLoadOrStoreMemRef(dstOp)) - memrefIntersection.insert(srcMemRef); - } + std::vector> loops(numOps); + unsigned loopDepthLimit = std::numeric_limits::max(); + for (unsigned i = 0; i < numOps; ++i) { + getLoopIVs(*ops[i], &loops[i]); + loopDepthLimit = + std::min(loopDepthLimit, static_cast(loops[i].size())); } - // Return failure if 'memrefIntersection' is empty. - if (memrefIntersection.empty()) - return failure(); - // Compute the union of slice bounds between all pairs in 'srcOps' and - // 'dstOps' in 'sliceUnionCst'. + unsigned loopDepth = 0; + for (unsigned d = 0; d < loopDepthLimit; ++d) { + unsigned i; + for (i = 1; i < numOps; ++i) { + if (loops[i - 1][d] != loops[i][d]) + return loopDepth; + } + surroundingLoops.push_back(loops[i - 1][d]); + ++loopDepth; + } + return loopDepth; +} + +/// Computes in 'sliceUnion' the union of all slice bounds computed at +/// 'loopDepth' between all dependent pairs of ops in 'opsA' and 'opsB'. +/// Returns 'Success' if union was computed, 'failure' otherwise. +LogicalResult mlir::computeSliceUnion(ArrayRef opsA, + ArrayRef opsB, + unsigned loopDepth, + unsigned numCommonLoops, + bool isBackwardSlice, + ComputationSliceState *sliceUnion) { + // Compute the union of slice bounds between all pairs in 'opsA' and + // 'opsB' in 'sliceUnionCst'. FlatAffineConstraints sliceUnionCst; assert(sliceUnionCst.getNumDimAndSymbolIds() == 0); - for (unsigned i = 0; i < numSrcOps; ++i) { - MemRefAccess srcAccess(srcOps[i]); - for (unsigned j = 0; j < numDstOps; ++j) { - MemRefAccess dstAccess(dstOps[j]); + std::vector> dependentOpPairs; + for (unsigned i = 0, numOpsA = opsA.size(); i < numOpsA; ++i) { + MemRefAccess srcAccess(opsA[i]); + for (unsigned j = 0, numOpsB = opsB.size(); j < numOpsB; ++j) { + MemRefAccess dstAccess(opsB[j]); if (srcAccess.memref != dstAccess.memref) continue; - // Compute slice bounds for 'srcAccess' and 'dstAccess'. - ComputationSliceState tmpSliceState; - if (failed(mlir::getBackwardComputationSliceState( - srcAccess, dstAccess, dstLoopDepth, &tmpSliceState))) { - LLVM_DEBUG(llvm::dbgs() << "Unable to compute slice bounds\n."); + // Check if 'loopDepth' exceeds nesting depth of src/dst ops. + if ((!isBackwardSlice && loopDepth > getNestingDepth(*opsA[i])) || + (isBackwardSlice && loopDepth > getNestingDepth(*opsB[j]))) { + LLVM_DEBUG(llvm::dbgs() << "Invalid loop depth\n."); return failure(); } + bool readReadAccesses = + isa(srcAccess.opInst) && isa(dstAccess.opInst); + FlatAffineConstraints dependenceConstraints; + // Check dependence between 'srcAccess' and 'dstAccess'. + DependenceResult result = checkMemrefAccessDependence( + srcAccess, dstAccess, /*loopDepth=*/numCommonLoops + 1, + &dependenceConstraints, /*dependenceComponents=*/nullptr, + /*allowRAR=*/readReadAccesses); + if (result.value == DependenceResult::Failure) { + LLVM_DEBUG(llvm::dbgs() << "Dependence check failed\n."); + return failure(); + } + if (result.value == DependenceResult::NoDependence) + continue; + dependentOpPairs.push_back({opsA[i], opsB[j]}); + + // Compute slice bounds for 'srcAccess' and 'dstAccess'. + ComputationSliceState tmpSliceState; + mlir::getComputationSliceState(opsA[i], opsB[j], &dependenceConstraints, + loopDepth, isBackwardSlice, + &tmpSliceState); + if (sliceUnionCst.getNumDimAndSymbolIds() == 0) { // Initialize 'sliceUnionCst' with the bounds computed in previous step. if (failed(tmpSliceState.getAsConstraints(&sliceUnionCst))) { @@ -599,116 +635,147 @@ LogicalResult mlir::computeSliceUnion(ArrayRef srcOps, } } - // Store 'numSrcLoopIvs' before converting dst loop IVs to dims. - unsigned numSrcLoopIVs = sliceUnionCst.getNumDimIds(); + // Empty union. + if (sliceUnionCst.getNumDimAndSymbolIds() == 0) + return failure(); + + // Gather loops surrounding ops from loop nest where slice will be inserted. + SmallVector ops; + for (auto &dep : dependentOpPairs) { + ops.push_back(isBackwardSlice ? dep.second : dep.first); + } + SmallVector surroundingLoops; + unsigned innermostCommonLoopDepth = + getInnermostCommonLoopDepth(ops, surroundingLoops); + if (loopDepth > innermostCommonLoopDepth) { + LLVM_DEBUG(llvm::dbgs() << "Exceeds max loop depth\n."); + return failure(); + } + + // Store 'numSliceLoopIVs' before converting dst loop IVs to dims. + unsigned numSliceLoopIVs = sliceUnionCst.getNumDimIds(); // Convert any dst loop IVs which are symbol identifiers to dim identifiers. sliceUnionCst.convertLoopIVSymbolsToDims(); sliceUnion->clearBounds(); - sliceUnion->lbs.resize(numSrcLoopIVs, AffineMap()); - sliceUnion->ubs.resize(numSrcLoopIVs, AffineMap()); + sliceUnion->lbs.resize(numSliceLoopIVs, AffineMap()); + sliceUnion->ubs.resize(numSliceLoopIVs, AffineMap()); // Get slice bounds from slice union constraints 'sliceUnionCst'. - sliceUnionCst.getSliceBounds(numSrcLoopIVs, srcOps[0]->getContext(), - &sliceUnion->lbs, &sliceUnion->ubs); + sliceUnionCst.getSliceBounds(/*offset=*/0, numSliceLoopIVs, + opsA[0]->getContext(), &sliceUnion->lbs, + &sliceUnion->ubs); // Add slice bound operands of union. SmallVector sliceBoundOperands; - sliceUnionCst.getIdValues(numSrcLoopIVs, + sliceUnionCst.getIdValues(numSliceLoopIVs, sliceUnionCst.getNumDimAndSymbolIds(), &sliceBoundOperands); // Copy src loop IVs from 'sliceUnionCst' to 'sliceUnion'. sliceUnion->ivs.clear(); - sliceUnionCst.getIdValues(0, numSrcLoopIVs, &sliceUnion->ivs); + sliceUnionCst.getIdValues(0, numSliceLoopIVs, &sliceUnion->ivs); + + // Set loop nest insertion point to block start at 'loopDepth'. + sliceUnion->insertPoint = + isBackwardSlice + ? surroundingLoops[loopDepth - 1].getBody()->begin() + : std::prev(surroundingLoops[loopDepth - 1].getBody()->end()); // Give each bound its own copy of 'sliceBoundOperands' for subsequent // canonicalization. - sliceUnion->lbOperands.resize(numSrcLoopIVs, sliceBoundOperands); - sliceUnion->ubOperands.resize(numSrcLoopIVs, sliceBoundOperands); + sliceUnion->lbOperands.resize(numSliceLoopIVs, sliceBoundOperands); + sliceUnion->ubOperands.resize(numSliceLoopIVs, sliceBoundOperands); return success(); } const char *const kSliceFusionBarrierAttrName = "slice_fusion_barrier"; -// Computes memref dependence between 'srcAccess' and 'dstAccess', projects -// out any dst loop IVs at depth greater than 'dstLoopDepth', and computes slice -// bounds in 'sliceState' which represent the src IVs in terms of the dst IVs, -// symbols and constants. -LogicalResult mlir::getBackwardComputationSliceState( - const MemRefAccess &srcAccess, const MemRefAccess &dstAccess, - unsigned dstLoopDepth, ComputationSliceState *sliceState) { - bool readReadAccesses = - isa(srcAccess.opInst) && isa(dstAccess.opInst); - FlatAffineConstraints dependenceConstraints; - DependenceResult result = checkMemrefAccessDependence( - srcAccess, dstAccess, /*loopDepth=*/1, &dependenceConstraints, - /*dependenceComponents=*/nullptr, /*allowRAR=*/readReadAccesses); - if (!hasDependence(result)) { - return failure(); - } +// Computes slice bounds by projecting out any loop IVs from +// 'dependenceConstraints' at depth greater than 'loopDepth', and computes slice +// bounds in 'sliceState' which represent the one loop nest's IVs in terms of +// the other loop nest's IVs, symbols and constants (using 'isBackwardsSlice'). +void mlir::getComputationSliceState( + Operation *depSourceOp, Operation *depSinkOp, + FlatAffineConstraints *dependenceConstraints, unsigned loopDepth, + bool isBackwardSlice, ComputationSliceState *sliceState) { // Get loop nest surrounding src operation. SmallVector srcLoopIVs; - getLoopIVs(*srcAccess.opInst, &srcLoopIVs); + getLoopIVs(*depSourceOp, &srcLoopIVs); unsigned numSrcLoopIVs = srcLoopIVs.size(); // Get loop nest surrounding dst operation. SmallVector dstLoopIVs; - getLoopIVs(*dstAccess.opInst, &dstLoopIVs); + getLoopIVs(*depSinkOp, &dstLoopIVs); unsigned numDstLoopIVs = dstLoopIVs.size(); - if (dstLoopDepth > numDstLoopIVs) { - dstAccess.opInst->emitError("invalid destination loop depth"); - return failure(); - } - // Project out dimensions other than those up to 'dstLoopDepth'. - dependenceConstraints.projectOut(numSrcLoopIVs + dstLoopDepth, - numDstLoopIVs - dstLoopDepth); + assert((!isBackwardSlice && loopDepth <= numSrcLoopIVs) || + (isBackwardSlice && loopDepth <= numDstLoopIVs)); - // Add src loop IV values to 'sliceState'. - dependenceConstraints.getIdValues(0, numSrcLoopIVs, &sliceState->ivs); + // Project out dimensions other than those up to 'loopDepth'. + unsigned pos = isBackwardSlice ? numSrcLoopIVs + loopDepth : loopDepth; + unsigned num = + isBackwardSlice ? numDstLoopIVs - loopDepth : numSrcLoopIVs - loopDepth; + dependenceConstraints->projectOut(pos, num); + + // Add slice loop IV values to 'sliceState'. + unsigned offset = isBackwardSlice ? 0 : loopDepth; + unsigned numSliceLoopIVs = isBackwardSlice ? numSrcLoopIVs : numDstLoopIVs; + dependenceConstraints->getIdValues(offset, offset + numSliceLoopIVs, + &sliceState->ivs); // Set up lower/upper bound affine maps for the slice. - sliceState->lbs.resize(numSrcLoopIVs, AffineMap()); - sliceState->ubs.resize(numSrcLoopIVs, AffineMap()); + sliceState->lbs.resize(numSliceLoopIVs, AffineMap()); + sliceState->ubs.resize(numSliceLoopIVs, AffineMap()); - // Get bounds for src IVs in terms of dst IVs, symbols, and constants. - dependenceConstraints.getSliceBounds(numSrcLoopIVs, - srcAccess.opInst->getContext(), - &sliceState->lbs, &sliceState->ubs); + // Get bounds for slice IVs in terms of other IVs, symbols, and constants. + dependenceConstraints->getSliceBounds(offset, numSliceLoopIVs, + depSourceOp->getContext(), + &sliceState->lbs, &sliceState->ubs); // Set up bound operands for the slice's lower and upper bounds. SmallVector sliceBoundOperands; - dependenceConstraints.getIdValues( - numSrcLoopIVs, dependenceConstraints.getNumDimAndSymbolIds(), - &sliceBoundOperands); + unsigned numDimsAndSymbols = dependenceConstraints->getNumDimAndSymbolIds(); + for (unsigned i = 0; i < numDimsAndSymbols; ++i) { + if (i < offset || i >= offset + numSliceLoopIVs) { + sliceBoundOperands.push_back(dependenceConstraints->getIdValue(i)); + } + } + // Give each bound its own copy of 'sliceBoundOperands' for subsequent // canonicalization. - sliceState->lbOperands.resize(numSrcLoopIVs, sliceBoundOperands); - sliceState->ubOperands.resize(numSrcLoopIVs, sliceBoundOperands); + sliceState->lbOperands.resize(numSliceLoopIVs, sliceBoundOperands); + sliceState->ubOperands.resize(numSliceLoopIVs, sliceBoundOperands); + + // Set destination loop nest insertion point to block start at 'dstLoopDepth'. + sliceState->insertPoint = + isBackwardSlice ? dstLoopIVs[loopDepth - 1].getBody()->begin() + : std::prev(srcLoopIVs[loopDepth - 1].getBody()->end()); llvm::SmallDenseSet sequentialLoops; - if (readReadAccesses) { + if (isa(depSourceOp) && isa(depSinkOp)) { // For read-read access pairs, clear any slice bounds on sequential loops. // Get sequential loops in loop nest rooted at 'srcLoopIVs[0]'. - getSequentialLoops(srcLoopIVs[0], &sequentialLoops); + getSequentialLoops(isBackwardSlice ? srcLoopIVs[0] : dstLoopIVs[0], + &sequentialLoops); } // Clear all sliced loop bounds beginning at the first sequential loop, or // first loop with a slice fusion barrier attribute.. // TODO(andydavis, bondhugula) Use MemRef read/write regions instead of // using 'kSliceFusionBarrierAttrName'. - for (unsigned i = 0; i < numSrcLoopIVs; ++i) { - Value *iv = srcLoopIVs[i].getInductionVar(); + auto getSliceLoop = [&](unsigned i) { + return isBackwardSlice ? srcLoopIVs[i] : dstLoopIVs[i]; + }; + for (unsigned i = 0; i < numSliceLoopIVs; ++i) { + Value *iv = getSliceLoop(i).getInductionVar(); if (sequentialLoops.count(iv) == 0 && - srcLoopIVs[i].getAttr(kSliceFusionBarrierAttrName) == nullptr) + getSliceLoop(i).getAttr(kSliceFusionBarrierAttrName) == nullptr) continue; - for (unsigned j = i; j < numSrcLoopIVs; ++j) { + for (unsigned j = i; j < numSliceLoopIVs; ++j) { sliceState->lbs[j] = AffineMap(); sliceState->ubs[j] = AffineMap(); } break; } - - return success(); } /// Creates a computation slice of the loop nest surrounding 'srcOpInst', diff --git a/mlir/lib/Transforms/LoopFusion.cpp b/mlir/lib/Transforms/LoopFusion.cpp index 829b1b221efe..95890a681266 100644 --- a/mlir/lib/Transforms/LoopFusion.cpp +++ b/mlir/lib/Transforms/LoopFusion.cpp @@ -1329,7 +1329,9 @@ static bool isFusionProfitable(Operation *srcOpInst, Operation *srcStoreOpInst, for (unsigned i = maxDstLoopDepth; i >= 1; --i) { // Compute the union of slice bounds of all ops in 'dstLoadOpInsts'. if (failed(mlir::computeSliceUnion({srcOpInst}, dstLoadOpInsts, - /*dstLoopDepth=*/i, + /*loopDepth=*/i, + /*numCommonLoops=*/0, + /*isBackwardSlice=*/true, &sliceStates[i - 1]))) { LLVM_DEBUG(llvm::dbgs() << "computeSliceUnion failed for loopDepth: " << i << "\n"); @@ -1736,15 +1738,16 @@ public: dstLoadOpInsts, dstStoreOpInsts, &sliceState, &bestDstLoopDepth, maximalFusion)) continue; - // TODO(andydavis) Remove assert and surrounding code when - // canFuseLoops is fully functional. + // TODO(andydavis) Remove the following test code when canFuseLoops + // is fully functional. mlir::ComputationSliceState sliceUnion; - FusionResult result = mlir::canFuseLoops( - cast(srcNode->op), cast(dstNode->op), - bestDstLoopDepth, &sliceUnion); - assert(result.value == FusionResult::Success); - (void)result; - + if (!maximalFusion) { + FusionResult result = mlir::canFuseLoops( + cast(srcNode->op), cast(dstNode->op), + bestDstLoopDepth, &sliceUnion); + assert(result.value == FusionResult::Success); + (void)result; + } // Fuse computation slice of 'srcLoopNest' into 'dstLoopNest'. auto sliceLoopNest = mlir::insertBackwardComputationSlice( srcStoreOpInst, dstLoadOpInsts[0], bestDstLoopDepth, &sliceState); diff --git a/mlir/lib/Transforms/TestLoopFusion.cpp b/mlir/lib/Transforms/TestLoopFusion.cpp index 638cf915b6a8..39990968a349 100644 --- a/mlir/lib/Transforms/TestLoopFusion.cpp +++ b/mlir/lib/Transforms/TestLoopFusion.cpp @@ -45,6 +45,11 @@ static llvm::cl::opt clTestDependenceCheck( llvm::cl::desc("Enable testing of loop fusion dependence check"), llvm::cl::cat(clOptionsCategory)); +static llvm::cl::opt clTestSliceComputation( + "test-loop-fusion-slice-computation", + llvm::cl::desc("Enable testing of loop fusion slice computation"), + llvm::cl::cat(clOptionsCategory)); + namespace { struct TestLoopFusion : public FunctionPass { @@ -70,20 +75,74 @@ gatherLoops(Block *block, unsigned currLoopDepth, } } -// Run fusion dependence check on 'loops[i]' and 'loops[j]' at 'loopDepth'. +// Run fusion dependence check on 'loops[i]' and 'loops[j]' at loop depths +// in range ['loopDepth' + 1, 'maxLoopDepth']. // Emits a remark on 'loops[i]' if a fusion-preventing dependence exists. static void testDependenceCheck(SmallVector &loops, unsigned i, - unsigned j, unsigned loopDepth) { + unsigned j, unsigned loopDepth, + unsigned maxLoopDepth) { AffineForOp srcForOp = loops[i]; AffineForOp dstForOp = loops[j]; mlir::ComputationSliceState sliceUnion; - // TODO(andydavis) Test at deeper loop depths current loop depth + 1. - FusionResult result = - mlir::canFuseLoops(srcForOp, dstForOp, loopDepth + 1, &sliceUnion); - if (result.value == FusionResult::FailBlockDependence) { - srcForOp.getOperation()->emitRemark("block-level dependence preventing" - " fusion of loop nest ") - << i << " into loop nest " << j << " at depth " << loopDepth; + for (unsigned d = loopDepth + 1; d <= maxLoopDepth; ++d) { + FusionResult result = + mlir::canFuseLoops(srcForOp, dstForOp, d, &sliceUnion); + if (result.value == FusionResult::FailBlockDependence) { + srcForOp.getOperation()->emitRemark("block-level dependence preventing" + " fusion of loop nest ") + << i << " into loop nest " << j << " at depth " << loopDepth; + } + } +} + +// Returns the index of 'op' in its block. +static unsigned getBlockIndex(Operation &op) { + unsigned index = 0; + for (auto &opX : *op.getBlock()) { + if (&op == &opX) + break; + ++index; + } + return index; +} + +// Returns a string representation of 'sliceUnion'. +static std::string getSliceStr(const mlir::ComputationSliceState &sliceUnion) { + std::string result; + llvm::raw_string_ostream os(result); + // Slice insertion point format [loop-depth, operation-block-index] + unsigned ipd = getNestingDepth(*sliceUnion.insertPoint); + unsigned ipb = getBlockIndex(*sliceUnion.insertPoint); + os << "insert point: (" << std::to_string(ipd) << ", " << std::to_string(ipb) + << ")"; + assert(sliceUnion.lbs.size() == sliceUnion.ubs.size()); + os << " loop bounds: "; + for (unsigned k = 0, e = sliceUnion.lbs.size(); k < e; ++k) { + os << '['; + sliceUnion.lbs[k].print(os); + os << ", "; + sliceUnion.ubs[k].print(os); + os << "] "; + } + return os.str(); +} + +// Computes fusion slice union on 'loops[i]' and 'loops[j]' at loop depths +// in range ['loopDepth' + 1, 'maxLoopDepth']. +// Emits a string represention of the slice union as a remark on 'loops[j]'. +static void testSliceComputation(SmallVector &loops, unsigned i, + unsigned j, unsigned loopDepth, + unsigned maxLoopDepth) { + AffineForOp forOpA = loops[i]; + AffineForOp forOpB = loops[j]; + for (unsigned d = loopDepth + 1; d <= maxLoopDepth; ++d) { + mlir::ComputationSliceState sliceUnion; + FusionResult result = mlir::canFuseLoops(forOpA, forOpB, d, &sliceUnion); + if (result.value == FusionResult::Success) { + forOpB.getOperation()->emitRemark("slice (") + << " src loop: " << i << ", dst loop: " << j << ", depth: " << d + << " : " << getSliceStr(sliceUnion) << ")"; + } } } @@ -104,7 +163,9 @@ void TestLoopFusion::runOnFunction() { if (j == k) continue; if (clTestDependenceCheck) - testDependenceCheck(loops, j, k, loopDepth); + testDependenceCheck(loops, j, k, loopDepth, depthToLoops.size()); + if (clTestSliceComputation) + testSliceComputation(loops, j, k, loopDepth, depthToLoops.size()); } } } diff --git a/mlir/lib/Transforms/Utils/LoopFusionUtils.cpp b/mlir/lib/Transforms/Utils/LoopFusionUtils.cpp index cb1d9d17ed05..1fb41a2a5e20 100644 --- a/mlir/lib/Transforms/Utils/LoopFusionUtils.cpp +++ b/mlir/lib/Transforms/Utils/LoopFusionUtils.cpp @@ -192,11 +192,7 @@ gatherLoadsAndStores(AffineForOp forOp, return !hasIfOp; } -// TODO(andydavis) Add support for the following features in subsequent CLs: -// *) Compute dependences of unfused src/dst loops. -// *) Compute dependences of src/dst loop as if they were fused. -// *) Check for fusion preventing dependences (e.g. a dependence which changes -// from loop-independent to backward loop-carried after fusion). +// TODO(andydavis) Prevent fusion of loop nests with side-effecting operations. FusionResult mlir::canFuseLoops(AffineForOp srcForOp, AffineForOp dstForOp, unsigned dstLoopDepth, ComputationSliceState *srcSlice) { @@ -219,24 +215,35 @@ FusionResult mlir::canFuseLoops(AffineForOp srcForOp, AffineForOp dstForOp, return FusionResult::FailBlockDependence; } - // Gather all load and store ops in 'srcForOp'. - SmallVector srcLoadAndStoreOps; - if (!gatherLoadsAndStores(srcForOp, srcLoadAndStoreOps)) { + // Check if 'srcForOp' precedeces 'dstForOp' in 'block'. + bool isSrcForOpBeforeDstForOp = + srcForOp.getOperation()->isBeforeInBlock(dstForOp.getOperation()); + // 'forOpA' executes before 'forOpB' in 'block'. + auto forOpA = isSrcForOpBeforeDstForOp ? srcForOp : dstForOp; + auto forOpB = isSrcForOpBeforeDstForOp ? dstForOp : srcForOp; + + // Gather all load and store from 'forOpA' which precedes 'forOpB' in 'block'. + SmallVector opsA; + if (!gatherLoadsAndStores(forOpA, opsA)) { LLVM_DEBUG(llvm::dbgs() << "Fusing loops with affine.if unsupported.\n."); return FusionResult::FailPrecondition; } - // Gather all load and store ops in 'dstForOp'. - SmallVector dstLoadAndStoreOps; - if (!gatherLoadsAndStores(dstForOp, dstLoadAndStoreOps)) { + // Gather all load and store from 'forOpB' which succeeds 'forOpA' in 'block'. + SmallVector opsB; + if (!gatherLoadsAndStores(forOpB, opsB)) { LLVM_DEBUG(llvm::dbgs() << "Fusing loops with affine.if unsupported.\n."); return FusionResult::FailPrecondition; } - // Compute union of computation slices computed from all pairs in - // {'srcLoadAndStoreOps', 'dstLoadAndStoreOps'}. - if (failed(mlir::computeSliceUnion(srcLoadAndStoreOps, dstLoadAndStoreOps, - dstLoopDepth, srcSlice))) { + // Calculate the number of common loops surrounding 'srcForOp' and 'dstForOp'. + unsigned numCommonLoops = mlir::getNumCommonSurroundingLoops( + *srcForOp.getOperation(), *dstForOp.getOperation()); + + // Compute union of computation slices computed between all pairs of ops + // from 'forOpA' and 'forOpB'. + if (failed(mlir::computeSliceUnion(opsA, opsB, dstLoopDepth, numCommonLoops, + isSrcForOpBeforeDstForOp, srcSlice))) { LLVM_DEBUG(llvm::dbgs() << "computeSliceUnion failed\n"); return FusionResult::FailPrecondition; } diff --git a/mlir/test/Transforms/loop-fusion-slice-computation.mlir b/mlir/test/Transforms/loop-fusion-slice-computation.mlir new file mode 100644 index 000000000000..9550cb725064 --- /dev/null +++ b/mlir/test/Transforms/loop-fusion-slice-computation.mlir @@ -0,0 +1,145 @@ +// RUN: mlir-opt %s -test-loop-fusion -test-loop-fusion-slice-computation -split-input-file -verify | FileCheck %s + +// ----- + +// CHECK-LABEL: func @slice_depth1_loop_nest() { +func @slice_depth1_loop_nest() { + %0 = alloc() : memref<100xf32> + %cst = constant 7.000000e+00 : f32 + affine.for %i0 = 0 to 16 { + // expected-remark@-1 {{slice ( src loop: 1, dst loop: 0, depth: 1 : insert point: (1, 1) loop bounds: [(d0) -> (d0), (d0) -> (d0 + 1)] )}} + store %cst, %0[%i0] : memref<100xf32> + } + affine.for %i1 = 0 to 5 { + // expected-remark@-1 {{slice ( src loop: 0, dst loop: 1, depth: 1 : insert point: (1, 0) loop bounds: [(d0) -> (d0), (d0) -> (d0 + 1)] )}} + %1 = load %0[%i1] : memref<100xf32> + } + return +} + +// ----- + +// Loop %i0 writes to locations [2, 17] and loop %i0 reads from locations [3, 6] +// Slice loop bounds should be adjusted such that the load/store are for the +// same location. +// CHECK-LABEL: func @slice_depth1_loop_nest_with_offsets() { +func @slice_depth1_loop_nest_with_offsets() { + %0 = alloc() : memref<100xf32> + %cst = constant 7.000000e+00 : f32 + affine.for %i0 = 0 to 16 { + // expected-remark@-1 {{slice ( src loop: 1, dst loop: 0, depth: 1 : insert point: (1, 2) loop bounds: [(d0) -> (d0 + 3), (d0) -> (d0 + 4)] )}} + %a0 = affine.apply (d0) -> (d0 + 2)(%i0) + store %cst, %0[%a0] : memref<100xf32> + } + affine.for %i1 = 4 to 8 { + // expected-remark@-1 {{slice ( src loop: 0, dst loop: 1, depth: 1 : insert point: (1, 0) loop bounds: [(d0) -> (d0 - 3), (d0) -> (d0 - 2)] )}} + %a1 = affine.apply (d0) -> (d0 - 1)(%i1) + %1 = load %0[%a1] : memref<100xf32> + } + return +} + +// ----- + +// Slices at loop depth 1 should only slice the loop bounds of the first loop. +// Slices at loop detph 2 should slice loop bounds of both loops. +// CHECK-LABEL: func @slice_depth2_loop_nest() { +func @slice_depth2_loop_nest() { + %0 = alloc() : memref<100x100xf32> + %cst = constant 7.000000e+00 : f32 + affine.for %i0 = 0 to 16 { + // expected-remark@-1 {{slice ( src loop: 1, dst loop: 0, depth: 1 : insert point: (1, 1) loop bounds: [(d0) -> (d0), (d0) -> (d0 + 1)] [(d0) -> (0), (d0) -> (8)] )}} + // expected-remark@-2 {{slice ( src loop: 1, dst loop: 0, depth: 2 : insert point: (2, 1) loop bounds: [(d0, d1) -> (d0), (d0, d1) -> (d0 + 1)] [(d0, d1) -> (d1), (d0, d1) -> (d1 + 1)] )}} + affine.for %i1 = 0 to 16 { + store %cst, %0[%i0, %i1] : memref<100x100xf32> + } + } + affine.for %i2 = 0 to 10 { + // expected-remark@-1 {{slice ( src loop: 0, dst loop: 1, depth: 1 : insert point: (1, 0) loop bounds: [(d0) -> (d0), (d0) -> (d0 + 1)] [(d0) -> (0), (d0) -> (8)] )}} + // expected-remark@-2 {{slice ( src loop: 0, dst loop: 1, depth: 2 : insert point: (2, 0) loop bounds: [(d0, d1) -> (d0), (d0, d1) -> (d0 + 1)] [(d0, d1) -> (d1), (d0, d1) -> (d1 + 1)] )}} + affine.for %i3 = 0 to 8 { + %1 = load %0[%i2, %i3] : memref<100x100xf32> + } + } + return +} + +// ----- + +// The load at depth 1 in loop nest %i2 prevents slicing loop nest %i0 at depths +// greater than 1. However, loop nest %i2 can be sliced into loop nest %i0 at +// depths 1 and 2 because the dependent store in loop nest %i0 is at depth 2. +// CHECK-LABEL: func @slice_depth2_loop_nest_two_loads() { +func @slice_depth2_loop_nest_two_loads() { + %0 = alloc() : memref<100x100xf32> + %c0 = constant 0 : index + %cst = constant 7.000000e+00 : f32 + affine.for %i0 = 0 to 16 { + // expected-remark@-1 {{slice ( src loop: 1, dst loop: 0, depth: 1 : insert point: (1, 1) loop bounds: [(d0)[s0] -> (d0), (d0)[s0] -> (d0 + 1)] [(d0)[s0] -> (0), (d0)[s0] -> (8)] )}} + // expected-remark@-2 {{slice ( src loop: 1, dst loop: 0, depth: 2 : insert point: (2, 1) loop bounds: [(d0, d1)[s0] -> (d0), (d0, d1)[s0] -> (d0 + 1)] [(d0, d1)[s0] -> (0), (d0, d1)[s0] -> (8)] )}} + affine.for %i1 = 0 to 16 { + store %cst, %0[%i0, %i1] : memref<100x100xf32> + } + } + affine.for %i2 = 0 to 10 { + // expected-remark@-1 {{slice ( src loop: 0, dst loop: 1, depth: 1 : insert point: (1, 0) loop bounds: [(d0)[s0] -> (d0), (d0)[s0] -> (d0 + 1)] [(d0)[s0] -> (0), (d0)[s0] -> (8)] )}} + affine.for %i3 = 0 to 8 { + %1 = load %0[%i2, %i3] : memref<100x100xf32> + } + %2 = load %0[%i2, %c0] : memref<100x100xf32> + } + return +} + +// ----- + +// The store at depth 1 in loop nest %i0 prevents slicing loop nest %i2 at +// depths greater than 1 into loop nest %i0. However, loop nest %i0 can be +// sliced into loop nest %i2 at depths 1 and 2 because the dependent load in +// loop nest %i2 is at depth 2. +// CHECK-LABEL: func @slice_depth2_loop_nest_two_stores() { +func @slice_depth2_loop_nest_two_stores() { + %0 = alloc() : memref<100x100xf32> + %c0 = constant 0 : index + %cst = constant 7.000000e+00 : f32 + affine.for %i0 = 0 to 16 { + // expected-remark@-1 {{slice ( src loop: 1, dst loop: 0, depth: 1 : insert point: (1, 2) loop bounds: [(d0)[s0] -> (d0), (d0)[s0] -> (d0 + 1)] [(d0)[s0] -> (0), (d0)[s0] -> (8)] )}} + affine.for %i1 = 0 to 16 { + store %cst, %0[%i0, %i1] : memref<100x100xf32> + } + store %cst, %0[%i0, %c0] : memref<100x100xf32> + } + affine.for %i2 = 0 to 10 { + // expected-remark@-1 {{slice ( src loop: 0, dst loop: 1, depth: 1 : insert point: (1, 0) loop bounds: [(d0)[s0] -> (d0), (d0)[s0] -> (d0 + 1)] [(d0)[s0] -> (0), (d0)[s0] -> (16)] )}} + // expected-remark@-2 {{slice ( src loop: 0, dst loop: 1, depth: 2 : insert point: (2, 0) loop bounds: [(d0, d1)[s0] -> (d0), (d0, d1)[s0] -> (d0 + 1)] [(d0, d1)[s0] -> (0), (d0, d1)[s0] -> (16)] )}} + affine.for %i3 = 0 to 8 { + %1 = load %0[%i2, %i3] : memref<100x100xf32> + } + } + return +} + +// ----- + +// Test loop nest which has a smaller outer trip count than its inner loop. +// CHECK-LABEL: func @slice_loop_nest_with_smaller_outer_trip_count() { +func @slice_loop_nest_with_smaller_outer_trip_count() { + %0 = alloc() : memref<100x100xf32> + %c0 = constant 0 : index + %cst = constant 7.000000e+00 : f32 + affine.for %i0 = 0 to 16 { + // expected-remark@-1 {{slice ( src loop: 1, dst loop: 0, depth: 1 : insert point: (1, 1) loop bounds: [(d0) -> (d0), (d0) -> (d0 + 1)] [(d0) -> (0), (d0) -> (10)] )}} + // expected-remark@-2 {{slice ( src loop: 1, dst loop: 0, depth: 2 : insert point: (2, 1) loop bounds: [(d0, d1) -> (d0), (d0, d1) -> (d0 + 1)] [(d0, d1) -> (d1), (d0, d1) -> (d1 + 1)] )}} + affine.for %i1 = 0 to 16 { + store %cst, %0[%i0, %i1] : memref<100x100xf32> + } + } + affine.for %i2 = 0 to 8 { + // expected-remark@-1 {{slice ( src loop: 0, dst loop: 1, depth: 1 : insert point: (1, 0) loop bounds: [(d0) -> (d0), (d0) -> (d0 + 1)] [(d0) -> (0), (d0) -> (10)] )}} + // expected-remark@-2 {{slice ( src loop: 0, dst loop: 1, depth: 2 : insert point: (2, 0) loop bounds: [(d0, d1) -> (d0), (d0, d1) -> (d0 + 1)] [(d0, d1) -> (d1), (d0, d1) -> (d1 + 1)] )}} + affine.for %i3 = 0 to 10 { + %1 = load %0[%i2, %i3] : memref<100x100xf32> + } + } + return +} \ No newline at end of file