From 4eef795a1dbd7eafa9a45303f01c51921729f1f4 Mon Sep 17 00:00:00 2001 From: MLIR Team Date: Fri, 21 Dec 2018 11:06:23 -0800 Subject: [PATCH] Computation slice update: adds parameters to insertBackwardComputationSlice which specify the source loop nest depth at which to perform iteration space slicing, and the destination loop nest depth at which to insert the compution slice. Updates LoopFusion pass to take these parameters as command line flags for experimentation. PiperOrigin-RevId: 226514297 --- mlir/include/mlir/Analysis/Utils.h | 17 +++++----- mlir/lib/Analysis/Utils.cpp | 43 +++++++++++++++++++------ mlir/lib/Transforms/LoopFusion.cpp | 36 ++++++++++++++++++++- mlir/test/Transforms/loop-fusion.mlir | 46 +++++++++++++++++++++++++++ 4 files changed, 123 insertions(+), 19 deletions(-) diff --git a/mlir/include/mlir/Analysis/Utils.h b/mlir/include/mlir/Analysis/Utils.h index 1743d49aafd2..365fc74a7788 100644 --- a/mlir/include/mlir/Analysis/Utils.h +++ b/mlir/include/mlir/Analysis/Utils.h @@ -143,20 +143,19 @@ bool boundCheckLoadOrStoreOp(LoadOrStoreOpPointer loadOrStoreOp, bool emitError = true); /// Creates a clone of the computation contained in the loop nest surrounding -/// 'srcAccess', and inserts it at the beginning of the statement block of the -/// loop containing 'dstAccess'. Returns the top-level loop of the computation -/// slice on success, returns nullptr otherwise. -// Computes memref dependence between 'srcAccess' and 'dstAccess' and uses the -// dependence constraint system to create AffineMaps with which to adjust the -// loop bounds of the inserted compution slice so that they are functions of the -// loop IVs and symbols of the loops surrounding 'dstAccess'. -// TODO(andydavis) Add 'dstLoopDepth' argument for computation slice insertion. +/// 'srcAccess', slices the iteration space of the first 'srcLoopDepth' src loop +/// IVs, and inserts the computation slice at the beginning of the statement +/// block of the loop at 'dstLoopDepth' in the loop nest surrounding +/// 'dstAccess'. Returns the top-level loop of the computation slice on +/// success, returns nullptr otherwise. // Loop depth is a crucial optimization choice that determines where to // materialize the results of the backward slice - presenting a trade-off b/w // storage and redundant computation in several cases // TODO(andydavis) Support computation slices with common surrounding loops. ForStmt *insertBackwardComputationSlice(MemRefAccess *srcAccess, - MemRefAccess *dstAccess); + MemRefAccess *dstAccess, + unsigned srcLoopDepth, + unsigned dstLoopDepth); } // end namespace mlir #endif // MLIR_ANALYSIS_UTILS_H diff --git a/mlir/lib/Analysis/Utils.cpp b/mlir/lib/Analysis/Utils.cpp index 86f5fbf8ea43..cc30cfffb063 100644 --- a/mlir/lib/Analysis/Utils.cpp +++ b/mlir/lib/Analysis/Utils.cpp @@ -28,6 +28,7 @@ #include "mlir/IR/BuiltinOps.h" #include "mlir/StandardOps/StandardOps.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" #define DEBUG_TYPE "analysis-utils" @@ -374,11 +375,14 @@ static Statement *getStmtAtPosition(ArrayRef positions, return nullptr; } -// TODO(andydavis) Support a 'dstLoopDepth' argument for computation slice -// insertion (currently the computation slice is inserted at the same -// loop depth as 'dstAccess.opStmt'. +// Computes memref dependence between 'srcAccess' and 'dstAccess' and uses the +// dependence constraint system to create AffineMaps with which to adjust the +// loop bounds of the inserted compution slice so that they are functions of the +// loop IVs and symbols of the loops surrounding 'dstAccess'. ForStmt *mlir::insertBackwardComputationSlice(MemRefAccess *srcAccess, - MemRefAccess *dstAccess) { + MemRefAccess *dstAccess, + unsigned srcLoopDepth, + unsigned dstLoopDepth) { FlatAffineConstraints dependenceConstraints; if (!checkMemrefAccessDependence(*srcAccess, *dstAccess, /*loopDepth=*/1, &dependenceConstraints, @@ -389,21 +393,32 @@ ForStmt *mlir::insertBackwardComputationSlice(MemRefAccess *srcAccess, SmallVector srcLoopNest; getLoopIVs(*srcAccess->opStmt, &srcLoopNest); unsigned srcLoopNestSize = srcLoopNest.size(); + assert(srcLoopDepth <= srcLoopNestSize); // Get loop nest surrounding dst operation. SmallVector dstLoopNest; getLoopIVs(*dstAccess->opStmt, &dstLoopNest); unsigned dstLoopNestSize = dstLoopNest.size(); + (void)dstLoopNestSize; + assert(dstLoopDepth > 0); + assert(dstLoopDepth <= dstLoopNestSize); // Solve for src IVs in terms of dst IVs, symbols and constants. SmallVector srcIvMaps(srcLoopNestSize, AffineMap::Null()); std::vector> srcIvOperands(srcLoopNestSize); for (unsigned i = 0; i < srcLoopNestSize; ++i) { + // Skip IVs which are greater than requested loop depth. + if (i >= srcLoopDepth) { + srcIvMaps[i] = AffineMap::Null(); + continue; + } auto cst = dependenceConstraints.clone(); for (int j = srcLoopNestSize - 1; j >= 0; --j) { if (i != j) cst->projectOut(j); } + // TODO(andydavis) Check for case with two equalities where we have + // set on IV to a constant. Set a constant IV map for these cases. if (cst->getNumEqualities() != 1) { srcIvMaps[i] = AffineMap::Null(); continue; @@ -412,11 +427,18 @@ ForStmt *mlir::insertBackwardComputationSlice(MemRefAccess *srcAccess, SmallVector nonZeroSymbolIds; srcIvMaps[i] = cst->toAffineMapFromEq(0, 0, srcAccess->opStmt->getContext(), &nonZeroDimIds, &nonZeroSymbolIds); - if (srcIvMaps[i] == AffineMap::Null()) + if (srcIvMaps[i] == AffineMap::Null()) { continue; + } // Add operands for all non-zero dst dims and symbols. // TODO(andydavis) Add local variable support. for (auto dimId : nonZeroDimIds) { + if (dimId - 1 >= dstLoopDepth) { + // This src IV has a dependence on dst IV dstLoopDepth where it will + // be inserted. So we cannot slice the iteration space at srcLoopDepth, + // and also insert it into the dst loop nest at 'dstLoopDepth'. + return nullptr; + } srcIvOperands[i].push_back(dstLoopNest[dimId - 1]); } // TODO(andydavis) Add symbols from the access function. Ideally, we @@ -429,8 +451,8 @@ ForStmt *mlir::insertBackwardComputationSlice(MemRefAccess *srcAccess, findStmtPosition(srcAccess->opStmt, srcLoopNest[0]->getBlock(), &positions); // Clone src loop nest and insert it a the beginning of the statement block - // of the same loop in which containts 'dstAccess->opStmt'. - auto *dstForStmt = dstLoopNest[dstLoopNestSize - 1]; + // of the loop at 'dstLoopDepth' in 'dstLoopNest'. + auto *dstForStmt = dstLoopNest[dstLoopDepth - 1]; MLFuncBuilder b(dstForStmt, dstForStmt->begin()); DenseMap operandMap; auto *sliceLoopNest = cast(b.clone(*srcLoopNest[0], operandMap)); @@ -442,11 +464,14 @@ ForStmt *mlir::insertBackwardComputationSlice(MemRefAccess *srcAccess, SmallVector sliceSurroundingLoops; getLoopIVs(*sliceStmt, &sliceSurroundingLoops); unsigned sliceSurroundingLoopsSize = sliceSurroundingLoops.size(); + (void)sliceSurroundingLoopsSize; // Update loop bounds for loops in 'sliceLoopNest'. - for (unsigned i = dstLoopNestSize; i < sliceSurroundingLoopsSize; ++i) { + unsigned sliceLoopLimit = dstLoopDepth + srcLoopNestSize; + assert(sliceLoopLimit <= sliceSurroundingLoopsSize); + for (unsigned i = dstLoopDepth; i < sliceLoopLimit; ++i) { auto *forStmt = sliceSurroundingLoops[i]; - unsigned index = i - dstLoopNestSize; + unsigned index = i - dstLoopDepth; AffineMap lbMap = srcIvMaps[index]; if (lbMap == AffineMap::Null()) continue; diff --git a/mlir/lib/Transforms/LoopFusion.cpp b/mlir/lib/Transforms/LoopFusion.cpp index 6393fa6069d8..df68765aeb7b 100644 --- a/mlir/lib/Transforms/LoopFusion.cpp +++ b/mlir/lib/Transforms/LoopFusion.cpp @@ -35,12 +35,27 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/SetVector.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/raw_ostream.h" using llvm::SetVector; using namespace mlir; +// TODO(andydavis) These flags are global for the pass to be used for +// experimentation. Find a way to provide more fine grained control (i.e. +// depth per-loop nest, or depth per load/store op) for this pass utilizing a +// cost model. +static llvm::cl::opt clSrcLoopDepth( + "src-loop-depth", llvm::cl::Hidden, + llvm::cl::desc("Controls the depth of the source loop nest at which " + "to apply loop iteration slicing before fusion.")); + +static llvm::cl::opt clDstLoopDepth( + "dst-loop-depth", llvm::cl::Hidden, + llvm::cl::desc("Controls the depth of the destination loop nest at which " + "to fuse the source loop nest slice.")); + namespace { /// Loop fusion pass. This pass currently supports a greedy fusion policy, @@ -107,6 +122,18 @@ static FusionCandidate buildFusionCandidate(OperationStmt *srcStoreOpStmt, return candidate; } +// Returns the loop depth of the loop nest surrounding 'opStmt'. +static unsigned getLoopDepth(OperationStmt *opStmt) { + unsigned loopDepth = 0; + auto *currStmt = opStmt->getParentStmt(); + ForStmt *currForStmt; + while (currStmt && (currForStmt = dyn_cast(currStmt))) { + ++loopDepth; + currStmt = currStmt->getParentStmt(); + } + return loopDepth; +} + namespace { // LoopNestStateCollector walks loop nests and collects load and store @@ -487,8 +514,15 @@ public: FusionCandidate candidate = buildFusionCandidate(srcStoreOpStmt, dstLoadOpStmt); // Fuse computation slice of 'srcLoopNest' into 'dstLoopNest'. + unsigned srcLoopDepth = clSrcLoopDepth.getNumOccurrences() > 0 + ? clSrcLoopDepth + : getLoopDepth(srcStoreOpStmt); + unsigned dstLoopDepth = clDstLoopDepth.getNumOccurrences() > 0 + ? clDstLoopDepth + : getLoopDepth(dstLoadOpStmt); auto *sliceLoopNest = mlir::insertBackwardComputationSlice( - &candidate.srcAccess, &candidate.dstAccess); + &candidate.srcAccess, &candidate.dstAccess, srcLoopDepth, + dstLoopDepth); if (sliceLoopNest != nullptr) { // Remove edges between 'srcNode' and 'dstNode' and remove 'srcNode' mdg->updateEdgesAndRemoveSrcNode(srcNode->id, dstNode->id); diff --git a/mlir/test/Transforms/loop-fusion.mlir b/mlir/test/Transforms/loop-fusion.mlir index a668e181cc19..f26041ed1693 100644 --- a/mlir/test/Transforms/loop-fusion.mlir +++ b/mlir/test/Transforms/loop-fusion.mlir @@ -1,4 +1,5 @@ // RUN: mlir-opt %s -loop-fusion -split-input-file -verify | FileCheck %s +// RUN: mlir-opt %s -loop-fusion -src-loop-depth=1 -dst-loop-depth=1 -split-input-file -verify | FileCheck %s --check-prefix DEPTH1 // TODO(andydavis) Add more tests: // *) Add nested fusion test cases when non-constant loop bound support is @@ -550,3 +551,48 @@ mlfunc @remap_ivs() { return } + +// ----- + +// DEPTH1: #map0 = (d0) -> (d0) +// DEPTH1: #map1 = (d0, d1, d2) -> (d0, d1, d2) + +// DEPTH1-LABEL: mlfunc @fuse_slice_at_depth1() { +mlfunc @fuse_slice_at_depth1() { + %m = alloc() : memref<100x16x100xf32> + + %cf7 = constant 7.0 : f32 + for %i0 = 0 to 100 { + for %i1 = 0 to 16 { + for %i2 = 0 to 100 { + %a0 = affine_apply (d0, d1, d2) -> (d0, d1, d2) (%i0, %i1, %i2) + store %cf7, %m[%a0#0, %a0#1, %a0#2] : memref<100x16x100xf32> + } + } + } + for %i3 = 0 to 100 { + for %i4 = 0 to 16 { + for %i5 = 0 to 100 { + %a1 = affine_apply (d0, d1, d2) -> (d0, d1, d2) (%i3, %i4, %i5) + %v0 = load %m[%a1#0, %a1#1, %a1#2] : memref<100x16x100xf32> + } + } + } +// DEPTH1: for %i0 = 0 to 100 { +// DEPTH1-NEXT: %1 = affine_apply #map0(%i0) +// DEPTH1-NEXT: for %i1 = 0 to 16 { +// DEPTH1-NEXT: for %i2 = 0 to 100 { +// DEPTH1-NEXT: %2 = affine_apply #map1(%1, %i1, %i2) +// DEPTH1-NEXT: store %cst, %0[%2#0, %2#1, %2#2] : memref<100x16x100xf32> +// DEPTH1-NEXT: } +// DEPTH1-NEXT: } +// DEPTH1-NEXT: for %i3 = 0 to 16 { +// DEPTH1-NEXT: for %i4 = 0 to 100 { +// DEPTH1-NEXT: %3 = affine_apply #map1(%i0, %i3, %i4) +// DEPTH1-NEXT: %4 = load %0[%3#0, %3#1, %3#2] : memref<100x16x100xf32> +// DEPTH1-NEXT: } +// DEPTH1-NEXT: } +// DEPTH1-NEXT: } +// DEPTH1-NEXT: return + return +}