Computation slice update: adds parameters to insertBackwardComputationSlice which specify the source loop nest depth at which to perform iteration space slicing, and the destination loop nest depth at which to insert the compution slice.

Updates LoopFusion pass to take these parameters as command line flags for experimentation.

PiperOrigin-RevId: 226514297
This commit is contained in:
MLIR Team 2018-12-21 11:06:23 -08:00 committed by jpienaar
parent 1e0ebabf66
commit 4eef795a1d
4 changed files with 123 additions and 19 deletions

View File

@ -143,20 +143,19 @@ bool boundCheckLoadOrStoreOp(LoadOrStoreOpPointer loadOrStoreOp,
bool emitError = true); bool emitError = true);
/// Creates a clone of the computation contained in the loop nest surrounding /// Creates a clone of the computation contained in the loop nest surrounding
/// 'srcAccess', and inserts it at the beginning of the statement block of the /// 'srcAccess', slices the iteration space of the first 'srcLoopDepth' src loop
/// loop containing 'dstAccess'. Returns the top-level loop of the computation /// IVs, and inserts the computation slice at the beginning of the statement
/// slice on success, returns nullptr otherwise. /// block of the loop at 'dstLoopDepth' in the loop nest surrounding
// Computes memref dependence between 'srcAccess' and 'dstAccess' and uses the /// 'dstAccess'. Returns the top-level loop of the computation slice on
// dependence constraint system to create AffineMaps with which to adjust the /// success, returns nullptr otherwise.
// loop bounds of the inserted compution slice so that they are functions of the
// loop IVs and symbols of the loops surrounding 'dstAccess'.
// TODO(andydavis) Add 'dstLoopDepth' argument for computation slice insertion.
// Loop depth is a crucial optimization choice that determines where to // Loop depth is a crucial optimization choice that determines where to
// materialize the results of the backward slice - presenting a trade-off b/w // materialize the results of the backward slice - presenting a trade-off b/w
// storage and redundant computation in several cases // storage and redundant computation in several cases
// TODO(andydavis) Support computation slices with common surrounding loops. // TODO(andydavis) Support computation slices with common surrounding loops.
ForStmt *insertBackwardComputationSlice(MemRefAccess *srcAccess, ForStmt *insertBackwardComputationSlice(MemRefAccess *srcAccess,
MemRefAccess *dstAccess); MemRefAccess *dstAccess,
unsigned srcLoopDepth,
unsigned dstLoopDepth);
} // end namespace mlir } // end namespace mlir
#endif // MLIR_ANALYSIS_UTILS_H #endif // MLIR_ANALYSIS_UTILS_H

View File

@ -28,6 +28,7 @@
#include "mlir/IR/BuiltinOps.h" #include "mlir/IR/BuiltinOps.h"
#include "mlir/StandardOps/StandardOps.h" #include "mlir/StandardOps/StandardOps.h"
#include "llvm/Support/Debug.h" #include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#define DEBUG_TYPE "analysis-utils" #define DEBUG_TYPE "analysis-utils"
@ -374,11 +375,14 @@ static Statement *getStmtAtPosition(ArrayRef<unsigned> positions,
return nullptr; return nullptr;
} }
// TODO(andydavis) Support a 'dstLoopDepth' argument for computation slice // Computes memref dependence between 'srcAccess' and 'dstAccess' and uses the
// insertion (currently the computation slice is inserted at the same // dependence constraint system to create AffineMaps with which to adjust the
// loop depth as 'dstAccess.opStmt'. // loop bounds of the inserted compution slice so that they are functions of the
// loop IVs and symbols of the loops surrounding 'dstAccess'.
ForStmt *mlir::insertBackwardComputationSlice(MemRefAccess *srcAccess, ForStmt *mlir::insertBackwardComputationSlice(MemRefAccess *srcAccess,
MemRefAccess *dstAccess) { MemRefAccess *dstAccess,
unsigned srcLoopDepth,
unsigned dstLoopDepth) {
FlatAffineConstraints dependenceConstraints; FlatAffineConstraints dependenceConstraints;
if (!checkMemrefAccessDependence(*srcAccess, *dstAccess, /*loopDepth=*/1, if (!checkMemrefAccessDependence(*srcAccess, *dstAccess, /*loopDepth=*/1,
&dependenceConstraints, &dependenceConstraints,
@ -389,21 +393,32 @@ ForStmt *mlir::insertBackwardComputationSlice(MemRefAccess *srcAccess,
SmallVector<ForStmt *, 4> srcLoopNest; SmallVector<ForStmt *, 4> srcLoopNest;
getLoopIVs(*srcAccess->opStmt, &srcLoopNest); getLoopIVs(*srcAccess->opStmt, &srcLoopNest);
unsigned srcLoopNestSize = srcLoopNest.size(); unsigned srcLoopNestSize = srcLoopNest.size();
assert(srcLoopDepth <= srcLoopNestSize);
// Get loop nest surrounding dst operation. // Get loop nest surrounding dst operation.
SmallVector<ForStmt *, 4> dstLoopNest; SmallVector<ForStmt *, 4> dstLoopNest;
getLoopIVs(*dstAccess->opStmt, &dstLoopNest); getLoopIVs(*dstAccess->opStmt, &dstLoopNest);
unsigned dstLoopNestSize = dstLoopNest.size(); unsigned dstLoopNestSize = dstLoopNest.size();
(void)dstLoopNestSize;
assert(dstLoopDepth > 0);
assert(dstLoopDepth <= dstLoopNestSize);
// Solve for src IVs in terms of dst IVs, symbols and constants. // Solve for src IVs in terms of dst IVs, symbols and constants.
SmallVector<AffineMap, 4> srcIvMaps(srcLoopNestSize, AffineMap::Null()); SmallVector<AffineMap, 4> srcIvMaps(srcLoopNestSize, AffineMap::Null());
std::vector<SmallVector<MLValue *, 2>> srcIvOperands(srcLoopNestSize); std::vector<SmallVector<MLValue *, 2>> srcIvOperands(srcLoopNestSize);
for (unsigned i = 0; i < srcLoopNestSize; ++i) { for (unsigned i = 0; i < srcLoopNestSize; ++i) {
// Skip IVs which are greater than requested loop depth.
if (i >= srcLoopDepth) {
srcIvMaps[i] = AffineMap::Null();
continue;
}
auto cst = dependenceConstraints.clone(); auto cst = dependenceConstraints.clone();
for (int j = srcLoopNestSize - 1; j >= 0; --j) { for (int j = srcLoopNestSize - 1; j >= 0; --j) {
if (i != j) if (i != j)
cst->projectOut(j); cst->projectOut(j);
} }
// TODO(andydavis) Check for case with two equalities where we have
// set on IV to a constant. Set a constant IV map for these cases.
if (cst->getNumEqualities() != 1) { if (cst->getNumEqualities() != 1) {
srcIvMaps[i] = AffineMap::Null(); srcIvMaps[i] = AffineMap::Null();
continue; continue;
@ -412,11 +427,18 @@ ForStmt *mlir::insertBackwardComputationSlice(MemRefAccess *srcAccess,
SmallVector<unsigned, 2> nonZeroSymbolIds; SmallVector<unsigned, 2> nonZeroSymbolIds;
srcIvMaps[i] = cst->toAffineMapFromEq(0, 0, srcAccess->opStmt->getContext(), srcIvMaps[i] = cst->toAffineMapFromEq(0, 0, srcAccess->opStmt->getContext(),
&nonZeroDimIds, &nonZeroSymbolIds); &nonZeroDimIds, &nonZeroSymbolIds);
if (srcIvMaps[i] == AffineMap::Null()) if (srcIvMaps[i] == AffineMap::Null()) {
continue; continue;
}
// Add operands for all non-zero dst dims and symbols. // Add operands for all non-zero dst dims and symbols.
// TODO(andydavis) Add local variable support. // TODO(andydavis) Add local variable support.
for (auto dimId : nonZeroDimIds) { for (auto dimId : nonZeroDimIds) {
if (dimId - 1 >= dstLoopDepth) {
// This src IV has a dependence on dst IV dstLoopDepth where it will
// be inserted. So we cannot slice the iteration space at srcLoopDepth,
// and also insert it into the dst loop nest at 'dstLoopDepth'.
return nullptr;
}
srcIvOperands[i].push_back(dstLoopNest[dimId - 1]); srcIvOperands[i].push_back(dstLoopNest[dimId - 1]);
} }
// TODO(andydavis) Add symbols from the access function. Ideally, we // TODO(andydavis) Add symbols from the access function. Ideally, we
@ -429,8 +451,8 @@ ForStmt *mlir::insertBackwardComputationSlice(MemRefAccess *srcAccess,
findStmtPosition(srcAccess->opStmt, srcLoopNest[0]->getBlock(), &positions); findStmtPosition(srcAccess->opStmt, srcLoopNest[0]->getBlock(), &positions);
// Clone src loop nest and insert it a the beginning of the statement block // Clone src loop nest and insert it a the beginning of the statement block
// of the same loop in which containts 'dstAccess->opStmt'. // of the loop at 'dstLoopDepth' in 'dstLoopNest'.
auto *dstForStmt = dstLoopNest[dstLoopNestSize - 1]; auto *dstForStmt = dstLoopNest[dstLoopDepth - 1];
MLFuncBuilder b(dstForStmt, dstForStmt->begin()); MLFuncBuilder b(dstForStmt, dstForStmt->begin());
DenseMap<const MLValue *, MLValue *> operandMap; DenseMap<const MLValue *, MLValue *> operandMap;
auto *sliceLoopNest = cast<ForStmt>(b.clone(*srcLoopNest[0], operandMap)); auto *sliceLoopNest = cast<ForStmt>(b.clone(*srcLoopNest[0], operandMap));
@ -442,11 +464,14 @@ ForStmt *mlir::insertBackwardComputationSlice(MemRefAccess *srcAccess,
SmallVector<ForStmt *, 4> sliceSurroundingLoops; SmallVector<ForStmt *, 4> sliceSurroundingLoops;
getLoopIVs(*sliceStmt, &sliceSurroundingLoops); getLoopIVs(*sliceStmt, &sliceSurroundingLoops);
unsigned sliceSurroundingLoopsSize = sliceSurroundingLoops.size(); unsigned sliceSurroundingLoopsSize = sliceSurroundingLoops.size();
(void)sliceSurroundingLoopsSize;
// Update loop bounds for loops in 'sliceLoopNest'. // Update loop bounds for loops in 'sliceLoopNest'.
for (unsigned i = dstLoopNestSize; i < sliceSurroundingLoopsSize; ++i) { unsigned sliceLoopLimit = dstLoopDepth + srcLoopNestSize;
assert(sliceLoopLimit <= sliceSurroundingLoopsSize);
for (unsigned i = dstLoopDepth; i < sliceLoopLimit; ++i) {
auto *forStmt = sliceSurroundingLoops[i]; auto *forStmt = sliceSurroundingLoops[i];
unsigned index = i - dstLoopNestSize; unsigned index = i - dstLoopDepth;
AffineMap lbMap = srcIvMaps[index]; AffineMap lbMap = srcIvMaps[index];
if (lbMap == AffineMap::Null()) if (lbMap == AffineMap::Null())
continue; continue;

View File

@ -35,12 +35,27 @@
#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseSet.h" #include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/SetVector.h" #include "llvm/ADT/SetVector.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/raw_ostream.h" #include "llvm/Support/raw_ostream.h"
using llvm::SetVector; using llvm::SetVector;
using namespace mlir; using namespace mlir;
// TODO(andydavis) These flags are global for the pass to be used for
// experimentation. Find a way to provide more fine grained control (i.e.
// depth per-loop nest, or depth per load/store op) for this pass utilizing a
// cost model.
static llvm::cl::opt<unsigned> clSrcLoopDepth(
"src-loop-depth", llvm::cl::Hidden,
llvm::cl::desc("Controls the depth of the source loop nest at which "
"to apply loop iteration slicing before fusion."));
static llvm::cl::opt<unsigned> clDstLoopDepth(
"dst-loop-depth", llvm::cl::Hidden,
llvm::cl::desc("Controls the depth of the destination loop nest at which "
"to fuse the source loop nest slice."));
namespace { namespace {
/// Loop fusion pass. This pass currently supports a greedy fusion policy, /// Loop fusion pass. This pass currently supports a greedy fusion policy,
@ -107,6 +122,18 @@ static FusionCandidate buildFusionCandidate(OperationStmt *srcStoreOpStmt,
return candidate; return candidate;
} }
// Returns the loop depth of the loop nest surrounding 'opStmt'.
static unsigned getLoopDepth(OperationStmt *opStmt) {
unsigned loopDepth = 0;
auto *currStmt = opStmt->getParentStmt();
ForStmt *currForStmt;
while (currStmt && (currForStmt = dyn_cast<ForStmt>(currStmt))) {
++loopDepth;
currStmt = currStmt->getParentStmt();
}
return loopDepth;
}
namespace { namespace {
// LoopNestStateCollector walks loop nests and collects load and store // LoopNestStateCollector walks loop nests and collects load and store
@ -487,8 +514,15 @@ public:
FusionCandidate candidate = FusionCandidate candidate =
buildFusionCandidate(srcStoreOpStmt, dstLoadOpStmt); buildFusionCandidate(srcStoreOpStmt, dstLoadOpStmt);
// Fuse computation slice of 'srcLoopNest' into 'dstLoopNest'. // Fuse computation slice of 'srcLoopNest' into 'dstLoopNest'.
unsigned srcLoopDepth = clSrcLoopDepth.getNumOccurrences() > 0
? clSrcLoopDepth
: getLoopDepth(srcStoreOpStmt);
unsigned dstLoopDepth = clDstLoopDepth.getNumOccurrences() > 0
? clDstLoopDepth
: getLoopDepth(dstLoadOpStmt);
auto *sliceLoopNest = mlir::insertBackwardComputationSlice( auto *sliceLoopNest = mlir::insertBackwardComputationSlice(
&candidate.srcAccess, &candidate.dstAccess); &candidate.srcAccess, &candidate.dstAccess, srcLoopDepth,
dstLoopDepth);
if (sliceLoopNest != nullptr) { if (sliceLoopNest != nullptr) {
// Remove edges between 'srcNode' and 'dstNode' and remove 'srcNode' // Remove edges between 'srcNode' and 'dstNode' and remove 'srcNode'
mdg->updateEdgesAndRemoveSrcNode(srcNode->id, dstNode->id); mdg->updateEdgesAndRemoveSrcNode(srcNode->id, dstNode->id);

View File

@ -1,4 +1,5 @@
// RUN: mlir-opt %s -loop-fusion -split-input-file -verify | FileCheck %s // RUN: mlir-opt %s -loop-fusion -split-input-file -verify | FileCheck %s
// RUN: mlir-opt %s -loop-fusion -src-loop-depth=1 -dst-loop-depth=1 -split-input-file -verify | FileCheck %s --check-prefix DEPTH1
// TODO(andydavis) Add more tests: // TODO(andydavis) Add more tests:
// *) Add nested fusion test cases when non-constant loop bound support is // *) Add nested fusion test cases when non-constant loop bound support is
@ -550,3 +551,48 @@ mlfunc @remap_ivs() {
return return
} }
// -----
// DEPTH1: #map0 = (d0) -> (d0)
// DEPTH1: #map1 = (d0, d1, d2) -> (d0, d1, d2)
// DEPTH1-LABEL: mlfunc @fuse_slice_at_depth1() {
mlfunc @fuse_slice_at_depth1() {
%m = alloc() : memref<100x16x100xf32>
%cf7 = constant 7.0 : f32
for %i0 = 0 to 100 {
for %i1 = 0 to 16 {
for %i2 = 0 to 100 {
%a0 = affine_apply (d0, d1, d2) -> (d0, d1, d2) (%i0, %i1, %i2)
store %cf7, %m[%a0#0, %a0#1, %a0#2] : memref<100x16x100xf32>
}
}
}
for %i3 = 0 to 100 {
for %i4 = 0 to 16 {
for %i5 = 0 to 100 {
%a1 = affine_apply (d0, d1, d2) -> (d0, d1, d2) (%i3, %i4, %i5)
%v0 = load %m[%a1#0, %a1#1, %a1#2] : memref<100x16x100xf32>
}
}
}
// DEPTH1: for %i0 = 0 to 100 {
// DEPTH1-NEXT: %1 = affine_apply #map0(%i0)
// DEPTH1-NEXT: for %i1 = 0 to 16 {
// DEPTH1-NEXT: for %i2 = 0 to 100 {
// DEPTH1-NEXT: %2 = affine_apply #map1(%1, %i1, %i2)
// DEPTH1-NEXT: store %cst, %0[%2#0, %2#1, %2#2] : memref<100x16x100xf32>
// DEPTH1-NEXT: }
// DEPTH1-NEXT: }
// DEPTH1-NEXT: for %i3 = 0 to 16 {
// DEPTH1-NEXT: for %i4 = 0 to 100 {
// DEPTH1-NEXT: %3 = affine_apply #map1(%i0, %i3, %i4)
// DEPTH1-NEXT: %4 = load %0[%3#0, %3#1, %3#2] : memref<100x16x100xf32>
// DEPTH1-NEXT: }
// DEPTH1-NEXT: }
// DEPTH1-NEXT: }
// DEPTH1-NEXT: return
return
}