Computation slice update: adds parameters to insertBackwardComputationSlice which specify the source loop nest depth at which to perform iteration space slicing, and the destination loop nest depth at which to insert the compution slice.

Updates LoopFusion pass to take these parameters as command line flags for experimentation.

PiperOrigin-RevId: 226514297
This commit is contained in:
MLIR Team 2018-12-21 11:06:23 -08:00 committed by jpienaar
parent 1e0ebabf66
commit 4eef795a1d
4 changed files with 123 additions and 19 deletions

View File

@ -143,20 +143,19 @@ bool boundCheckLoadOrStoreOp(LoadOrStoreOpPointer loadOrStoreOp,
bool emitError = true);
/// Creates a clone of the computation contained in the loop nest surrounding
/// 'srcAccess', and inserts it at the beginning of the statement block of the
/// loop containing 'dstAccess'. Returns the top-level loop of the computation
/// slice on success, returns nullptr otherwise.
// Computes memref dependence between 'srcAccess' and 'dstAccess' and uses the
// dependence constraint system to create AffineMaps with which to adjust the
// loop bounds of the inserted compution slice so that they are functions of the
// loop IVs and symbols of the loops surrounding 'dstAccess'.
// TODO(andydavis) Add 'dstLoopDepth' argument for computation slice insertion.
/// 'srcAccess', slices the iteration space of the first 'srcLoopDepth' src loop
/// IVs, and inserts the computation slice at the beginning of the statement
/// block of the loop at 'dstLoopDepth' in the loop nest surrounding
/// 'dstAccess'. Returns the top-level loop of the computation slice on
/// success, returns nullptr otherwise.
// Loop depth is a crucial optimization choice that determines where to
// materialize the results of the backward slice - presenting a trade-off b/w
// storage and redundant computation in several cases
// TODO(andydavis) Support computation slices with common surrounding loops.
ForStmt *insertBackwardComputationSlice(MemRefAccess *srcAccess,
MemRefAccess *dstAccess);
MemRefAccess *dstAccess,
unsigned srcLoopDepth,
unsigned dstLoopDepth);
} // end namespace mlir
#endif // MLIR_ANALYSIS_UTILS_H

View File

@ -28,6 +28,7 @@
#include "mlir/IR/BuiltinOps.h"
#include "mlir/StandardOps/StandardOps.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#define DEBUG_TYPE "analysis-utils"
@ -374,11 +375,14 @@ static Statement *getStmtAtPosition(ArrayRef<unsigned> positions,
return nullptr;
}
// TODO(andydavis) Support a 'dstLoopDepth' argument for computation slice
// insertion (currently the computation slice is inserted at the same
// loop depth as 'dstAccess.opStmt'.
// Computes memref dependence between 'srcAccess' and 'dstAccess' and uses the
// dependence constraint system to create AffineMaps with which to adjust the
// loop bounds of the inserted compution slice so that they are functions of the
// loop IVs and symbols of the loops surrounding 'dstAccess'.
ForStmt *mlir::insertBackwardComputationSlice(MemRefAccess *srcAccess,
MemRefAccess *dstAccess) {
MemRefAccess *dstAccess,
unsigned srcLoopDepth,
unsigned dstLoopDepth) {
FlatAffineConstraints dependenceConstraints;
if (!checkMemrefAccessDependence(*srcAccess, *dstAccess, /*loopDepth=*/1,
&dependenceConstraints,
@ -389,21 +393,32 @@ ForStmt *mlir::insertBackwardComputationSlice(MemRefAccess *srcAccess,
SmallVector<ForStmt *, 4> srcLoopNest;
getLoopIVs(*srcAccess->opStmt, &srcLoopNest);
unsigned srcLoopNestSize = srcLoopNest.size();
assert(srcLoopDepth <= srcLoopNestSize);
// Get loop nest surrounding dst operation.
SmallVector<ForStmt *, 4> dstLoopNest;
getLoopIVs(*dstAccess->opStmt, &dstLoopNest);
unsigned dstLoopNestSize = dstLoopNest.size();
(void)dstLoopNestSize;
assert(dstLoopDepth > 0);
assert(dstLoopDepth <= dstLoopNestSize);
// Solve for src IVs in terms of dst IVs, symbols and constants.
SmallVector<AffineMap, 4> srcIvMaps(srcLoopNestSize, AffineMap::Null());
std::vector<SmallVector<MLValue *, 2>> srcIvOperands(srcLoopNestSize);
for (unsigned i = 0; i < srcLoopNestSize; ++i) {
// Skip IVs which are greater than requested loop depth.
if (i >= srcLoopDepth) {
srcIvMaps[i] = AffineMap::Null();
continue;
}
auto cst = dependenceConstraints.clone();
for (int j = srcLoopNestSize - 1; j >= 0; --j) {
if (i != j)
cst->projectOut(j);
}
// TODO(andydavis) Check for case with two equalities where we have
// set on IV to a constant. Set a constant IV map for these cases.
if (cst->getNumEqualities() != 1) {
srcIvMaps[i] = AffineMap::Null();
continue;
@ -412,11 +427,18 @@ ForStmt *mlir::insertBackwardComputationSlice(MemRefAccess *srcAccess,
SmallVector<unsigned, 2> nonZeroSymbolIds;
srcIvMaps[i] = cst->toAffineMapFromEq(0, 0, srcAccess->opStmt->getContext(),
&nonZeroDimIds, &nonZeroSymbolIds);
if (srcIvMaps[i] == AffineMap::Null())
if (srcIvMaps[i] == AffineMap::Null()) {
continue;
}
// Add operands for all non-zero dst dims and symbols.
// TODO(andydavis) Add local variable support.
for (auto dimId : nonZeroDimIds) {
if (dimId - 1 >= dstLoopDepth) {
// This src IV has a dependence on dst IV dstLoopDepth where it will
// be inserted. So we cannot slice the iteration space at srcLoopDepth,
// and also insert it into the dst loop nest at 'dstLoopDepth'.
return nullptr;
}
srcIvOperands[i].push_back(dstLoopNest[dimId - 1]);
}
// TODO(andydavis) Add symbols from the access function. Ideally, we
@ -429,8 +451,8 @@ ForStmt *mlir::insertBackwardComputationSlice(MemRefAccess *srcAccess,
findStmtPosition(srcAccess->opStmt, srcLoopNest[0]->getBlock(), &positions);
// Clone src loop nest and insert it a the beginning of the statement block
// of the same loop in which containts 'dstAccess->opStmt'.
auto *dstForStmt = dstLoopNest[dstLoopNestSize - 1];
// of the loop at 'dstLoopDepth' in 'dstLoopNest'.
auto *dstForStmt = dstLoopNest[dstLoopDepth - 1];
MLFuncBuilder b(dstForStmt, dstForStmt->begin());
DenseMap<const MLValue *, MLValue *> operandMap;
auto *sliceLoopNest = cast<ForStmt>(b.clone(*srcLoopNest[0], operandMap));
@ -442,11 +464,14 @@ ForStmt *mlir::insertBackwardComputationSlice(MemRefAccess *srcAccess,
SmallVector<ForStmt *, 4> sliceSurroundingLoops;
getLoopIVs(*sliceStmt, &sliceSurroundingLoops);
unsigned sliceSurroundingLoopsSize = sliceSurroundingLoops.size();
(void)sliceSurroundingLoopsSize;
// Update loop bounds for loops in 'sliceLoopNest'.
for (unsigned i = dstLoopNestSize; i < sliceSurroundingLoopsSize; ++i) {
unsigned sliceLoopLimit = dstLoopDepth + srcLoopNestSize;
assert(sliceLoopLimit <= sliceSurroundingLoopsSize);
for (unsigned i = dstLoopDepth; i < sliceLoopLimit; ++i) {
auto *forStmt = sliceSurroundingLoops[i];
unsigned index = i - dstLoopNestSize;
unsigned index = i - dstLoopDepth;
AffineMap lbMap = srcIvMaps[index];
if (lbMap == AffineMap::Null())
continue;

View File

@ -35,12 +35,27 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/raw_ostream.h"
using llvm::SetVector;
using namespace mlir;
// TODO(andydavis) These flags are global for the pass to be used for
// experimentation. Find a way to provide more fine grained control (i.e.
// depth per-loop nest, or depth per load/store op) for this pass utilizing a
// cost model.
static llvm::cl::opt<unsigned> clSrcLoopDepth(
"src-loop-depth", llvm::cl::Hidden,
llvm::cl::desc("Controls the depth of the source loop nest at which "
"to apply loop iteration slicing before fusion."));
static llvm::cl::opt<unsigned> clDstLoopDepth(
"dst-loop-depth", llvm::cl::Hidden,
llvm::cl::desc("Controls the depth of the destination loop nest at which "
"to fuse the source loop nest slice."));
namespace {
/// Loop fusion pass. This pass currently supports a greedy fusion policy,
@ -107,6 +122,18 @@ static FusionCandidate buildFusionCandidate(OperationStmt *srcStoreOpStmt,
return candidate;
}
// Returns the loop depth of the loop nest surrounding 'opStmt'.
static unsigned getLoopDepth(OperationStmt *opStmt) {
unsigned loopDepth = 0;
auto *currStmt = opStmt->getParentStmt();
ForStmt *currForStmt;
while (currStmt && (currForStmt = dyn_cast<ForStmt>(currStmt))) {
++loopDepth;
currStmt = currStmt->getParentStmt();
}
return loopDepth;
}
namespace {
// LoopNestStateCollector walks loop nests and collects load and store
@ -487,8 +514,15 @@ public:
FusionCandidate candidate =
buildFusionCandidate(srcStoreOpStmt, dstLoadOpStmt);
// Fuse computation slice of 'srcLoopNest' into 'dstLoopNest'.
unsigned srcLoopDepth = clSrcLoopDepth.getNumOccurrences() > 0
? clSrcLoopDepth
: getLoopDepth(srcStoreOpStmt);
unsigned dstLoopDepth = clDstLoopDepth.getNumOccurrences() > 0
? clDstLoopDepth
: getLoopDepth(dstLoadOpStmt);
auto *sliceLoopNest = mlir::insertBackwardComputationSlice(
&candidate.srcAccess, &candidate.dstAccess);
&candidate.srcAccess, &candidate.dstAccess, srcLoopDepth,
dstLoopDepth);
if (sliceLoopNest != nullptr) {
// Remove edges between 'srcNode' and 'dstNode' and remove 'srcNode'
mdg->updateEdgesAndRemoveSrcNode(srcNode->id, dstNode->id);

View File

@ -1,4 +1,5 @@
// RUN: mlir-opt %s -loop-fusion -split-input-file -verify | FileCheck %s
// RUN: mlir-opt %s -loop-fusion -src-loop-depth=1 -dst-loop-depth=1 -split-input-file -verify | FileCheck %s --check-prefix DEPTH1
// TODO(andydavis) Add more tests:
// *) Add nested fusion test cases when non-constant loop bound support is
@ -550,3 +551,48 @@ mlfunc @remap_ivs() {
return
}
// -----
// DEPTH1: #map0 = (d0) -> (d0)
// DEPTH1: #map1 = (d0, d1, d2) -> (d0, d1, d2)
// DEPTH1-LABEL: mlfunc @fuse_slice_at_depth1() {
mlfunc @fuse_slice_at_depth1() {
%m = alloc() : memref<100x16x100xf32>
%cf7 = constant 7.0 : f32
for %i0 = 0 to 100 {
for %i1 = 0 to 16 {
for %i2 = 0 to 100 {
%a0 = affine_apply (d0, d1, d2) -> (d0, d1, d2) (%i0, %i1, %i2)
store %cf7, %m[%a0#0, %a0#1, %a0#2] : memref<100x16x100xf32>
}
}
}
for %i3 = 0 to 100 {
for %i4 = 0 to 16 {
for %i5 = 0 to 100 {
%a1 = affine_apply (d0, d1, d2) -> (d0, d1, d2) (%i3, %i4, %i5)
%v0 = load %m[%a1#0, %a1#1, %a1#2] : memref<100x16x100xf32>
}
}
}
// DEPTH1: for %i0 = 0 to 100 {
// DEPTH1-NEXT: %1 = affine_apply #map0(%i0)
// DEPTH1-NEXT: for %i1 = 0 to 16 {
// DEPTH1-NEXT: for %i2 = 0 to 100 {
// DEPTH1-NEXT: %2 = affine_apply #map1(%1, %i1, %i2)
// DEPTH1-NEXT: store %cst, %0[%2#0, %2#1, %2#2] : memref<100x16x100xf32>
// DEPTH1-NEXT: }
// DEPTH1-NEXT: }
// DEPTH1-NEXT: for %i3 = 0 to 16 {
// DEPTH1-NEXT: for %i4 = 0 to 100 {
// DEPTH1-NEXT: %3 = affine_apply #map1(%i0, %i3, %i4)
// DEPTH1-NEXT: %4 = load %0[%3#0, %3#1, %3#2] : memref<100x16x100xf32>
// DEPTH1-NEXT: }
// DEPTH1-NEXT: }
// DEPTH1-NEXT: }
// DEPTH1-NEXT: return
return
}