[mlir] Add a simplifying wrapper for generateCopy and expose it.

Summary:
affineDataCopyGenerate is a monolithinc function that
combines several steps for good reasons, but it makes customizing
the behaivor even harder. The major two steps by affineDataCopyGenerate are:
a) Identify interesting memrefs and collect their uses.
b) Create new buffers to forward these uses.

Step (a) actually has requires tremendous customization options. One could see
that from the recently added filterMemRef parameter.

This patch adds a function that only does (b), in the hope that (a)
can be directly implemented by the callers. In fact, (a) is quite
simple if the caller has only one buffer to consider, or even one use.

Differential Revision: https://reviews.llvm.org/D75965
This commit is contained in:
Tim Shen 2020-03-10 15:32:53 -07:00
parent d4757a6cf1
commit d00f5632f3
4 changed files with 84 additions and 7 deletions

View File

@ -24,6 +24,7 @@ class AffineForOp;
class FuncOp;
class OpBuilder;
class Value;
struct MemRefRegion;
namespace loop {
class ForOp;
@ -185,6 +186,34 @@ uint64_t affineDataCopyGenerate(AffineForOp forOp,
Optional<Value> filterMemRef,
DenseSet<Operation *> &copyNests);
/// Result for calling generateCopyForMemRegion.
struct CopyGenerateResult {
// Number of bytes used by alloc.
uint64_t sizeInBytes;
// The newly created buffer allocation.
Operation *alloc;
// Generated loop nest for copying data between the allocated buffer and the
// original memref.
Operation *copyNest;
};
/// generateCopyForMemRegion is similar to affineDataCopyGenerate, but works
/// with a single memref region. `memrefRegion` is supposed to contain analysis
/// information within analyzedOp. The generated prologue and epilogue always
/// surround `analyzedOp`.
///
/// Note that `analyzedOp` is a single op for API convenience, and the
/// [begin, end) version can be added as needed.
///
/// Also note that certain options in `copyOptions` aren't looked at anymore,
/// like slowMemorySpace.
LogicalResult generateCopyForMemRegion(const MemRefRegion &memrefRegion,
Operation *analyzedOp,
const AffineCopyOptions &copyOptions,
CopyGenerateResult &result);
/// Tile a nest of standard for loops rooted at `rootForOp` by finding such
/// parametric tile sizes that the outer loops have a fixed number of iterations
/// as defined in `sizes`.

View File

@ -1797,6 +1797,28 @@ uint64_t mlir::affineDataCopyGenerate(AffineForOp forOp,
filterMemRef, copyNests);
}
LogicalResult mlir::generateCopyForMemRegion(
const MemRefRegion &memrefRegion, Operation *analyzedOp,
const AffineCopyOptions &copyOptions, CopyGenerateResult &result) {
Block *block = analyzedOp->getBlock();
auto begin = analyzedOp->getIterator();
auto end = std::next(begin);
DenseMap<Value, Value> fastBufferMap;
DenseSet<Operation *> copyNests;
auto err = generateCopy(memrefRegion, block, begin, end, block, begin, end,
copyOptions, fastBufferMap, copyNests,
&result.sizeInBytes, &begin, &end);
if (failed(err))
return err;
result.alloc =
fastBufferMap.find(memrefRegion.memref)->second.getDefiningOp();
assert(copyNests.size() <= 1 && "At most one copy nest is expected.");
result.copyNest = copyNests.empty() ? nullptr : *copyNests.begin();
return success();
}
/// Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'.
static void
gatherLoopsInBlock(Block *block, unsigned currLoopDepth,

View File

@ -6,7 +6,8 @@
// affine data copy utility on the input loop nest.
// '-test-affine-data-copy-memref-filter' passes the first memref found in an
// affine.load op in the innermost loop as a filter.
// RUN: mlir-opt %s -split-input-file -test-affine-data-copy='memref-filter=1' | FileCheck %s --check-prefix=FILTER
// RUN: mlir-opt %s -split-input-file -test-affine-data-copy='memref-filter' | FileCheck %s --check-prefix=FILTER
// RUN: mlir-opt %s -split-input-file -test-affine-data-copy='for-memref-region' | FileCheck %s --check-prefix=MEMREF_REGION
// -copy-skip-non-stride-loops forces the copies to be placed right inside the
// tile space loops, avoiding the sensitivity of copy placement depth to memory
@ -140,6 +141,7 @@ func @matmul(%A: memref<4096x4096xf32>, %B: memref<4096x4096xf32>, %C: memref<40
//
// CHECK-SMALL-LABEL: func @foo
// FILTER-LABEL: func @foo
// MEMREF_REGION-LABEL: func @foo
func @foo(%arg0: memref<1024x1024xf32>, %arg1: memref<1024x1024xf32>, %arg2: memref<1024x1024xf32>) -> memref<1024x1024xf32> {
affine.for %i = 0 to 1024 {
affine.for %j = 0 to 1024 {
@ -198,3 +200,15 @@ func @foo(%arg0: memref<1024x1024xf32>, %arg1: memref<1024x1024xf32>, %arg2: mem
// FILTER-NEXT: affine.for %{{.*}} = 0 to 1024 {
// FILTER: dealloc %{{.*}} : memref<1024x1024xf32>
// FILTER-NOT: dealloc
// CHeck that only one memref is copied, because for-memref-region is enabled
// (and the first ever encountered load is analyzed).
// MEMREF_REGION: alloc() : memref<1024x1024xf32>
// MEMREF_REGION-NOT: alloc()
// MEMREF_REGION: affine.for %{{.*}} = 0 to 1024 {
// MEMREF_REGION: affine.for %{{.*}} = 0 to 1024 {
// MEMREF_REGION: affine.for %{{.*}} = 0 to 1024 {
// MEMREF_REGION-NEXT: affine.for %{{.*}} = 0 to 1024 {
// MEMREF_REGION-NEXT: affine.for %{{.*}} = 0 to 1024 {
// MEMREF_REGION: dealloc %{{.*}} : memref<1024x1024xf32>
// MEMREF_REGION-NOT: dealloc

View File

@ -12,6 +12,7 @@
//===----------------------------------------------------------------------===//
#include "mlir/Analysis/Passes.h"
#include "mlir/Analysis/Utils.h"
#include "mlir/Dialect/AffineOps/AffineOps.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/LoopUtils.h"
@ -37,6 +38,10 @@ private:
llvm::cl::desc(
"Enable memref filter testing in affine data copy optimization"),
llvm::cl::init(false)};
Option<bool> clTestGenerateCopyForMemRegion{
*this, "for-memref-region",
llvm::cl::desc("Test copy generation for a single memref region"),
llvm::cl::init(false)};
};
} // end anonymous namespace
@ -55,13 +60,13 @@ void TestAffineDataCopy::runOnFunction() {
auto loopNest = depthToLoops[0][0];
auto innermostLoop = depthToLoops[innermostLoopIdx][0];
Optional<Value> memrefFilter;
if (clMemRefFilter) {
AffineLoadOp load;
if (clMemRefFilter || clTestGenerateCopyForMemRegion) {
// Gather MemRef filter. For simplicity, we use the first loaded memref
// found in the innermost loop.
for (auto &op : *innermostLoop.getBody()) {
if (auto load = dyn_cast<AffineLoadOp>(op)) {
memrefFilter = load.getMemRef();
if (auto ld = dyn_cast<AffineLoadOp>(op)) {
load = ld;
break;
}
}
@ -72,8 +77,15 @@ void TestAffineDataCopy::runOnFunction() {
/*fastMemorySpace=*/0,
/*tagMemorySpace=*/0,
/*fastMemCapacityBytes=*/32 * 1024 * 1024UL};
DenseSet<Operation *> copyNests;
affineDataCopyGenerate(loopNest, copyOptions, memrefFilter, copyNests);
if (clMemRefFilter) {
DenseSet<Operation *> copyNests;
affineDataCopyGenerate(loopNest, copyOptions, load.getMemRef(), copyNests);
} else if (clTestGenerateCopyForMemRegion) {
CopyGenerateResult result;
MemRefRegion region(loopNest.getLoc());
region.compute(load, /*loopDepth=*/0);
generateCopyForMemRegion(region, loopNest, copyOptions, result);
}
}
namespace mlir {