forked from OSchip/llvm-project
[mlir] Add a simplifying wrapper for generateCopy and expose it.
Summary: affineDataCopyGenerate is a monolithinc function that combines several steps for good reasons, but it makes customizing the behaivor even harder. The major two steps by affineDataCopyGenerate are: a) Identify interesting memrefs and collect their uses. b) Create new buffers to forward these uses. Step (a) actually has requires tremendous customization options. One could see that from the recently added filterMemRef parameter. This patch adds a function that only does (b), in the hope that (a) can be directly implemented by the callers. In fact, (a) is quite simple if the caller has only one buffer to consider, or even one use. Differential Revision: https://reviews.llvm.org/D75965
This commit is contained in:
parent
d4757a6cf1
commit
d00f5632f3
|
@ -24,6 +24,7 @@ class AffineForOp;
|
|||
class FuncOp;
|
||||
class OpBuilder;
|
||||
class Value;
|
||||
struct MemRefRegion;
|
||||
|
||||
namespace loop {
|
||||
class ForOp;
|
||||
|
@ -185,6 +186,34 @@ uint64_t affineDataCopyGenerate(AffineForOp forOp,
|
|||
Optional<Value> filterMemRef,
|
||||
DenseSet<Operation *> ©Nests);
|
||||
|
||||
/// Result for calling generateCopyForMemRegion.
|
||||
struct CopyGenerateResult {
|
||||
// Number of bytes used by alloc.
|
||||
uint64_t sizeInBytes;
|
||||
|
||||
// The newly created buffer allocation.
|
||||
Operation *alloc;
|
||||
|
||||
// Generated loop nest for copying data between the allocated buffer and the
|
||||
// original memref.
|
||||
Operation *copyNest;
|
||||
};
|
||||
|
||||
/// generateCopyForMemRegion is similar to affineDataCopyGenerate, but works
|
||||
/// with a single memref region. `memrefRegion` is supposed to contain analysis
|
||||
/// information within analyzedOp. The generated prologue and epilogue always
|
||||
/// surround `analyzedOp`.
|
||||
///
|
||||
/// Note that `analyzedOp` is a single op for API convenience, and the
|
||||
/// [begin, end) version can be added as needed.
|
||||
///
|
||||
/// Also note that certain options in `copyOptions` aren't looked at anymore,
|
||||
/// like slowMemorySpace.
|
||||
LogicalResult generateCopyForMemRegion(const MemRefRegion &memrefRegion,
|
||||
Operation *analyzedOp,
|
||||
const AffineCopyOptions ©Options,
|
||||
CopyGenerateResult &result);
|
||||
|
||||
/// Tile a nest of standard for loops rooted at `rootForOp` by finding such
|
||||
/// parametric tile sizes that the outer loops have a fixed number of iterations
|
||||
/// as defined in `sizes`.
|
||||
|
|
|
@ -1797,6 +1797,28 @@ uint64_t mlir::affineDataCopyGenerate(AffineForOp forOp,
|
|||
filterMemRef, copyNests);
|
||||
}
|
||||
|
||||
LogicalResult mlir::generateCopyForMemRegion(
|
||||
const MemRefRegion &memrefRegion, Operation *analyzedOp,
|
||||
const AffineCopyOptions ©Options, CopyGenerateResult &result) {
|
||||
Block *block = analyzedOp->getBlock();
|
||||
auto begin = analyzedOp->getIterator();
|
||||
auto end = std::next(begin);
|
||||
DenseMap<Value, Value> fastBufferMap;
|
||||
DenseSet<Operation *> copyNests;
|
||||
|
||||
auto err = generateCopy(memrefRegion, block, begin, end, block, begin, end,
|
||||
copyOptions, fastBufferMap, copyNests,
|
||||
&result.sizeInBytes, &begin, &end);
|
||||
if (failed(err))
|
||||
return err;
|
||||
|
||||
result.alloc =
|
||||
fastBufferMap.find(memrefRegion.memref)->second.getDefiningOp();
|
||||
assert(copyNests.size() <= 1 && "At most one copy nest is expected.");
|
||||
result.copyNest = copyNests.empty() ? nullptr : *copyNests.begin();
|
||||
return success();
|
||||
}
|
||||
|
||||
/// Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'.
|
||||
static void
|
||||
gatherLoopsInBlock(Block *block, unsigned currLoopDepth,
|
||||
|
|
|
@ -6,7 +6,8 @@
|
|||
// affine data copy utility on the input loop nest.
|
||||
// '-test-affine-data-copy-memref-filter' passes the first memref found in an
|
||||
// affine.load op in the innermost loop as a filter.
|
||||
// RUN: mlir-opt %s -split-input-file -test-affine-data-copy='memref-filter=1' | FileCheck %s --check-prefix=FILTER
|
||||
// RUN: mlir-opt %s -split-input-file -test-affine-data-copy='memref-filter' | FileCheck %s --check-prefix=FILTER
|
||||
// RUN: mlir-opt %s -split-input-file -test-affine-data-copy='for-memref-region' | FileCheck %s --check-prefix=MEMREF_REGION
|
||||
|
||||
// -copy-skip-non-stride-loops forces the copies to be placed right inside the
|
||||
// tile space loops, avoiding the sensitivity of copy placement depth to memory
|
||||
|
@ -140,6 +141,7 @@ func @matmul(%A: memref<4096x4096xf32>, %B: memref<4096x4096xf32>, %C: memref<40
|
|||
//
|
||||
// CHECK-SMALL-LABEL: func @foo
|
||||
// FILTER-LABEL: func @foo
|
||||
// MEMREF_REGION-LABEL: func @foo
|
||||
func @foo(%arg0: memref<1024x1024xf32>, %arg1: memref<1024x1024xf32>, %arg2: memref<1024x1024xf32>) -> memref<1024x1024xf32> {
|
||||
affine.for %i = 0 to 1024 {
|
||||
affine.for %j = 0 to 1024 {
|
||||
|
@ -198,3 +200,15 @@ func @foo(%arg0: memref<1024x1024xf32>, %arg1: memref<1024x1024xf32>, %arg2: mem
|
|||
// FILTER-NEXT: affine.for %{{.*}} = 0 to 1024 {
|
||||
// FILTER: dealloc %{{.*}} : memref<1024x1024xf32>
|
||||
// FILTER-NOT: dealloc
|
||||
|
||||
// CHeck that only one memref is copied, because for-memref-region is enabled
|
||||
// (and the first ever encountered load is analyzed).
|
||||
// MEMREF_REGION: alloc() : memref<1024x1024xf32>
|
||||
// MEMREF_REGION-NOT: alloc()
|
||||
// MEMREF_REGION: affine.for %{{.*}} = 0 to 1024 {
|
||||
// MEMREF_REGION: affine.for %{{.*}} = 0 to 1024 {
|
||||
// MEMREF_REGION: affine.for %{{.*}} = 0 to 1024 {
|
||||
// MEMREF_REGION-NEXT: affine.for %{{.*}} = 0 to 1024 {
|
||||
// MEMREF_REGION-NEXT: affine.for %{{.*}} = 0 to 1024 {
|
||||
// MEMREF_REGION: dealloc %{{.*}} : memref<1024x1024xf32>
|
||||
// MEMREF_REGION-NOT: dealloc
|
||||
|
|
|
@ -12,6 +12,7 @@
|
|||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "mlir/Analysis/Passes.h"
|
||||
#include "mlir/Analysis/Utils.h"
|
||||
#include "mlir/Dialect/AffineOps/AffineOps.h"
|
||||
#include "mlir/Pass/Pass.h"
|
||||
#include "mlir/Transforms/LoopUtils.h"
|
||||
|
@ -37,6 +38,10 @@ private:
|
|||
llvm::cl::desc(
|
||||
"Enable memref filter testing in affine data copy optimization"),
|
||||
llvm::cl::init(false)};
|
||||
Option<bool> clTestGenerateCopyForMemRegion{
|
||||
*this, "for-memref-region",
|
||||
llvm::cl::desc("Test copy generation for a single memref region"),
|
||||
llvm::cl::init(false)};
|
||||
};
|
||||
|
||||
} // end anonymous namespace
|
||||
|
@ -55,13 +60,13 @@ void TestAffineDataCopy::runOnFunction() {
|
|||
|
||||
auto loopNest = depthToLoops[0][0];
|
||||
auto innermostLoop = depthToLoops[innermostLoopIdx][0];
|
||||
Optional<Value> memrefFilter;
|
||||
if (clMemRefFilter) {
|
||||
AffineLoadOp load;
|
||||
if (clMemRefFilter || clTestGenerateCopyForMemRegion) {
|
||||
// Gather MemRef filter. For simplicity, we use the first loaded memref
|
||||
// found in the innermost loop.
|
||||
for (auto &op : *innermostLoop.getBody()) {
|
||||
if (auto load = dyn_cast<AffineLoadOp>(op)) {
|
||||
memrefFilter = load.getMemRef();
|
||||
if (auto ld = dyn_cast<AffineLoadOp>(op)) {
|
||||
load = ld;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -72,8 +77,15 @@ void TestAffineDataCopy::runOnFunction() {
|
|||
/*fastMemorySpace=*/0,
|
||||
/*tagMemorySpace=*/0,
|
||||
/*fastMemCapacityBytes=*/32 * 1024 * 1024UL};
|
||||
DenseSet<Operation *> copyNests;
|
||||
affineDataCopyGenerate(loopNest, copyOptions, memrefFilter, copyNests);
|
||||
if (clMemRefFilter) {
|
||||
DenseSet<Operation *> copyNests;
|
||||
affineDataCopyGenerate(loopNest, copyOptions, load.getMemRef(), copyNests);
|
||||
} else if (clTestGenerateCopyForMemRegion) {
|
||||
CopyGenerateResult result;
|
||||
MemRefRegion region(loopNest.getLoc());
|
||||
region.compute(load, /*loopDepth=*/0);
|
||||
generateCopyForMemRegion(region, loopNest, copyOptions, result);
|
||||
}
|
||||
}
|
||||
|
||||
namespace mlir {
|
||||
|
|
Loading…
Reference in New Issue