Generate dealloc's for the alloc's of dma-generate.

- for the DMA buffers being allocated (and their tags), generate corresponding deallocs
- minor related update to replaceAllMemRefUsesWith and PipelineDataTransfer pass

Code generation for DMA transfers was being done with the initial simplifying
assumption that the alloc's would map to scoped allocations, and so no
deallocations would be necessary. Drop this assumption to generalize. Note that
even with scoped allocations, unrolling loops that have scoped allocations
could create a series of allocations and exhaustion of fast memory. Having a
end of lifetime marker like a dealloc in fact allows creating new scopes if
necessary when lowering to a backend and still utilize scoped allocation.
DMA buffers created by -dma-generate are guaranteed to have either
non-overlapping lifetimes or nested lifetimes.

PiperOrigin-RevId: 233502632
This commit is contained in:
Uday Bondhugula 2019-02-11 16:33:53 -08:00 committed by jpienaar
parent f5eed89df0
commit 8b3f841daf
5 changed files with 68 additions and 22 deletions

View File

@ -40,10 +40,10 @@ class Module;
class Function;
/// Replaces all uses of oldMemRef with newMemRef while optionally remapping the
/// old memref's indices using the supplied affine map, 'indexRemap'. The new
/// memref could be of a different shape or rank. 'extraIndices' provides
/// additional access indices to be added to the start.
/// Replaces all "deferencing" uses of oldMemRef with newMemRef while optionally
/// remapping the old memref's indices using the supplied affine map,
/// 'indexRemap'. The new memref could be of a different shape or rank.
/// 'extraIndices' provides additional access indices to be added to the start.
///
/// 'indexRemap' remaps indices of the old memref access to a new set of indices
/// that are used to index the memref. Additional input operands to indexRemap
@ -57,9 +57,10 @@ class Function;
/// operations that are dominated by the former; similarly, `postDomInstFilter`
/// restricts replacement to only those operations that are postdominated by it.
///
/// Returns true on success and false if the replacement is not possible
/// (whenever a memref is used as an operand in a non-deferencing scenario). See
/// comments at function definition for an example.
/// Returns true on success and false if the replacement is not possible,
/// whenever a memref is used as an operand in a non-deferencing context, except
/// for dealloc's on the memref which are left untouched. See comments at
/// function definition for an example.
//
// Ex: to replace load %A[%i, %j] with load %Abuf[%t mod 2, %ii - %i, %j]:
// The SSA value corresponding to '%t mod 2' should be in 'extraIndices', and

View File

@ -59,8 +59,6 @@ namespace {
/// by the latter. Only load op's handled for now.
// TODO(bondhugula): We currently can't generate DMAs correctly when stores are
// strided. Check for strided stores.
// TODO(mlir-team): we don't insert dealloc's for the DMA buffers; this is thus
// natural only for scoped allocations.
struct DmaGeneration : public FunctionPass {
explicit DmaGeneration(
unsigned slowMemorySpace = 0, unsigned fastMemorySpace = 1,
@ -331,10 +329,8 @@ bool DmaGeneration::generateDma(const MemRefRegion &region, Block *block,
Value *fastMemRef;
// Check if a buffer was already created.
// TODO(bondhugula): union across all memory op's per buffer. For now assuming
// that multiple memory op's on the same memref have the *same* memory
// footprint.
if (fastBufferMap.count(memref) == 0) {
bool existingBuf = fastBufferMap.count(memref) > 0;
if (!existingBuf) {
auto fastMemRefType = top.getMemRefType(
fastBufferShape, memRefType.getElementType(), {}, fastMemorySpace);
@ -358,6 +354,7 @@ bool DmaGeneration::generateDma(const MemRefRegion &region, Block *block,
// Create a tag (single element 1-d memref) for the DMA.
auto tagMemRefType = top.getMemRefType({1}, top.getIntegerType(32));
auto tagMemRef = prologue.create<AllocOp>(loc, tagMemRefType);
auto numElementsSSA =
top.create<ConstantIndexOp>(loc, numElements.getValue());
@ -397,13 +394,23 @@ bool DmaGeneration::generateDma(const MemRefRegion &region, Block *block,
zeroIndex, stride, numEltPerStride);
// Since new ops are being appended (for outgoing DMAs), adjust the end to
// mark end of range of the original.
if (*nEnd == end)
*nEnd = Block::iterator(op->getInstruction());
*nEnd = Block::iterator(op->getInstruction());
}
// Matching DMA wait to block on completion; tag always has a 0 index.
b->create<DmaWaitOp>(loc, tagMemRef, zeroIndex, numElementsSSA);
// Generate dealloc for the tag.
auto tagDeallocOp = epilogue.create<DeallocOp>(loc, tagMemRef);
if (*nEnd == end)
// Since new ops are being appended (for outgoing DMAs), adjust the end to
// mark end of range of the original.
*nEnd = Block::iterator(tagDeallocOp->getInstruction());
// Generate dealloc for the DMA buffer.
if (!existingBuf)
epilogue.create<DeallocOp>(loc, fastMemRef);
// Replace all uses of the old memref with the faster one while remapping
// access indices (subtracting out lower bound offsets for each dimension).
// Ex: to replace load %A[%i, %j] with load %Abuf[%i - %iT, %j - %jT],

View File

@ -124,8 +124,9 @@ static bool doubleBuffer(Value *oldMemRef, OpPointer<AffineForOp> forOp) {
// replaceAllMemRefUsesWith will always succeed unless the forOp body has
// non-deferencing uses of the memref.
if (!replaceAllMemRefUsesWith(oldMemRef, newMemRef, {ivModTwoOp}, AffineMap(),
{}, &*forOp->getBody()->begin())) {
if (!replaceAllMemRefUsesWith(
oldMemRef, newMemRef, {ivModTwoOp}, AffineMap(), {},
/*domInstFilter=*/&*forOp->getBody()->begin())) {
LLVM_DEBUG(llvm::dbgs()
<< "memref replacement for double buffering failed\n";);
ivModTwoOp->getInstruction()->erase();
@ -284,10 +285,20 @@ PipelineDataTransfer::runOnAffineForOp(OpPointer<AffineForOp> forOp) {
// If the old memref has no more uses, remove its 'dead' alloc if it was
// alloc'ed. (note: DMA buffers are rarely function live-in; but a 'dim'
// operation could have been used on it if it was dynamically shaped in
// order to create the double buffer above)
if (oldMemRef->use_empty())
if (auto *allocInst = oldMemRef->getDefiningInst())
// order to create the double buffer above.)
// '-canonicalize' does this in a more general way, but we'll anyway do the
// simple/common case so that the output / test cases looks clear.
if (auto *allocInst = oldMemRef->getDefiningInst()) {
if (oldMemRef->use_empty()) {
allocInst->erase();
} else if (oldMemRef->hasOneUse()) {
auto *singleUse = oldMemRef->use_begin()->getOwner();
if (singleUse->isa<DeallocOp>()) {
singleUse->erase();
oldMemRef->getDefiningInst()->erase();
}
}
}
}
// Double the buffers for tag memrefs.

View File

@ -91,6 +91,11 @@ bool mlir::replaceAllMemRefUsesWith(const Value *oldMemRef, Value *newMemRef,
!postDomInfo->postDominates(postDomInstFilter, opInst))
continue;
// Skip dealloc's - no replacement is necessary, and a replacement doesn't
// hurt dealloc's.
if (opInst->isa<DeallocOp>())
continue;
// Check if the memref was used in a non-deferencing context. It is fine for
// the memref to be used in a non-deferencing way outside of the region
// where this replacement is happening.

View File

@ -40,6 +40,10 @@ func @loop_nest_1d() {
// Already in faster memory space.
// CHECK: %11 = load %2[%i0] : memref<256xf32, 1>
// CHECK-NEXT: }
// CHECK-NEXT: dealloc %6 : memref<1xi32>
// CHECK-NEXT: dealloc %5 : memref<256xf32, 1>
// CHECK-NEXT: dealloc %4 : memref<1xi32>
// CHECK-NEXT: dealloc %3 : memref<256xf32, 1>
// CHECK-NEXT: return
for %i = 0 to 256 {
load %A[%i] : memref<256 x f32>
@ -95,6 +99,13 @@ func @loop_nest_1d() {
// OUTGOING DMA for C.
// CHECK-NEXT: dma_start [[BUFC]][%c0, %c0], %arg2[%c0, %c0], %c16384, [[TAGC_W]][%c0] : memref<512x32xf32, 1>, memref<512x32xf32>, memref<1xi32>
// CHECK-NEXT: dma_wait [[TAGC_W]][%c0], %c16384 : memref<1xi32>
// CHECK-NEXT: dealloc [[TAGC_W]] : memref<1xi32>
// CHECK-NEXT: dealloc [[TAGC]] : memref<1xi32>
// CHECK-NEXT: dealloc [[BUFC]] : memref<512x32xf32, 1>
// CHECK-NEXT: dealloc [[TAGA]] : memref<1xi32>
// CHECK-NEXT: dealloc [[BUFA]] : memref<512x32xf32, 1>
// CHECK-NEXT: dealloc [[TAGB]] : memref<1xi32>
// CHECK-NEXT: dealloc [[BUFB]] : memref<512x32xf32, 1>
// CHECK-NEXT: return
// CHECK-NEXT:}
func @loop_nest_high_d(%A: memref<512 x 32 x f32>,
@ -144,6 +155,8 @@ func @loop_nest_high_d(%A: memref<512 x 32 x f32>,
// ...
// ...
// CHECK: }
// CHECK-NEXT: dealloc %3 : memref<1xi32>
// CHECK-NEXT: dealloc %2 : memref<1x2xf32, 1>
// CHECK-NEXT: }
// CHECK-NEXT: return
func @loop_nest_modulo() {
@ -183,7 +196,6 @@ func @loop_nest_tiled() -> memref<256x1024xf32> {
}
}
}
// CHECK: return %0 : memref<256x1024xf32>
return %0 : memref<256x1024xf32>
}
@ -229,7 +241,7 @@ func @dma_with_symbolic_accesses(%A : memref<100x100xf32>, %M : index) {
// CHECK-NEXT: %6 = load %1[%4, %5] : memref<100x100xf32, 1>
// CHECK-NEXT: }
// CHECK-NEXT: }
// CHECK-NEXT: return
// CHECK: return
}
// CHECK-LABEL: func @dma_with_symbolic_loop_bounds
@ -357,6 +369,9 @@ func @multi_load_store_union() {
// CHECK-NEXT: }
// CHECK-NEXT: dma_start %1[%c0, %c0], %0[%c2, %c2_0], %c170372, %3[%c0], %c512, %c446 : memref<382x446xf32, 1>, memref<512x512xf32>, memref<1xi32>
// CHECK-NEXT: dma_wait %3[%c0], %c170372 : memref<1xi32>
// CHECK-NEXT: dealloc %3 : memref<1xi32>
// CHECK-NEXT: dealloc %2 : memref<1xi32>
// CHECK-NEXT: dealloc %1 : memref<382x446xf32, 1>
// CHECK-NEXT: return
// CHECK-NEXT:}
@ -385,6 +400,8 @@ func @dma_loop_straightline_interspersed() {
// CHECK-NEXT: dma_start %0[%c0], %1[%c0], %c1_1, %2[%c0] : memref<256xf32>, memref<1xf32, 1>, memref<1xi32>
// CHECK-NEXT: dma_wait %2[%c0], %c1_1 : memref<1xi32>
// CHECK-NEXT: %3 = load %1[%c0_2] : memref<1xf32, 1>
// CHECK-NEXT: dealloc %2 : memref<1xi32>
// CHECK-NEXT: dealloc %1 : memref<1xf32, 1>
// CHECK-NEXT: %4 = alloc() : memref<254xf32, 1>
// CHECK-NEXT: %5 = alloc() : memref<1xi32>
// CHECK-NEXT: dma_start %0[%c1_0], %4[%c0], %c254, %5[%c0] : memref<256xf32>, memref<254xf32, 1>, memref<1xi32>
@ -393,6 +410,8 @@ func @dma_loop_straightline_interspersed() {
// CHECK-NEXT: %6 = affine.apply [[MAP_MINUS_ONE]](%i0)
// CHECK-NEXT: %7 = load %4[%6] : memref<254xf32, 1>
// CHECK-NEXT: }
// CHECK-NEXT: dealloc %5 : memref<1xi32>
// CHECK-NEXT: dealloc %4 : memref<254xf32, 1>
// CHECK-NEXT: %8 = alloc() : memref<256xf32, 1>
// CHECK-NEXT: %9 = alloc() : memref<1xi32>
// CHECK-NEXT: dma_start %0[%c0], %8[%c0], %c256, %9[%c0] : memref<256xf32>, memref<256xf32, 1>, memref<1xi32>
@ -402,6 +421,9 @@ func @dma_loop_straightline_interspersed() {
// CHECK-NEXT: store %11, %8[%c0_2] : memref<256xf32, 1>
// CHECK-NEXT: dma_start %8[%c0], %0[%c0], %c1, %10[%c0] : memref<256xf32, 1>, memref<256xf32>, memref<1xi32>
// CHECK-NEXT: dma_wait %10[%c0], %c1 : memref<1xi32>
// CHECK-NEXT: dealloc %10 : memref<1xi32>
// CHECK-NEXT: dealloc %9 : memref<1xi32>
// CHECK-NEXT: dealloc %8 : memref<256xf32, 1>
// CHECK-NEXT: return
// -----