Minor updates + cleanup to dma-generate

- switch some debug info to emitError
- use a single constant op for zero index to make it easier to write/update
  test cases; avoid creating new constant op's for common zero index cases
- test case cleanup

This is in preparation for an upcoming major update to this pass.

PiperOrigin-RevId: 230728379
This commit is contained in:
Uday Bondhugula 2019-01-24 08:43:17 -08:00 committed by jpienaar
parent f319bbbd28
commit 72e5c7f428
2 changed files with 54 additions and 42 deletions

View File

@ -81,8 +81,8 @@ struct DmaGeneration : public FunctionPass {
// Minimum DMA transfer size supported by the target in bytes. // Minimum DMA transfer size supported by the target in bytes.
const int minDmaTransferSize; const int minDmaTransferSize;
// The loop level at which DMAs should be generated. '0' is an outermost loop. // Constant zero index to avoid too many duplicates.
unsigned dmaDepth; Value *zeroIndex = nullptr;
static char passID; static char passID;
}; };
@ -166,8 +166,6 @@ bool DmaGeneration::generateDma(const MemRefRegion &region, ForInst *forInst,
// Indices for the faster buffer being DMAed into/from. // Indices for the faster buffer being DMAed into/from.
SmallVector<Value *, 4> bufIndices; SmallVector<Value *, 4> bufIndices;
Value *zeroIndex = top.create<ConstantIndexOp>(loc, 0);
unsigned rank = memRefType.getRank(); unsigned rank = memRefType.getRank();
SmallVector<int64_t, 4> fastBufferShape; SmallVector<int64_t, 4> fastBufferShape;
@ -216,8 +214,13 @@ bool DmaGeneration::generateDma(const MemRefRegion &region, ForInst *forInst,
// Set DMA start location for this dimension in the lower memory space // Set DMA start location for this dimension in the lower memory space
// memref. // memref.
if (auto caf = offset.dyn_cast<AffineConstantExpr>()) { if (auto caf = offset.dyn_cast<AffineConstantExpr>()) {
memIndices.push_back( auto indexVal = caf.getValue();
top.create<ConstantIndexOp>(loc, caf.getValue())->getResult()); if (indexVal == 0) {
memIndices.push_back(zeroIndex);
} else {
memIndices.push_back(
top.create<ConstantIndexOp>(loc, caf.getValue())->getResult());
}
} else { } else {
// The coordinate for the start location is just the lower bound along the // The coordinate for the start location is just the lower bound along the
// corresponding dimension on the memory region (stored in 'offset'). // corresponding dimension on the memory region (stored in 'offset').
@ -349,7 +352,7 @@ void DmaGeneration::runOnForInst(ForInst *forInst) {
// DMAs will be generated for this depth, i.e., for all data accessed by this // DMAs will be generated for this depth, i.e., for all data accessed by this
// loop. // loop.
dmaDepth = getNestingDepth(*forInst); unsigned dmaDepth = getNestingDepth(*forInst);
regions.clear(); regions.clear();
fastBufferMap.clear(); fastBufferMap.clear();
@ -375,7 +378,7 @@ void DmaGeneration::runOnForInst(ForInst *forInst) {
// instead of O(num of load/store op's). // instead of O(num of load/store op's).
auto region = std::make_unique<MemRefRegion>(); auto region = std::make_unique<MemRefRegion>();
if (!getMemRefRegion(opInst, dmaDepth, region.get())) { if (!getMemRefRegion(opInst, dmaDepth, region.get())) {
LLVM_DEBUG(llvm::dbgs() << "Error obtaining memory region\n"); forInst->emitError("Error obtaining memory region: semi-affine maps?\n");
return; return;
} }
@ -393,14 +396,17 @@ void DmaGeneration::runOnForInst(ForInst *forInst) {
ret = ret | iRet; ret = ret | iRet;
} }
if (!ret) { if (!ret) {
LLVM_DEBUG(llvm::dbgs() forInst->emitError("DMA generation failed for one or more memref's\n");
<< "DMA generation failed for one or more memref's\n";); return;
} }
LLVM_DEBUG(llvm::dbgs() << Twine(llvm::divideCeil(totalSizeInBytes, 1024)) LLVM_DEBUG(llvm::dbgs() << Twine(llvm::divideCeil(totalSizeInBytes, 1024))
<< " KiB of DMA buffers in fast memory space\n";); << " KiB of DMA buffers in fast memory space\n";);
} }
PassResult DmaGeneration::runOnFunction(Function *f) { PassResult DmaGeneration::runOnFunction(Function *f) {
FuncBuilder topBuilder(f);
zeroIndex = topBuilder.create<ConstantIndexOp>(f->getLoc(), 0);
for (auto &block : *f) { for (auto &block : *f) {
for (auto &inst : block) { for (auto &inst : block) {
if (auto *forInst = dyn_cast<ForInst>(&inst)) { if (auto *forInst = dyn_cast<ForInst>(&inst)) {

View File

@ -1,4 +1,4 @@
// RUN: mlir-opt %s -dma-generate | FileCheck %s // RUN: mlir-opt %s -split-input-file -dma-generate -verify | FileCheck %s
// Index of the buffer for the second DMA is remapped. // Index of the buffer for the second DMA is remapped.
// CHECK-DAG: [[MAP:#map[0-9]+]] = (d0) -> (d0 - 256) // CHECK-DAG: [[MAP:#map[0-9]+]] = (d0) -> (d0 - 256)
@ -20,8 +20,8 @@ func @loop_nest_1d() {
// Tag for first DMA. // Tag for first DMA.
// CHECK: %4 = alloc() : memref<1xi32> // CHECK: %4 = alloc() : memref<1xi32>
// First DMA transfer. // First DMA transfer.
// CHECK: dma_start %0[%c0_2], %3[%c0_1], %c256_3, %4[%c0_1] : memref<256xf32>, memref<256xf32, 1>, memref<1xi32> // CHECK: dma_start %0[%c0], %3[%c0], %c256_1, %4[%c0] : memref<256xf32>, memref<256xf32, 1>, memref<1xi32>
// CHECK: dma_wait %4[%c0_1], %c256_3 : memref<1xi32> // CHECK: dma_wait %4[%c0], %c256_1 : memref<1xi32>
// Second DMA buffer. // Second DMA buffer.
// CHECK: %5 = alloc() : memref<256xf32, 1> // CHECK: %5 = alloc() : memref<256xf32, 1>
// Tag for second DMA. // Tag for second DMA.
@ -49,54 +49,54 @@ func @loop_nest_1d() {
} }
// CHECK-LABEL: func @loop_nest_high_d // CHECK-LABEL: func @loop_nest_high_d
// CHECK: %c16384 = constant 16384 : index // CHECK: %c16384 = constant 16384 : index
// CHECK: %0 = alloc() : memref<512x32xf32, 1> // CHECK-DAG: [[BUFB:%[0-9]+]] = alloc() : memref<512x32xf32, 1>
// CHECK: %1 = alloc() : memref<1xi32> // CHECK-DAG: [[BUFA:%[0-9]+]] = alloc() : memref<512x32xf32, 1>
// CHECK-DAG: [[BUFC:%[0-9]+]] = alloc() : memref<512x32xf32, 1>
// CHECK-DAG: [[TAGB:%[0-9]+]] = alloc() : memref<1xi32>
// CHECK-DAG: [[TAGA:%[0-9]+]] = alloc() : memref<1xi32>
// CHECK-DAG: [[TAGC:%[0-9]+]] = alloc() : memref<1xi32>
// CHECK-DAG: [[TAGC_W:%[0-9]+]] = alloc() : memref<1xi32>
// INCOMING DMA for B // INCOMING DMA for B
// CHECK: dma_start %arg1[%c0_11, %c0_12], %0[%c0_10, %c0_10], %c16384_13, %1[%c0_10] : memref<512x32xf32>, memref<512x32xf32, 1>, memref<1xi32> // CHECK-DAG: dma_start %arg1[%c0, %c0], [[BUFB]][%c0, %c0], %c16384_2, [[TAGB]][%c0] : memref<512x32xf32>, memref<512x32xf32, 1>, memref<1xi32>
// CHECK-NEXT: dma_wait %1[%c0_10], %c16384_13 : memref<1xi32> // CHECK-DAG: dma_wait [[TAGB]][%c0], %c16384_2 : memref<1xi32>
// CHECK-NEXT: %2 = alloc() : memref<512x32xf32, 1>
// CHECK-NEXT: %3 = alloc() : memref<1xi32>
// INCOMING DMA for A. // INCOMING DMA for A.
// CHECK-NEXT: dma_start %arg0[%c0_7, %c0_8], %2[%c0_6, %c0_6], %c16384_9, %3[%c0_6] : memref<512x32xf32>, memref<512x32xf32, 1>, memref<1xi32> // CHECK-DAG: dma_start %arg0[%c0, %c0], [[BUFA]][%c0, %c0], %c16384_1, [[TAGA]][%c0] : memref<512x32xf32>, memref<512x32xf32, 1>, memref<1xi32>
// CHECK-NEXT: dma_wait %3[%c0_6], %c16384_9 : memref<1xi32> // CHECK-DAG: dma_wait [[TAGA]][%c0], %c16384_1 : memref<1xi32>
// CHECK-NEXT: %4 = alloc() : memref<512x32xf32, 1>
// CHECK-NEXT: %5 = alloc() : memref<1xi32>
// INCOMING DMA for C. // INCOMING DMA for C.
// CHECK-NEXT: dma_start %arg2[%c0_3, %c0_4], %4[%c0_2, %c0_2], %c16384_5, %5[%c0_2] : memref<512x32xf32>, memref<512x32xf32, 1>, memref<1xi32> // CHECK-DAG: dma_start %arg2[%c0, %c0], [[BUFC]][%c0, %c0], %c16384_0, [[TAGC]][%c0] : memref<512x32xf32>, memref<512x32xf32, 1>, memref<1xi32>
// CHECK-NEXT: dma_wait %5[%c0_2], %c16384_5 : memref<1xi32> // CHECK-DAG: dma_wait [[TAGC]][%c0], %c16384_0 : memref<1xi32>
// CHECK-NEXT: %6 = alloc() : memref<1xi32>
// CHECK-NEXT: for %i0 = 0 to 32 { // CHECK-NEXT: for %i0 = 0 to 32 {
// CHECK-NEXT: for %i1 = 0 to 32 { // CHECK-NEXT: for %i1 = 0 to 32 {
// CHECK-NEXT: for %i2 = 0 to 32 { // CHECK-NEXT: for %i2 = 0 to 32 {
// CHECK-NEXT: for %i3 = 0 to 16 { // CHECK-NEXT: for %i3 = 0 to 16 {
// CHECK-NEXT: %7 = affine_apply #map{{[0-9]+}}(%i1, %i3) // CHECK-NEXT: %7 = affine_apply #map{{[0-9]+}}(%i1, %i3)
// CHECK-NEXT: %8 = affine_apply #map{{[0-9]+}}(%7, %i0) // CHECK-NEXT: %8 = affine_apply #map{{[0-9]+}}(%7, %i0)
// CHECK-NEXT: %9 = load %0[%8#0, %8#1] : memref<512x32xf32, 1> // CHECK-NEXT: %9 = load [[BUFB]][%8#0, %8#1] : memref<512x32xf32, 1>
// CHECK-NEXT: "foo"(%9) : (f32) -> () // CHECK-NEXT: "foo"(%9) : (f32) -> ()
// CHECK-NEXT: } // CHECK-NEXT: }
// CHECK-NEXT: for %i4 = 0 to 16 { // CHECK-NEXT: for %i4 = 0 to 16 {
// CHECK-NEXT: %10 = affine_apply #map{{[0-9]+}}(%i2, %i4) // CHECK-NEXT: %10 = affine_apply #map{{[0-9]+}}(%i2, %i4)
// CHECK-NEXT: %11 = affine_apply #map{{[0-9]+}}(%10, %i1) // CHECK-NEXT: %11 = affine_apply #map{{[0-9]+}}(%10, %i1)
// CHECK-NEXT: %12 = load %2[%11#0, %11#1] : memref<512x32xf32, 1> // CHECK-NEXT: %12 = load [[BUFA]][%11#0, %11#1] : memref<512x32xf32, 1>
// CHECK-NEXT: "bar"(%12) {mxu_id: 0} : (f32) -> () // CHECK-NEXT: "bar"(%12) : (f32) -> ()
// CHECK-NEXT: } // CHECK-NEXT: }
// CHECK-NEXT: for %i5 = 0 to 16 { // CHECK-NEXT: for %i5 = 0 to 16 {
// CHECK-NEXT: %13 = "abc_compute"() : () -> f32 // CHECK-NEXT: %13 = "abc_compute"() : () -> f32
// CHECK-NEXT: %14 = affine_apply #map{{[0-9]+}}(%i2, %i5) // CHECK-NEXT: %14 = affine_apply #map{{[0-9]+}}(%i2, %i5)
// CHECK-NEXT: %15 = affine_apply #map{{[0-9]+}}(%14, %i0) // CHECK-NEXT: %15 = affine_apply #map{{[0-9]+}}(%14, %i0)
// CHECK-NEXT: %16 = load %4[%15#0, %15#1] : memref<512x32xf32, 1> // CHECK-NEXT: %16 = load [[BUFC]][%15#0, %15#1] : memref<512x32xf32, 1>
// CHECK-NEXT: %17 = "addf32"(%13, %16) : (f32, f32) -> f32 // CHECK-NEXT: %17 = "addf32"(%13, %16) : (f32, f32) -> f32
// CHECK-NEXT: %18 = affine_apply #map{{[0-9]+}}(%14, %i0) // CHECK-NEXT: %18 = affine_apply #map{{[0-9]+}}(%14, %i0)
// CHECK-NEXT: store %17, %4[%18#0, %18#1] : memref<512x32xf32, 1> // CHECK-NEXT: store %17, [[BUFC]][%18#0, %18#1] : memref<512x32xf32, 1>
// CHECK-NEXT: } // CHECK-NEXT: }
// CHECK-NEXT: "foobar"() : () -> () // CHECK-NEXT: "foobar"() : () -> ()
// CHECK-NEXT: } // CHECK-NEXT: }
// CHECK-NEXT: } // CHECK-NEXT: }
// CHECK-NEXT: } // CHECK-NEXT: }
// OUTGOING DMA for C. // OUTGOING DMA for C.
// CHECK-NEXT: dma_start %4[%c0, %c0], %arg2[%c0_0, %c0_1], %c16384, %6[%c0] : memref<512x32xf32, 1>, memref<512x32xf32>, memref<1xi32> // CHECK-NEXT: dma_start [[BUFC]][%c0, %c0], %arg2[%c0, %c0], %c16384, [[TAGC_W]][%c0] : memref<512x32xf32, 1>, memref<512x32xf32>, memref<1xi32>
// CHECK-NEXT: dma_wait %6[%c0], %c16384 : memref<1xi32> // CHECK-NEXT: dma_wait [[TAGC_W]][%c0], %c16384 : memref<1xi32>
// CHECK-NEXT: return // CHECK-NEXT: return
// CHECK-NEXT:} // CHECK-NEXT:}
func @loop_nest_high_d(%A: memref<512 x 32 x f32>, func @loop_nest_high_d(%A: memref<512 x 32 x f32>,
@ -115,7 +115,7 @@ func @loop_nest_high_d(%A: memref<512 x 32 x f32>,
for %ii = 0 to 16 { // i intratile. for %ii = 0 to 16 { // i intratile.
%i = affine_apply (d0, d1) -> (16*d0 + d1)(%iT, %ii) %i = affine_apply (d0, d1) -> (16*d0 + d1)(%iT, %ii)
%v1 = load %A[%i, %kT] : memref<512 x 32 x f32> %v1 = load %A[%i, %kT] : memref<512 x 32 x f32>
"bar"(%v1) {mxu_id: 0} : (f32) -> () "bar"(%v1) : (f32) -> ()
} }
for %ii_ = 0 to 16 { // i intratile. for %ii_ = 0 to 16 { // i intratile.
%v2 = "abc_compute"() : () -> f32 %v2 = "abc_compute"() : () -> f32
@ -140,7 +140,7 @@ func @loop_nest_high_d(%A: memref<512 x 32 x f32>,
// CHECK-NEXT: %1 = affine_apply #map{{[0-9]+}}(%i0) // CHECK-NEXT: %1 = affine_apply #map{{[0-9]+}}(%i0)
// CHECK-NEXT: %2 = alloc() : memref<1x2xf32, 1> // CHECK-NEXT: %2 = alloc() : memref<1x2xf32, 1>
// CHECK-NEXT: %3 = alloc() : memref<1xi32> // CHECK-NEXT: %3 = alloc() : memref<1xi32>
// CHECK-NEXT: dma_start %0[%1, %c0_0], %2[%c0, %c0], %c2, %3[%c0] : memref<256x8xf32>, memref<1x2xf32, 1>, memref<1xi32> // CHECK-NEXT: dma_start %0[%1, %c0], %2[%c0, %c0], %c2, %3[%c0] : memref<256x8xf32>, memref<1x2xf32, 1>, memref<1xi32>
// CHECK-NEXT: dma_wait %3[%c0], %c2 : memref<1xi32> // CHECK-NEXT: dma_wait %3[%c0], %c2 : memref<1xi32>
// CHECK-NEXT: for %i1 = 0 to 8 { // CHECK-NEXT: for %i1 = 0 to 8 {
// ... // ...
@ -195,11 +195,11 @@ func @dma_constant_dim_access(%A : memref<100x100xf32>) {
// CHECK: %0 = alloc() : memref<1x100xf32, 1> // CHECK: %0 = alloc() : memref<1x100xf32, 1>
// CHECK-NEXT: %1 = alloc() : memref<1xi32> // CHECK-NEXT: %1 = alloc() : memref<1xi32>
// No strided DMA needed here. // No strided DMA needed here.
// CHECK: dma_start %arg0[%c1, %c0_0], %0[%c0, %c0], %c100, %1[%c0] : memref<100x100xf32>, memref<1x100xf32, 1>, // CHECK: dma_start %arg0[%c1, %c0], %0[%c0, %c0], %c100, %1[%c0] : memref<100x100xf32>, memref<1x100xf32, 1>,
// CHECK-NEXT: dma_wait %1[%c0], %c100 : memref<1xi32> // CHECK-NEXT: dma_wait %1[%c0], %c100 : memref<1xi32>
for %i = 0 to 100 { for %i = 0 to 100 {
for %j = 0 to ()[s0] -> (s0) ()[%N] { for %j = 0 to ()[s0] -> (s0) ()[%N] {
// CHECK: %2 = affine_apply [[MAP_MINUS_ONE]](%c1_1, %i1) // CHECK: %2 = affine_apply [[MAP_MINUS_ONE]](%c1_0, %i1)
// CHECK-NEXT: %3 = load %0[%2#0, %2#1] : memref<1x100xf32, 1> // CHECK-NEXT: %3 = load %0[%2#0, %2#1] : memref<1x100xf32, 1>
load %A[%one, %j] : memref<100 x 100 x f32> load %A[%one, %j] : memref<100 x 100 x f32>
} }
@ -219,7 +219,7 @@ func @dma_with_symbolic_accesses(%A : memref<100x100xf32>, %M : index) {
return return
// CHECK: %1 = alloc() : memref<100x100xf32, 1> // CHECK: %1 = alloc() : memref<100x100xf32, 1>
// CHECK-NEXT: %2 = alloc() : memref<1xi32> // CHECK-NEXT: %2 = alloc() : memref<1xi32>
// CHECK-NEXT: dma_start %arg0[%c0_0, %0], %1[%c0, %c0], %c10000, %2[%c0] // CHECK-NEXT: dma_start %arg0[%c0, %0], %1[%c0, %c0], %c10000, %2[%c0]
// CHECK-NEXT: dma_wait %2[%c0], %c10000 // CHECK-NEXT: dma_wait %2[%c0], %c10000
// CHECK-NEXT: for %i0 = 0 to 100 { // CHECK-NEXT: for %i0 = 0 to 100 {
// CHECK-NEXT: for %i1 = 0 to 100 { // CHECK-NEXT: for %i1 = 0 to 100 {
@ -238,7 +238,7 @@ func @dma_with_symbolic_loop_bounds(%A : memref<100x100xf32>, %M : index, %N: in
// memref size; so the DMA buffer is the entire 100x100. // memref size; so the DMA buffer is the entire 100x100.
// CHECK: %0 = alloc() : memref<100x100xf32, 1> // CHECK: %0 = alloc() : memref<100x100xf32, 1>
// CHECK-NEXT: %1 = alloc() : memref<1xi32> // CHECK-NEXT: %1 = alloc() : memref<1xi32>
// CHECK-NEXT: dma_start %arg0[%c0_0, %c0_1], %0[%c0, %c0], %c10000, %1[%c0] : memref<100x100xf32>, memref<100x100xf32, 1>, memref<1xi32> // CHECK-NEXT: dma_start %arg0[%c0, %c0], %0[%c0, %c0], %c10000, %1[%c0] : memref<100x100xf32>, memref<100x100xf32, 1>, memref<1xi32>
// CHECK-NEXT: dma_wait %1[%c0], %c10000 : memref<1xi32> // CHECK-NEXT: dma_wait %1[%c0], %c10000 : memref<1xi32>
for %i = 0 to 100 { for %i = 0 to 100 {
for %j = %M to %N { for %j = %M to %N {
@ -249,21 +249,26 @@ func @dma_with_symbolic_loop_bounds(%A : memref<100x100xf32>, %M : index, %N: in
return return
} }
// -----
// CHECK-LABEL: func @dma_unknown_size // CHECK-LABEL: func @dma_unknown_size
func @dma_unknown_size(%arg0: memref<?x?xf32>) { func @dma_unknown_size(%arg0: memref<?x?xf32>) {
%M = dim %arg0, 0 : memref<? x ? x f32> %M = dim %arg0, 0 : memref<? x ? x f32>
%N = dim %arg0, 0 : memref<? x ? x f32> %N = dim %arg0, 0 : memref<? x ? x f32>
for %i = 0 to %M { for %i = 0 to %M {
for %j = 0 to %N { for %j = 0 to %N {
// If this loop nest isn't tiled, requires a non-constant size DMA -- not // If this loop nest isn't tiled, the access requires a non-constant DMA
// yet implemented. // size -- not yet implemented.
// CHECK: %2 = load %arg0[%i0, %i1] : memref<?x?xf32> // CHECK: %2 = load %arg0[%i0, %i1] : memref<?x?xf32>
load %arg0[%i, %j] : memref<? x ? x f32> load %arg0[%i, %j] : memref<? x ? x f32>
// expected-error@-6 {{DMA generation failed for one or more memref's}}
} }
} }
return return
} }
// -----
// CHECK-LABEL: func @dma_memref_3d // CHECK-LABEL: func @dma_memref_3d
func @dma_memref_3d(%arg0: memref<1024x1024x1024xf32>) { func @dma_memref_3d(%arg0: memref<1024x1024x1024xf32>) {
for %i = 0 to 1024 { for %i = 0 to 1024 {
@ -274,6 +279,7 @@ func @dma_memref_3d(%arg0: memref<1024x1024x1024xf32>) {
// not yet implemented. // not yet implemented.
// CHECK: %3 = load %arg0[%2#0, %2#1, %2#2] : memref<1024x1024x1024xf32> // CHECK: %3 = load %arg0[%2#0, %2#1, %2#2] : memref<1024x1024x1024xf32>
%v = load %arg0[%idx#0, %idx#1, %idx#2] : memref<1024 x 1024 x 1024 x f32> %v = load %arg0[%idx#0, %idx#1, %idx#2] : memref<1024 x 1024 x 1024 x f32>
// expected-error@-8 {{DMA generation failed for one or more memref's}}
} }
} }
} }