llvm-project/mlir/test/Transforms/pipeline-data-transfer.mlir

227 lines
8.8 KiB
MLIR
Raw Normal View History

Introduce loop body skewing / loop pipelining / loop shifting utility. - loopBodySkew shifts statements of a loop body by stmt-wise delays, and is typically meant to be used to: - allow overlap of non-blocking start/wait until completion operations with other computation - allow shifting of statements (for better register reuse/locality/parallelism) - software pipelining (when applied to the innermost loop) - an additional argument specifies whether to unroll the prologue and epilogue. - add method to check SSA dominance preservation. - add a fake loop pipeline pass to test this utility. Sample input/output are below. While on this, fix/add following: - fix minor bug in getAddMulPureAffineExpr - add additional builder methods for common affine map cases - fix const_operand_iterator's for ForStmt, etc. When there is no such thing as 'const MLValue', the iterator shouldn't be returning const MLValue's. Returning MLValue is const correct. Sample input/output examples: 1) Simplest case: shift second statement by one. Input: for %i = 0 to 7 { %y = "foo"(%i) : (affineint) -> affineint %x = "bar"(%i) : (affineint) -> affineint } Output: #map0 = (d0) -> (d0 - 1) mlfunc @loop_nest_simple1() { %c8 = constant 8 : affineint %c0 = constant 0 : affineint %0 = "foo"(%c0) : (affineint) -> affineint for %i0 = 1 to 7 { %1 = "foo"(%i0) : (affineint) -> affineint %2 = affine_apply #map0(%i0) %3 = "bar"(%2) : (affineint) -> affineint } %4 = affine_apply #map0(%c8) %5 = "bar"(%4) : (affineint) -> affineint return } 2) DMA overlap: shift dma.wait and compute by one. Input for %i = 0 to 7 { %pingpong = affine_apply (d0) -> (d0 mod 2) (%i) "dma.enqueue"(%pingpong) : (affineint) -> affineint %pongping = affine_apply (d0) -> (d0 mod 2) (%i) "dma.wait"(%pongping) : (affineint) -> affineint "compute1"(%pongping) : (affineint) -> affineint } Output #map0 = (d0) -> (d0 mod 2) #map1 = (d0) -> (d0 - 1) #map2 = ()[s0] -> (s0 + 7) mlfunc @loop_nest_dma() { %c8 = constant 8 : affineint %c0 = constant 0 : affineint %0 = affine_apply #map0(%c0) %1 = "dma.enqueue"(%0) : (affineint) -> affineint for %i0 = 1 to 7 { %2 = affine_apply #map0(%i0) %3 = "dma.enqueue"(%2) : (affineint) -> affineint %4 = affine_apply #map1(%i0) %5 = affine_apply #map0(%4) %6 = "dma.wait"(%5) : (affineint) -> affineint %7 = "compute1"(%5) : (affineint) -> affineint } %8 = affine_apply #map1(%c8) %9 = affine_apply #map0(%8) %10 = "dma.wait"(%9) : (affineint) -> affineint %11 = "compute1"(%9) : (affineint) -> affineint return } 3) With arbitrary affine bound maps: Shift last two statements by two. Input: for %i = %N to ()[s0] -> (s0 + 7)()[%N] { %y = "foo"(%i) : (affineint) -> affineint %x = "bar"(%i) : (affineint) -> affineint %z = "foo_bar"(%i) : (affineint) -> (affineint) "bar_foo"(%i) : (affineint) -> (affineint) } Output #map0 = ()[s0] -> (s0 + 1) #map1 = ()[s0] -> (s0 + 2) #map2 = ()[s0] -> (s0 + 7) #map3 = (d0) -> (d0 - 2) #map4 = ()[s0] -> (s0 + 8) #map5 = ()[s0] -> (s0 + 9) for %i0 = %arg0 to #map0()[%arg0] { %0 = "foo"(%i0) : (affineint) -> affineint %1 = "bar"(%i0) : (affineint) -> affineint } for %i1 = #map1()[%arg0] to #map2()[%arg0] { %2 = "foo"(%i1) : (affineint) -> affineint %3 = "bar"(%i1) : (affineint) -> affineint %4 = affine_apply #map3(%i1) %5 = "foo_bar"(%4) : (affineint) -> affineint %6 = "bar_foo"(%4) : (affineint) -> affineint } for %i2 = #map4()[%arg0] to #map5()[%arg0] { %7 = affine_apply #map3(%i2) %8 = "foo_bar"(%7) : (affineint) -> affineint %9 = "bar_foo"(%7) : (affineint) -> affineint } 4) Shift one by zero, second by one, third by two for %i = 0 to 7 { %y = "foo"(%i) : (affineint) -> affineint %x = "bar"(%i) : (affineint) -> affineint %z = "foobar"(%i) : (affineint) -> affineint } #map0 = (d0) -> (d0 - 1) #map1 = (d0) -> (d0 - 2) #map2 = ()[s0] -> (s0 + 7) %c9 = constant 9 : affineint %c8 = constant 8 : affineint %c1 = constant 1 : affineint %c0 = constant 0 : affineint %0 = "foo"(%c0) : (affineint) -> affineint %1 = "foo"(%c1) : (affineint) -> affineint %2 = affine_apply #map0(%c1) %3 = "bar"(%2) : (affineint) -> affineint for %i0 = 2 to 7 { %4 = "foo"(%i0) : (affineint) -> affineint %5 = affine_apply #map0(%i0) %6 = "bar"(%5) : (affineint) -> affineint %7 = affine_apply #map1(%i0) %8 = "foobar"(%7) : (affineint) -> affineint } %9 = affine_apply #map0(%c8) %10 = "bar"(%9) : (affineint) -> affineint %11 = affine_apply #map1(%c8) %12 = "foobar"(%11) : (affineint) -> affineint %13 = affine_apply #map1(%c9) %14 = "foobar"(%13) : (affineint) -> affineint 5) SSA dominance violated; no shifting if a shift is specified for the second statement. for %i = 0 to 7 { %x = "foo"(%i) : (affineint) -> affineint "bar"(%x) : (affineint) -> affineint } PiperOrigin-RevId: 214975731
2018-09-29 03:17:26 +08:00
// RUN: mlir-opt %s -pipeline-data-transfer | FileCheck %s
// CHECK-LABEL: mlfunc @loop_nest_dma() {
mlfunc @loop_nest_dma() {
// CHECK: %0 = alloc() : memref<256xf32>
// CHECK: %1 = alloc() : memref<2x32xf32, 1>
// CHECK: %2 = alloc() : memref<2x1xf32>
// CHECK: dma_start %0[%c0], %1[%3#0, %c0], %c128, %2[%3#1, %c0_0] : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32>
// CHECK-NEXT: for %i0 = 1 to 8 {
// CHECK-NEXT: %4 = affine_apply #map0(%i0)
// CHECK-NEXT: dma_start %0[%i0], %1[%4#0, %i0], %c128, %2[%4#1, %c0_0] : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32>
// CHECK-NEXT: %5 = affine_apply #map1(%i0)
// CHECK-NEXT: %6 = affine_apply #map2(%5)
// CHECK-NEXT: %7 = affine_apply #map2(%5)
// CHECK-NEXT: dma_wait %2[%6, %c0_0], %c128 : memref<2x1xf32>
// CHECK-NEXT: %8 = load %1[%7, %5] : memref<2x32xf32, 1>
// CHECK-NEXT: %9 = "compute"(%8) : (f32) -> f32
// CHECK-NEXT: store %9, %1[%7, %5] : memref<2x32xf32, 1>
// CHECK-NEXT: for %i1 = 0 to 128 {
// CHECK-NEXT: "do_more_compute"(%5, %i1) : (index, index) -> ()
// CHECK-NEXT: }
// CHECK-NEXT: }
// CHECK-NEXT: %10 = affine_apply #map1(%c8)
// CHECK-NEXT: %11 = affine_apply #map2(%10)
// CHECK-NEXT: %12 = affine_apply #map2(%10)
// CHECK-NEXT: dma_wait %2[%11, %c0_0], %c128 : memref<2x1xf32>
// CHECK-NEXT: %13 = load %1[%12, %10] : memref<2x32xf32, 1>
// CHECK-NEXT: %14 = "compute"(%13) : (f32) -> f32
// CHECK-NEXT: store %14, %1[%12, %10] : memref<2x32xf32, 1>
// CHECK-NEXT: for %i2 = 0 to 128 {
// CHECK-NEXT: "do_more_compute"(%10, %i2) : (index, index) -> ()
// CHECK-NEXT: }
// CHECK-NEXT: return
Introduce loop body skewing / loop pipelining / loop shifting utility. - loopBodySkew shifts statements of a loop body by stmt-wise delays, and is typically meant to be used to: - allow overlap of non-blocking start/wait until completion operations with other computation - allow shifting of statements (for better register reuse/locality/parallelism) - software pipelining (when applied to the innermost loop) - an additional argument specifies whether to unroll the prologue and epilogue. - add method to check SSA dominance preservation. - add a fake loop pipeline pass to test this utility. Sample input/output are below. While on this, fix/add following: - fix minor bug in getAddMulPureAffineExpr - add additional builder methods for common affine map cases - fix const_operand_iterator's for ForStmt, etc. When there is no such thing as 'const MLValue', the iterator shouldn't be returning const MLValue's. Returning MLValue is const correct. Sample input/output examples: 1) Simplest case: shift second statement by one. Input: for %i = 0 to 7 { %y = "foo"(%i) : (affineint) -> affineint %x = "bar"(%i) : (affineint) -> affineint } Output: #map0 = (d0) -> (d0 - 1) mlfunc @loop_nest_simple1() { %c8 = constant 8 : affineint %c0 = constant 0 : affineint %0 = "foo"(%c0) : (affineint) -> affineint for %i0 = 1 to 7 { %1 = "foo"(%i0) : (affineint) -> affineint %2 = affine_apply #map0(%i0) %3 = "bar"(%2) : (affineint) -> affineint } %4 = affine_apply #map0(%c8) %5 = "bar"(%4) : (affineint) -> affineint return } 2) DMA overlap: shift dma.wait and compute by one. Input for %i = 0 to 7 { %pingpong = affine_apply (d0) -> (d0 mod 2) (%i) "dma.enqueue"(%pingpong) : (affineint) -> affineint %pongping = affine_apply (d0) -> (d0 mod 2) (%i) "dma.wait"(%pongping) : (affineint) -> affineint "compute1"(%pongping) : (affineint) -> affineint } Output #map0 = (d0) -> (d0 mod 2) #map1 = (d0) -> (d0 - 1) #map2 = ()[s0] -> (s0 + 7) mlfunc @loop_nest_dma() { %c8 = constant 8 : affineint %c0 = constant 0 : affineint %0 = affine_apply #map0(%c0) %1 = "dma.enqueue"(%0) : (affineint) -> affineint for %i0 = 1 to 7 { %2 = affine_apply #map0(%i0) %3 = "dma.enqueue"(%2) : (affineint) -> affineint %4 = affine_apply #map1(%i0) %5 = affine_apply #map0(%4) %6 = "dma.wait"(%5) : (affineint) -> affineint %7 = "compute1"(%5) : (affineint) -> affineint } %8 = affine_apply #map1(%c8) %9 = affine_apply #map0(%8) %10 = "dma.wait"(%9) : (affineint) -> affineint %11 = "compute1"(%9) : (affineint) -> affineint return } 3) With arbitrary affine bound maps: Shift last two statements by two. Input: for %i = %N to ()[s0] -> (s0 + 7)()[%N] { %y = "foo"(%i) : (affineint) -> affineint %x = "bar"(%i) : (affineint) -> affineint %z = "foo_bar"(%i) : (affineint) -> (affineint) "bar_foo"(%i) : (affineint) -> (affineint) } Output #map0 = ()[s0] -> (s0 + 1) #map1 = ()[s0] -> (s0 + 2) #map2 = ()[s0] -> (s0 + 7) #map3 = (d0) -> (d0 - 2) #map4 = ()[s0] -> (s0 + 8) #map5 = ()[s0] -> (s0 + 9) for %i0 = %arg0 to #map0()[%arg0] { %0 = "foo"(%i0) : (affineint) -> affineint %1 = "bar"(%i0) : (affineint) -> affineint } for %i1 = #map1()[%arg0] to #map2()[%arg0] { %2 = "foo"(%i1) : (affineint) -> affineint %3 = "bar"(%i1) : (affineint) -> affineint %4 = affine_apply #map3(%i1) %5 = "foo_bar"(%4) : (affineint) -> affineint %6 = "bar_foo"(%4) : (affineint) -> affineint } for %i2 = #map4()[%arg0] to #map5()[%arg0] { %7 = affine_apply #map3(%i2) %8 = "foo_bar"(%7) : (affineint) -> affineint %9 = "bar_foo"(%7) : (affineint) -> affineint } 4) Shift one by zero, second by one, third by two for %i = 0 to 7 { %y = "foo"(%i) : (affineint) -> affineint %x = "bar"(%i) : (affineint) -> affineint %z = "foobar"(%i) : (affineint) -> affineint } #map0 = (d0) -> (d0 - 1) #map1 = (d0) -> (d0 - 2) #map2 = ()[s0] -> (s0 + 7) %c9 = constant 9 : affineint %c8 = constant 8 : affineint %c1 = constant 1 : affineint %c0 = constant 0 : affineint %0 = "foo"(%c0) : (affineint) -> affineint %1 = "foo"(%c1) : (affineint) -> affineint %2 = affine_apply #map0(%c1) %3 = "bar"(%2) : (affineint) -> affineint for %i0 = 2 to 7 { %4 = "foo"(%i0) : (affineint) -> affineint %5 = affine_apply #map0(%i0) %6 = "bar"(%5) : (affineint) -> affineint %7 = affine_apply #map1(%i0) %8 = "foobar"(%7) : (affineint) -> affineint } %9 = affine_apply #map0(%c8) %10 = "bar"(%9) : (affineint) -> affineint %11 = affine_apply #map1(%c8) %12 = "foobar"(%11) : (affineint) -> affineint %13 = affine_apply #map1(%c9) %14 = "foobar"(%13) : (affineint) -> affineint 5) SSA dominance violated; no shifting if a shift is specified for the second statement. for %i = 0 to 7 { %x = "foo"(%i) : (affineint) -> affineint "bar"(%x) : (affineint) -> affineint } PiperOrigin-RevId: 214975731
2018-09-29 03:17:26 +08:00
%A = alloc() : memref<256 x f32, (d0) -> (d0), 0>
%Ah = alloc() : memref<32 x f32, (d0) -> (d0), 1>
Introduce loop body skewing / loop pipelining / loop shifting utility. - loopBodySkew shifts statements of a loop body by stmt-wise delays, and is typically meant to be used to: - allow overlap of non-blocking start/wait until completion operations with other computation - allow shifting of statements (for better register reuse/locality/parallelism) - software pipelining (when applied to the innermost loop) - an additional argument specifies whether to unroll the prologue and epilogue. - add method to check SSA dominance preservation. - add a fake loop pipeline pass to test this utility. Sample input/output are below. While on this, fix/add following: - fix minor bug in getAddMulPureAffineExpr - add additional builder methods for common affine map cases - fix const_operand_iterator's for ForStmt, etc. When there is no such thing as 'const MLValue', the iterator shouldn't be returning const MLValue's. Returning MLValue is const correct. Sample input/output examples: 1) Simplest case: shift second statement by one. Input: for %i = 0 to 7 { %y = "foo"(%i) : (affineint) -> affineint %x = "bar"(%i) : (affineint) -> affineint } Output: #map0 = (d0) -> (d0 - 1) mlfunc @loop_nest_simple1() { %c8 = constant 8 : affineint %c0 = constant 0 : affineint %0 = "foo"(%c0) : (affineint) -> affineint for %i0 = 1 to 7 { %1 = "foo"(%i0) : (affineint) -> affineint %2 = affine_apply #map0(%i0) %3 = "bar"(%2) : (affineint) -> affineint } %4 = affine_apply #map0(%c8) %5 = "bar"(%4) : (affineint) -> affineint return } 2) DMA overlap: shift dma.wait and compute by one. Input for %i = 0 to 7 { %pingpong = affine_apply (d0) -> (d0 mod 2) (%i) "dma.enqueue"(%pingpong) : (affineint) -> affineint %pongping = affine_apply (d0) -> (d0 mod 2) (%i) "dma.wait"(%pongping) : (affineint) -> affineint "compute1"(%pongping) : (affineint) -> affineint } Output #map0 = (d0) -> (d0 mod 2) #map1 = (d0) -> (d0 - 1) #map2 = ()[s0] -> (s0 + 7) mlfunc @loop_nest_dma() { %c8 = constant 8 : affineint %c0 = constant 0 : affineint %0 = affine_apply #map0(%c0) %1 = "dma.enqueue"(%0) : (affineint) -> affineint for %i0 = 1 to 7 { %2 = affine_apply #map0(%i0) %3 = "dma.enqueue"(%2) : (affineint) -> affineint %4 = affine_apply #map1(%i0) %5 = affine_apply #map0(%4) %6 = "dma.wait"(%5) : (affineint) -> affineint %7 = "compute1"(%5) : (affineint) -> affineint } %8 = affine_apply #map1(%c8) %9 = affine_apply #map0(%8) %10 = "dma.wait"(%9) : (affineint) -> affineint %11 = "compute1"(%9) : (affineint) -> affineint return } 3) With arbitrary affine bound maps: Shift last two statements by two. Input: for %i = %N to ()[s0] -> (s0 + 7)()[%N] { %y = "foo"(%i) : (affineint) -> affineint %x = "bar"(%i) : (affineint) -> affineint %z = "foo_bar"(%i) : (affineint) -> (affineint) "bar_foo"(%i) : (affineint) -> (affineint) } Output #map0 = ()[s0] -> (s0 + 1) #map1 = ()[s0] -> (s0 + 2) #map2 = ()[s0] -> (s0 + 7) #map3 = (d0) -> (d0 - 2) #map4 = ()[s0] -> (s0 + 8) #map5 = ()[s0] -> (s0 + 9) for %i0 = %arg0 to #map0()[%arg0] { %0 = "foo"(%i0) : (affineint) -> affineint %1 = "bar"(%i0) : (affineint) -> affineint } for %i1 = #map1()[%arg0] to #map2()[%arg0] { %2 = "foo"(%i1) : (affineint) -> affineint %3 = "bar"(%i1) : (affineint) -> affineint %4 = affine_apply #map3(%i1) %5 = "foo_bar"(%4) : (affineint) -> affineint %6 = "bar_foo"(%4) : (affineint) -> affineint } for %i2 = #map4()[%arg0] to #map5()[%arg0] { %7 = affine_apply #map3(%i2) %8 = "foo_bar"(%7) : (affineint) -> affineint %9 = "bar_foo"(%7) : (affineint) -> affineint } 4) Shift one by zero, second by one, third by two for %i = 0 to 7 { %y = "foo"(%i) : (affineint) -> affineint %x = "bar"(%i) : (affineint) -> affineint %z = "foobar"(%i) : (affineint) -> affineint } #map0 = (d0) -> (d0 - 1) #map1 = (d0) -> (d0 - 2) #map2 = ()[s0] -> (s0 + 7) %c9 = constant 9 : affineint %c8 = constant 8 : affineint %c1 = constant 1 : affineint %c0 = constant 0 : affineint %0 = "foo"(%c0) : (affineint) -> affineint %1 = "foo"(%c1) : (affineint) -> affineint %2 = affine_apply #map0(%c1) %3 = "bar"(%2) : (affineint) -> affineint for %i0 = 2 to 7 { %4 = "foo"(%i0) : (affineint) -> affineint %5 = affine_apply #map0(%i0) %6 = "bar"(%5) : (affineint) -> affineint %7 = affine_apply #map1(%i0) %8 = "foobar"(%7) : (affineint) -> affineint } %9 = affine_apply #map0(%c8) %10 = "bar"(%9) : (affineint) -> affineint %11 = affine_apply #map1(%c8) %12 = "foobar"(%11) : (affineint) -> affineint %13 = affine_apply #map1(%c9) %14 = "foobar"(%13) : (affineint) -> affineint 5) SSA dominance violated; no shifting if a shift is specified for the second statement. for %i = 0 to 7 { %x = "foo"(%i) : (affineint) -> affineint "bar"(%x) : (affineint) -> affineint } PiperOrigin-RevId: 214975731
2018-09-29 03:17:26 +08:00
%tag = alloc() : memref<1 x f32>
%zero = constant 0 : index
%num_elts = constant 128 : index
Introduce loop body skewing / loop pipelining / loop shifting utility. - loopBodySkew shifts statements of a loop body by stmt-wise delays, and is typically meant to be used to: - allow overlap of non-blocking start/wait until completion operations with other computation - allow shifting of statements (for better register reuse/locality/parallelism) - software pipelining (when applied to the innermost loop) - an additional argument specifies whether to unroll the prologue and epilogue. - add method to check SSA dominance preservation. - add a fake loop pipeline pass to test this utility. Sample input/output are below. While on this, fix/add following: - fix minor bug in getAddMulPureAffineExpr - add additional builder methods for common affine map cases - fix const_operand_iterator's for ForStmt, etc. When there is no such thing as 'const MLValue', the iterator shouldn't be returning const MLValue's. Returning MLValue is const correct. Sample input/output examples: 1) Simplest case: shift second statement by one. Input: for %i = 0 to 7 { %y = "foo"(%i) : (affineint) -> affineint %x = "bar"(%i) : (affineint) -> affineint } Output: #map0 = (d0) -> (d0 - 1) mlfunc @loop_nest_simple1() { %c8 = constant 8 : affineint %c0 = constant 0 : affineint %0 = "foo"(%c0) : (affineint) -> affineint for %i0 = 1 to 7 { %1 = "foo"(%i0) : (affineint) -> affineint %2 = affine_apply #map0(%i0) %3 = "bar"(%2) : (affineint) -> affineint } %4 = affine_apply #map0(%c8) %5 = "bar"(%4) : (affineint) -> affineint return } 2) DMA overlap: shift dma.wait and compute by one. Input for %i = 0 to 7 { %pingpong = affine_apply (d0) -> (d0 mod 2) (%i) "dma.enqueue"(%pingpong) : (affineint) -> affineint %pongping = affine_apply (d0) -> (d0 mod 2) (%i) "dma.wait"(%pongping) : (affineint) -> affineint "compute1"(%pongping) : (affineint) -> affineint } Output #map0 = (d0) -> (d0 mod 2) #map1 = (d0) -> (d0 - 1) #map2 = ()[s0] -> (s0 + 7) mlfunc @loop_nest_dma() { %c8 = constant 8 : affineint %c0 = constant 0 : affineint %0 = affine_apply #map0(%c0) %1 = "dma.enqueue"(%0) : (affineint) -> affineint for %i0 = 1 to 7 { %2 = affine_apply #map0(%i0) %3 = "dma.enqueue"(%2) : (affineint) -> affineint %4 = affine_apply #map1(%i0) %5 = affine_apply #map0(%4) %6 = "dma.wait"(%5) : (affineint) -> affineint %7 = "compute1"(%5) : (affineint) -> affineint } %8 = affine_apply #map1(%c8) %9 = affine_apply #map0(%8) %10 = "dma.wait"(%9) : (affineint) -> affineint %11 = "compute1"(%9) : (affineint) -> affineint return } 3) With arbitrary affine bound maps: Shift last two statements by two. Input: for %i = %N to ()[s0] -> (s0 + 7)()[%N] { %y = "foo"(%i) : (affineint) -> affineint %x = "bar"(%i) : (affineint) -> affineint %z = "foo_bar"(%i) : (affineint) -> (affineint) "bar_foo"(%i) : (affineint) -> (affineint) } Output #map0 = ()[s0] -> (s0 + 1) #map1 = ()[s0] -> (s0 + 2) #map2 = ()[s0] -> (s0 + 7) #map3 = (d0) -> (d0 - 2) #map4 = ()[s0] -> (s0 + 8) #map5 = ()[s0] -> (s0 + 9) for %i0 = %arg0 to #map0()[%arg0] { %0 = "foo"(%i0) : (affineint) -> affineint %1 = "bar"(%i0) : (affineint) -> affineint } for %i1 = #map1()[%arg0] to #map2()[%arg0] { %2 = "foo"(%i1) : (affineint) -> affineint %3 = "bar"(%i1) : (affineint) -> affineint %4 = affine_apply #map3(%i1) %5 = "foo_bar"(%4) : (affineint) -> affineint %6 = "bar_foo"(%4) : (affineint) -> affineint } for %i2 = #map4()[%arg0] to #map5()[%arg0] { %7 = affine_apply #map3(%i2) %8 = "foo_bar"(%7) : (affineint) -> affineint %9 = "bar_foo"(%7) : (affineint) -> affineint } 4) Shift one by zero, second by one, third by two for %i = 0 to 7 { %y = "foo"(%i) : (affineint) -> affineint %x = "bar"(%i) : (affineint) -> affineint %z = "foobar"(%i) : (affineint) -> affineint } #map0 = (d0) -> (d0 - 1) #map1 = (d0) -> (d0 - 2) #map2 = ()[s0] -> (s0 + 7) %c9 = constant 9 : affineint %c8 = constant 8 : affineint %c1 = constant 1 : affineint %c0 = constant 0 : affineint %0 = "foo"(%c0) : (affineint) -> affineint %1 = "foo"(%c1) : (affineint) -> affineint %2 = affine_apply #map0(%c1) %3 = "bar"(%2) : (affineint) -> affineint for %i0 = 2 to 7 { %4 = "foo"(%i0) : (affineint) -> affineint %5 = affine_apply #map0(%i0) %6 = "bar"(%5) : (affineint) -> affineint %7 = affine_apply #map1(%i0) %8 = "foobar"(%7) : (affineint) -> affineint } %9 = affine_apply #map0(%c8) %10 = "bar"(%9) : (affineint) -> affineint %11 = affine_apply #map1(%c8) %12 = "foobar"(%11) : (affineint) -> affineint %13 = affine_apply #map1(%c9) %14 = "foobar"(%13) : (affineint) -> affineint 5) SSA dominance violated; no shifting if a shift is specified for the second statement. for %i = 0 to 7 { %x = "foo"(%i) : (affineint) -> affineint "bar"(%x) : (affineint) -> affineint } PiperOrigin-RevId: 214975731
2018-09-29 03:17:26 +08:00
for %i = 0 to 8 {
dma_start %A[%i], %Ah[%i], %num_elts, %tag[%zero] : memref<256 x f32>, memref<32 x f32, 1>, memref<1 x f32>
dma_wait %tag[%zero], %num_elts : memref<1 x f32>
%v = load %Ah[%i] : memref<32 x f32, (d0) -> (d0), 1>
%r = "compute"(%v) : (f32) -> (f32)
store %r, %Ah[%i] : memref<32 x f32, (d0) -> (d0), 1>
for %j = 0 to 128 {
"do_more_compute"(%i, %j) : (index, index) -> ()
}
Introduce loop body skewing / loop pipelining / loop shifting utility. - loopBodySkew shifts statements of a loop body by stmt-wise delays, and is typically meant to be used to: - allow overlap of non-blocking start/wait until completion operations with other computation - allow shifting of statements (for better register reuse/locality/parallelism) - software pipelining (when applied to the innermost loop) - an additional argument specifies whether to unroll the prologue and epilogue. - add method to check SSA dominance preservation. - add a fake loop pipeline pass to test this utility. Sample input/output are below. While on this, fix/add following: - fix minor bug in getAddMulPureAffineExpr - add additional builder methods for common affine map cases - fix const_operand_iterator's for ForStmt, etc. When there is no such thing as 'const MLValue', the iterator shouldn't be returning const MLValue's. Returning MLValue is const correct. Sample input/output examples: 1) Simplest case: shift second statement by one. Input: for %i = 0 to 7 { %y = "foo"(%i) : (affineint) -> affineint %x = "bar"(%i) : (affineint) -> affineint } Output: #map0 = (d0) -> (d0 - 1) mlfunc @loop_nest_simple1() { %c8 = constant 8 : affineint %c0 = constant 0 : affineint %0 = "foo"(%c0) : (affineint) -> affineint for %i0 = 1 to 7 { %1 = "foo"(%i0) : (affineint) -> affineint %2 = affine_apply #map0(%i0) %3 = "bar"(%2) : (affineint) -> affineint } %4 = affine_apply #map0(%c8) %5 = "bar"(%4) : (affineint) -> affineint return } 2) DMA overlap: shift dma.wait and compute by one. Input for %i = 0 to 7 { %pingpong = affine_apply (d0) -> (d0 mod 2) (%i) "dma.enqueue"(%pingpong) : (affineint) -> affineint %pongping = affine_apply (d0) -> (d0 mod 2) (%i) "dma.wait"(%pongping) : (affineint) -> affineint "compute1"(%pongping) : (affineint) -> affineint } Output #map0 = (d0) -> (d0 mod 2) #map1 = (d0) -> (d0 - 1) #map2 = ()[s0] -> (s0 + 7) mlfunc @loop_nest_dma() { %c8 = constant 8 : affineint %c0 = constant 0 : affineint %0 = affine_apply #map0(%c0) %1 = "dma.enqueue"(%0) : (affineint) -> affineint for %i0 = 1 to 7 { %2 = affine_apply #map0(%i0) %3 = "dma.enqueue"(%2) : (affineint) -> affineint %4 = affine_apply #map1(%i0) %5 = affine_apply #map0(%4) %6 = "dma.wait"(%5) : (affineint) -> affineint %7 = "compute1"(%5) : (affineint) -> affineint } %8 = affine_apply #map1(%c8) %9 = affine_apply #map0(%8) %10 = "dma.wait"(%9) : (affineint) -> affineint %11 = "compute1"(%9) : (affineint) -> affineint return } 3) With arbitrary affine bound maps: Shift last two statements by two. Input: for %i = %N to ()[s0] -> (s0 + 7)()[%N] { %y = "foo"(%i) : (affineint) -> affineint %x = "bar"(%i) : (affineint) -> affineint %z = "foo_bar"(%i) : (affineint) -> (affineint) "bar_foo"(%i) : (affineint) -> (affineint) } Output #map0 = ()[s0] -> (s0 + 1) #map1 = ()[s0] -> (s0 + 2) #map2 = ()[s0] -> (s0 + 7) #map3 = (d0) -> (d0 - 2) #map4 = ()[s0] -> (s0 + 8) #map5 = ()[s0] -> (s0 + 9) for %i0 = %arg0 to #map0()[%arg0] { %0 = "foo"(%i0) : (affineint) -> affineint %1 = "bar"(%i0) : (affineint) -> affineint } for %i1 = #map1()[%arg0] to #map2()[%arg0] { %2 = "foo"(%i1) : (affineint) -> affineint %3 = "bar"(%i1) : (affineint) -> affineint %4 = affine_apply #map3(%i1) %5 = "foo_bar"(%4) : (affineint) -> affineint %6 = "bar_foo"(%4) : (affineint) -> affineint } for %i2 = #map4()[%arg0] to #map5()[%arg0] { %7 = affine_apply #map3(%i2) %8 = "foo_bar"(%7) : (affineint) -> affineint %9 = "bar_foo"(%7) : (affineint) -> affineint } 4) Shift one by zero, second by one, third by two for %i = 0 to 7 { %y = "foo"(%i) : (affineint) -> affineint %x = "bar"(%i) : (affineint) -> affineint %z = "foobar"(%i) : (affineint) -> affineint } #map0 = (d0) -> (d0 - 1) #map1 = (d0) -> (d0 - 2) #map2 = ()[s0] -> (s0 + 7) %c9 = constant 9 : affineint %c8 = constant 8 : affineint %c1 = constant 1 : affineint %c0 = constant 0 : affineint %0 = "foo"(%c0) : (affineint) -> affineint %1 = "foo"(%c1) : (affineint) -> affineint %2 = affine_apply #map0(%c1) %3 = "bar"(%2) : (affineint) -> affineint for %i0 = 2 to 7 { %4 = "foo"(%i0) : (affineint) -> affineint %5 = affine_apply #map0(%i0) %6 = "bar"(%5) : (affineint) -> affineint %7 = affine_apply #map1(%i0) %8 = "foobar"(%7) : (affineint) -> affineint } %9 = affine_apply #map0(%c8) %10 = "bar"(%9) : (affineint) -> affineint %11 = affine_apply #map1(%c8) %12 = "foobar"(%11) : (affineint) -> affineint %13 = affine_apply #map1(%c9) %14 = "foobar"(%13) : (affineint) -> affineint 5) SSA dominance violated; no shifting if a shift is specified for the second statement. for %i = 0 to 7 { %x = "foo"(%i) : (affineint) -> affineint "bar"(%x) : (affineint) -> affineint } PiperOrigin-RevId: 214975731
2018-09-29 03:17:26 +08:00
}
return
}
#map0 = (d0, d1) -> (d0, d1)
#map1 = (d0, d1) -> ((d0 * 2048 + d1 * 256) floordiv 32, 0)
#map2 = (d0) -> ((d0 * 2048) floordiv 32, 0)
// CHECK: mlfunc @loop_dma_nested(%arg0 : memref<512x32xvector<8xf32>
mlfunc @loop_dma_nested(%arg0 : memref<512x32xvector<8xf32>, #map0>, %arg1 : memref<512x32xvector<8xf32>, #map0>, %arg2 : memref<512x32xvector<8xf32>, #map0>) {
%num_elts = constant 256 : index
%c0 = constant 0 : index
%0 = alloc() : memref<64x4xvector<8xf32>, #map0, 2>
%1 = alloc() : memref<64x4xvector<8xf32>, #map0, 2>
%2 = alloc() : memref<64x4xvector<8xf32>, #map0, 2>
%3 = alloc() : memref<2xi32>
%4 = alloc() : memref<2xi32>
%5 = alloc() : memref<2xi32>
// Prologue for DMA overlap on arg2.
// CHECK: dma_start %arg2[
// CHECK-NEXT: for %i0 = 1 to 8 {
for %i0 = 0 to 8 {
%6 = affine_apply #map2(%i0)
dma_start %arg2[%6#0, %6#1], %2[%c0, %c0], %num_elts, %5[%c0] : memref<512x32xvector<8xf32>, #map0>, memref<64x4xvector<8xf32>, #map0, 2>, memref<2xi32>
dma_wait %5[%c0], %num_elts : memref<2xi32>
// Steady state for DMA overlap on arg2
// CHECK: dma_start %arg2[
// CHECK: dma_wait %1[
// Prologue for DMA overlap on arg0, arg1 nested within i0
// CHECK: dma_start %arg0[
// CHECK: dma_start %arg1[
// CHECK-NEXT for %i1 = 1 to 8 {
for %i1 = 0 to 8 {
%7 = affine_apply #map1(%i0, %i1)
%8 = affine_apply #map2(%i1)
dma_start %arg0[%7#0, %7#1], %0[%c0, %c0], %num_elts, %3[%c0] : memref<512x32xvector<8xf32>, #map0>, memref<64x4xvector<8xf32>, #map0, 2>, memref<2xi32>
dma_start %arg1[%8#0, %8#1], %1[%c0, %c0], %num_elts, %4[%c0] : memref<512x32xvector<8xf32>, #map0>, memref<64x4xvector<8xf32>, #map0, 2>, memref<2xi32>
dma_wait %3[%c0], %num_elts : memref<2xi32>
dma_wait %4[%c0], %num_elts : memref<2xi32>
// Steady state for DMA overlap on arg0, arg1
// CHECK: dma_start %arg0[
// CHECK: dma_start %arg1[
// CHECK: dma_wait %10[
// CHECK: dma_wait %11[
// CHECK-NEXT: for %i2 = 0 to 4 {
for %i2 = 0 to 4 {
"foo"() : () -> ()
}
}
// epilogue for arg0, arg1
// CHECK: dma_wait %10[
// CHECK: dma_wait %11[
// epilogue for DMA overlap on %arg2
// CHECK: dma_wait %1[%31, %c0_2], %c256 : memref<2x2xi32>
// Within the epilogue for arg2's DMA, we have the DMAs on %arg1, %arg2 nested.
// CHECK: dma_start %arg0[
// CHECK: dma_start %arg1[
// CHECK: for %i4 = 1 to 8 {
// CHECK: dma_start %arg0[
// CHECK: dma_start %arg1[
// CHECK: dma_wait %36[
// CHECK: dma_wait %37[
// CHECK: for %i5 = 0 to 4 {
// CHECK: "foo"() : () -> ()
// CHECK: dma_wait %36[
// CHECK: dma_wait %37[
// CHECK: for %i6 = 0 to 4 {
} // CHECK: }
return // CHECK-NEXT: return
}
// CHECK: mlfunc @loop_dma_dependent
mlfunc @loop_dma_dependent(%arg2 : memref<512x32xvector<8xf32>>) {
%num_elts = constant 256 : index
%c0 = constant 0 : index
%0 = alloc() : memref<64x4xvector<8xf32>, 2>
%1 = alloc() : memref<64x4xvector<8xf32>, 2>
%2 = alloc() : memref<64x4xvector<8xf32>, 2>
%3 = alloc() : memref<2xi32>
%4 = alloc() : memref<2xi32>
%5 = alloc() : memref<2xi32>
// The two DMAs below are dependent (incoming and outgoing on the same
// memref) in the same iteration; so no pipelining here.
// CHECK-NOT: dma_start
// CHECK: for %i0 = 0 to 8 {
for %i0 = 0 to 8 {
%6 = affine_apply #map2(%i0)
dma_start %arg2[%6#0, %6#1], %2[%c0, %c0], %num_elts, %5[%c0] : memref<512x32xvector<8xf32>>, memref<64x4xvector<8xf32>, 2>, memref<2xi32>
dma_wait %5[%c0], %num_elts : memref<2xi32>
dma_start %2[%c0, %c0], %arg2[%6#0, %6#1], %num_elts, %5[%c0] : memref<64x4xvector<8xf32>, 2>, memref<512x32xvector<8xf32>>, memref<2xi32>
dma_wait %5[%c0], %num_elts : memref<2xi32>
} // CHECK: }
return // CHECK-NEXT: return
}
// CHECK-LABEL: mlfunc @escaping_use
mlfunc @escaping_use(%arg0: memref<512 x 32 x f32>) {
%c32 = constant 32 : index
%num_elt = constant 512 : index
%zero = constant 0 : index
%Av = alloc() : memref<32 x 32 x f32, 2>
%tag = alloc() : memref<1 x i32>
// CHECK-NOT: dma_start
// CHECK: for %i0 = 0 to 16 {
for %kTT = 0 to 16 {
dma_start %arg0[%zero, %zero], %Av[%zero, %zero], %num_elt, %tag[%zero] :
memref<512 x 32 x f32>,
memref<32 x 32 x f32, 2>, memref<1 x i32>
dma_wait %tag[%zero], %num_elt : memref<1 x i32>
// escaping use; no DMA pipelining / double buffering will be done.
"foo"(%Av) : (memref<32 x 32 x f32, 2>) -> ()
}
return
// CHECK: "foo"(%{{[0-9]+}}) : (memref<32x32xf32, 2>) -> ()
// CHECK: }
// CHECK-NEXT: return
}
// CHECK-LABEL: mlfunc @live_out_use
mlfunc @live_out_use(%arg0: memref<512 x 32 x f32>) -> f32 {
%c32 = constant 32 : index
%num_elt = constant 512 : index
%zero = constant 0 : index
%Av = alloc() : memref<32 x 32 x f32, 2>
%tag = alloc() : memref<1 x i32>
// CHECK-NOT: dma_start
// CHECK: for %i0 = 0 to 16 {
for %kTT = 0 to 16 {
dma_start %arg0[%zero, %zero], %Av[%zero, %zero], %num_elt, %tag[%zero] :
memref<512 x 32 x f32>,
memref<32 x 32 x f32, 2>, memref<1 x i32>
dma_wait %tag[%zero], %num_elt : memref<1 x i32>
}
// Use live out of 'for' stmt; no DMA pipelining will be done.
%v = load %Av[%zero, %zero] : memref<32 x 32 x f32, 2>
return %v : f32
// CHECK: %{{[0-9]+}} = load %{{[0-9]+}}[%c0, %c0] : memref<32x32xf32, 2>
// CHECK-NEXT: return
}
// CHECK-LABEL: mlfunc @dynamic_shape_dma_buffer
mlfunc @dynamic_shape_dma_buffer(%arg0: memref<512 x 32 x f32>) {
%c32 = constant 32 : index
%num_elt = constant 512 : index
%zero = constant 0 : index
%Av = alloc(%c32, %c32) : memref<? x ? x f32, 2>
%tag = alloc() : memref<1 x i32>
// Double buffering for dynamic shaped buffer.
// CHECK: %0 = alloc(%c32, %c32) : memref<?x?xf32, 2>
// CHECK-NEXT: %1 = dim %0, 0 : memref<?x?xf32, 2>
// CHECK-NEXT: %2 = dim %0, 1 : memref<?x?xf32, 2>
// CHECK-NEXT: %3 = alloc(%1, %2) : memref<2x?x?xf32, 2>
// CHECK: dma_start %arg0[%c0_0, %c0_0], %3[%5#0, %c0_0, %c0_0],
// CHECK-NEXT: for %i0 = 1 to 16 {
for %kTT = 0 to 16 {
dma_start %arg0[%zero, %zero], %Av[%zero, %zero], %num_elt, %tag[%zero] :
memref<512 x 32 x f32>,
memref<? x ? x f32, 2>, memref<1 x i32>
dma_wait %tag[%zero], %num_elt : memref<1 x i32>
}
return
// CHECK: dma_start %arg0[%c0_0, %c0_0], %3[%6#0, %c0_0, %c0_0], %c512, %4[%6#1, %c0_0]
// CHECK: dma_wait %4[%8, %c0_0], %c512 : memref<2x1xi32>
// CHECK: }
// CHECK: dma_wait %4[%11, %c0_0], %c512 : memref<2x1xi32>
// CHECK-NEXT: return
}