diff --git a/mlir/test/Dialect/Arithmetic/one-shot-bufferize.mlir b/mlir/test/Dialect/Arithmetic/one-shot-bufferize.mlir new file mode 100644 index 000000000000..4523981ea322 --- /dev/null +++ b/mlir/test/Dialect/Arithmetic/one-shot-bufferize.mlir @@ -0,0 +1,61 @@ +// RUN: mlir-opt %s -one-shot-bufferize="allow-return-allocs bufferize-function-boundaries" -split-input-file | FileCheck %s + +// Run fuzzer with different seeds. +// RUN: mlir-opt %s -one-shot-bufferize="allow-return-allocs test-analysis-only analysis-fuzzer-seed=23 bufferize-function-boundaries" -split-input-file -o /dev/null +// RUN: mlir-opt %s -one-shot-bufferize="allow-return-allocs test-analysis-only analysis-fuzzer-seed=59 bufferize-function-boundaries" -split-input-file -o /dev/null +// RUN: mlir-opt %s -one-shot-bufferize="allow-return-allocs test-analysis-only analysis-fuzzer-seed=91 bufferize-function-boundaries" -split-input-file -o /dev/null + +// Test bufferization using memref types that have no layout map. +// RUN: mlir-opt %s -one-shot-bufferize="allow-return-allocs fully-dynamic-layout-maps=0 bufferize-function-boundaries" -split-input-file -o /dev/null + +// CHECK-LABEL: func @write_to_select_op_source +// CHECK-SAME: %[[t1:.*]]: memref, %[[t2:.*]]: memref +func.func @write_to_select_op_source( + %t1 : tensor {bufferization.writable = true}, + %t2 : tensor {bufferization.writable = true}, + %c : i1) + -> (tensor, tensor) +{ + %cst = arith.constant 0.0 : f32 + %idx = arith.constant 0 : index + // CHECK: %[[alloc:.*]] = memref.alloc + // CHECK: memref.copy %[[t1]], %[[alloc]] + // CHECK: memref.store %{{.*}}, %[[alloc]] + %w = tensor.insert %cst into %t1[%idx] : tensor + // CHECK: %[[select:.*]] = arith.select %{{.*}}, %[[t1]], %[[t2]] + %s = arith.select %c, %t1, %t2 : tensor + // CHECK: return %[[select]], %[[alloc]] + return %s, %w : tensor, tensor +} + +// ----- + +// Due to the out-of-place bufferization of %t1, buffers with different layout +// maps are passed to arith.select. A cast must be inserted. + +// CHECK-LABEL: func @write_after_select_read_one +// CHECK-SAME: %[[t1:.*]]: memref, %[[t2:.*]]: memref +func.func @write_after_select_read_one( + %t1 : tensor {bufferization.writable = true}, + %t2 : tensor {bufferization.writable = true}, + %c : i1) + -> (f32, tensor) +{ + %cst = arith.constant 0.0 : f32 + %idx = arith.constant 0 : index + + // CHECK: %[[alloc:.*]] = memref.alloc + // CHECK-DAG: %[[casted:.*]] = memref.cast %[[alloc]] + // CHECK-DAG: memref.copy %[[t1]], %[[alloc]] + // CHECK: %[[select:.*]] = arith.select %{{.*}}, %[[casted]], %[[t2]] + %s = arith.select %c, %t1, %t2 : tensor + + // CHECK: memref.store %{{.*}}, %[[select]] + %w = tensor.insert %cst into %s[%idx] : tensor + + // CHECK: %[[f:.*]] = memref.load %[[t1]] + %f = tensor.extract %t1[%idx] : tensor + + // CHECK: return %[[f]], %[[select]] + return %f, %w : f32, tensor +} diff --git a/mlir/test/Dialect/Linalg/comprehensive-bufferize-analysis-2fill-extract-matmul-all-perms.mlir b/mlir/test/Dialect/Linalg/one-shot-bufferize-analysis-2fill-extract-matmul-all-perms.mlir similarity index 99% rename from mlir/test/Dialect/Linalg/comprehensive-bufferize-analysis-2fill-extract-matmul-all-perms.mlir rename to mlir/test/Dialect/Linalg/one-shot-bufferize-analysis-2fill-extract-matmul-all-perms.mlir index 18f379041c5e..b48a8af13ede 100644 --- a/mlir/test/Dialect/Linalg/comprehensive-bufferize-analysis-2fill-extract-matmul-all-perms.mlir +++ b/mlir/test/Dialect/Linalg/one-shot-bufferize-analysis-2fill-extract-matmul-all-perms.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -linalg-comprehensive-module-bufferize=test-analysis-only -split-input-file | FileCheck %s +// RUN: mlir-opt %s -one-shot-bufferize="test-analysis-only bufferize-function-boundaries" -split-input-file | FileCheck %s /// All combinations of matmul(fill(extract(init_tensor)), fill(extract(%init_tensor)), %arg2) /// These should all be inplaceable except the first op. diff --git a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir b/mlir/test/Dialect/Linalg/one-shot-bufferize.mlir similarity index 53% rename from mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir rename to mlir/test/Dialect/Linalg/one-shot-bufferize.mlir index 451ad9c174ad..7a1072c75d23 100644 --- a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir +++ b/mlir/test/Dialect/Linalg/one-shot-bufferize.mlir @@ -1,12 +1,14 @@ -// RUN: mlir-opt %s -linalg-comprehensive-module-bufferize=allow-return-allocs -split-input-file | FileCheck %s +// RUN: mlir-opt %s -one-shot-bufferize="allow-return-allocs bufferize-function-boundaries" -split-input-file | FileCheck %s // Run fuzzer with different seeds. -// RUN: mlir-opt %s -linalg-comprehensive-module-bufferize="allow-return-allocs test-analysis-only analysis-fuzzer-seed=23" -split-input-file -o /dev/null -// RUN: mlir-opt %s -linalg-comprehensive-module-bufferize="allow-return-allocs test-analysis-only analysis-fuzzer-seed=59" -split-input-file -o /dev/null -// RUN: mlir-opt %s -linalg-comprehensive-module-bufferize="allow-return-allocs test-analysis-only analysis-fuzzer-seed=91" -split-input-file -o /dev/null +// RUN: mlir-opt %s -one-shot-bufferize="allow-return-allocs test-analysis-only analysis-fuzzer-seed=23 bufferize-function-boundaries" -split-input-file -o /dev/null +// RUN: mlir-opt %s -one-shot-bufferize="allow-return-allocs test-analysis-only analysis-fuzzer-seed=59 bufferize-function-boundaries" -split-input-file -o /dev/null +// RUN: mlir-opt %s -one-shot-bufferize="allow-return-allocs test-analysis-only analysis-fuzzer-seed=91 bufferize-function-boundaries" -split-input-file -o /dev/null // Test bufferization using memref types that have no layout map. -// RUN: mlir-opt %s -linalg-comprehensive-module-bufferize="allow-return-allocs fully-dynamic-layout-maps=0" -split-input-file | FileCheck %s --check-prefix=CHECK-NO-LAYOUT-MAP +// RUN: mlir-opt %s -one-shot-bufferize="allow-return-allocs fully-dynamic-layout-maps=0 bufferize-function-boundaries" -split-input-file | FileCheck %s --check-prefix=CHECK-NO-LAYOUT-MAP + +// TODO: Some test cases from this file should be moved to other dialects. // CHECK-DAG: #[[$map_1d_dyn:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)> @@ -32,19 +34,6 @@ func.func @fill_inplace( // ----- -// CHECK-LABEL: func @tensor_extract(%{{.*}}: memref) -> f32 { -func.func @tensor_extract(%A : tensor {bufferization.writable = false}) -> (f32) { - %c0 = arith.constant 0 : index - -// CHECK: %[[RES:.*]] = memref.load {{.*}} : memref - %0 = tensor.extract %A[%c0] : tensor - -// CHECK: return %[[RES]] : f32 - return %0 : f32 -} - -// ----- - // CHECK-DAG: #[[$map_1d_dyn:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)> /// No bufferization.writable flag, must allocate. @@ -160,138 +149,6 @@ func.func @vec_not_inplace( // ----- -// CHECK-DAG: #[[$map_1d_dyn:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)> - -// CHECK-LABEL: func @insert_slice_fun -// CHECK-SAME: %[[A0:[a-zA-Z0-9]*]]: memref, -// CHECK-SAME: %[[A1:[a-zA-Z0-9]*]]: memref, -// CHECK-SAME: %[[t0:[a-zA-Z0-9]*]]: memref<4xf32, #[[$map_1d_dyn]]>, -// CHECK-SAME: %[[t1:[a-zA-Z0-9]*]]: memref<4xf32, #[[$map_1d_dyn]]> -func.func @insert_slice_fun( - %A0 : tensor {bufferization.writable = false}, - %A1 : tensor {bufferization.writable = true}, - %t0 : tensor<4xf32> {bufferization.writable = false}, - %t1 : tensor<4xf32> {bufferization.writable = true}) - -> (tensor, tensor, tensor, tensor) -{ - // Hoisted allocs. - // CHECK: %[[REALLOC1:.*]] = memref.alloc - // CHECK: %[[REALLOC2:.*]] = memref.alloc - // CHECK: %[[REALLOC3:.*]] = memref.alloc - - // Alloc and copy the whole result tensor. Copy the tensor.extract_slice. - // CHECK: memref.copy %[[A0]], %[[REALLOC3]] - // CHECK: %[[SV_A0:.*]] = memref.subview %[[REALLOC3]] - // CHECK: memref.copy %[[t0]], %[[SV_A0]] - %r0 = tensor.insert_slice %t0 into %A0[0][4][1] : tensor<4xf32> into tensor - - // Alloc and copy the whole result tensor. Copy the tensor.extract_slice. - // CHECK: memref.copy %[[A0]] - // CHECK: %[[SV_A0_2:.*]] = memref.subview %[[REALLOC2]] - // CHECK: memref.copy %[[t1]], %[[SV_A0_2]] - %r1 = tensor.insert_slice %t1 into %A0[0][4][1] : tensor<4xf32> into tensor - - // Still alloc the large tensor because %A1 is read after. Copy the tensor.extract_slice. - // CHECK: memref.copy %[[A1]] - // CHECK: %[[SV_A1:.*]] = memref.subview %[[REALLOC1]] - // CHECK: memref.copy %[[t0]], %[[SV_A1]] - %r2 = tensor.insert_slice %t0 into %A1[0][4][1] : tensor<4xf32> into tensor - - // Do not realloc the large tensor. Copy the tensor.extract_slice. - // CHECK-NOT: alloc - // CHECK: %[[SV_A1_2:.*]] = memref.subview %[[A1]] - // CHECK: memref.copy %[[t1]], %[[SV_A1_2]] - %r3 = tensor.insert_slice %t1 into %A1[0][4][1] : tensor<4xf32> into tensor - - // CHECK: return %[[REALLOC3]], %[[REALLOC2]], %[[REALLOC1]] : - // CHECK-SAME: memref, memref, memref - return %r0, %r1, %r2, %r3: tensor, tensor, tensor, tensor -} - -// ----- - -// CHECK-DAG: #[[$map_1d_dyn:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)> - -// CHECK-LABEL: func @insert_slice_fun -// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: memref -// CHECK-SAME: %[[t:[a-zA-Z0-9]*]]: memref<4xf32, #[[$map_1d_dyn]]> -func.func @insert_slice_fun( - %A : tensor {bufferization.writable = true}, - %t : tensor<4xf32> {bufferization.writable = false}) - -> tensor -{ - %f0 = arith.constant 0.0 : f32 - - // CHECK-NOT: alloc - // CHECK: %[[SV_A:.*]] = memref.subview %[[A]] - // CHECK: memref.copy %[[t]], %[[SV_A]] - %r0 = tensor.insert_slice %t into %A[0][4][1] : tensor<4xf32> into tensor - - /// Overwrite A inplace. - // CHECK: linalg.fill ins({{.*}}{{.*}}outs(%[[A]] - %r1 = linalg.fill ins(%f0 : f32) outs(%r0 : tensor) -> tensor - - // CHECK: return - // CHECK-NOT: tensor - return %r1: tensor -} - -// ----- - -// CHECK-DAG: #[[$map_1d_dyn:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)> - -// CHECK-LABEL: func @insert_slice_fun -// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: memref -// CHECK-SAME: %[[t:[a-zA-Z0-9]*]]: memref<4xf32, #[[$map_1d_dyn]]> -func.func @insert_slice_fun( - %A : tensor {bufferization.writable = true}, - %t : tensor<4xf32> {bufferization.writable = false}) - -> tensor -{ - %f0 = arith.constant 0.0 : f32 - - // CHECK: linalg.fill ins({{.*}}{{.*}}outs(%[[A]] - %r0 = linalg.fill ins(%f0 : f32) outs(%A : tensor) -> tensor - - // CHECK-NOT: alloc - // CHECK: %[[SV_A:.*]] = memref.subview %[[A]] - /// Overwrite A inplace by copying into the subview. - // CHECK: memref.copy %[[t]], %[[SV_A]] - %r1 = tensor.insert_slice %t into %r0[0][4][1] : tensor<4xf32> into tensor - - // CHECK: return - // CHECK-NOT: tensor - return %r1: tensor -} - -// ----- - -// CHECK-DAG: #[[$map_1d_dyn:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)> - -// CHECK-LABEL: func @insert_slice_fun_not_inplace -// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: memref -// CHECK-SAME: %[[t:[a-zA-Z0-9]*]]: memref<4xf32, #[[$map_1d_dyn]]> -func.func @insert_slice_fun_not_inplace( - %A : tensor {bufferization.writable = false}, - %t : tensor<4xf32> {bufferization.writable = false}) - -> tensor -{ - // CHECK: %[[ALLOC:.*]] = memref.alloc(%{{.*}}) {alignment = 128 : i64} : memref - // CHECK: memref.copy %[[A]], %[[ALLOC]] : memref - // CHECK: %[[SV:.*]] = memref.subview %[[ALLOC]][0] [4] [1] : memref to memref<4xf32> - // CHECK: memref.copy %[[t]], %[[SV]] : memref<4xf32, #map> to memref<4xf32> - %r0 = tensor.insert_slice %t into %A[0][4][1] : tensor<4xf32> into tensor - - // CHECK: return %{{.*}} : memref - return %r0: tensor -} - -// ----- - -//===----------------------------------------------------------------------===// -// Cross function boundary cases. -//===----------------------------------------------------------------------===// - // CHECK: func @matmul( // CHECK-SAME: %[[A:[0-9a-zA-Z]*]]: memref<128x256xf32> // CHECK-SAME: %[[B:[0-9a-zA-Z]*]]: memref<256x192xf32> @@ -364,29 +221,7 @@ func.func @matmul( // ----- -// CHECK-LABEL: func @tensor_cast_not_in_place( -// CHECK-SAME: %[[A:.*]]: memref, %[[B:.*]]: memref -// CHECK: %[[alloc:.*]] = memref.alloc -// CHECK: memref.copy %[[A]], %[[alloc]] -// CHECK: %[[subview:.*]] = memref.subview %[[A]][{{.*}}] [4] [1] : {{.*}} to memref<4xf32 -// CHECK: memref.copy %[[alloc]], %[[subview]] -func.func @tensor_cast_not_in_place( - %A : tensor {bufferization.writable = true}, - %B : tensor {bufferization.writable = false}, %idx: index) - -> (tensor) -{ - %r0 = tensor.cast %A : tensor to tensor<4xf32> - %r1 = tensor.insert_slice %r0 into %A[%idx][4][1] : tensor<4xf32> into tensor - return %r1 : tensor -} - -// ----- - -//===----------------------------------------------------------------------===// -// Insertion point cases. -//===----------------------------------------------------------------------===// - -/// These tests just check the produced IR is valid and does not have dominance +/// This test just checks the produced IR is valid and does not have dominance /// errors in the def-use chains. // CHECK-LABEL: func @dominance_violation_bug_1 @@ -406,19 +241,6 @@ func.func @dominance_violation_bug_1( return %rA : tensor } - -// ----- - -// CHECK-LABEL: func @insert_op -// CHECK-SAME: %[[t1:.*]]: memref, %[[s:.*]]: f32, %[[i:.*]]: index -func.func @insert_op(%t1 : tensor {bufferization.writable = true}, - %s : f32, %i : index) -> tensor { - // CHECK: memref.store %[[s]], %[[t1]][%[[i]]] - %0 = tensor.insert %s into %t1[%i] : tensor - // CHECK: return - return %0 : tensor -} - // ----- func.func @gather_like( @@ -537,85 +359,3 @@ func.func @depthwise_conv_1d_nwc_wc(%arg0: index, %arg1: index, %arg2: tensor<8x return %3 : tensor } -// ----- - -// CHECK-LABEL: func @write_to_select_op_source -// CHECK-SAME: %[[t1:.*]]: memref, %[[t2:.*]]: memref -func.func @write_to_select_op_source( - %t1 : tensor {bufferization.writable = true}, - %t2 : tensor {bufferization.writable = true}, - %c : i1) - -> (tensor, tensor) -{ - %cst = arith.constant 0.0 : f32 - %idx = arith.constant 0 : index - // CHECK: %[[alloc:.*]] = memref.alloc - // CHECK: memref.copy %[[t1]], %[[alloc]] - // CHECK: memref.store %{{.*}}, %[[alloc]] - %w = tensor.insert %cst into %t1[%idx] : tensor - // CHECK: %[[select:.*]] = arith.select %{{.*}}, %[[t1]], %[[t2]] - %s = arith.select %c, %t1, %t2 : tensor - // CHECK: return %[[select]], %[[alloc]] - return %s, %w : tensor, tensor -} - -// ----- - -// CHECK-LABEL: func @write_after_select_read_one -// CHECK-SAME: %[[t1:.*]]: memref, %[[t2:.*]]: memref -func.func @write_after_select_read_one( - %t1 : tensor {bufferization.writable = true}, - %t2 : tensor {bufferization.writable = true}, - %c : i1) - -> (f32, tensor) -{ - %cst = arith.constant 0.0 : f32 - %idx = arith.constant 0 : index - - // CHECK: %[[alloc:.*]] = memref.alloc - // CHECK-DAG: %[[casted:.*]] = memref.cast %[[alloc]] - // CHECK-DAG: memref.copy %[[t1]], %[[alloc]] - // CHECK: %[[select:.*]] = arith.select %{{.*}}, %[[casted]], %[[t2]] - %s = arith.select %c, %t1, %t2 : tensor - - // CHECK: memref.store %{{.*}}, %[[select]] - %w = tensor.insert %cst into %s[%idx] : tensor - - // CHECK: %[[f:.*]] = memref.load %[[t1]] - %f = tensor.extract %t1[%idx] : tensor - - // CHECK: return %[[f]], %[[select]] - return %f, %w : f32, tensor -} - -// ----- - -// A regression test to make sure that we handle rank-reducing extract_slice -// correctly. - -// CHECK-LABEL: func @rank_reducing -func.func @rank_reducing( - %i: index, %j: index, - %arg0: tensor<8x18x32xf32>) - -> tensor { - %c1 = arith.constant 1 : index - %c6 = arith.constant 6 : index - %c8 = arith.constant 8 : index - %c32 = arith.constant 32 : index - %c0 = arith.constant 0 : index - %0 = linalg.init_tensor [4, 1, 6, 8] : tensor<4x1x6x8xf32> - %1 = tensor.cast %0 : tensor<4x1x6x8xf32> to tensor - %2 = linalg.init_tensor [1, 6, 8] : tensor<1x6x8xf32> - %5 = scf.for %arg7 = %c0 to %c32 step %c8 iter_args(%arg8 = %1) -> (tensor) { - %7 = affine.apply affine_map<(d0) -> (d0 ceildiv 8)>(%arg7) - %8 = tensor.extract_slice %arg0[%i, %j, %arg7] [1, 6, 8] [1, 1, 1] : tensor<8x18x32xf32> to tensor<1x6x8xf32> - %9 = scf.for %arg9 = %c0 to %c6 step %c1 iter_args(%arg10 = %2) -> (tensor<1x6x8xf32>) { - %11 = tensor.extract_slice %8[0, %arg9, 0] [1, 1, 8] [1, 1, 1] : tensor<1x6x8xf32> to tensor<1x1x8xf32> - %12 = tensor.insert_slice %11 into %arg10[0, %arg9, 0] [1, 1, 8] [1, 1, 1] : tensor<1x1x8xf32> into tensor<1x6x8xf32> - scf.yield %12 : tensor<1x6x8xf32> - } - %10 = tensor.insert_slice %9 into %arg8[%7, 0, 0, 0] [1, 1, 6, 8] [1, 1, 1, 1] : tensor<1x6x8xf32> into tensor - scf.yield %10 : tensor - } - return %5: tensor -} diff --git a/mlir/test/Dialect/Tensor/one-shot-bufferize.mlir b/mlir/test/Dialect/Tensor/one-shot-bufferize.mlir new file mode 100644 index 000000000000..c9a7afd76fbb --- /dev/null +++ b/mlir/test/Dialect/Tensor/one-shot-bufferize.mlir @@ -0,0 +1,197 @@ +// RUN: mlir-opt %s -one-shot-bufferize="allow-return-allocs bufferize-function-boundaries" -split-input-file | FileCheck %s + +// Run fuzzer with different seeds. +// RUN: mlir-opt %s -one-shot-bufferize="allow-return-allocs test-analysis-only analysis-fuzzer-seed=23 bufferize-function-boundaries" -split-input-file -o /dev/null +// RUN: mlir-opt %s -one-shot-bufferize="allow-return-allocs test-analysis-only analysis-fuzzer-seed=59 bufferize-function-boundaries" -split-input-file -o /dev/null +// RUN: mlir-opt %s -one-shot-bufferize="allow-return-allocs test-analysis-only analysis-fuzzer-seed=91 bufferize-function-boundaries" -split-input-file -o /dev/null + +// Test bufferization using memref types that have no layout map. +// RUN: mlir-opt %s -one-shot-bufferize="allow-return-allocs fully-dynamic-layout-maps=0 bufferize-function-boundaries" -split-input-file -o /dev/null + +// CHECK-DAG: #[[$map_1d_dyn:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)> + +// CHECK-LABEL: func @insert_slice_fun +// CHECK-SAME: %[[A0:[a-zA-Z0-9]*]]: memref, +// CHECK-SAME: %[[A1:[a-zA-Z0-9]*]]: memref, +// CHECK-SAME: %[[t0:[a-zA-Z0-9]*]]: memref<4xf32, #[[$map_1d_dyn]]>, +// CHECK-SAME: %[[t1:[a-zA-Z0-9]*]]: memref<4xf32, #[[$map_1d_dyn]]> +func.func @insert_slice_fun( + %A0 : tensor {bufferization.writable = false}, + %A1 : tensor {bufferization.writable = true}, + %t0 : tensor<4xf32> {bufferization.writable = false}, + %t1 : tensor<4xf32> {bufferization.writable = true}) + -> (tensor, tensor, tensor, tensor) +{ + // Hoisted allocs. + // CHECK: %[[REALLOC1:.*]] = memref.alloc + // CHECK: %[[REALLOC2:.*]] = memref.alloc + // CHECK: %[[REALLOC3:.*]] = memref.alloc + + // Alloc and copy the whole result tensor. Copy the tensor.extract_slice. + // CHECK: memref.copy %[[A0]], %[[REALLOC3]] + // CHECK: %[[SV_A0:.*]] = memref.subview %[[REALLOC3]] + // CHECK: memref.copy %[[t0]], %[[SV_A0]] + %r0 = tensor.insert_slice %t0 into %A0[0][4][1] : tensor<4xf32> into tensor + + // Alloc and copy the whole result tensor. Copy the tensor.extract_slice. + // CHECK: memref.copy %[[A0]] + // CHECK: %[[SV_A0_2:.*]] = memref.subview %[[REALLOC2]] + // CHECK: memref.copy %[[t1]], %[[SV_A0_2]] + %r1 = tensor.insert_slice %t1 into %A0[0][4][1] : tensor<4xf32> into tensor + + // Still alloc the large tensor because %A1 is read after. Copy the tensor.extract_slice. + // CHECK: memref.copy %[[A1]] + // CHECK: %[[SV_A1:.*]] = memref.subview %[[REALLOC1]] + // CHECK: memref.copy %[[t0]], %[[SV_A1]] + %r2 = tensor.insert_slice %t0 into %A1[0][4][1] : tensor<4xf32> into tensor + + // Do not realloc the large tensor. Copy the tensor.extract_slice. + // CHECK-NOT: alloc + // CHECK: %[[SV_A1_2:.*]] = memref.subview %[[A1]] + // CHECK: memref.copy %[[t1]], %[[SV_A1_2]] + %r3 = tensor.insert_slice %t1 into %A1[0][4][1] : tensor<4xf32> into tensor + + // CHECK: return %[[REALLOC3]], %[[REALLOC2]], %[[REALLOC1]] : + // CHECK-SAME: memref, memref, memref + return %r0, %r1, %r2, %r3: tensor, tensor, tensor, tensor +} + +// ----- + +// CHECK-DAG: #[[$map_1d_dyn:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)> + +// CHECK-LABEL: func @insert_slice_fun +// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: memref +// CHECK-SAME: %[[t:[a-zA-Z0-9]*]]: memref<4xf32, #[[$map_1d_dyn]]> +func.func @insert_slice_fun( + %A : tensor {bufferization.writable = true}, + %t : tensor<4xf32> {bufferization.writable = false}) + -> tensor +{ + %f0 = arith.constant 0.0 : f32 + + // CHECK-NOT: alloc + // CHECK: %[[SV_A:.*]] = memref.subview %[[A]] + // CHECK: memref.copy %[[t]], %[[SV_A]] + %r0 = tensor.insert_slice %t into %A[0][4][1] : tensor<4xf32> into tensor + + /// Overwrite A inplace. + // CHECK: linalg.fill ins({{.*}}{{.*}}outs(%[[A]] + %r1 = linalg.fill ins(%f0 : f32) outs(%r0 : tensor) -> tensor + + // CHECK: return + // CHECK-NOT: tensor + return %r1: tensor +} + +// ----- + +// CHECK-DAG: #[[$map_1d_dyn:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)> + +// CHECK-LABEL: func @insert_slice_fun +// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: memref +// CHECK-SAME: %[[t:[a-zA-Z0-9]*]]: memref<4xf32, #[[$map_1d_dyn]]> +func.func @insert_slice_fun( + %A : tensor {bufferization.writable = true}, + %t : tensor<4xf32> {bufferization.writable = false}) + -> tensor +{ + %f0 = arith.constant 0.0 : f32 + + // CHECK: linalg.fill ins({{.*}}{{.*}}outs(%[[A]] + %r0 = linalg.fill ins(%f0 : f32) outs(%A : tensor) -> tensor + + // CHECK-NOT: alloc + // CHECK: %[[SV_A:.*]] = memref.subview %[[A]] + /// Overwrite A inplace by copying into the subview. + // CHECK: memref.copy %[[t]], %[[SV_A]] + %r1 = tensor.insert_slice %t into %r0[0][4][1] : tensor<4xf32> into tensor + + // CHECK: return + // CHECK-NOT: tensor + return %r1: tensor +} + +// ----- + +// CHECK-DAG: #[[$map_1d_dyn:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)> + +// CHECK-LABEL: func @insert_slice_fun_not_inplace +// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: memref +// CHECK-SAME: %[[t:[a-zA-Z0-9]*]]: memref<4xf32, #[[$map_1d_dyn]]> +func.func @insert_slice_fun_not_inplace( + %A : tensor {bufferization.writable = false}, + %t : tensor<4xf32> {bufferization.writable = false}) + -> tensor +{ + // CHECK: %[[ALLOC:.*]] = memref.alloc(%{{.*}}) {alignment = 128 : i64} : memref + // CHECK: memref.copy %[[A]], %[[ALLOC]] : memref + // CHECK: %[[SV:.*]] = memref.subview %[[ALLOC]][0] [4] [1] : memref to memref<4xf32> + // CHECK: memref.copy %[[t]], %[[SV]] : memref<4xf32, #map> to memref<4xf32> + %r0 = tensor.insert_slice %t into %A[0][4][1] : tensor<4xf32> into tensor + + // CHECK: return %{{.*}} : memref + return %r0: tensor +} + +// ----- + +// CHECK-LABEL: func @tensor_cast_not_in_place( +// CHECK-SAME: %[[A:.*]]: memref, %[[B:.*]]: memref +// CHECK: %[[alloc:.*]] = memref.alloc +// CHECK: memref.copy %[[A]], %[[alloc]] +// CHECK: %[[subview:.*]] = memref.subview %[[A]][{{.*}}] [4] [1] : {{.*}} to memref<4xf32 +// CHECK: memref.copy %[[alloc]], %[[subview]] +func.func @tensor_cast_not_in_place( + %A : tensor {bufferization.writable = true}, + %B : tensor {bufferization.writable = false}, %idx: index) + -> (tensor) +{ + %r0 = tensor.cast %A : tensor to tensor<4xf32> + %r1 = tensor.insert_slice %r0 into %A[%idx][4][1] : tensor<4xf32> into tensor + return %r1 : tensor +} + +// ----- + +// CHECK-LABEL: func @insert_op +// CHECK-SAME: %[[t1:.*]]: memref, %[[s:.*]]: f32, %[[i:.*]]: index +func.func @insert_op(%t1 : tensor {bufferization.writable = true}, + %s : f32, %i : index) -> tensor { + // CHECK: memref.store %[[s]], %[[t1]][%[[i]]] + %0 = tensor.insert %s into %t1[%i] : tensor + // CHECK: return + return %0 : tensor +} + +// ----- + +// A regression test to make sure that we handle rank-reducing extract_slice +// correctly. + +// CHECK-LABEL: func @rank_reducing +func.func @rank_reducing( + %i: index, %j: index, + %arg0: tensor<8x18x32xf32>) + -> tensor { + %c1 = arith.constant 1 : index + %c6 = arith.constant 6 : index + %c8 = arith.constant 8 : index + %c32 = arith.constant 32 : index + %c0 = arith.constant 0 : index + %0 = linalg.init_tensor [4, 1, 6, 8] : tensor<4x1x6x8xf32> + %1 = tensor.cast %0 : tensor<4x1x6x8xf32> to tensor + %2 = linalg.init_tensor [1, 6, 8] : tensor<1x6x8xf32> + %5 = scf.for %arg7 = %c0 to %c32 step %c8 iter_args(%arg8 = %1) -> (tensor) { + %7 = affine.apply affine_map<(d0) -> (d0 ceildiv 8)>(%arg7) + %8 = tensor.extract_slice %arg0[%i, %j, %arg7] [1, 6, 8] [1, 1, 1] : tensor<8x18x32xf32> to tensor<1x6x8xf32> + %9 = scf.for %arg9 = %c0 to %c6 step %c1 iter_args(%arg10 = %2) -> (tensor<1x6x8xf32>) { + %11 = tensor.extract_slice %8[0, %arg9, 0] [1, 1, 8] [1, 1, 1] : tensor<1x6x8xf32> to tensor<1x1x8xf32> + %12 = tensor.insert_slice %11 into %arg10[0, %arg9, 0] [1, 1, 8] [1, 1, 1] : tensor<1x1x8xf32> into tensor<1x6x8xf32> + scf.yield %12 : tensor<1x6x8xf32> + } + %10 = tensor.insert_slice %9 into %arg8[%7, 0, 0, 0] [1, 1, 6, 8] [1, 1, 1, 1] : tensor<1x6x8xf32> into tensor + scf.yield %10 : tensor + } + return %5: tensor +}