[mlir][Linalg] Disable init_tensor elimination by default

init_tensor elimination is arguably a pre-optimization that should be separated from comprehensive bufferization.
In any case it is still experimental and easily results in wrong IR with violated SSA def-use orderings.
Isolate the optimization behind a flag, separate the test cases and add a test case that would results in wrong IR.

Differential Revision: https://reviews.llvm.org/D116936
This commit is contained in:
Nicolas Vasilache 2022-01-10 09:03:23 -05:00
parent c2293bc17d
commit 1a2474b786
6 changed files with 151 additions and 120 deletions

View File

@ -53,7 +53,11 @@ def LinalgComprehensiveModuleBufferize :
"Use stack allocations for memrefs (for testing purposes only)">,
Option<"analysisFuzzerSeed", "analysis-fuzzer-seed", "unsigned",
/*default=*/"0",
"Analyze ops in random order with a given seed (fuzzer)">
"Analyze ops in random order with a given seed (fuzzer)">,
Option<"initTensorElimination", "init-tensor-elimination", "bool",
/*default=*/"false",
"(Experimental) Try to eliminate init_tensor operations that are "
"anchored at an insert_slice op">,
];
let constructor = "mlir::createLinalgComprehensiveModuleBufferizePass()";
}

View File

@ -92,8 +92,10 @@ void LinalgComprehensiveModuleBufferize::runOnOperation() {
options->printConflicts = printConflicts;
// Enable InitTensorOp elimination.
options->addPostAnalysisStep<
linalg_ext::InsertSliceAnchoredInitTensorEliminationStep>();
if (initTensorElimination) {
options->addPostAnalysisStep<
linalg_ext::InsertSliceAnchoredInitTensorEliminationStep>();
}
if (!allowReturnMemref)
options->addPostAnalysisStep<scf_ext::AssertDestinationPassingStyle>();

View File

@ -0,0 +1,55 @@
// RUN: mlir-opt %s -linalg-comprehensive-module-bufferize="test-analysis-only allow-return-memref init-tensor-elimination" -split-input-file | FileCheck %s
// -----
//===----------------------------------------------------------------------===//
// InitTensorOp elimination
//===----------------------------------------------------------------------===//
// CHECK-LABEL: func @buffer_forwarding_conflict
func @buffer_forwarding_conflict(%arg0: tensor<?xf32> {linalg.inplaceable = true}, %arg1: index) -> (tensor<?xf32>, tensor<?xf32>) {
%cst = arith.constant 0.000000e+00 : f32
// CHECK: tensor.extract_slice
// CHECK-SAME: {__inplace_operands_attr__ = ["false", "none"]
// Instead of allocating, share buffer with some inplace bufferization?
%0 = linalg.init_tensor [%arg1] : tensor<?xf32>
// CHECK: linalg.fill
// CHECK-SAME: {__inplace_operands_attr__ = ["none", "true"]
%1 = linalg.fill(%cst, %0) : f32, tensor<?xf32> -> tensor<?xf32>
// CHECK: tensor.insert_slice
// CHECK-SAME: {__inplace_operands_attr__ = ["true", "false", "none"]
%2 = tensor.insert_slice %1 into %arg0[0] [%arg1] [1] : tensor<?xf32> into tensor<?xf32>
// CHECK: tensor.insert_slice
// CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "none"]
%3 = tensor.insert_slice %1 into %arg0[42] [%arg1] [1] : tensor<?xf32> into tensor<?xf32>
// CHECK: return
// CHECK-SAME: __equivalent_func_args__ = [-1, 0]
return %2, %3 : tensor<?xf32>, tensor<?xf32>
}
// -----
// CHECK-LABEL: func @buffer_forwarding_no_conflict
func @buffer_forwarding_no_conflict(%arg0: tensor<?xf32> {linalg.inplaceable = true}, %arg1: index) -> (tensor<?xf32>, tensor<?xf32>) {
%cst = arith.constant 0.000000e+00 : f32
// CHECK: tensor.extract_slice
// CHECK-SAME: {__inplace_operands_attr__ = ["true", "none"]
// Instead of allocating, share buffer with some inplace bufferization?
%0 = linalg.init_tensor [%arg1] : tensor<?xf32>
// CHECK: linalg.fill
// CHECK-SAME: {__inplace_operands_attr__ = ["none", "true"]
%1 = linalg.fill(%cst, %0) : f32, tensor<?xf32> -> tensor<?xf32>
// CHECK: tensor.insert_slice
// CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "none"]
%2 = tensor.insert_slice %1 into %arg0[42] [%arg1] [1] : tensor<?xf32> into tensor<?xf32>
// CHECK: return
// CHECK-SAME: __equivalent_func_args__ = [0, 0]
return %2, %2 : tensor<?xf32>, tensor<?xf32>
}

View File

@ -1249,60 +1249,6 @@ func @non_reading_scf_for(%t1: tensor<?xf32> {linalg.inplaceable = true},
// -----
//===----------------------------------------------------------------------===//
// InitTensorOp elimination
//===----------------------------------------------------------------------===//
// CHECK-LABEL: func @buffer_forwarding_conflict
func @buffer_forwarding_conflict(%arg0: tensor<?xf32> {linalg.inplaceable = true}, %arg1: index) -> (tensor<?xf32>, tensor<?xf32>) {
%cst = arith.constant 0.000000e+00 : f32
// CHECK: tensor.extract_slice
// CHECK-SAME: {__inplace_operands_attr__ = ["false", "none"]
// Instead of allocating, share buffer with some inplace bufferization?
%0 = linalg.init_tensor [%arg1] : tensor<?xf32>
// CHECK: linalg.fill
// CHECK-SAME: {__inplace_operands_attr__ = ["none", "true"]
%1 = linalg.fill(%cst, %0) : f32, tensor<?xf32> -> tensor<?xf32>
// CHECK: tensor.insert_slice
// CHECK-SAME: {__inplace_operands_attr__ = ["true", "false", "none"]
%2 = tensor.insert_slice %1 into %arg0[0] [%arg1] [1] : tensor<?xf32> into tensor<?xf32>
// CHECK: tensor.insert_slice
// CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "none"]
%3 = tensor.insert_slice %1 into %arg0[42] [%arg1] [1] : tensor<?xf32> into tensor<?xf32>
// CHECK: return
// CHECK-SAME: __equivalent_func_args__ = [-1, 0]
return %2, %3 : tensor<?xf32>, tensor<?xf32>
}
// -----
// CHECK-LABEL: func @buffer_forwarding_no_conflict
func @buffer_forwarding_no_conflict(%arg0: tensor<?xf32> {linalg.inplaceable = true}, %arg1: index) -> (tensor<?xf32>, tensor<?xf32>) {
%cst = arith.constant 0.000000e+00 : f32
// CHECK: tensor.extract_slice
// CHECK-SAME: {__inplace_operands_attr__ = ["true", "none"]
// Instead of allocating, share buffer with some inplace bufferization?
%0 = linalg.init_tensor [%arg1] : tensor<?xf32>
// CHECK: linalg.fill
// CHECK-SAME: {__inplace_operands_attr__ = ["none", "true"]
%1 = linalg.fill(%cst, %0) : f32, tensor<?xf32> -> tensor<?xf32>
// CHECK: tensor.insert_slice
// CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "none"]
%2 = tensor.insert_slice %1 into %arg0[42] [%arg1] [1] : tensor<?xf32> into tensor<?xf32>
// CHECK: return
// CHECK-SAME: __equivalent_func_args__ = [0, 0]
return %2, %2 : tensor<?xf32>, tensor<?xf32>
}
// -----
//===----------------------------------------------------------------------===//
// scf.if cases
//===----------------------------------------------------------------------===//
@ -1764,3 +1710,26 @@ func @equivalent_func_arg_2(%c0: index, %c10: index, %c1: index, %t0: tensor<?xf
}
return %1: tensor<?xf32>
}
// -----
//===----------------------------------------------------------------------===//
// InitTensorOp elimination would produce SSA violations for the example below.
//===----------------------------------------------------------------------===//
func @depthwise_conv_1d_nwc_wc(%arg0: index, %arg1: index, %arg2: tensor<8x18x32xf32>)
-> tensor<?x1x6x8xf32> {
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c8 = arith.constant 8 : index
%0 = linalg.init_tensor [4, 1, 6, 8] : tensor<4x1x6x8xf32>
%1 = tensor.cast %0 : tensor<4x1x6x8xf32> to tensor<?x1x6x8xf32>
%2 = linalg.init_tensor [1, 6, 8] : tensor<1x6x8xf32>
%3 = scf.for %arg3 = %c0 to %c32 step %c8 iter_args(%arg4 = %1) -> (tensor<?x1x6x8xf32>) {
%4 = affine.apply affine_map<(d0) -> (d0 ceildiv 8)>(%arg3)
%5 = tensor.insert_slice %2 into %arg4[%4,0, 0, 0] [1, 1, 6, 8] [1, 1, 1, 1] :
tensor<1x6x8xf32> into tensor<?x1x6x8xf32>
scf.yield %5 : tensor<?x1x6x8xf32>
}
return %3 : tensor<?x1x6x8xf32>
}

View File

@ -0,0 +1,64 @@
// RUN: mlir-opt %s -linalg-comprehensive-module-bufferize="allow-return-memref init-tensor-elimination" -split-input-file | FileCheck %s
// -----
// CHECK: func @buffer_forwarding_conflict(
// CHECK-SAME: %[[FUNC_ARG:[0-9a-zA-Z]*]]: memref<?xf32>
// CHECK-SAME: %[[sz:[0-9a-zA-Z]*]]: index
func @buffer_forwarding_conflict(
%t: tensor<?xf32> {linalg.buffer_layout = affine_map<(d0) -> (d0)>, linalg.inplaceable = true},
%sz: index)
-> (tensor<?xf32>, tensor<?xf32>)
{
%f0 = arith.constant 0.0: f32
// Alloc is needed for the **first** insert_slice (due to backward traversal during analysis).
// CHECK: %[[DIM:.*]] = memref.dim %[[FUNC_ARG]]
// This allocs the whole dim to allow for a full clone of t.
// CHECK: %[[ALLOC:.*]] = memref.alloc(%[[DIM]])
// init_tensor itself does not alloc but forwards to the **second**
// insert_slice. InitTensorOp replaces the init_tensor with an out-of-place
// extract_slice.
// CHECK: %[[EXTRACT_SLICE_ALLOC:.*]] = memref.alloc(%[[sz]])
// CHECK: %[[T_SUBVIEW:.*]] = memref.subview %[[FUNC_ARG]][42] [%[[sz]]] [1]
%a = linalg.init_tensor[%sz] : tensor<?xf32>
// CHECK: linalg.fill({{.*}}, %[[EXTRACT_SLICE_ALLOC]]) : f32, memref<?xf32>
%f = linalg.fill(%f0, %a) : f32, tensor<?xf32> -> tensor<?xf32>
// CHECK: linalg.copy(%[[FUNC_ARG]], %[[ALLOC]]) : memref<?xf32>, memref<?xf32>
// CHECK: %[[SV0_ALLOC:.*]] = memref.subview %[[ALLOC]][0] [%[[sz]]] [1] : memref<?xf32> to memref<?xf32>
// CHECK: linalg.copy(%[[EXTRACT_SLICE_ALLOC]], %[[SV0_ALLOC]]) : memref<?xf32>, memref<?xf32>
%r0 = tensor.insert_slice %f into %t[0][%sz][1]: tensor<?xf32> into tensor<?xf32>
// CHECK: linalg.copy(%[[EXTRACT_SLICE_ALLOC]], %[[T_SUBVIEW]])
%r1 = tensor.insert_slice %f into %t[42][%sz][1]: tensor<?xf32> into tensor<?xf32>
return %r0, %r1: tensor<?xf32>, tensor<?xf32>
}
// -----
// CHECK: func @buffer_forwarding_no_conflict(
// CHECK-SAME: %[[FUNC_ARG:[0-9a-zA-Z]*]]: memref<?xf32>
// CHECK-SAME: %[[sz:[0-9a-zA-Z]*]]: index
func @buffer_forwarding_no_conflict(
%t: tensor<?xf32> {linalg.buffer_layout = affine_map<(d0) -> (d0)>, linalg.inplaceable = true},
%sz: index)
-> (tensor<?xf32>)
{
%f0 = arith.constant 0.0: f32
// init_tensor itself does not alloc but forwards to the insert_slice.
// InitTensorOp replaces the init_tensor with an inplace extract_slice.
// CHECK: %[[T_SUBVIEW:.*]] = memref.subview %[[FUNC_ARG]][42] [%[[sz]]] [1]
%a = linalg.init_tensor[%sz] : tensor<?xf32>
// CHECK: linalg.fill({{.*}}, %[[T_SUBVIEW]]) : f32, memref<?xf32
%f = linalg.fill(%f0, %a) : f32, tensor<?xf32> -> tensor<?xf32>
// Self-copy canonicalizes away later.
%r1 = tensor.insert_slice %f into %t[42][%sz][1]: tensor<?xf32> into tensor<?xf32>
return %r1: tensor<?xf32>
}

View File

@ -868,69 +868,6 @@ func @dominance_violation_bug_1(
// -----
// CHECK: func @buffer_forwarding_conflict(
// CHECK-SAME: %[[FUNC_ARG:[0-9a-zA-Z]*]]: memref<?xf32>
// CHECK-SAME: %[[sz:[0-9a-zA-Z]*]]: index
func @buffer_forwarding_conflict(
%t: tensor<?xf32> {linalg.buffer_layout = affine_map<(d0) -> (d0)>, linalg.inplaceable = true},
%sz: index)
-> (tensor<?xf32>, tensor<?xf32>)
{
%f0 = arith.constant 0.0: f32
// Alloc is needed for the **first** insert_slice (due to backward traversal during analysis).
// CHECK: %[[DIM:.*]] = memref.dim %[[FUNC_ARG]]
// This allocs the whole dim to allow for a full clone of t.
// CHECK: %[[ALLOC:.*]] = memref.alloc(%[[DIM]])
// init_tensor itself does not alloc but forwards to the **second**
// insert_slice. InitTensorOp replaces the init_tensor with an out-of-place
// extract_slice.
// CHECK: %[[EXTRACT_SLICE_ALLOC:.*]] = memref.alloc(%[[sz]])
// CHECK: %[[T_SUBVIEW:.*]] = memref.subview %[[FUNC_ARG]][42] [%[[sz]]] [1]
%a = linalg.init_tensor[%sz] : tensor<?xf32>
// CHECK: linalg.fill({{.*}}, %[[EXTRACT_SLICE_ALLOC]]) : f32, memref<?xf32>
%f = linalg.fill(%f0, %a) : f32, tensor<?xf32> -> tensor<?xf32>
// CHECK: linalg.copy(%[[FUNC_ARG]], %[[ALLOC]]) : memref<?xf32>, memref<?xf32>
// CHECK: %[[SV0_ALLOC:.*]] = memref.subview %[[ALLOC]][0] [%[[sz]]] [1] : memref<?xf32> to memref<?xf32>
// CHECK: linalg.copy(%[[EXTRACT_SLICE_ALLOC]], %[[SV0_ALLOC]]) : memref<?xf32>, memref<?xf32>
%r0 = tensor.insert_slice %f into %t[0][%sz][1]: tensor<?xf32> into tensor<?xf32>
// CHECK: linalg.copy(%[[EXTRACT_SLICE_ALLOC]], %[[T_SUBVIEW]])
%r1 = tensor.insert_slice %f into %t[42][%sz][1]: tensor<?xf32> into tensor<?xf32>
return %r0, %r1: tensor<?xf32>, tensor<?xf32>
}
// -----
// CHECK: func @buffer_forwarding_no_conflict(
// CHECK-SAME: %[[FUNC_ARG:[0-9a-zA-Z]*]]: memref<?xf32>
// CHECK-SAME: %[[sz:[0-9a-zA-Z]*]]: index
func @buffer_forwarding_no_conflict(
%t: tensor<?xf32> {linalg.buffer_layout = affine_map<(d0) -> (d0)>, linalg.inplaceable = true},
%sz: index)
-> (tensor<?xf32>)
{
%f0 = arith.constant 0.0: f32
// init_tensor itself does not alloc but forwards to the insert_slice.
// InitTensorOp replaces the init_tensor with an inplace extract_slice.
// CHECK: %[[T_SUBVIEW:.*]] = memref.subview %[[FUNC_ARG]][42] [%[[sz]]] [1]
%a = linalg.init_tensor[%sz] : tensor<?xf32>
// CHECK: linalg.fill({{.*}}, %[[T_SUBVIEW]]) : f32, memref<?xf32
%f = linalg.fill(%f0, %a) : f32, tensor<?xf32> -> tensor<?xf32>
// Self-copy canonicalizes away later.
%r1 = tensor.insert_slice %f into %t[42][%sz][1]: tensor<?xf32> into tensor<?xf32>
return %r1: tensor<?xf32>
}
// -----
// CHECK-LABEL: func @scf_if_inplace(
// CHECK-SAME: %[[cond:.*]]: i1, %[[t1:.*]]: memref<?xf32{{.*}}>, %[[v:.*]]: vector
func @scf_if_inplace(%cond: i1,