[mlir][bufferize][NFC] Rename `comprehensive-function-bufferize` to `one-shot-bufferize`

The related functionality is moved over to the bufferization dialect. Test cases are cleaned up a bit. Differential Revision: https://reviews.llvm.org/D120191
2022-02-22 17:14:03 +09:00 · 2022-02-22 17:14:03 +09:00 · d2dacde5d8
parent 5acd9c49a8
commit d2dacde5d8
13 changed files with 297 additions and 226 deletions
--- a/mlir/include/mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h
+++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h
@ -164,9 +164,8 @@ private:
 LogicalResult analyzeOp(Operation *op, AnalysisBufferizationState &state);
 /// Run One-Shot Bufferize on the given op: Analysis + Bufferization
-LogicalResult
+LogicalResult runOneShotBufferize(Operation *op,
-runOneShotBufferize(Operation *op,
+                                  const AnalysisBufferizationOptions &options);
                    std::unique_ptr<AnalysisBufferizationOptions> options);
 } // namespace bufferization
 } // namespace mlir
--- a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h
@ -5,6 +5,7 @@
 namespace mlir {
 namespace bufferization {
 struct AnalysisBufferizationOptions;
 //===----------------------------------------------------------------------===//
 // Passes
@ -29,6 +30,15 @@ std::unique_ptr<Pass> createBufferResultsToOutParamsPass();
 /// bufferization.to_tensor and bufferization.to_memref operations.
 std::unique_ptr<OperationPass<FuncOp>> createFinalizingBufferizePass();
 /// Create a pass that bufferizes all ops that implement BufferizableOpInterface
 /// with One-Shot Bufferize.
 std::unique_ptr<Pass> createOneShotBufferizePass();
 /// Create a pass that bufferizes all ops that implement BufferizableOpInterface
 /// with One-Shot Bufferize and the specified bufferization options.
 std::unique_ptr<Pass>
 createOneShotBufferizePass(const AnalysisBufferizationOptions &options);
 /// Creates a pass that promotes heap-based allocations to stack-based ones.
 /// Only buffers smaller than the provided size are promoted.
 /// Dynamic shaped buffers are promoted up to the given rank.
--- a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
@ -149,6 +149,88 @@ def FinalizingBufferize : Pass<"finalizing-bufferize", "FuncOp"> {
  let constructor = "mlir::bufferization::createFinalizingBufferizePass()";
 }
 def OneShotBufferize : Pass<"one-shot-bufferize", "ModuleOp"> {
  let summary = "One-Shot Bufferize";
  let description = [{
    This pass bufferizes all ops that implement `BufferizableOpInterface`. It
    first performs an inplacability analysis on SSA use-def chains of tensor
    values to determine which OpOperands may bufferize in-place, i.e., without
    inserting a buffer copy. It then rewrites the IR, inserting a buffer
    allocation and copy for each OpOperand that was decided to bufferize
    out-of-place.
    One-Shot Bufferize (and `BufferizableOpInterface`) was designed for ops that
    are in destination-passing style. When bufferizing such ops, it is possible
    to reuse the buffer of a tensor OpOperand for a tensor OpResult. In essence,
    a possible destination of an operation is already passed as an SSA value.
    `tensor.insert` is an example for an op in destination-passing style. E.g.,
    when bufferizing `%t0 = tensor.insert %f into %dest[%idx]`, `buffer(%t0)` is
    identical to `buffer(%dest)` in the absence of RaW conflicts. As a counter
    example, `tensor.generate` is not in destination-passing style and always
    results in a new buffer allocation.
    One-Shot Bufferize deallocates all buffers that it allocates. Yielding newly
    allocated buffers from a block is not supported yet and such IR will be
    rejected. For testing purposes and compatibility with partial bufferization,
    One-Shot Bufferize can be run with `allow-return-memref=1 create-dealloc=0`
    to allow such IR.
    One-Shot Bufferize will by default reject IR that contains non-bufferizable
    op, i.e., ops that do not implemement BufferizableOpInterface. Such IR can
    be allowed with `allow-unknown-ops=1`. In that case, to_memref and to_tensor
    ops will be generated at the bufferization boundary. This is useful for
    compatibility with existing partial bufferization passes: These can
    bufferize the remaining IR after running One-Shot Bufferize.
    Note: Running One-Shot Bufferize after a partial bufferization pass is
    currently not supported. Running partial bufferization passes after running
    One-Shot Bufferize is supported and the recommended way to gradually
    migrate from partial bufferization to One-Shot Bufferize.
    With `dialect-filter`, bufferization can be restricted to a set of dialects.
    If no filter is specified, all ops that implement `BufferizableOpInterface`
    are bufferized. Ops from the `std` dialect are an exception: These ops are
    always ignored, even if no filter is specified. When specifying a dialect
    filter and `allow-unknown-ops` is not turned on, bufferization would fail
    when encountering an op that is not included in the filter (even if it is
    bufferizable).
    For testing/debugging purposes, `test-analysis-only=1 print-conflicts=1`
    prints analysis results and explains why an OpOperand was decided to
    bufferize out-of-place. This is useful for understanding why One-Shot
    Bufferize chose to insert a certain buffer copy.
  }];
  let options = [
    Option<"allowReturnMemref", "allow-return-memref", "bool",
            /*default=*/"false",
           "Allows the return of memrefs (for testing purposes only)">,
    Option<"allowUnknownOps", "allow-unknown-ops", "bool",
           /*default=*/"false",
           "Allows unknown (not bufferizable) ops in the input IR.">,
    Option<"analysisFuzzerSeed", "analysis-fuzzer-seed", "unsigned",
           /*default=*/"0",
           "Test only: Analyze ops in random order with a given seed (fuzzer)">,
    Option<"createDeallocs", "create-deallocs", "bool", /*default=*/"true",
           "Specify if buffers should be deallocated. For compatibility with "
           "core bufferization passes.">,
    ListOption<"dialectFilter", "dialect-filter", "std::string",
               "Restrict bufferization to ops from these dialects.",
               "llvm::cl::MiscFlags::CommaSeparated">,
    Option<"fullyDynamicLayoutMaps", "fully-dynamic-layout-maps", "bool",
           /*default=*/"true",
           "Generate MemRef types with dynamic offset+strides by default.">,
    Option<"testAnalysisOnly", "test-analysis-only", "bool",
            /*default=*/"false",
           "Test only: Only run inplaceability analysis and annotate IR">,
    Option<"printConflicts", "print-conflicts", "bool",
            /*default=*/"false",
           "Test only: Annotate IR with RaW conflicts. Requires "
           "test-analysis-only.">,
  ];
  let constructor = "mlir::bufferization::createOneShotBufferizePass()";
 }
 def PromoteBuffersToStack : Pass<"promote-buffers-to-stack", "FuncOp"> {
  let summary = "Promotes heap-based allocations to automatically managed "
                "stack-based allocations";
--- a/mlir/include/mlir/InitAllDialects.h
+++ b/mlir/include/mlir/InitAllDialects.h
@ -17,6 +17,7 @@
 #include "mlir/Dialect/AMX/AMXDialect.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
 #include "mlir/Dialect/Arithmetic/Transforms/BufferizableOpInterfaceImpl.h"
 #include "mlir/Dialect/ArmNeon/ArmNeonDialect.h"
 #include "mlir/Dialect/ArmSVE/ArmSVEDialect.h"
 #include "mlir/Dialect/Async/IR/Async.h"
@ -30,6 +31,7 @@
 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"
 #include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Linalg/Transforms/BufferizableOpInterfaceImpl.h"
 #include "mlir/Dialect/Math/IR/Math.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/OpenACC/OpenACC.h"
@ -37,6 +39,7 @@
 #include "mlir/Dialect/PDL/IR/PDL.h"
 #include "mlir/Dialect/PDLInterp/IR/PDLInterp.h"
 #include "mlir/Dialect/Quant/QuantOps.h"
 #include "mlir/Dialect/SCF/BufferizableOpInterfaceImpl.h"
 #include "mlir/Dialect/SCF/SCF.h"
 #include "mlir/Dialect/SPIRV/IR/SPIRVDialect.h"
 #include "mlir/Dialect/Shape/IR/Shape.h"
@ -45,8 +48,10 @@
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Tensor/IR/TensorInferTypeOpInterfaceImpl.h"
 #include "mlir/Dialect/Tensor/IR/TensorTilingInterfaceImpl.h"
 #include "mlir/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.h"
 #include "mlir/Dialect/Tosa/IR/TosaOps.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Dialect/Vector/Transforms/BufferizableOpInterfaceImpl.h"
 #include "mlir/Dialect/X86Vector/X86VectorDialect.h"
 #include "mlir/IR/Dialect.h"
@ -88,8 +93,13 @@ inline void registerAllDialects(DialectRegistry &registry) {
                  tosa::TosaDialect,
                  x86vector::X86VectorDialect>();
  // clang-format on
  arith::registerBufferizableOpInterfaceExternalModels(registry);
  linalg::registerBufferizableOpInterfaceExternalModels(registry);
  scf::registerBufferizableOpInterfaceExternalModels(registry);
  tensor::registerBufferizableOpInterfaceExternalModels(registry);
  tensor::registerInferTypeOpInterfaceExternalModels(registry);
  tensor::registerTilingOpInterfaceExternalModels(registry);
  vector::registerBufferizableOpInterfaceExternalModels(registry);
 }
 /// Append all the MLIR dialects to the registry contained in the given context.
--- a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
@ -11,9 +11,13 @@
 #include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
 #include "mlir/Dialect/Bufferization/Transforms/Bufferize.h"
 #include "mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h"
 #include "mlir/Dialect/Bufferization/Transforms/Passes.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/Passes.h"
 using namespace mlir;
 using namespace mlir::bufferization;
@ -144,8 +148,81 @@ struct FinalizingBufferizePass
      signalPassFailure();
  }
 };
 struct OneShotBufferizePass
    : public OneShotBufferizeBase<OneShotBufferizePass> {
  using OneShotBufferizeBase<OneShotBufferizePass>::OneShotBufferizeBase;
  explicit OneShotBufferizePass(const AnalysisBufferizationOptions &options)
      : options(options) {}
  void getDependentDialects(DialectRegistry &registry) const override {
    registry.insert<bufferization::BufferizationDialect>();
  }
  void runOnOperation() override {
    AnalysisBufferizationOptions opt;
    if (!options) {
      // Make new bufferization options if none were provided when creating the
      // pass.
      opt.allowReturnMemref = allowReturnMemref;
      opt.allowUnknownOps = allowUnknownOps;
      opt.analysisFuzzerSeed = analysisFuzzerSeed;
      opt.createDeallocs = createDeallocs;
      opt.fullyDynamicLayoutMaps = fullyDynamicLayoutMaps;
      opt.printConflicts = printConflicts;
      opt.testAnalysisOnly = testAnalysisOnly;
      BufferizationOptions::OpFilterEntry::FilterFn filterFn =
          [&](Operation *op) {
            // Disallow non-std dialect ops. I.e., no ops related to function
            // calls.
            if (op->getDialect()->getNamespace() ==
                StandardOpsDialect::getDialectNamespace())
              return false;
            // Filter may be specified via options.
            if (this->dialectFilter.hasValue())
              return llvm::find(this->dialectFilter,
                                op->getDialect()->getNamespace()) !=
                     this->dialectFilter.end();
            // No filter specified: All other ops are allowed.
            return true;
          };
      opt.allowOperationInFilter(filterFn);
    } else {
      opt = *options;
    }
    ModuleOp moduleOp = getOperation();
    if (failed(runOneShotBufferize(moduleOp, opt))) {
      signalPassFailure();
      return;
    }
    if (opt.testAnalysisOnly)
      return;
    OpPassManager cleanupPipeline("builtin.module");
    cleanupPipeline.addPass(createCanonicalizerPass());
    cleanupPipeline.addPass(createCSEPass());
    cleanupPipeline.addPass(createLoopInvariantCodeMotionPass());
    (void)runPipeline(cleanupPipeline, moduleOp);
  }
 private:
  llvm::Optional<AnalysisBufferizationOptions> options;
 };
 } // namespace
 std::unique_ptr<Pass> mlir::bufferization::createOneShotBufferizePass() {
  return std::make_unique<OneShotBufferizePass>();
 }
 std::unique_ptr<Pass> mlir::bufferization::createOneShotBufferizePass(
    const AnalysisBufferizationOptions &options) {
  return std::make_unique<OneShotBufferizePass>(options);
 }
 std::unique_ptr<OperationPass<FuncOp>>
 mlir::bufferization::createFinalizingBufferizePass() {
  return std::make_unique<FinalizingBufferizePass>();
--- a/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp
@ -799,11 +799,11 @@ LogicalResult bufferization::analyzeOp(Operation *op,
 }
 LogicalResult bufferization::runOneShotBufferize(
-    Operation *op, std::unique_ptr<AnalysisBufferizationOptions> options) {
+    Operation *op, const AnalysisBufferizationOptions &options) {
-  AnalysisBufferizationState state(op, *options);
+  AnalysisBufferizationState state(op, options);
  if (failed(analyzeOp(op, state)))
    return failure();
-  if (options->testAnalysisOnly)
+  if (options.testAnalysisOnly)
    return success();
  return bufferizeOp(op, state);
 }
--- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-compat.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-compat.mlir
@ -1,10 +1,10 @@
 // RUN: mlir-opt %s \
-// RUN:     -test-comprehensive-function-bufferize="allow-return-memref allow-unknown-ops create-deallocs=0" \
+// RUN:     -one-shot-bufferize="allow-return-memref allow-unknown-ops create-deallocs=0" \
 // RUN:     -split-input-file | \
 // RUN: FileCheck %s --check-prefix=CHECK-NODEALLOC
 // RUN: mlir-opt %s \
-// RUN:     -test-comprehensive-function-bufferize="allow-return-memref allow-unknown-ops create-deallocs=0" \
+// RUN:     -one-shot-bufferize="allow-return-memref allow-unknown-ops create-deallocs=0" \
 // RUN:     -buffer-deallocation | \
 // RUN: FileCheck %s --check-prefix=CHECK-BUFFERDEALLOC
--- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-partial.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-partial.mlir
@ -1,30 +1,28 @@
-// RUN: mlir-opt %s -allow-unregistered-dialect -linalg-comprehensive-module-bufferize="allow-return-memref allow-unknown-ops" -split-input-file | FileCheck %s
+// RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="allow-return-memref allow-unknown-ops" -split-input-file | FileCheck %s
 // Test bufferization using memref types that have no layout map.
-// RUN: mlir-opt %s -allow-unregistered-dialect -linalg-comprehensive-module-bufferize="allow-return-memref allow-unknown-ops fully-dynamic-layout-maps=0" -split-input-file | FileCheck %s --check-prefix=CHECK-NO-LAYOUT-MAP
+// RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="allow-return-memref allow-unknown-ops fully-dynamic-layout-maps=0" -split-input-file | FileCheck %s --check-prefix=CHECK-NO-LAYOUT-MAP
 // Run fuzzer with different seeds.
-// RUN: mlir-opt %s -allow-unregistered-dialect -linalg-comprehensive-module-bufferize="allow-return-memref test-analysis-only analysis-fuzzer-seed=23" -split-input-file -o /dev/null
+// RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="allow-return-memref test-analysis-only analysis-fuzzer-seed=23" -split-input-file -o /dev/null
-// RUN: mlir-opt %s -allow-unregistered-dialect -linalg-comprehensive-module-bufferize="allow-return-memref test-analysis-only analysis-fuzzer-seed=59" -split-input-file -o /dev/null
+// RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="allow-return-memref test-analysis-only analysis-fuzzer-seed=59" -split-input-file -o /dev/null
-// RUN: mlir-opt %s -allow-unregistered-dialect -linalg-comprehensive-module-bufferize="allow-return-memref test-analysis-only analysis-fuzzer-seed=91" -split-input-file -o /dev/null
+// RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="allow-return-memref test-analysis-only analysis-fuzzer-seed=91" -split-input-file -o /dev/null
-// RUN: mlir-opt %s -allow-unregistered-dialect -test-comprehensive-function-bufferize="dialect-filter=tensor allow-unknown-ops allow-return-memref" -canonicalize -split-input-file | FileCheck %s --check-prefix=CHECK-TENSOR
+// RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="dialect-filter=tensor allow-unknown-ops allow-return-memref" -canonicalize -split-input-file | FileCheck %s --check-prefix=CHECK-TENSOR
-// RUN: mlir-opt %s -allow-unregistered-dialect -test-comprehensive-function-bufferize="dialect-filter=scf allow-unknown-ops allow-return-memref" -canonicalize -split-input-file | FileCheck %s --check-prefix=CHECK-SCF
+// RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="dialect-filter=scf allow-unknown-ops allow-return-memref" -canonicalize -split-input-file | FileCheck %s --check-prefix=CHECK-SCF
 // CHECK: #[[$MAP:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
 // CHECK-LABEL: func @use_of_unknown_op_1(
-//  CHECK-SAME:     %[[m1:.*]]: memref<?xf32, #[[$MAP]]>
+//  CHECK-SAME:     %[[t1:.*]]: tensor<?xf32>
 // CHECK-NO-LAYOUT-MAP-LABEL: func @use_of_unknown_op_1(
-//  CHECK-NO-LAYOUT-MAP-SAME:     %[[m1:.*]]: memref<?xf32>)
+//  CHECK-NO-LAYOUT-MAP-SAME:     %[[t1:.*]]: tensor<?xf32>
-func @use_of_unknown_op_1(%t1: tensor<?xf32> {linalg.inplaceable = true})
+func @use_of_unknown_op_1(%t1: tensor<?xf32>)
    -> vector<5xf32> {
  // ToTensorOp is generated because the function is bufferized and has a
  // memref block argument.
-  // CHECK: %[[m1_tensor:.*]] = bufferization.to_tensor %[[m1]] : memref<?xf32, #[[$MAP]]>
+  // CHECK: %[[dummy:.*]] = "test.dummy_op"(%[[t1]])
-  // CHECK: %[[dummy:.*]] = "test.dummy_op"(%[[m1_tensor]])
+  // CHECK-NO-LAYOUT-MAP: %[[dummy:.*]] = "test.dummy_op"(%[[t1]])
  // CHECK-NO-LAYOUT-MAP: %[[m1_tensor:.*]] = bufferization.to_tensor %[[m1]] : memref<?xf32>
  // CHECK-NO-LAYOUT-MAP: %[[dummy:.*]] = "test.dummy_op"(%[[m1_tensor]])
  %0 = "test.dummy_op"(%t1) : (tensor<?xf32>) -> tensor<?xf32>
  %idx = arith.constant 0 : index
@ -40,36 +38,34 @@ func @use_of_unknown_op_1(%t1: tensor<?xf32> {linalg.inplaceable = true})
 // -----
 // CHECK-LABEL: func @use_of_unknown_op_2(
-//  CHECK-SAME:     %[[m1:.*]]: memref<?xf32
+//  CHECK-SAME:     %[[t1:.*]]: tensor<?xf32>
-func @use_of_unknown_op_2(%t1: tensor<?xf32> {linalg.inplaceable = true})
+func @use_of_unknown_op_2(%t1: tensor<?xf32>) -> tensor<?xf32> {
-    -> tensor<?xf32> {
+  // CHECK: %[[dummy1:.*]] = "test.dummy_op"(%[[t1]])
  // CHECK: %[[m1_tensor:.*]] = bufferization.to_tensor %[[m1]]
  // CHECK: %[[dummy1:.*]] = "test.dummy_op"(%[[m1_tensor]])
  %0 = "test.dummy_op"(%t1) : (tensor<?xf32>) -> tensor<?xf32>
  // CHECK: %[[dummy2:.*]] = "test.another_dummy_op"(%[[dummy1]])
  %1 = "test.another_dummy_op"(%0) : (tensor<?xf32>) -> tensor<?xf32>
-  // CHECK: %[[dummy2_memref:.*]] = bufferization.to_memref %[[dummy2]]
+  // CHECK: return %[[dummy2]]
  // CHECK: return %[[dummy2_memref]]
  return %1 : tensor<?xf32>
 }
 // -----
 // CHECK: #[[$MAP2:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
 // CHECK-LABEL: func @use_of_unknown_op_3(
-//  CHECK-SAME:     %[[m1:.*]]: memref<?xf32
+//  CHECK-SAME:     %[[t1:.*]]: tensor<?xf32>
-func @use_of_unknown_op_3(%t1: tensor<?xf32> {linalg.inplaceable = true})
+func @use_of_unknown_op_3(%t1: tensor<?xf32>)
    -> (vector<5xf32>, vector<5xf32>) {
  %idx = arith.constant 0 : index
  %cst = arith.constant 0.0 : f32
-  // CHECK: %[[m1_tensor:.*]] = bufferization.to_tensor %[[m1]]
+  // CHECK: %[[m1:.*]] = bufferization.to_memref %[[t1]]
  // CHECK: %[[v1:.*]] = vector.transfer_read %[[m1]]
  %1 = vector.transfer_read %t1[%idx], %cst : tensor<?xf32>, vector<5xf32>
-  // CHECK: %[[dummy:.*]] = "test.dummy_op"(%[[m1_tensor]])
+  // CHECK: %[[dummy:.*]] = "test.dummy_op"(%[[t1]])
  %0 = "test.dummy_op"(%t1) : (tensor<?xf32>) -> tensor<?xf32>
-  // CHECK: %[[dummy_memref:.*]] = bufferization.to_memref %[[dummy]]
+  // CHECK: %[[dummy_memref:.*]] = bufferization.to_memref %[[dummy]] : memref<?xf32, #[[$MAP2]]>
  // CHECK: %[[v2:.*]] = vector.transfer_read %[[dummy_memref]]
  %2 = vector.transfer_read %0[%idx], %cst : tensor<?xf32>, vector<5xf32>
@ -80,14 +76,13 @@ func @use_of_unknown_op_3(%t1: tensor<?xf32> {linalg.inplaceable = true})
 // -----
 // CHECK-LABEL: func @use_of_unknown_op_4(
-//  CHECK-SAME:     %[[m1:.*]]: memref<?xf32
+//  CHECK-SAME:     %[[t1:.*]]: tensor<?xf32>
-func @use_of_unknown_op_4(%t1: tensor<?xf32> {linalg.inplaceable = true})
+func @use_of_unknown_op_4(%t1: tensor<?xf32>)
    -> (vector<5xf32>, tensor<?xf32>) {
  %idx = arith.constant 0 : index
  %cst = arith.constant 0.0 : f32
-  // CHECK: %[[m1_tensor:.*]] = bufferization.to_tensor %[[m1]]
+  // CHECK: %[[dummy:.*]] = "test.dummy_op"(%[[t1]])
  // CHECK: %[[dummy:.*]] = "test.dummy_op"(%[[m1_tensor]])
  %0 = "test.dummy_op"(%t1) : (tensor<?xf32>) -> tensor<?xf32>
  // CHECK: %[[dummy_memref:.*]] = bufferization.to_memref %[[dummy]]
@ -97,40 +92,39 @@ func @use_of_unknown_op_4(%t1: tensor<?xf32> {linalg.inplaceable = true})
  // CHECK: %[[another_dummy:.*]] = "test.another_dummy_op"(%[[dummy]])
  %2 = "test.another_dummy_op"(%0) : (tensor<?xf32>) -> tensor<?xf32>
-  // CHECK: %[[another_dummy_memref:.*]] = bufferization.to_memref %[[another_dummy]]
+  // CHECK: return %[[v1]], %[[another_dummy]]
  // CHECK: return %[[v1]], %[[another_dummy_memref]]
  return %1, %2 : vector<5xf32>, tensor<?xf32>
 }
 // -----
 // CHECK-LABEL: func @use_of_bufferizable_op_in_unbufferizable_op
-//  CHECK-SAME:     %[[m1:.*]]: memref<?xf32
+//  CHECK-SAME:     %[[t1:.*]]: tensor<?xf32>
 func @use_of_bufferizable_op_in_unbufferizable_op(
    %t1: tensor<?xf32>, %o: index, %s: index) -> (tensor<?xf32>, tensor<?xf32>) {
  // CHECK: %[[m1:.*]] = bufferization.to_memref %[[t1]]
  // CHECK: %[[subview:.*]] = memref.subview %[[m1]]
  %0 = tensor.extract_slice %t1[%o][%s][1] : tensor<?xf32> to tensor<?xf32>
  // CHECK: %[[subview_tensor:.*]] = bufferization.to_tensor %[[subview]]
  // CHECK: %[[dummy:.*]] = "test.dummy_op"(%[[subview_tensor]])
  %1 = "test.dummy_op"(%0) : (tensor<?xf32>) -> tensor<?xf32>
-  // CHECK: %[[dummy_memref:.*]] = bufferization.to_memref %[[dummy]]
+  // CHECK: return %[[subview_tensor]], %[[dummy]]
  // CHECK: return %[[subview]], %[[dummy_memref]]
  return %0, %1 : tensor<?xf32>, tensor<?xf32>
 }
 // -----
 // CHECK-LABEL: func @unused_unknown_op(
-//  CHECK-SAME:     %[[m1:.*]]: memref<?xf32
+//  CHECK-SAME:     %[[t1:.*]]: tensor<?xf32>
 func @unused_unknown_op(%t1 : tensor<?xf32>) -> vector<5xf32> {
  %idx = arith.constant 0 : index
  %cst = arith.constant 0.0 : f32
-  // ToTensorOp is inserted to pass in the result of the above bufferized op.
+
-  // CHECK: %[[m1_tensor:.*]] = bufferization.to_tensor %[[m1]]
+  // CHECK: %[[m1:.*]] = bufferization.to_memref %[[t1]]
  // CHECK: vector.transfer_read %[[m1]]
  %1 = vector.transfer_read %t1[%idx], %cst : tensor<?xf32>, vector<5xf32>
-  // CHECK: "test.dummy_op"(%[[m1_tensor]])
+  // CHECK: "test.dummy_op"(%[[t1]])
  "test.dummy_op"(%t1) : (tensor<?xf32>) -> ()
  return %1 : vector<5xf32>
@ -138,25 +132,60 @@ func @unused_unknown_op(%t1 : tensor<?xf32>) -> vector<5xf32> {
 // -----
 // CHECK: #[[$MAP3:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
 // CHECK-LABEL: func @unknown_op_may_read(
 func @unknown_op_may_read(%v: vector<5xf32>)
    -> (tensor<10xf32>, tensor<10xf32>) {
  %idx = arith.constant 0 : index
  %cst = arith.constant 5.0 : f32
  // One alloc for the init_tensor, another one because the transfer_write
  // bufferizes out-of-place.
  // CHECK: %[[m1:.*]] = memref.alloc() {{.*}} : memref<10xf32>
  // CHECK: %[[alloc:.*]] = memref.alloc() {{.*}} : memref<10xf32>
  // CHECK: %[[alloc_casted:.*]] = memref.cast %[[alloc]] : memref<10xf32> to memref<10xf32, #[[$MAP3]]>
  // CHECK: %[[m1_casted:.*]] = memref.cast %[[m1]] : memref<10xf32> to memref<10xf32, #[[$MAP3]]>
  %t1 = linalg.init_tensor [10] : tensor<10xf32>
  // CHECK: linalg.fill(%{{.*}}, %[[m1]])
  // CHECK: %[[filled_tensor:.*]] = bufferization.to_tensor %[[m1_casted]]
  %filled = linalg.fill(%cst, %t1) : f32, tensor<10xf32> -> tensor<10xf32>
  // The transfer_write is out-of-place because "dummy_op" may read.
  // CHECK: memref.copy %[[m1]], %[[alloc]]
  // CHECK: vector.transfer_write %{{.*}}, %[[alloc]]
  // CHECK: %[[alloc_tensor:.*]] = bufferization.to_tensor %[[alloc_casted]]
  %1 = vector.transfer_write %v, %filled[%idx] : vector<5xf32>, tensor<10xf32>
  // CHECK: %[[dummy:.*]] = "test.dummy_op"(%[[filled_tensor]])
  %2 = "test.dummy_op"(%filled) : (tensor<10xf32>) -> (tensor<10xf32>)
  // CHECK: memref.dealloc %[[alloc]]
  // CHECK: memref.dealloc %[[m1]]
  // CHECK: return %[[alloc_tensor]], %[[dummy]]
  return %1, %2 : tensor<10xf32>, tensor<10xf32>
 }
 // -----
 // CHECK-LABEL: func @unknown_op_not_writable
-//  CHECK-SAME:     %[[m1:.*]]: memref<?xf32
+//  CHECK-SAME:     %[[t1:.*]]: tensor<?xf32>
 func @unknown_op_not_writable(
    %t1 : tensor<?xf32>, %v :  vector<5xf32>, %idx : index) -> tensor<?xf32> {
-  // CHECK: %[[m1_tensor:.*]] = bufferization.to_tensor %[[m1]]
+  // CHECK: %[[dummy:.*]] = "test.dummy_op"(%[[t1]])
  // CHECK: %[[dummy:.*]] = "test.dummy_op"(%[[m1_tensor]])
  // CHECK: %[[dummy_memref:.*]] = bufferization.to_memref %[[dummy]]
  %0 = "test.dummy_op"(%t1) : (tensor<?xf32>) -> (tensor<?xf32>)
  // The result of an unknown op is not writable. Always generate a copy.
  // Note: This copy is essential for partial bufferization. Otherwise, we could
  // introducing a RaW conflict.
  // CHECK: %[[dim:.*]] = tensor.dim %[[dummy]]
  // CHECK: %[[alloc:.*]] = memref.alloc(%[[dim]])
  // CHECK: memref.copy %[[dummy_memref]], %[[alloc]]
  // CHECK: vector.transfer_write %{{.*}}, %[[alloc]]
  %1 = vector.transfer_write %v, %0[%idx] : vector<5xf32>, tensor<?xf32>
-  // CHECK: return %[[alloc]]
+  // CHECK: %[[alloc_tensor:.*]] = bufferization.to_tensor %[[alloc]]
  // CHECK: return %[[alloc_tensor]]
  return %1 : tensor<?xf32>
 }
--- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize.mlir
@ -1,9 +1,9 @@
-// RUN: mlir-opt %s -test-comprehensive-function-bufferize="allow-return-memref allow-unknown-ops" -split-input-file | FileCheck %s
+// RUN: mlir-opt %s -one-shot-bufferize="allow-return-memref allow-unknown-ops" -split-input-file | FileCheck %s
 // Run fuzzer with different seeds.
-// RUN: mlir-opt %s -test-comprehensive-function-bufferize="allow-return-memref test-analysis-only analysis-fuzzer-seed=23" -split-input-file -o /dev/null
+// RUN: mlir-opt %s -one-shot-bufferize="allow-return-memref test-analysis-only analysis-fuzzer-seed=23" -split-input-file -o /dev/null
-// RUN: mlir-opt %s -test-comprehensive-function-bufferize="allow-return-memref test-analysis-only analysis-fuzzer-seed=59" -split-input-file -o /dev/null
+// RUN: mlir-opt %s -one-shot-bufferize="allow-return-memref test-analysis-only analysis-fuzzer-seed=59" -split-input-file -o /dev/null
-// RUN: mlir-opt %s -test-comprehensive-function-bufferize="allow-return-memref test-analysis-only analysis-fuzzer-seed=91" -split-input-file -o /dev/null
+// RUN: mlir-opt %s -one-shot-bufferize="allow-return-memref test-analysis-only analysis-fuzzer-seed=91" -split-input-file -o /dev/null
 // CHECK-LABEL: func @use_tensor_func_arg(
 //  CHECK-SAME:     %[[A:.*]]: tensor<?xf32>
@ -68,31 +68,4 @@ func @empty_func() -> () {
  return
 }
 // -----
 // CHECK-LABEL: func @rank_reducing
 func @rank_reducing(
    %i: index, %j: index,
    %arg0: tensor<8x18x32xf32>) 
      -> tensor<?x1x6x8xf32> {
  %c1 = arith.constant 1 : index
  %c6 = arith.constant 6 : index
  %c8 = arith.constant 8 : index
  %c32 = arith.constant 32 : index
  %c0 = arith.constant 0 : index
  %0 = linalg.init_tensor [4, 1, 6, 8] : tensor<4x1x6x8xf32>
  %1 = tensor.cast %0 : tensor<4x1x6x8xf32> to tensor<?x1x6x8xf32>
  %2 = linalg.init_tensor [1, 6, 8] : tensor<1x6x8xf32>
  %5 = scf.for %arg7 = %c0 to %c32 step %c8 iter_args(%arg8 = %1) -> (tensor<?x1x6x8xf32>) {
    %7 = affine.apply affine_map<(d0) -> (d0 ceildiv 8)>(%arg7)
    %8 = tensor.extract_slice %arg0[%i, %j, %arg7] [1, 6, 8] [1, 1, 1] : tensor<8x18x32xf32> to tensor<1x6x8xf32>
    %9 = scf.for %arg9 = %c0 to %c6 step %c1 iter_args(%arg10 = %2) -> (tensor<1x6x8xf32>) {
      %11 = tensor.extract_slice %8[0, %arg9, 0] [1, 1, 8] [1, 1, 1] : tensor<1x6x8xf32> to tensor<1x1x8xf32>
      %12 = tensor.insert_slice %11 into %arg10[0, %arg9, 0] [1, 1, 8] [1, 1, 1] : tensor<1x1x8xf32> into tensor<1x6x8xf32>
      scf.yield %12 : tensor<1x6x8xf32>
    }
    %10 = tensor.insert_slice %9 into %arg8[%7, 0, 0, 0] [1, 1, 6, 8] [1, 1, 1, 1] : tensor<1x6x8xf32> into tensor<?x1x6x8xf32>
    scf.yield %10 : tensor<?x1x6x8xf32>
  }
  return %5: tensor<?x1x6x8xf32>
 }
--- a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir
+++ b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir
@ -1355,3 +1355,35 @@ func @write_after_select_read_one(
  // CHECK: return %[[f]], %[[select]]
  return %f, %w : f32, tensor<?xf32>
 }
 // -----
 // A regression test to make sure that we handle rank-reducing extract_slice
 // correctly.
 // CHECK-LABEL: func @rank_reducing
 func @rank_reducing(
    %i: index, %j: index,
    %arg0: tensor<8x18x32xf32>)
      -> tensor<?x1x6x8xf32> {
  %c1 = arith.constant 1 : index
  %c6 = arith.constant 6 : index
  %c8 = arith.constant 8 : index
  %c32 = arith.constant 32 : index
  %c0 = arith.constant 0 : index
  %0 = linalg.init_tensor [4, 1, 6, 8] : tensor<4x1x6x8xf32>
  %1 = tensor.cast %0 : tensor<4x1x6x8xf32> to tensor<?x1x6x8xf32>
  %2 = linalg.init_tensor [1, 6, 8] : tensor<1x6x8xf32>
  %5 = scf.for %arg7 = %c0 to %c32 step %c8 iter_args(%arg8 = %1) -> (tensor<?x1x6x8xf32>) {
    %7 = affine.apply affine_map<(d0) -> (d0 ceildiv 8)>(%arg7)
    %8 = tensor.extract_slice %arg0[%i, %j, %arg7] [1, 6, 8] [1, 1, 1] : tensor<8x18x32xf32> to tensor<1x6x8xf32>
    %9 = scf.for %arg9 = %c0 to %c6 step %c1 iter_args(%arg10 = %2) -> (tensor<1x6x8xf32>) {
      %11 = tensor.extract_slice %8[0, %arg9, 0] [1, 1, 8] [1, 1, 1] : tensor<1x6x8xf32> to tensor<1x1x8xf32>
      %12 = tensor.insert_slice %11 into %arg10[0, %arg9, 0] [1, 1, 8] [1, 1, 1] : tensor<1x1x8xf32> into tensor<1x6x8xf32>
      scf.yield %12 : tensor<1x6x8xf32>
    }
    %10 = tensor.insert_slice %9 into %arg8[%7, 0, 0, 0] [1, 1, 6, 8] [1, 1, 1, 1] : tensor<1x6x8xf32> into tensor<?x1x6x8xf32>
    scf.yield %10 : tensor<?x1x6x8xf32>
  }
  return %5: tensor<?x1x6x8xf32>
 }
--- a/mlir/test/lib/Dialect/Linalg/CMakeLists.txt
+++ b/mlir/test/lib/Dialect/Linalg/CMakeLists.txt
@ -1,6 +1,5 @@
 # Exclude tests from libMLIR.so
 add_mlir_library(MLIRLinalgTestPasses
  TestComprehensiveBufferize.cpp
  TestLinalgCodegenStrategy.cpp
  TestLinalgDistribution.cpp
  TestLinalgElementwiseFusion.cpp
--- a/mlir/test/lib/Dialect/Linalg/TestComprehensiveBufferize.cpp
+++ b/mlir/test/lib/Dialect/Linalg/TestComprehensiveBufferize.cpp
@ -1,138 +0,0 @@
 //===- TestComprehensiveBufferize.cpp - Test Comprehensive Bufferize ------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 // This file implements logic for testing Comprehensive Bufferize.
 //
 //===----------------------------------------------------------------------===//
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
 #include "mlir/Dialect/Arithmetic/Transforms/BufferizableOpInterfaceImpl.h"
 #include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
 #include "mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h"
 #include "mlir/Dialect/Linalg/ComprehensiveBufferize/AffineInterfaceImpl.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Linalg/Passes.h"
 #include "mlir/Dialect/Linalg/Transforms/BufferizableOpInterfaceImpl.h"
 #include "mlir/Dialect/SCF/BufferizableOpInterfaceImpl.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Dialect/Vector/Transforms/BufferizableOpInterfaceImpl.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Transforms/Passes.h"
 using namespace mlir;
 using namespace mlir::linalg;
 using namespace mlir::linalg::comprehensive_bufferize;
 using namespace mlir::bufferization;
 namespace {
 /// A helper struct for FunctionBufferize and ModuleBufferize. Both passes are
 /// mostly identical.
 struct TestComprehensiveFunctionBufferize
    : public PassWrapper<TestComprehensiveFunctionBufferize,
                         OperationPass<FuncOp>> {
  StringRef getArgument() const final {
    return "test-comprehensive-function-bufferize";
  }
  StringRef getDescription() const final {
    return "Test Comprehensive Bufferize of FuncOps (body only).";
  }
  TestComprehensiveFunctionBufferize() = default;
  TestComprehensiveFunctionBufferize(
      const TestComprehensiveFunctionBufferize &pass)
      : PassWrapper(pass) {}
  void getDependentDialects(DialectRegistry &registry) const override {
    registry.insert<bufferization::BufferizationDialect, linalg::LinalgDialect,
                    memref::MemRefDialect, tensor::TensorDialect,
                    vector::VectorDialect, scf::SCFDialect, StandardOpsDialect,
                    arith::ArithmeticDialect, AffineDialect>();
    affine_ext::registerBufferizableOpInterfaceExternalModels(registry);
    arith::registerBufferizableOpInterfaceExternalModels(registry);
    linalg::registerBufferizableOpInterfaceExternalModels(registry);
    scf::registerBufferizableOpInterfaceExternalModels(registry);
    tensor::registerBufferizableOpInterfaceExternalModels(registry);
    vector::registerBufferizableOpInterfaceExternalModels(registry);
  }
  void runOnOperation() override;
  Option<bool> allowReturnMemref{
      *this, "allow-return-memref",
      llvm::cl::desc("Allow returning/yielding memrefs from functions/blocks"),
      llvm::cl::init(false)};
  Option<bool> allowUnknownOps{
      *this, "allow-unknown-ops",
      llvm::cl::desc(
          "Allows the return of memrefs (for testing purposes only)"),
      llvm::cl::init(false)};
  Option<bool> testAnalysisOnly{
      *this, "test-analysis-only",
      llvm::cl::desc(
          "Only runs inplaceability analysis (for testing purposes only)"),
      llvm::cl::init(false)};
  Option<unsigned> analysisFuzzerSeed{
      *this, "analysis-fuzzer-seed",
      llvm::cl::desc("Analyze ops in random order with a given seed (fuzzer)"),
      llvm::cl::init(0)};
  ListOption<std::string> dialectFilter{
      *this, "dialect-filter",
      llvm::cl::desc("Bufferize only ops from the specified dialects"),
      llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated};
  Option<bool> fullyDynamicLayoutMaps{
      *this, "fully-dynamic-layout-maps",
      llvm::cl::desc("Use fully dynamic layout maps on memref types"),
      llvm::cl::init(true)};
  Option<bool> createDeallocs{
      *this, "create-deallocs",
      llvm::cl::desc("Specify if buffers should be deallocated"),
      llvm::cl::init(true)};
 };
 } // namespace
 void TestComprehensiveFunctionBufferize::runOnOperation() {
  auto options = std::make_unique<AnalysisBufferizationOptions>();
  options->allowReturnMemref = allowReturnMemref;
  options->allowUnknownOps = allowUnknownOps;
  options->testAnalysisOnly = testAnalysisOnly;
  options->analysisFuzzerSeed = analysisFuzzerSeed;
  options->fullyDynamicLayoutMaps = fullyDynamicLayoutMaps;
  options->createDeallocs = createDeallocs;
  if (dialectFilter.hasValue()) {
    options->hasFilter = true;
    for (const std::string &dialectNamespace : dialectFilter)
      options->allowDialectInFilter(dialectNamespace);
  }
  Operation *op = getOperation();
  if (failed(runOneShotBufferize(op, std::move(options))))
    return;
  if (testAnalysisOnly)
    return;
  OpPassManager cleanupPipeline("builtin.func");
  cleanupPipeline.addPass(createCanonicalizerPass());
  cleanupPipeline.addPass(createCSEPass());
  cleanupPipeline.addPass(createLoopInvariantCodeMotionPass());
  (void)this->runPipeline(cleanupPipeline, op);
 }
 namespace mlir {
 namespace test {
 void registerTestComprehensiveFunctionBufferize() {
  PassRegistration<TestComprehensiveFunctionBufferize>();
 }
 } // namespace test
 } // namespace mlir
--- a/mlir/tools/mlir-opt/mlir-opt.cpp
+++ b/mlir/tools/mlir-opt/mlir-opt.cpp
@ -64,7 +64,6 @@ void registerTestAffineLoopParametricTilingPass();
 void registerTestAliasAnalysisPass();
 void registerTestBuiltinAttributeInterfaces();
 void registerTestCallGraphPass();
 void registerTestComprehensiveFunctionBufferize();
 void registerTestConstantFold();
 void registerTestGpuSerializeToCubinPass();
 void registerTestGpuSerializeToHsacoPass();
@ -159,7 +158,6 @@ void registerTestPasses() {
 #if MLIR_ROCM_CONVERSIONS_ENABLED
  mlir::test::registerTestGpuSerializeToHsacoPass();
 #endif
  mlir::test::registerTestComprehensiveFunctionBufferize();
  mlir::test::registerTestDecomposeCallGraphTypes();
  mlir::test::registerTestDataLayoutQuery();
  mlir::test::registerTestDominancePass();