From a77524cd2c20f4d0aa4ddd164f12f7af64b5bfc8 Mon Sep 17 00:00:00 2001
From: Nicolas Vasilache <nicolas.vasilache@gmail.com>
Date: Tue, 29 Jun 2021 15:39:14 +0000
Subject: [PATCH] [mlir][Linalg] Add a ComprehensiveModuleBufferizePass and
 support for CallOp analysis(9/n)

This revision adds the minimal plumbing to create a simple ComprehensiveModuleBufferizePass that can behave conservatively in the presence of CallOps.

A topological sort of caller/callee is performed and, if the call-graph is cycle-free, analysis can proceed.

Differential revision: https://reviews.llvm.org/D104859
---
 mlir/include/mlir/Dialect/Linalg/Passes.h     |   8 +
 mlir/include/mlir/Dialect/Linalg/Passes.td    |  21 +-
 mlir/include/mlir/IR/FunctionSupport.h        |   4 +
 .../Transforms/ComprehensiveBufferize.cpp     | 242 +++++++++++++++---
 ...mprehensive-module-bufferize-analysis.mlir |  84 ++++++
 ...omprehensive-module-bufferize-invalid.mlir |  15 ++
 6 files changed, 336 insertions(+), 38 deletions(-)
 create mode 100644 mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis.mlir
 create mode 100644 mlir/test/Dialect/Linalg/comprehensive-module-bufferize-invalid.mlir
diff --git a/mlir/include/mlir/Dialect/Linalg/Passes.h b/mlir/include/mlir/Dialect/Linalg/Passes.h
index b3433e5c58a5..d80eb9a0652d 100644
--- a/mlir/include/mlir/Dialect/Linalg/Passes.h
+++ b/mlir/include/mlir/Dialect/Linalg/Passes.h
@@ -62,6 +62,14 @@ std::unique_ptr<OperationPass<FuncOp>> createConvertLinalgToAffineLoopsPass();
 ///   b) whose buffer uses would be free of memory hazards.
 std::unique_ptr<Pass> createLinalgComprehensiveFuncBufferizePass();
 
+/// This pass implements a cross-dialect bufferization approach and performs an
+/// analysis to determine which op operands and results may be bufferized in the
+/// same buffers. The analysis is performed on topologically sorted CallOp and
+/// FuncOp within a module. It provides analyses and bufferization across
+/// function boundaries. Within a single function body, the bufferization used
+/// is that provided by `LinalgComprehensiveFuncBufferizePass`.
+std::unique_ptr<Pass> createLinalgComprehensiveModuleBufferizePass();
+
 /// Create a pass to convert Linalg operations which work on tensors to use
 /// buffers instead.
 std::unique_ptr<OperationPass<FuncOp>> createLinalgBufferizePass();
diff --git a/mlir/include/mlir/Dialect/Linalg/Passes.td b/mlir/include/mlir/Dialect/Linalg/Passes.td
index d36d655638a2..3d9833061a09 100644
--- a/mlir/include/mlir/Dialect/Linalg/Passes.td
+++ b/mlir/include/mlir/Dialect/Linalg/Passes.td
@@ -32,7 +32,7 @@ def LinalgComprehensiveFuncBufferize :
     This pass implements a cross-dialect bufferization approach and performs an
     analysis to determine which op operands and results may be bufferized in the
     same buffers. The analysis is performed on SSA use-def chains starting from
-    function operands that are annotated with the 'inplaceable' attribute
+    function operands that are annotated with the 'inplaceable' attribute.
   }];
   let options = [
     Option<"testAnalysisOnly", "test-analysis-only", "bool",
@@ -42,6 +42,25 @@ def LinalgComprehensiveFuncBufferize :
   let constructor = "mlir::createLinalgComprehensiveFuncBufferizePass()";
 }
 
+def LinalgComprehensiveModuleBufferize :
+    Pass<"linalg-comprehensive-module-bufferize", "ModuleOp"> {
+  let summary = "Bufferize (tensor into memref) for a Module.";
+  let description = [{
+    This pass implements a cross-dialect bufferization approach and performs an
+    analysis to determine which op operands and results may be bufferized in the
+    same buffers. The analysis is performed on topologically sorted CallOp and
+    FuncOp within a module. It provides analyses and bufferization across
+    function boundaries. Within a single function body, the bufferization used
+    is that provided by `-linalg-comprehensive-func-bufferize`.
+  }];
+  let options = [
+    Option<"testAnalysisOnly", "test-analysis-only", "bool",
+            /*default=*/"false",
+           "Only runs inplaceability analysis (for testing purposes only)">
+  ];
+  let constructor = "mlir::createLinalgComprehensiveModuleBufferizePass()";
+}
+
 def LinalgFoldUnitExtentDims : FunctionPass<"linalg-fold-unit-extent-dims"> {
   let summary = "Remove unit-extent dimension in Linalg ops on tensors";
   let constructor = "mlir::createLinalgFoldUnitExtentDimsPass()";
diff --git a/mlir/include/mlir/IR/FunctionSupport.h b/mlir/include/mlir/IR/FunctionSupport.h
index c081fb24bf26..f27c857a175a 100644
--- a/mlir/include/mlir/IR/FunctionSupport.h
+++ b/mlir/include/mlir/IR/FunctionSupport.h
@@ -375,6 +375,10 @@ public:
   /// attribute that was erased, or nullptr if there was no attribute with such
   /// name.
   Attribute removeArgAttr(unsigned index, Identifier name);
+  Attribute removeArgAttr(unsigned index, StringRef name) {
+    return removeArgAttr(
+        index, Identifier::get(name, this->getOperation()->getContext()));
+  }
 
   //===--------------------------------------------------------------------===//
   // Result Attributes
diff --git a/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferize.cpp b/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferize.cpp
index 996e90662a45..dd49ae43d8fe 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferize.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferize.cpp
@@ -16,7 +16,7 @@
 // Composability with extensible set of ops is not a first-class concern.
 //
 // Bufferization occurs by:
-//  a. performing an inPlace analysis `inPlaceAnalysisFuncOpInternals`
+//  a. performing an inPlace analysis `inPlaceAnalysisFuncOpBody`
 //     which marks each operation within the function with the
 //     `kInPlaceResultsAttrName` attribute.
 //  b. traversing each operation in the function and rewriting it in
@@ -132,6 +132,19 @@ using namespace tensor;
 #define DBGS() (llvm::dbgs() << '[' << DEBUG_TYPE << "] ")
 #define LDBG(X) LLVM_DEBUG(DBGS() << X)
 
+//===----------------------------------------------------------------------===//
+// Generic helpers.
+//===----------------------------------------------------------------------===//
+
+/// Return the FuncOp called by `callOp`.
+static FuncOp getCalledFunction(CallOpInterface callOp) {
+  SymbolRefAttr sym = callOp.getCallableForCallee().dyn_cast<SymbolRefAttr>();
+  if (!sym)
+    return nullptr;
+  return dyn_cast_or_null<FuncOp>(
+      SymbolTable::lookupNearestSymbolFrom(callOp, sym));
+}
+
 //===----------------------------------------------------------------------===//
 // Bufferization-specific BlockAndValueMapping support with debugging.
 //===----------------------------------------------------------------------===//
@@ -167,6 +180,7 @@ static Value lookup(BlockAndValueMapping &bvm, Value key) {
     parentOp = key.getDefiningOp()->getParentOfType<FuncOp>();
   }
   LDBG("In func:\n" << *parentOp << "NO VALUE FOR KEY: " << key << '\n');
+  (void)parentOp;
   return Value();
 }
 
@@ -276,6 +290,25 @@ static InPlaceSpec getInPlace(BlockArgument bbArg) {
   return InPlaceSpec::None;
 }
 
+/// Set the attribute that triggers inplace bufferization on a FuncOp argument
+/// `bbArg`.
+static void
+setInPlaceFuncArgument(BlockArgument bbArg,
+                       InPlaceSpec inPlaceSpec = InPlaceSpec::True) {
+  auto funcOp = cast<FuncOp>(bbArg.getOwner()->getParentOp());
+  funcOp.setArgAttr(
+      bbArg.getArgNumber(), LinalgDialect::kInplaceableAttrName,
+      BoolAttr::get(bbArg.getContext(), inPlaceSpec == InPlaceSpec::True));
+}
+
+/// Remove the attribute that triggers inplace bufferization on a FuncOp
+/// argument `bbArg`.
+static void removeInPlaceFuncArgument(BlockArgument bbArg) {
+  auto funcOp = cast<FuncOp>(bbArg.getOwner()->getParentOp());
+  funcOp.removeArgAttr(bbArg.getArgNumber(),
+                       LinalgDialect::kInplaceableAttrName);
+}
+
 LLVM_ATTRIBUTE_UNUSED static InPlaceSpec getInPlace(Value v) {
   if (auto bbArg = v.dyn_cast<BlockArgument>())
     return getInPlace(bbArg);
@@ -305,7 +338,8 @@ LLVM_ATTRIBUTE_UNUSED static InPlaceSpec getInPlace(Value v) {
 static bool hasKnownBufferizationAliasingBehavior(Operation *op) {
   return
       // clang-format off
-      isa<scf::ForOp,
+      isa<CallOpInterface,
+          scf::ForOp,
           LinalgOp,
           ReturnOp,
           ExtractSliceOp,
@@ -386,6 +420,10 @@ static OpResult getInplaceableOpResult(OpOperand &opOperand) {
         // ExtractSliceOp is special, when bufferized inplace it just returns an
         // alias to its operand. Its result is never inplaceable on its operand.
         .Case([&](ExtractSliceOp op) { return OpResult(); })
+        // CallOpInterface is special, it needs to wait for the callee to be
+        // bufferized and needs to inspect the BufferAliasInfo object. It can't
+        // make a proper determination by itself and needs to be conservative.
+        .Case([&](CallOpInterface op) { return OpResult(); })
         // Other ops.
         .Default([&](Operation *op) { return OpResult(); });
   // clang-format on
@@ -458,6 +496,12 @@ static bool bufferizesToMemoryRead(OpOperand &opOperand) {
   // matching bbArg may.
   if (isa<scf::ForOp>(opOperand.getOwner()))
     return false;
+  // CallOpInterface alone doesn't bufferize to a memory read, one of the uses
+  // of the matching bbArg may. It is the responsibility of the caller to
+  // inspect bbArgs. In the absence of a BufferizationAliasInfo, we need to be
+  // conservative.
+  if (auto callOp = dyn_cast<CallOpInterface>(opOperand.getOwner()))
+    return true;
   if (auto linalgOp = dyn_cast<LinalgOp>(opOperand.getOwner()))
     return linalgOp.isInputTensor(&opOperand) ||
            linalgOp.isInitTensor(&opOperand);
@@ -473,6 +517,19 @@ static bool bufferizesToMemoryRead(OpOperand &opOperand) {
 static bool
 bufferizesToMemoryWrite(OpOperand &opOperand,
                         InPlaceSpec inPlaceSpec = InPlaceSpec::None) {
+  // These terminators are not writes.
+  if (isa<ReturnOp, linalg::YieldOp, scf::YieldOp>(opOperand.getOwner()))
+    return false;
+  // ExtractSliceOp alone doesn't bufferize to a memory write, one of its uses
+  // may.
+  if (isa<ExtractSliceOp>(opOperand.getOwner()))
+    return false;
+  // CallOpInterface alone doesn't bufferize to a memory write, one of the uses
+  // of the matching bbArg may. It is the responsibility of the caller to
+  // inspect bbArgs. In the absence of a BufferizationAliasInfo, we need to be
+  // conservative.
+  if (auto callOp = dyn_cast<CallOpInterface>(opOperand.getOwner()))
+    return true;
   Optional<OpResult> maybeOpResult = getAliasingOpResult(opOperand);
   // Unknown op that returns a tensor. The inplace analysis does not support
   // it. Conservatively return true.
@@ -482,13 +539,6 @@ bufferizesToMemoryWrite(OpOperand &opOperand,
   // This does not bufferize to a write.
   if (!*maybeOpResult)
     return false;
-  // These terminators are not writes.
-  if (isa<ReturnOp, linalg::YieldOp, scf::YieldOp>(opOperand.getOwner()))
-    return false;
-  // ExtractSliceOp alone doesn't bufferize to a memory write, one of its uses
-  // may.
-  if (maybeOpResult->getDefiningOp<ExtractSliceOp>())
-    return false;
   // If we have a matching OpResult, this is a write.
   // Additionally allow to restrict to only inPlace write, if so specified.
   return inPlaceSpec == InPlaceSpec::None ||
@@ -521,7 +571,11 @@ public:
     Equivalent
   };
 
-  explicit BufferizationAliasInfo(FuncOp funcOp);
+  explicit BufferizationAliasInfo(Operation *rootOp);
+
+  /// Add a new entry for `v` in the `aliasInfo` and `equivalentInfo`. In the
+  /// beginning the alias and equivalence sets only contain `v` itself.
+  void createAliasInfoEntry(Value v);
 
   /// Return true if the buffer to which `operand` would bufferize aliases a
   /// buffer that is known to not be writeable. This implies that the matching
@@ -664,33 +718,28 @@ private:
 };
 } // namespace
 
-BufferizationAliasInfo::BufferizationAliasInfo(FuncOp funcOp) {
-  funcOp.walk([&](Operation *op) {
-    for (Value v : op->getResults()) {
-      if (!v.getType().isa<TensorType>())
-        continue;
-      assert(getInPlace(v) == InPlaceSpec::None &&
-             "unexpected inplace in analysis.");
-      DenseSet<Value> selfSet;
-      selfSet.insert(v);
-      aliasInfo.try_emplace(v, selfSet);
-      equivalentInfo.insert(v);
-    }
-    for (Region &r : op->getRegions()) {
-      for (Block &b : r.getBlocks()) {
-        for (auto bbArg : b.getArguments()) {
-          if (!bbArg.getType().isa<TensorType>())
-            continue;
-          DenseSet<Value> selfSet;
-          selfSet.insert(bbArg);
-          aliasInfo.try_emplace(bbArg, selfSet);
-          equivalentInfo.insert(bbArg);
-        }
-      }
-    }
+BufferizationAliasInfo::BufferizationAliasInfo(Operation *rootOp) {
+  rootOp->walk([&](Operation *op) {
+    for (Value v : op->getResults())
+      if (v.getType().isa<TensorType>())
+        createAliasInfoEntry(v);
+    for (Region &r : op->getRegions())
+      for (Block &b : r.getBlocks())
+        for (auto bbArg : b.getArguments())
+          if (bbArg.getType().isa<TensorType>())
+            createAliasInfoEntry(bbArg);
   });
 }
 
+/// Add a new entry for `v` in the `aliasInfo` and `equivalentInfo`. In the
+/// beginning the alias and equivalence sets only contain `v` itself.
+void BufferizationAliasInfo::createAliasInfoEntry(Value v) {
+  DenseSet<Value> selfSet;
+  selfSet.insert(v);
+  aliasInfo.try_emplace(v, selfSet);
+  equivalentInfo.insert(v);
+}
+
 /// Return true if the buffer to which `operand` would bufferize aliases a
 /// buffer that is known to not be writeable. This implies that the matching
 /// OpResult cannot be bufferized inplace.
@@ -1679,8 +1728,8 @@ bufferizationSanityCheck(scf::YieldOp yieldOp,
 
 /// Analyze the `funcOp` body to determine which OpResults are inplaceable.
 static LogicalResult
-inPlaceAnalysisFuncOpInternals(FuncOp funcOp, BufferizationAliasInfo &aliasInfo,
-                               const DominanceInfo &domInfo) {
+inPlaceAnalysisFuncOpBody(FuncOp funcOp, BufferizationAliasInfo &aliasInfo,
+                          const DominanceInfo &domInfo) {
   LLVM_DEBUG(llvm::dbgs() << "\n\n");
   LDBG("Begin InPlaceAnalysisFuncOpInternals:\n" << funcOp << '\n');
   assert(funcOp && funcOp->getNumRegions() > 0 && !funcOp.body().empty() &&
@@ -1816,7 +1865,7 @@ void LinalgComprehensiveFuncBufferize::runOnFunction() {
   BufferizationAliasInfo aliasInfo(funcOp);
   // If the analysis fails, just return. This is expected to reset the IR and no
   // single OpResult should be marked inPlace.
-  if (failed(inPlaceAnalysisFuncOpInternals(funcOp, aliasInfo, domInfo))) {
+  if (failed(inPlaceAnalysisFuncOpBody(funcOp, aliasInfo, domInfo))) {
     signalPassFailure();
     return;
   }
@@ -1836,3 +1885,122 @@ void LinalgComprehensiveFuncBufferize::runOnFunction() {
 std::unique_ptr<Pass> mlir::createLinalgComprehensiveFuncBufferizePass() {
   return std::make_unique<LinalgComprehensiveFuncBufferize>();
 }
+
+//===----------------------------------------------------------------------===//
+// Bufferization entry-point for modules.
+//===----------------------------------------------------------------------===//
+
+/// Store all functions of the `moduleOp` in `orderedFuncOps`, sorted by
+/// callee-caller order (i.e. callees without callers first).
+/// Store the map of FuncOp to all its callers in `callerMap`.
+/// Return `failure()` if a cycle of calls is detected or if we are unable to
+/// retrieve the called FuncOp from any CallOpInterface.
+static LogicalResult
+getFuncOpsOrderedByCalls(ModuleOp moduleOp,
+                         SmallVectorImpl<FuncOp> &orderedFuncOps,
+                         DenseMap<FuncOp, DenseSet<Operation *>> &callerMap) {
+  // For each FuncOp, the set of functions called by it (i.e. the union of
+  // symbols of all nested CallOpInterfaceOp).
+  DenseMap<FuncOp, DenseSet<FuncOp>> calledBy;
+  // For each FuncOp, the number of CallOpInterface it contains.
+  DenseMap<FuncOp, unsigned> numberCallOpsContainedInFuncOp;
+  WalkResult res = moduleOp.walk([&](FuncOp funcOp) {
+    numberCallOpsContainedInFuncOp[funcOp] = 0;
+    return funcOp.walk([&](CallOpInterface callOp) {
+      FuncOp calledFunction = getCalledFunction(callOp);
+      if (!calledFunction)
+        return WalkResult::interrupt();
+      auto it = callerMap.try_emplace(calledFunction, DenseSet<Operation *>{});
+      it.first->getSecond().insert(callOp);
+      if (calledBy[calledFunction].count(funcOp) == 0) {
+        calledBy[calledFunction].insert(funcOp);
+        numberCallOpsContainedInFuncOp[funcOp]++;
+      }
+      return WalkResult::advance();
+    });
+  });
+  if (res.wasInterrupted())
+    return failure();
+  // Iteratively remove function operation that do not call any of the
+  // functions remaining in the callCounter map and add them to the worklist.
+  while (!numberCallOpsContainedInFuncOp.empty()) {
+    auto it = llvm::find_if(numberCallOpsContainedInFuncOp,
+                            [](auto entry) { return entry.getSecond() == 0; });
+    if (it == numberCallOpsContainedInFuncOp.end())
+      return moduleOp.emitOpError(
+          "expected callgraph to be free of circular dependencies.");
+    orderedFuncOps.push_back(it->getFirst());
+    for (auto callee : calledBy[it->getFirst()])
+      numberCallOpsContainedInFuncOp[callee]--;
+    numberCallOpsContainedInFuncOp.erase(it);
+  }
+  return success();
+}
+
+namespace {
+struct LinalgComprehensiveModuleBufferize
+    : public LinalgComprehensiveModuleBufferizeBase<
+          LinalgComprehensiveModuleBufferize> {
+
+  void runOnOperation() override;
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<linalg::LinalgDialect, memref::MemRefDialect>();
+  }
+};
+} // end namespace
+
+void LinalgComprehensiveModuleBufferize::runOnOperation() {
+  ModuleOp moduleOp = getOperation();
+
+  SmallVector<FuncOp> orderedFuncOps;
+  DenseMap<FuncOp, DenseSet<Operation *>> callerMap;
+  if (failed(getFuncOpsOrderedByCalls(moduleOp, orderedFuncOps, callerMap)))
+    return signalPassFailure();
+
+  DominanceInfo domInfo(moduleOp);
+  BufferizationAliasInfo aliasInfo(moduleOp);
+  // Interestingly, all function args that are not visible outside of a module
+  // can be fully bufferized inplace by guaranteeing the CallOp is bufferized
+  // inplace. Therefore, we just bufferize funcOp as if none of its results were
+  // inplaceable, detect which operands are cloned internally and decide what to
+  // do at call sites.
+  for (FuncOp funcOp : orderedFuncOps) {
+    // No body => no analysis.
+    if (funcOp.body().empty())
+      continue;
+
+    // In a first approximation:
+    // =========================
+    // If the function is called, we can allocate on the caller side which lets
+    // us force inplace arguments at function boundaries.
+    // TODO: do not rely on this behavior.
+    if (callerMap.find(funcOp) != callerMap.end())
+      for (BlockArgument bbArg : funcOp.getArguments())
+        if (bbArg.getType().isa<TensorType>())
+          setInPlaceFuncArgument(bbArg);
+
+    // If the analysis fails, just return.
+    if (failed(inPlaceAnalysisFuncOpBody(funcOp, aliasInfo, domInfo))) {
+      signalPassFailure();
+      return;
+    }
+
+    // TODO: Bufferization phase.
+  }
+  // Don't drop the attributes if we only want to report the analysis.
+  if (testAnalysisOnly)
+    return;
+
+  // Post-pass cleanup of inplaceable attributes.
+  moduleOp.walk(
+      [&](Operation *op) { op->removeAttr(kInPlaceResultsAttrName); });
+  moduleOp.walk([&](FuncOp op) {
+    for (BlockArgument bbArg : op.getArguments())
+      removeInPlaceFuncArgument(bbArg);
+  });
+}
+
+std::unique_ptr<Pass> mlir::createLinalgComprehensiveModuleBufferizePass() {
+  return std::make_unique<LinalgComprehensiveModuleBufferize>();
+}
diff --git a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis.mlir b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis.mlir
new file mode 100644
index 000000000000..108119467ea6
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis.mlir
@@ -0,0 +1,84 @@
+// RUN: mlir-opt %s -linalg-comprehensive-module-bufferize=test-analysis-only -split-input-file | FileCheck %s
+
+func private @foo(tensor<64xf32>)
+
+// CHECK-LABEL: dependence_through_call
+func @dependence_through_call(%I : tensor<64xf32> {linalg.inplaceable = true}) {
+  %f1 = constant 1.000000e+00 : f32
+  %f2 = constant 2.000000e+00 : f32
+
+  // 2. %B already bufferizes inplace, %A would alias and have a different
+  // value. The calls to `foo` are determined to read conservatively, so %A
+  // cannot bufferize inplace.
+  //     CHECK: fill
+  // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
+  %A = linalg.fill(%f1, %I) : f32, tensor<64xf32> -> tensor<64xf32>
+
+  // 1. Bufferizes inplace: no alias to %A is yet possible.
+  //     CHECK: fill
+  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+  %B = linalg.fill(%f2, %I) : f32, tensor<64xf32> -> tensor<64xf32>
+
+  call @foo(%A) : (tensor<64xf32>) -> ()
+  call @foo(%B) : (tensor<64xf32>) -> ()
+
+  return
+}
+
+// -----
+
+func private @foo(tensor<64xf32>)
+
+func private @bar(%A : tensor<64xf32>) {
+  call @foo(%A) : (tensor<64xf32>) -> ()
+  return
+}
+
+func @read_dependence_through_scf_and_call(
+    %I : tensor<64xf32> {linalg.inplaceable = true},
+    %I2 : tensor<64xf32> {linalg.inplaceable = true}) {
+  %c0 = constant 0 : index
+  %c1 = constant 1 : index
+  %c10 = constant 10 : index
+  %f1 = constant 1.000000e+00 : f32
+  %f2 = constant 2.000000e+00 : f32
+
+  // 5. %B bufferizes inplace, %A would alias and have a different value.
+  // The calls to `foo` are determined to read conservatively, so %A cannot
+  // bufferize inplace.
+  //     CHECK: fill
+  // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
+  %A = linalg.fill(%f1, %I) : f32, tensor<64xf32> -> tensor<64xf32>
+
+  // 4. Bufferizes inplace: no alias to %A is yet possible.
+  //     CHECK: fill
+  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+  %B = linalg.fill(%f2, %I) : f32, tensor<64xf32> -> tensor<64xf32>
+
+  // 3. Does not read or write, bufferizes inplace.
+  //     CHECK: scf.for
+  //     CHECK: {__inplace_results_attr__ = ["true", "true"]}
+  %r:2 = scf.for %i = %c0 to %c10 step %c1 iter_args(%0 = %A, %1 = %B)
+    -> (tensor<64xf32>, tensor<64xf32>)
+  {
+    scf.yield %0, %1 : tensor<64xf32>, tensor<64xf32>
+  }
+  call @foo(%r#0) : (tensor<64xf32>) -> ()
+  call @foo(%r#1) : (tensor<64xf32>) -> ()
+
+  // 2. %B2 already bufferizes inplace, %A2 would alias and have a different
+  // value. The calls to `foo` are determined to read conservatively, so %A2
+  // cannot bufferize inplace.
+  //     CHECK: fill
+  // CHECK-SAME: {__inplace_results_attr__ = ["false"]}
+  %A2 = linalg.fill(%f1, %I2) : f32, tensor<64xf32> -> tensor<64xf32>
+
+  // 1. Bufferizes inplace: no alias to %A2 is yet possible.
+  //     CHECK: fill
+  // CHECK-SAME: {__inplace_results_attr__ = ["true"]}
+  %B2 = linalg.fill(%f2, %I2) : f32, tensor<64xf32> -> tensor<64xf32>
+
+  call @bar(%A2) : (tensor<64xf32>) -> ()
+  call @bar(%B2) : (tensor<64xf32>) -> ()
+  return
+}
diff --git a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-invalid.mlir b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-invalid.mlir
new file mode 100644
index 000000000000..0e378a89ef58
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-invalid.mlir
@@ -0,0 +1,15 @@
+// RUN: mlir-opt %s -linalg-comprehensive-module-bufferize -split-input-file -verify-diagnostics
+
+// -----
+
+// expected-error @-3 {{expected callgraph to be free of circular dependencies}}
+
+func @foo() {
+  call @bar() : () -> ()
+  return
+}
+
+func @bar() {
+  call @foo() : () -> ()
+  return
+}