[mlir][scf] Simplify affine.min ops after loop peeling

Simplify affine.min ops, enabling various other canonicalizations inside the peeled loop body. affine.min ops such as: ``` map = affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)> %r = affine.min #affine.min #map(%iv)[%step, %ub] ``` are rewritten them into (in the case the peeled loop): ``` %r = %step ``` To determine how an affine.min op should be rewritten and to prove its correctness, FlatAffineConstraints is utilized. Differential Revision: https://reviews.llvm.org/D107222
2021-08-19 17:08:21 +09:00 · 2021-08-19 17:08:21 +09:00 · 8e8b70aa84
parent 3330b2532f
commit 8e8b70aa84
10 changed files with 439 additions and 59 deletions
--- a/mlir/include/mlir/Analysis/AffineStructures.h
+++ b/mlir/include/mlir/Analysis/AffineStructures.h
@ -250,6 +250,12 @@ public:
  void addSymbolId(unsigned pos);
  void addLocalId(unsigned pos);
  virtual unsigned addId(IdKind kind, unsigned pos);
+  /// Add identifiers of the specified kind at the end of the table. Return the
+  /// position of the column. The coefficient column corresponding to the
+  /// added identifier is initialized to zero.
+  unsigned addDimId();
+  unsigned addSymbolId();
+  unsigned addLocalId();

  /// Composes an affine map whose dimensions and symbols match one to one with
  /// the dimensions and symbols of this FlatAffineConstraints. The results of
@ -659,6 +665,12 @@ public:
  using FlatAffineConstraints::addSymbolId;
  unsigned addId(IdKind kind, unsigned pos) override;
  unsigned addId(IdKind kind, unsigned pos, Value val);
+  /// Add identifiers of the specified kind at the end of the table. Return the
+  /// position of the column. The coefficient column corresponding to the
+  /// added identifier is initialized to zero. `val` is the Value corresponding
+  /// to the identifier that can optionally be provided.
+  unsigned addDimId(Value val);
+  unsigned addSymbolId(Value val);

  /// Add the specified values as a dim or symbol id depending on its nature, if
  /// it already doesn't exist in the system. `val` has to be either a terminal
@ -766,6 +778,18 @@ public:
    return {values.data(), values.size()};
  }

+  inline ArrayRef<Optional<Value>> getMaybeDimValues() const {
+    return {values.data(), getNumDimIds()};
+  }
+
+  inline ArrayRef<Optional<Value>> getMaybeSymbolValues() const {
+    return {values.data() + getNumDimIds(), getNumSymbolIds()};
+  }
+
+  inline ArrayRef<Optional<Value>> getMaybeDimAndSymbolValues() const {
+    return {values.data(), getNumDimIds() + getNumSymbolIds()};
+  }
+
  /// Sets the Value associated with the pos^th identifier.
  inline void setValue(unsigned pos, Value val) {
    assert(pos < numIds && "invalid id position");
--- a/mlir/include/mlir/Dialect/SCF/Transforms.h
+++ b/mlir/include/mlir/Dialect/SCF/Transforms.h
@ -44,9 +44,10 @@ void naivelyFuseParallelOps(Region &region);
 /// by an scf.if for the last (partial) iteration (if any). This transformation
 /// is called "loop peeling".
 ///
-/// Other patterns can simplify/canonicalize operations in the body of the loop
-/// and the scf.if. This is beneficial for a wide range of transformations such
-/// as vectorization or loop tiling.
+/// This transformation is beneficial for a wide range of transformations such
+/// as vectorization or loop tiling: It enables additional canonicalizations
+/// inside the peeled loop body such as rewriting masked loads into unmaked
+/// loads.
 ///
 /// E.g., assuming a lower bound of 0 (for illustration purposes):
 /// ```
@ -65,11 +66,22 @@ void naivelyFuseParallelOps(Region &region);
 /// }
 /// ```
 ///
-/// This function rewrites the given scf.for loop in-place and creates a new
-/// scf.if operation (returned via `ifOp`) for the last iteration.
+/// After loop peeling, this function tries to simplify/canonicalize affine.min
+/// operations in the body of the loop and the scf.if, taking advantage of the
+/// fact that every iteration of the peeled loop is a "full" iteration. This
+/// canonicalization is expected to enable further canonicalization
+/// opportunities through other patterns.
 ///
-/// TODO: Simplify affine.min ops inside the new loop/if statement.
-LogicalResult peelForLoop(RewriterBase &b, ForOp forOp, scf::IfOp &ifOp);
+/// The return value indicates whether the loop was rewritten or not. Loops are
+/// not rewritten if:
+/// * Loop step size is 1 or
+/// * Loop bounds and step size are static, and step already divides the
+///   iteration space evenly.
+///
+/// Note: This function rewrites the given scf.for loop in-place and creates a
+/// new scf.if operation for the last iteration. It replaces all uses of the
+/// unpeeled loop with the results of the newly generated scf.if.
+LogicalResult peelAndCanonicalizeForLoop(RewriterBase &rewriter, ForOp forOp);

 /// Tile a parallel loop of the form
 ///   scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
--- a/mlir/include/mlir/IR/AffineExpr.h
+++ b/mlir/include/mlir/IR/AffineExpr.h
@ -143,12 +143,15 @@ public:
  /// `*this` and apply replace with `map` on its subexpressions.
  AffineExpr replace(const DenseMap<AffineExpr, AffineExpr> &map) const;

-  /// Replace dims[0 .. numDims - 1] by dims[shift .. shift + numDims - 1].
-  AffineExpr shiftDims(unsigned numDims, unsigned shift) const;
+  /// Replace dims[offset ... numDims)
+  /// by dims[offset + shift ... shift + numDims).
+  AffineExpr shiftDims(unsigned numDims, unsigned shift,
+                       unsigned offset = 0) const;

-  /// Replace symbols[0 .. numSymbols - 1] by
-  ///         symbols[shift .. shift + numSymbols - 1].
-  AffineExpr shiftSymbols(unsigned numSymbols, unsigned shift) const;
+  /// Replace symbols[offset ... numSymbols)
+  /// by symbols[offset + shift ... shift + numSymbols).
+  AffineExpr shiftSymbols(unsigned numSymbols, unsigned shift,
+                          unsigned offset = 0) const;

  AffineExpr operator+(int64_t v) const;
  AffineExpr operator+(AffineExpr other) const;
--- a/mlir/include/mlir/IR/AffineMap.h
+++ b/mlir/include/mlir/IR/AffineMap.h
@ -207,24 +207,28 @@ public:
  AffineMap replace(const DenseMap<AffineExpr, AffineExpr> &map,
                    unsigned numResultDims, unsigned numResultSyms) const;

-  /// Replace dims[0 .. numDims - 1] by dims[shift .. shift + numDims - 1].
-  AffineMap shiftDims(unsigned shift) const {
-    return AffineMap::get(
-        getNumDims() + shift, getNumSymbols(),
-        llvm::to_vector<4>(llvm::map_range(
-            getResults(),
-            [&](AffineExpr e) { return e.shiftDims(getNumDims(), shift); })),
-        getContext());
+  /// Replace dims[offset ... numDims)
+  /// by dims[offset + shift ... shift + numDims).
+  AffineMap shiftDims(unsigned shift, unsigned offset = 0) const {
+    assert(offset <= getNumDims());
+    return AffineMap::get(getNumDims() + shift, getNumSymbols(),
+                          llvm::to_vector<4>(llvm::map_range(
+                              getResults(),
+                              [&](AffineExpr e) {
+                                return e.shiftDims(getNumDims(), shift, offset);
+                              })),
+                          getContext());
  }

-  /// Replace symbols[0 .. numSymbols - 1] by
-  ///         symbols[shift .. shift + numSymbols - 1].
-  AffineMap shiftSymbols(unsigned shift) const {
+  /// Replace symbols[offset ... numSymbols)
+  /// by symbols[offset + shift ... shift + numSymbols).
+  AffineMap shiftSymbols(unsigned shift, unsigned offset = 0) const {
    return AffineMap::get(getNumDims(), getNumSymbols() + shift,
                          llvm::to_vector<4>(llvm::map_range(
                              getResults(),
                              [&](AffineExpr e) {
-                                return e.shiftSymbols(getNumSymbols(), shift);
+                                return e.shiftSymbols(getNumSymbols(), shift,
+                                                      offset);
                              })),
                          getContext());
  }
--- a/mlir/lib/Analysis/AffineStructures.cpp
+++ b/mlir/lib/Analysis/AffineStructures.cpp
@ -263,22 +263,52 @@ void FlatAffineConstraints::addLocalId(unsigned pos) {
  addId(IdKind::Local, pos);
 }

+unsigned FlatAffineConstraints::addLocalId() {
+  unsigned pos = getNumLocalIds();
+  addId(IdKind::Local, pos);
+  return pos;
+}
+
 void FlatAffineConstraints::addDimId(unsigned pos) {
  addId(IdKind::Dimension, pos);
 }

+unsigned FlatAffineConstraints::addDimId() {
+  unsigned pos = getNumDimIds();
+  addId(IdKind::Dimension, pos);
+  return pos;
+}
+
 void FlatAffineValueConstraints::addDimId(unsigned pos, Value val) {
  addId(IdKind::Dimension, pos, val);
 }

+unsigned FlatAffineValueConstraints::addDimId(Value val) {
+  unsigned pos = getNumDimIds();
+  addId(IdKind::Dimension, pos, val);
+  return pos;
+}
+
 void FlatAffineConstraints::addSymbolId(unsigned pos) {
  addId(IdKind::Symbol, pos);
 }

+unsigned FlatAffineConstraints::addSymbolId() {
+  unsigned pos = getNumSymbolIds();
+  addId(IdKind::Symbol, pos);
+  return pos;
+}
+
 void FlatAffineValueConstraints::addSymbolId(unsigned pos, Value val) {
  addId(IdKind::Symbol, pos, val);
 }

+unsigned FlatAffineValueConstraints::addSymbolId(Value val) {
+  unsigned pos = getNumSymbolIds();
+  addId(IdKind::Symbol, pos, val);
+  return pos;
+}
+
 unsigned FlatAffineConstraints::addId(IdKind kind, unsigned pos) {
  if (kind == IdKind::Dimension)
    assert(pos <= getNumDimIds());
--- a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
+++ b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
@ -357,6 +357,9 @@ static bool isDimOpValidSymbol(OpTy dimOp, Region *region) {
 // *) It is a result of the dim op on a memref whose corresponding size is a
 //    valid symbol.
 bool mlir::isValidSymbol(Value value) {
+  if (!value)
+    return false;
+
  // The value must be an index type.
  if (!value.getType().isIndex())
    return false;
--- a/mlir/lib/Dialect/SCF/Transforms/LoopSpecialization.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/LoopSpecialization.cpp
@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//

 #include "PassDetail.h"
+#include "mlir/Analysis/AffineStructures.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/SCF/Passes.h"
 #include "mlir/Dialect/SCF/SCF.h"
@ -97,8 +98,16 @@ static void specializeForLoopForUnrolling(ForOp op) {
 /// Rewrite a for loop with bounds/step that potentially do not divide evenly
 /// into a for loop where the step divides the iteration space evenly, followed
 /// by an scf.if for the last (partial) iteration (if any).
-LogicalResult mlir::scf::peelForLoop(RewriterBase &b, ForOp forOp,
-                                     scf::IfOp &ifOp) {
+///
+/// This function rewrites the given scf.for loop in-place and creates a new
+/// scf.if operation for the last iteration. It replaces all uses of the
+/// unpeeled loop with the results of the newly generated scf.if.
+///
+/// The newly generated scf.if operation is returned via `ifOp`. The boundary
+/// at which the loop is split (new upper bound) is returned via `splitBound`.
+/// The return value indicates whether the loop was rewritten or not.
+static LogicalResult peelForLoop(RewriterBase &b, ForOp forOp, scf::IfOp &ifOp,
+                                 Value &splitBound) {
  RewriterBase::InsertionGuard guard(b);
  auto lbInt = getConstantIntValue(forOp.lowerBound());
  auto ubInt = getConstantIntValue(forOp.upperBound());
@ -117,7 +126,7 @@ LogicalResult mlir::scf::peelForLoop(RewriterBase &b, ForOp forOp,
  // New upper bound: %ub - (%ub - %lb) mod %step
  auto modMap = AffineMap::get(3, 0, {dim1 - ((dim1 - dim0) % dim2)});
  b.setInsertionPoint(forOp);
-  Value splitBound = b.createOrFold<AffineApplyOp>(
+  splitBound = b.createOrFold<AffineApplyOp>(
      loc, modMap,
      ValueRange{forOp.lowerBound(), forOp.upperBound(), forOp.step()});

@ -153,6 +162,226 @@ LogicalResult mlir::scf::peelForLoop(RewriterBase &b, ForOp forOp,
  return success();
 }

+static void unpackOptionalValues(ArrayRef<Optional<Value>> source,
+                                 SmallVector<Value> &target) {
+  target = llvm::to_vector<4>(llvm::map_range(source, [](Optional<Value> val) {
+    return val.hasValue() ? *val : Value();
+  }));
+}
+
+/// Bound an identifier `pos` in a given FlatAffineValueConstraints with
+/// constraints drawn from an affine map. Before adding the constraint, the
+/// dimensions/symbols of the affine map are aligned with `constraints`.
+/// `operands` are the SSA Value operands used with the affine map.
+/// Note: This function adds a new symbol column to the `constraints` for each
+/// dimension/symbol that exists in the affine map but not in `constraints`.
+static LogicalResult alignAndAddBound(FlatAffineValueConstraints &constraints,
+                                      FlatAffineConstraints::BoundType type,
+                                      unsigned pos, AffineMap map,
+                                      ValueRange operands) {
+  SmallVector<Value> dims, syms, newSyms;
+  unpackOptionalValues(constraints.getMaybeDimValues(), dims);
+  unpackOptionalValues(constraints.getMaybeSymbolValues(), syms);
+
+  AffineMap alignedMap =
+      alignAffineMapWithValues(map, operands, dims, syms, &newSyms);
+  for (unsigned i = syms.size(); i < newSyms.size(); ++i)
+    constraints.addSymbolId(constraints.getNumSymbolIds(), newSyms[i]);
+  return constraints.addBound(type, pos, alignedMap);
+}
+
+/// This function tries to canonicalize affine.min operations by proving that
+/// its value is bounded by the same lower and upper bound. In that case, the
+/// operation can be folded away.
+///
+/// Bounds are computed by FlatAffineValueConstraints. Invariants required for
+/// finding/proving bounds should be supplied via `constraints`.
+///
+/// 1. Add dimensions for `minOp` and `minOpUb` (upper bound of `minOp`).
+/// 2. Compute an upper bound of `minOp` and bind it to `minOpUb`. SSA values
+///    that are used in `minOp` but are not part of `dims`, are added as extra
+///    symbols to the constraint set.
+/// 3. For each result of `minOp`: Add result as a dimension `r_i`. Prove that
+///    r_i >= minOpUb. If this is the case, ub(minOp) == lb(minOp) and `minOp`
+///    can be replaced with that bound.
+///
+/// In summary, the following constraints are added throughout this function.
+/// Note: `invar` are dimensions added by the caller to express the invariants.
+///
+///  invar | minOp | minOpUb | r_i | extra syms... | const |           eq/ineq
+///  ------+-------+---------+-----+---------------+-------+-------------------
+///   (various eq./ineq. constraining `invar`, added by the caller)
+///    ... |     0 |       0 |   0 |             0 |   ... |               ...
+///  ------+-------+---------+-----+---------------+-------+-------------------
+///   (various ineq. constraining `minOp` in terms of `minOp` operands (`invar`
+///    and extra `minOp` operands "extra syms" that are not in `invar`)).
+///    ... |    -1 |       0 |   0 |           ... |   ... |              >= 0
+///  ------+-------+---------+-----+---------------+-------+-------------------
+///   (set `minOpUb` to `minOp` upper bound in terms of `invar` and extra syms)
+///    ... |     0 |      -1 |   0 |           ... |   ... |               = 0
+///  ------+-------+---------+-----+---------------+-------+-------------------
+///   (for each `minOp` map result r_i: copy previous constraints, set r_i to
+///    corresponding map result, prove r_i >= minOpUb via contradiction)
+///    ... |     0 |       0 |  -1 |           ... |   ... |               = 0
+///      0 |     0 |       1 |  -1 |             0 |    -1 |              >= 0
+///
+static LogicalResult
+canonicalizeAffineMinOp(RewriterBase &rewriter, AffineMinOp minOp,
+                        FlatAffineValueConstraints constraints) {
+  RewriterBase::InsertionGuard guard(rewriter);
+  AffineMap minOpMap = minOp.getAffineMap();
+  unsigned numResults = minOpMap.getNumResults();
+
+  // Add a few extra dimensions.
+  unsigned dimMinOp = constraints.addDimId();   // `minOp`
+  unsigned dimMinOpUb = constraints.addDimId(); // `minOp` upper bound
+  unsigned resultDimStart = constraints.getNumDimIds();
+  for (unsigned i = 0; i < numResults; ++i)
+    constraints.addDimId();
+
+  // Add an inequality for each result expr_i of minOpMap: minOp <= expr_i
+  if (failed(alignAndAddBound(constraints, FlatAffineConstraints::UB, dimMinOp,
+                              minOpMap, minOp.operands())))
+    return failure();
+
+  // Try to compute an upper bound for minOp, expressed in terms of the other
+  // `dims` and extra symbols.
+  SmallVector<AffineMap> minOpValLb(1), minOpValUb(1);
+  constraints.getSliceBounds(dimMinOp, 1, minOp.getContext(), &minOpValLb,
+                             &minOpValUb);
+  // TODO: `getSliceBounds` may return multiple bounds at the moment. This is
+  // a TODO of `getSliceBounds` and not handled here.
+  if (!minOpValUb[0] || minOpValUb[0].getNumResults() != 1)
+    return failure(); // No or multiple upper bounds found.
+
+  // Add an equality: dimMinOpUb = minOpValUb[0]
+  // Add back dimension for minOp. (Was removed by `getSliceBounds`.)
+  AffineMap alignedUbMap = minOpValUb[0].shiftDims(/*shift=*/1,
+                                                   /*offset=*/dimMinOp);
+  if (failed(constraints.addBound(FlatAffineConstraints::EQ, dimMinOpUb,
+                                  alignedUbMap)))
+    return failure();
+
+  // If the constraint system is empty, there is an inconsistency. (E.g., this
+  // can happen if loop lb > ub.)
+  if (constraints.isEmpty())
+    return failure();
+
+  // Prove that each result of minOpMap has a lower bound that is equal to (or
+  // greater than) the upper bound of minOp (`kDimMinOpUb`). In that case,
+  // minOp can be replaced with the bound. I.e., prove that for each result
+  // expr_i (represented by dimension r_i):
+  //
+  // r_i >= minOpUb
+  //
+  // To prove this inequality, add its negation to the constraint set and prove
+  // that the constraint set is empty.
+  for (unsigned i = resultDimStart; i < resultDimStart + numResults; ++i) {
+    FlatAffineValueConstraints newConstr(constraints);
+
+    // Add an equality: r_i = expr_i
+    // Note: These equalities could have been added earlier and used to express
+    // minOp <= expr_i. However, then we run the risk that `getSliceBounds`
+    // computes minOpUb in terms of r_i dims, which is not desired.
+    if (failed(alignAndAddBound(newConstr, FlatAffineConstraints::EQ, i,
+                                minOpMap.getSubMap({i - resultDimStart}),
+                                minOp.operands())))
+      return failure();
+
+    // Add inequality: r_i < minOpUb (equiv.: minOpUb - r_i - 1 >= 0)
+    SmallVector<int64_t> ineq(newConstr.getNumCols(), 0);
+    ineq[dimMinOpUb] = 1;
+    ineq[i] = -1;
+    ineq[newConstr.getNumCols() - 1] = -1;
+    newConstr.addInequality(ineq);
+    if (!newConstr.isEmpty())
+      return failure();
+  }
+
+  // Lower and upper bound of `minOp` are equal. Replace `minOp` with its bound.
+  AffineMap newMap = alignedUbMap;
+  SmallVector<Value> newOperands;
+  unpackOptionalValues(constraints.getMaybeDimAndSymbolValues(), newOperands);
+  mlir::canonicalizeMapAndOperands(&newMap, &newOperands);
+  rewriter.setInsertionPoint(minOp);
+  rewriter.replaceOpWithNewOp<AffineApplyOp>(minOp, newMap, newOperands);
+  return success();
+}
+
+/// Try to simplify an affine.min operation `minOp` after loop peeling. This
+/// function detects affine.min operations such as (ub is the previous upper
+/// bound of the unpeeled loop):
+/// ```
+/// #map = affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)>
+/// %r = affine.min #affine.min #map(%iv)[%step, %ub]
+/// ```
+/// and rewrites them into (in the case the peeled loop):
+/// ```
+/// %r = %step
+/// ```
+/// affine.min operations inside the generated scf.if operation are rewritten in
+/// a similar way.
+///
+/// This function builds up a set of constraints, capable of proving that:
+/// * Inside the peeled loop: min(step, ub - iv) == step
+/// * Inside the scf.if operation: min(step, ub - iv) == ub - iv
+///
+/// Note: `ub` is the previous upper bound of the loop (before peeling).
+/// `insideLoop` must be true for affine.min ops inside the loop and false for
+/// affine.min ops inside the scf.for op.
+static LogicalResult rewritePeeledAffineOp(RewriterBase &rewriter,
+                                           AffineMinOp minOp, Value iv,
+                                           Value ub, Value step,
+                                           bool insideLoop) {
+  FlatAffineValueConstraints constraints;
+  constraints.addDimId(0, iv);
+  constraints.addDimId(1, ub);
+  constraints.addDimId(2, step);
+  if (auto constUb = getConstantIntValue(ub))
+    constraints.addBound(FlatAffineConstraints::EQ, 1, *constUb);
+  if (auto constStep = getConstantIntValue(step))
+    constraints.addBound(FlatAffineConstraints::EQ, 2, *constStep);
+
+  // Add loop peeling invariant. This is the main piece of knowledge that
+  // enables AffineMinOp simplification.
+  if (insideLoop) {
+    // ub - iv >= step (equiv.: -iv + ub - step + 0 >= 0)
+    // Intuitively: Inside the peeled loop, every iteration is a "full"
+    // iteration, i.e., step divides the iteration space `ub - lb` evenly.
+    constraints.addInequality({-1, 1, -1, 0});
+  } else {
+    // ub - iv < step (equiv.: iv + -ub + step - 1 >= 0)
+    // Intuitively: `iv` is the split bound here, i.e., the iteration variable
+    // value of the very last iteration (in the unpeeled loop). At that point,
+    // there are less than `step` elements remaining. (Otherwise, the peeled
+    // loop would run for at least one more iteration.)
+    constraints.addInequality({1, -1, 1, -1});
+  }
+
+  return canonicalizeAffineMinOp(rewriter, minOp, constraints);
+}
+
+LogicalResult mlir::scf::peelAndCanonicalizeForLoop(RewriterBase &rewriter,
+                                                    ForOp forOp) {
+  Value ub = forOp.upperBound();
+  scf::IfOp ifOp;
+  Value splitBound;
+  if (failed(peelForLoop(rewriter, forOp, ifOp, splitBound)))
+    return failure();
+
+  // Rewrite affine.min ops.
+  forOp.walk([&](AffineMinOp minOp) {
+    (void)rewritePeeledAffineOp(rewriter, minOp, forOp.getInductionVar(), ub,
+                                forOp.step(), /*insideLoop=*/true);
+  });
+  ifOp.walk([&](AffineMinOp minOp) {
+    (void)rewritePeeledAffineOp(rewriter, minOp, splitBound, ub, forOp.step(),
+                                /*insideLoop=*/false);
+  });
+
+  return success();
+}
+
 static constexpr char kPeeledLoopLabel[] = "__peeled_loop__";

 namespace {
@ -163,15 +392,12 @@ struct ForLoopPeelingPattern : public OpRewritePattern<ForOp> {
                                PatternRewriter &rewriter) const override {
    if (forOp->hasAttr(kPeeledLoopLabel))
      return failure();
-
-    scf::IfOp ifOp;
-    if (failed(peelForLoop(rewriter, forOp, ifOp)))
+    if (failed(peelAndCanonicalizeForLoop(rewriter, forOp)))
      return failure();
    // Apply label, so that the same loop is not rewritten a second time.
    rewriter.updateRootInPlace(forOp, [&]() {
      forOp->setAttr(kPeeledLoopLabel, rewriter.getUnitAttr());
    });
-
    return success();
  }
 };
--- a/mlir/lib/IR/AffineExpr.cpp
+++ b/mlir/lib/IR/AffineExpr.cpp
@ -101,19 +101,26 @@ AffineExpr::replaceSymbols(ArrayRef<AffineExpr> symReplacements) const {
  return replaceDimsAndSymbols({}, symReplacements);
 }

-/// Replace symbols[0 .. numDims - 1] by symbols[shift .. shift + numDims - 1].
-AffineExpr AffineExpr::shiftDims(unsigned numDims, unsigned shift) const {
+/// Replace dims[offset ... numDims)
+/// by dims[offset + shift ... shift + numDims).
+AffineExpr AffineExpr::shiftDims(unsigned numDims, unsigned shift,
+                                 unsigned offset) const {
  SmallVector<AffineExpr, 4> dims;
-  for (unsigned idx = 0; idx < numDims; ++idx)
+  for (unsigned idx = 0; idx < offset; ++idx)
+    dims.push_back(getAffineDimExpr(idx, getContext()));
+  for (unsigned idx = offset; idx < numDims; ++idx)
    dims.push_back(getAffineDimExpr(idx + shift, getContext()));
  return replaceDimsAndSymbols(dims, {});
 }

-/// Replace symbols[0 .. numSymbols - 1] by
-/// symbols[shift .. shift + numSymbols - 1].
-AffineExpr AffineExpr::shiftSymbols(unsigned numSymbols, unsigned shift) const {
+/// Replace symbols[offset ... numSymbols)
+/// by symbols[offset + shift ... shift + numSymbols).
+AffineExpr AffineExpr::shiftSymbols(unsigned numSymbols, unsigned shift,
+                                    unsigned offset) const {
  SmallVector<AffineExpr, 4> symbols;
-  for (unsigned idx = 0; idx < numSymbols; ++idx)
+  for (unsigned idx = 0; idx < offset; ++idx)
+    symbols.push_back(getAffineSymbolExpr(idx, getContext()));
+  for (unsigned idx = offset; idx < numSymbols; ++idx)
    symbols.push_back(getAffineSymbolExpr(idx + shift, getContext()));
  return replaceDimsAndSymbols({}, symbols);
 }
--- a/mlir/test/Dialect/SCF/for-loop-peeling.mlir
+++ b/mlir/test/Dialect/SCF/for-loop-peeling.mlir
@ -1,22 +1,20 @@
 // RUN: mlir-opt %s -for-loop-peeling -canonicalize -split-input-file | FileCheck %s

 //  CHECK-DAG: #[[MAP0:.*]] = affine_map<()[s0, s1, s2] -> (s1 - (s1 - s0) mod s2)>
-//  CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)>
-//  CHECK-DAG: #[[MAP2:.*]] = affine_map<()[s0, s1, s2] -> (s0, s2 - (s2 - (s2 - s1) mod s0))>
+//  CHECK-DAG: #[[MAP1:.*]] = affine_map<()[s0, s1, s2] -> (-(s0 - (s0 - s1) mod s2) + s0)>
 //      CHECK: func @fully_dynamic_bounds(
 // CHECK-SAME:     %[[LB:.*]]: index, %[[UB:.*]]: index, %[[STEP:.*]]: index
 //      CHECK:   %[[C0_I32:.*]] = constant 0 : i32
 //      CHECK:   %[[NEW_UB:.*]] = affine.apply #[[MAP0]]()[%[[LB]], %[[UB]], %[[STEP]]]
 //      CHECK:   %[[LOOP:.*]] = scf.for %[[IV:.*]] = %[[LB]] to %[[NEW_UB]]
 // CHECK-SAME:       step %[[STEP]] iter_args(%[[ACC:.*]] = %[[C0_I32]]) -> (i32) {
-//      CHECK:     %[[MINOP:.*]] = affine.min #[[MAP1]](%[[IV]])[%[[STEP]], %[[UB]]]
-//      CHECK:     %[[CAST:.*]] = index_cast %[[MINOP]] : index to i32
+//      CHECK:     %[[CAST:.*]] = index_cast %[[STEP]] : index to i32
 //      CHECK:     %[[ADD:.*]] = addi %[[ACC]], %[[CAST]] : i32
 //      CHECK:     scf.yield %[[ADD]]
 //      CHECK:   }
 //      CHECK:   %[[HAS_MORE:.*]] = cmpi slt, %[[NEW_UB]], %[[UB]]
 //      CHECK:   %[[RESULT:.*]] = scf.if %[[HAS_MORE]] -> (i32) {
-//      CHECK:     %[[REM:.*]] = affine.min #[[MAP2]]()[%[[STEP]], %[[LB]], %[[UB]]]
+//      CHECK:     %[[REM:.*]] = affine.apply #[[MAP1]]()[%[[UB]], %[[LB]], %[[STEP]]]
 //      CHECK:     %[[CAST2:.*]] = index_cast %[[REM]]
 //      CHECK:     %[[ADD2:.*]] = addi %[[LOOP]], %[[CAST2]]
 //      CHECK:     scf.yield %[[ADD2]]
@ -38,18 +36,16 @@ func @fully_dynamic_bounds(%lb : index, %ub: index, %step: index) -> i32 {

 // -----

-//  CHECK-DAG: #[[MAP:.*]] = affine_map<(d0) -> (4, -d0 + 17)>
 //      CHECK: func @fully_static_bounds(
 //  CHECK-DAG:   %[[C0_I32:.*]] = constant 0 : i32
 //  CHECK-DAG:   %[[C1_I32:.*]] = constant 1 : i32
+//  CHECK-DAG:   %[[C4_I32:.*]] = constant 4 : i32
 //  CHECK-DAG:   %[[C0:.*]] = constant 0 : index
 //  CHECK-DAG:   %[[C4:.*]] = constant 4 : index
 //  CHECK-DAG:   %[[C16:.*]] = constant 16 : index
 //      CHECK:   %[[LOOP:.*]] = scf.for %[[IV:.*]] = %[[C0]] to %[[C16]]
 // CHECK-SAME:       step %[[C4]] iter_args(%[[ACC:.*]] = %[[C0_I32]]) -> (i32) {
-//      CHECK:     %[[MINOP:.*]] = affine.min #[[MAP]](%[[IV]])
-//      CHECK:     %[[CAST:.*]] = index_cast %[[MINOP]] : index to i32
-//      CHECK:     %[[ADD:.*]] = addi %[[ACC]], %[[CAST]] : i32
+//      CHECK:     %[[ADD:.*]] = addi %[[ACC]], %[[C4_I32]] : i32
 //      CHECK:     scf.yield %[[ADD]]
 //      CHECK:   }
 //      CHECK:   %[[RESULT:.*]] = addi %[[LOOP]], %[[C1_I32]] : i32
@ -73,24 +69,22 @@ func @fully_static_bounds() -> i32 {
 // -----

 //  CHECK-DAG: #[[MAP0:.*]] = affine_map<()[s0] -> ((s0 floordiv 4) * 4)>
-//  CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0)[s0] -> (4, -d0 + s0)>
-//  CHECK-DAG: #[[MAP2:.*]] = affine_map<()[s0] -> (4, s0 mod 4)>
+//  CHECK-DAG: #[[MAP1:.*]] = affine_map<()[s0] -> (s0 mod 4)>
 //      CHECK: func @dynamic_upper_bound(
 // CHECK-SAME:     %[[UB:.*]]: index
 //  CHECK-DAG:   %[[C0_I32:.*]] = constant 0 : i32
+//  CHECK-DAG:   %[[C4_I32:.*]] = constant 4 : i32
 //  CHECK-DAG:   %[[C0:.*]] = constant 0 : index
 //  CHECK-DAG:   %[[C4:.*]] = constant 4 : index
 //      CHECK:   %[[NEW_UB:.*]] = affine.apply #[[MAP0]]()[%[[UB]]]
 //      CHECK:   %[[LOOP:.*]] = scf.for %[[IV:.*]] = %[[C0]] to %[[NEW_UB]]
 // CHECK-SAME:       step %[[C4]] iter_args(%[[ACC:.*]] = %[[C0_I32]]) -> (i32) {
-//      CHECK:     %[[MINOP:.*]] = affine.min #[[MAP1]](%[[IV]])[%[[UB]]]
-//      CHECK:     %[[CAST:.*]] = index_cast %[[MINOP]] : index to i32
-//      CHECK:     %[[ADD:.*]] = addi %[[ACC]], %[[CAST]] : i32
+//      CHECK:     %[[ADD:.*]] = addi %[[ACC]], %[[C4_I32]] : i32
 //      CHECK:     scf.yield %[[ADD]]
 //      CHECK:   }
 //      CHECK:   %[[HAS_MORE:.*]] = cmpi slt, %[[NEW_UB]], %[[UB]]
 //      CHECK:   %[[RESULT:.*]] = scf.if %[[HAS_MORE]] -> (i32) {
-//      CHECK:     %[[REM:.*]] = affine.min #[[MAP2]]()[%[[UB]]]
+//      CHECK:     %[[REM:.*]] = affine.apply #[[MAP1]]()[%[[UB]]]
 //      CHECK:     %[[CAST2:.*]] = index_cast %[[REM]]
 //      CHECK:     %[[ADD2:.*]] = addi %[[LOOP]], %[[CAST2]]
 //      CHECK:     scf.yield %[[ADD2]]
@ -116,23 +110,21 @@ func @dynamic_upper_bound(%ub : index) -> i32 {
 // -----

 //  CHECK-DAG: #[[MAP0:.*]] = affine_map<()[s0] -> ((s0 floordiv 4) * 4)>
-//  CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0)[s0] -> (4, -d0 + s0)>
-//  CHECK-DAG: #[[MAP2:.*]] = affine_map<()[s0] -> (4, s0 mod 4)>
+//  CHECK-DAG: #[[MAP1:.*]] = affine_map<()[s0] -> (s0 mod 4)>
 //      CHECK: func @no_loop_results(
 // CHECK-SAME:     %[[UB:.*]]: index, %[[MEMREF:.*]]: memref<i32>
+//  CHECK-DAG:   %[[C4_I32:.*]] = constant 4 : i32
 //  CHECK-DAG:   %[[C0:.*]] = constant 0 : index
 //  CHECK-DAG:   %[[C4:.*]] = constant 4 : index
 //      CHECK:   %[[NEW_UB:.*]] = affine.apply #[[MAP0]]()[%[[UB]]]
 //      CHECK:   scf.for %[[IV:.*]] = %[[C0]] to %[[NEW_UB]] step %[[C4]] {
-//      CHECK:     %[[MINOP:.*]] = affine.min #[[MAP1]](%[[IV]])[%[[UB]]]
 //      CHECK:     %[[LOAD:.*]] = memref.load %[[MEMREF]][]
-//      CHECK:     %[[CAST:.*]] = index_cast %[[MINOP]] : index to i32
-//      CHECK:     %[[ADD:.*]] = addi %[[LOAD]], %[[CAST]] : i32
+//      CHECK:     %[[ADD:.*]] = addi %[[LOAD]], %[[C4_I32]] : i32
 //      CHECK:     memref.store %[[ADD]], %[[MEMREF]]
 //      CHECK:   }
 //      CHECK:   %[[HAS_MORE:.*]] = cmpi slt, %[[NEW_UB]], %[[UB]]
 //      CHECK:   scf.if %[[HAS_MORE]] {
-//      CHECK:     %[[REM:.*]] = affine.min #[[MAP2]]()[%[[UB]]]
+//      CHECK:     %[[REM:.*]] = affine.apply #[[MAP1]]()[%[[UB]]]
 //      CHECK:     %[[LOAD2:.*]] = memref.load %[[MEMREF]][]
 //      CHECK:     %[[CAST2:.*]] = index_cast %[[REM]]
 //      CHECK:     %[[ADD2:.*]] = addi %[[LOAD2]], %[[CAST2]]
@ -153,3 +145,81 @@ func @no_loop_results(%ub : index, %d : memref<i32>) {
  }
  return
 }
+
+// -----
+
+// Test rewriting of affine.min ops. Make sure that more general cases than
+// the ones above are successfully rewritten. Also make sure that the pattern
+// does not rewrite affine.min ops that should not be rewritten.
+
+//  CHECK-DAG: #[[MAP1:.*]] = affine_map<()[s0] -> (s0 + 1)>
+//  CHECK-DAG: #[[MAP2:.*]] = affine_map<(d0)[s0, s1] -> (s0, -d0 + s1 - 1)>
+//  CHECK-DAG: #[[MAP3:.*]] = affine_map<(d0)[s0, s1, s2] -> (s0, -d0 + s1, s2)>
+//  CHECK-DAG: #[[MAP4:.*]] = affine_map<()[s0, s1, s2] -> (-(s0 - (s0 - s1) mod s2) + s0)>
+//  CHECK-DAG: #[[MAP5:.*]] = affine_map<()[s0, s1, s2] -> (-(s0 - (s0 - s1) mod s2) + s0 + 1)>
+//  CHECK-DAG: #[[MAP6:.*]] = affine_map<()[s0, s1, s2] -> (-(s0 - (s0 - s1) mod s2) + s0 - 1)>
+//  CHECK-DAG: #[[MAP7:.*]] = affine_map<()[s0, s1, s2, s3] -> (s0, s2 - (s2 - (s2 - s1) mod s0), s3)>
+//      CHECK: func @test_affine_min_rewrite(
+// CHECK-SAME:     %[[LB:.*]]: index, %[[UB:.*]]: index, %[[STEP:.*]]: index,
+// CHECK-SAME:     %[[MEMREF:.*]]: memref<?xindex>, %[[SOME_VAL:.*]]: index
+//      CHECK:   scf.for %[[IV:.*]] = %[[LB]] to %{{.*}} step %[[STEP]] {
+//                 (affine.min folded away)
+//      CHECK:     memref.store %[[STEP]]
+//                 (affine.min folded away)
+//      CHECK:     memref.store %[[STEP]]
+//      CHECK:     %[[RES2:.*]] = affine.apply #[[MAP1]]()[%[[STEP]]]
+//      CHECK:     memref.store %[[RES2]]
+//      CHECK:     %[[RES3:.*]] = affine.min #[[MAP2]](%[[IV]])[%[[STEP]], %[[UB]]]
+//      CHECK:     memref.store %[[RES3]]
+//      CHECK:     %[[RES4:.*]] = affine.min #[[MAP3]](%[[IV]])[%[[STEP]], %[[UB]], %[[SOME_VAL]]]
+//      CHECK:     memref.store %[[RES4]]
+//      CHECK:   }
+//      CHECK:   scf.if {{.*}} {
+//      CHECK:     %[[RES_IF_0:.*]] = affine.apply #[[MAP4]]()[%[[UB]], %[[LB]], %[[STEP]]]
+//      CHECK:     memref.store %[[RES_IF_0]]
+//      CHECK:     %[[RES_IF_1:.*]] = affine.apply #[[MAP5]]()[%[[UB]], %[[LB]], %[[STEP]]]
+//      CHECK:     memref.store %[[RES_IF_1]]
+//      CHECK:     %[[RES_IF_2:.*]] = affine.apply #[[MAP5]]()[%[[UB]], %[[LB]], %[[STEP]]]
+//      CHECK:     memref.store %[[RES_IF_2]]
+//      CHECK:     %[[RES_IF_3:.*]] = affine.apply #[[MAP6]]()[%[[UB]], %[[LB]], %[[STEP]]]
+//      CHECK:     memref.store %[[RES_IF_3]]
+//      CHECK:     %[[RES_IF_4:.*]] = affine.min #[[MAP7]]()[%[[STEP]], %[[LB]], %[[UB]], %[[SOME_VAL]]]
+//      CHECK:     memref.store %[[RES_IF_4]]
+#map0 = affine_map<(d0, d1)[s0] -> (s0, d0 - d1)>
+#map1 = affine_map<(d0, d1)[s0] -> (d0 - d1 + 1, s0)>
+#map2 = affine_map<(d0, d1)[s0] -> (s0 + 1, d0 - d1 + 1)>
+#map3 = affine_map<(d0, d1)[s0] -> (s0, d0 - d1 - 1)>
+#map4 = affine_map<(d0, d1, d2)[s0] -> (s0, d0 - d1, d2)>
+func @test_affine_min_rewrite(%lb : index, %ub: index,
+                              %step: index, %d : memref<?xindex>,
+                              %some_val: index) {
+  %c0 = constant 0 : index
+  %c1 = constant 1 : index
+  %c2 = constant 2 : index
+  %c3 = constant 3 : index
+  %c4 = constant 4 : index
+  scf.for %iv = %lb to %ub step %step {
+    // Most common case: Rewrite min(%ub - %iv, %step) to %step.
+    %m0 = affine.min #map0(%ub, %iv)[%step]
+    memref.store %m0, %d[%c0] : memref<?xindex>
+
+    // Increase %ub - %iv a little bit, pattern should still apply.
+    %m1 = affine.min #map1(%ub, %iv)[%step]
+    memref.store %m1, %d[%c1] : memref<?xindex>
+
+    // Rewrite min(%ub - %iv + 1, %step + 1) to %step + 1.
+    %m2 = affine.min #map2(%ub, %iv)[%step]
+    memref.store %m2, %d[%c2] : memref<?xindex>
+
+    // min(%ub - %iv - 1, %step) cannot be simplified because %ub - %iv - 1
+    // can be smaller than %step. (Can be simplified in if-statement.)
+    %m3 = affine.min #map3(%ub, %iv)[%step]
+    memref.store %m3, %d[%c3] : memref<?xindex>
+
+    // min(%ub - %iv, %step, %some_val) cannot be simplified because the range
+    // of %some_val is unknown.
+    %m4 = affine.min #map4(%ub, %iv, %some_val)[%step]
+    memref.store %m4, %d[%c4] : memref<?xindex>
+  }
+  return
+}
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@ -1479,6 +1479,7 @@ cc_library(
    includes = ["include"],
    deps = [
        ":Affine",
+        ":Analysis",
        ":DialectUtils",
        ":IR",
        ":MemRefDialect",