Allocate private/local buffers for slices accurately during fusion

- the size of the private memref created for the slice should be based on the memref region accessed at the depth at which the slice is being materialized, i.e., symbolic in the outer IVs up until that depth, as opposed to the region accessed based on the entire domain. - leads to a significant contraction of the temporary / intermediate memref whenever the memref isn't reduced to a single scalar (through store fwd'ing). Other changes - update to promoteIfSingleIteration - avoid introducing unnecessary identity map affine_apply from IV; makes it much easier to write and read test cases and pass output for all passes that use promoteIfSingleIteration; loop-fusion test cases become much simpler - fix replaceAllMemrefUsesWith bug that was exposed by the above update - 'domInstFilter' could be one of the ops erased due to a memref replacement in it. - fix getConstantBoundOnDimSize bug: a division by the coefficient of the identifier was missing (the latter need not always be 1); add lbFloorDivisors output argument - rename getBoundingConstantSizeAndShape -> getConstantBoundingSizeAndShape PiperOrigin-RevId: 230405218
2019-01-22 13:58:52 -08:00 · 2019-01-22 13:58:52 -08:00 · 94a03f864f
parent 71495d58a7
commit 94a03f864f
11 changed files with 355 additions and 218 deletions
--- a/mlir/include/mlir/Analysis/AffineStructures.h
+++ b/mlir/include/mlir/Analysis/AffineStructures.h
@ -547,18 +547,22 @@ public:
  /// Clears this list of constraints and copies other into it.
  void clearAndCopyFrom(const FlatAffineConstraints &other);

-  /// Returns the smallest known constant bound for the extent of the
-  /// specified identifier (pos^th), i.e., the smallest known constant that is
-  /// greater than or equal to 'exclusive upper bound' - 'lower bound' of the
-  /// identifier; returns None if it's not a constant. This method employs
+  /// Returns the smallest known constant bound for the extent of the specified
+  /// identifier (pos^th), i.e., the smallest known constant that is greater
+  /// than or equal to 'exclusive upper bound' - 'lower bound' of the
+  /// identifier. Returns None if it's not a constant. This method employs
  /// trivial (low complexity / cost) checks and detection. Symbolic identifiers
  /// are treated specially, i.e., it looks for constant differences between
  /// affine expressions involving only the symbolic identifiers. See comments
-  /// at function definition for examples. 'lb', if provided, is set to the
-  /// lower bound associated with the constant difference.
+  /// at function definition for examples. 'lb' and 'lbDivisor', if provided,
+  /// are used to express the lower bound associated with the constant
+  /// difference: 'lb' has the coefficients and lbDivisor, the divisor. For eg.,
+  /// if the lower bound is [(s0 + s2 - 1) floordiv 32] for a system with three
+  /// symbolic identifiers, *lb = [1, 0, 1], lbDivisor = 32.
  Optional<int64_t>
  getConstantBoundOnDimSize(unsigned pos,
-                            SmallVectorImpl<int64_t> *lb = nullptr) const;
+                            SmallVectorImpl<int64_t> *lb = nullptr,
+                            int64_t *lbDivisor = nullptr) const;

  /// Returns the constant lower bound for the pos^th identifier if there is
  /// one; None otherwise.
--- a/mlir/include/mlir/Analysis/Utils.h
+++ b/mlir/include/mlir/Analysis/Utils.h
@ -79,9 +79,10 @@ struct MemRefRegion {
  /// bounded by a known constant, None otherwise. The 'shape' vector is set to
  /// the corresponding dimension-wise bounds major to minor. We use int64_t
  /// instead of uint64_t since index types can be at most int64_t.
-  Optional<int64_t> getBoundingConstantSizeAndShape(
+  Optional<int64_t> getConstantBoundingSizeAndShape(
      SmallVectorImpl<int> *shape = nullptr,
-      std::vector<SmallVector<int64_t, 4>> *lbs = nullptr) const;
+      std::vector<SmallVector<int64_t, 4>> *lbs = nullptr,
+      SmallVectorImpl<int64_t> *lbDivisors = nullptr) const;

  /// A wrapper around FlatAffineConstraints::getConstantBoundOnDimSize(). 'pos'
  /// corresponds to the position of the memref shape's dimension (major to
@ -89,7 +90,8 @@ struct MemRefRegion {
  //'cst'.
  Optional<int64_t>
  getConstantBoundOnDimSize(unsigned pos,
-                            SmallVectorImpl<int64_t> *lb = nullptr) const {
+                            SmallVectorImpl<int64_t> *lb = nullptr,
+                            int64_t *lbDivisor = nullptr) const {
    assert(pos < getRank() && "invalid position");
    return cst.getConstantBoundOnDimSize(pos, lb);
  }
--- a/mlir/include/mlir/Transforms/Utils.h
+++ b/mlir/include/mlir/Transforms/Utils.h
@ -40,16 +40,29 @@ class OperationInst;

 class Function;

-/// Replace all uses of oldMemRef with newMemRef while optionally remapping the
-/// old memref's indices using the supplied affine map and adding any additional
-/// indices. Additional indices are added at the start. The new memref could be
-/// of a different shape or rank. 'extraOperands' is an optional argument that
-/// corresponds to additional operands (inputs) for indexRemap at the beginning
-/// of its input list. An additional optional argument 'domInstFilter' restricts
-/// the replacement to only those operations that are dominated by the former.
-/// Returns true on success and false if the replacement is not possible
-/// (whenever a memref is used as an operand in a non-deferencing scenario). See
-/// comments at function definition for an example.
+/// Replaces all uses of oldMemRef with newMemRef while optionally remapping the
+/// old memref's indices using the supplied affine map, 'indexRemap'. The new
+/// memref could be of a different shape or rank. 'extraIndices' provides
+/// additional access indices to be added to the start. 'indexRemap' remaps
+/// indices of the old memref access to a new set of indices that are used to
+/// index the memref. Additional input operands to indexRemap can be optionally
+/// provided, and they are added at the start of its input list. 'indexRemap' is
+/// expected to have only dimensional inputs, and the number of its inputs equal
+/// to extraOperands.size() plus rank of the memref.  'extraOperands' is an
+/// optional argument that corresponds to additional operands (inputs) for
+/// indexRemap at the beginning of its input list. An additional optional
+/// argument 'domInstFilter' restricts the replacement to only those operations
+/// that are dominated by the former. Returns true on success and false if the
+/// replacement is not possible (whenever a memref is used as an operand in a
+/// non-deferencing scenario). See comments at function definition for an
+/// example.
+//  Ex: to replace load %A[%i, %j] with load %Abuf[%t mod 2, %ii - %i, %j]:
+//  The SSA value corresponding to '%t mod 2' should be in 'extraIndices', and
+//  index remap will perform (%i, %j) -> (%ii - %i, %j), i.e., indexRemap = (d0,
+//  d1, d2) -> (d0 - d1, d2), and %ii will be the extra operand. Without any
+//  extra operands, note that 'indexRemap' would just be applied to existing
+//  indices (%i, %j).
+//  TODO(bondhugula): allow extraIndices to be added at any position.
 bool replaceAllMemRefUsesWith(const Value *oldMemRef, Value *newMemRef,
                              ArrayRef<Value *> extraIndices = {},
                              AffineMap indexRemap = AffineMap::Null(),
--- a/mlir/lib/Analysis/AffineAnalysis.cpp
+++ b/mlir/lib/Analysis/AffineAnalysis.cpp
@ -1531,6 +1531,7 @@ AffineApplyNormalizer::AffineApplyNormalizer(AffineMap map,

  affineMap = simplifyAffineMap(map.compose(exprsMap));
  LLVM_DEBUG(affineMap.print(dbgs() << "\nSimplified result: "));
+  LLVM_DEBUG(dbgs() << "\n");
 }

 /// Implements `map` and `operands` composition and simplification to support
--- a/mlir/lib/Analysis/AffineStructures.cpp
+++ b/mlir/lib/Analysis/AffineStructures.cpp
@ -1438,9 +1438,11 @@ void FlatAffineConstraints::constantFoldIdRange(unsigned pos, unsigned num) {
 /// the coefficients of the symbolic identifiers and the constant coefficient.
 //  Egs: 0 <= i <= 15, return 16.
 //       s0 + 2 <= i <= s0 + 17, returns 16. (s0 has to be a symbol)
-//       i + s0 + 16 <= d0 <= i + s0  + 31, returns 16.
+//       s0 + s1 + 16 <= d0 <= s0 + s1 + 31, returns 16.
+//       s0 - 7 <= 8*j <= s0 returns 1 with lb = s0, lbDivisor = 8 (since lb =
+//       ceil(s0 - 7 / 8) = floor(s0 / 8)).
 Optional<int64_t> FlatAffineConstraints::getConstantBoundOnDimSize(
-    unsigned pos, SmallVectorImpl<int64_t> *lb) const {
+    unsigned pos, SmallVectorImpl<int64_t> *lb, int64_t *lbFloorDivisor) const {
  assert(pos < getNumDimIds() && "Invalid identifier position");
  assert(getNumLocalIds() == 0);

@ -1463,6 +1465,9 @@ Optional<int64_t> FlatAffineConstraints::getConstantBoundOnDimSize(
        (*lb)[c] = v < 0 ? atEq(eqRow, getNumDimIds() + c) / -v
                         : -atEq(eqRow, getNumDimIds() + c) / v;
      }
+      assert(lbFloorDivisor &&
+             "both lb and divisor or none should be provided");
+      *lbFloorDivisor = 1;
    }
    return 1;
  }
@ -1519,8 +1524,9 @@ Optional<int64_t> FlatAffineConstraints::getConstantBoundOnDimSize(
        }
      if (j < getNumCols() - 1)
        continue;
-      int64_t diff =
-          atIneq(ubPos, getNumCols() - 1) + atIneq(lbPos, getNumCols() - 1) + 1;
+      int64_t diff = floorDiv(atIneq(ubPos, getNumCols() - 1) +
+                                  atIneq(lbPos, getNumCols() - 1) + 1,
+                              atIneq(lbPos, pos));
      if (minDiff == None || diff < minDiff) {
        minDiff = diff;
        minLbPosition = lbPos;
@ -1530,8 +1536,16 @@ Optional<int64_t> FlatAffineConstraints::getConstantBoundOnDimSize(
  if (lb && minDiff.hasValue()) {
    // Set lb to the symbolic lower bound.
    lb->resize(getNumSymbolIds() + 1);
+    // The lower bound is the ceildiv of the lb constraint over the coefficient
+    // of the variable at 'pos'. We express the ceildiv equivalently as a floor
+    // for uniformity. For eg., if the lower bound constraint was: 32*d0 - N +
+    // 31 >= 0, the lower bound for d0 is ceil(N - 31, 32), i.e., floor(N, 32).
+    *lbFloorDivisor = atIneq(minLbPosition, pos);
    for (unsigned c = 0, e = getNumSymbolIds() + 1; c < e; c++) {
-      (*lb)[c] = -atIneq(minLbPosition, getNumDimIds() + c);
+      // ceildiv (val / d) = floordiv (val + d - 1 / d); hence, the addition of
+      // 'atIneq(minLbPosition, pos) - 1'.
+      (*lb)[c] = -atIneq(minLbPosition, getNumDimIds() + c) +
+                 atIneq(minLbPosition, pos) - 1;
    }
  }
  return minDiff;
--- a/mlir/lib/Analysis/Utils.cpp
+++ b/mlir/lib/Analysis/Utils.cpp
@ -55,9 +55,9 @@ unsigned MemRefRegion::getRank() const {
  return memref->getType().cast<MemRefType>().getRank();
 }

-Optional<int64_t> MemRefRegion::getBoundingConstantSizeAndShape(
-    SmallVectorImpl<int> *shape,
-    std::vector<SmallVector<int64_t, 4>> *lbs) const {
+Optional<int64_t> MemRefRegion::getConstantBoundingSizeAndShape(
+    SmallVectorImpl<int> *shape, std::vector<SmallVector<int64_t, 4>> *lbs,
+    SmallVectorImpl<int64_t> *lbDivisors) const {
  auto memRefType = memref->getType().cast<MemRefType>();
  unsigned rank = memRefType.getRank();
  shape->reserve(rank);
@ -66,11 +66,13 @@ Optional<int64_t> MemRefRegion::getBoundingConstantSizeAndShape(
  // dimension.
  int64_t numElements = 1;
  int64_t diffConstant;
+  int64_t lbDivisor;
  for (unsigned d = 0; d < rank; d++) {
    SmallVector<int64_t, 4> lb;
-    Optional<int64_t> diff = cst.getConstantBoundOnDimSize(d, &lb);
+    Optional<int64_t> diff = cst.getConstantBoundOnDimSize(d, &lb, &lbDivisor);
    if (diff.hasValue()) {
      diffConstant = diff.getValue();
+      assert(lbDivisor > 0);
    } else {
      // If no constant bound is found, then it can always be bound by the
      // memref's dim size if the latter has a constant size along this dim.
@ -80,10 +82,13 @@ Optional<int64_t> MemRefRegion::getBoundingConstantSizeAndShape(
      diffConstant = dimSize;
      // Lower bound becomes 0.
      lb.resize(cst.getNumSymbolIds() + 1, 0);
+      lbDivisor = 1;
    }
    numElements *= diffConstant;
    if (lbs) {
      lbs->push_back(lb);
+      assert(lbDivisors && "both lbs and lbDivisor or none");
+      lbDivisors->push_back(lbDivisor);
    }
    if (shape) {
      shape->push_back(diffConstant);
--- a/mlir/lib/Transforms/DmaGeneration.cpp
+++ b/mlir/lib/Transforms/DmaGeneration.cpp
@ -204,9 +204,10 @@ bool DmaGeneration::generateDma(const MemRefRegion &region, ForInst *forInst,

  // Compute the extents of the buffer.
  std::vector<SmallVector<int64_t, 4>> lbs;
+  SmallVector<int64_t, 8> lbDivisors;
  lbs.reserve(rank);
-  Optional<int64_t> numElements =
-      region.getBoundingConstantSizeAndShape(&fastBufferShape, &lbs);
+  Optional<int64_t> numElements = region.getConstantBoundingSizeAndShape(
+      &fastBufferShape, &lbs, &lbDivisors);
  if (!numElements.hasValue()) {
    LLVM_DEBUG(llvm::dbgs() << "Non-constant region size not supported\n");
    *sizeInBytes = 0;
@ -219,10 +220,11 @@ bool DmaGeneration::generateDma(const MemRefRegion &region, ForInst *forInst,
    return false;
  }

+  const FlatAffineConstraints *cst = region.getConstraints();
+
  // 'outerIVs' holds the values that this memory region is symbolic/paramteric
  // on; this would correspond to loop IVs surrounding the level at which the
  // DMA generation is being done.
-  const FlatAffineConstraints *cst = region.getConstraints();
  SmallVector<Value *, 8> outerIVs;
  cst->getIdValues(rank, cst->getNumIds(), &outerIVs);

@ -241,7 +243,9 @@ bool DmaGeneration::generateDma(const MemRefRegion &region, ForInst *forInst,
    for (unsigned j = 0, e = cst->getNumCols() - rank - 1; j < e; j++) {
      offset = offset + lbs[d][j] * top.getAffineDimExpr(j);
    }
-    offset = offset + lbs[d][cst->getNumCols() - 1 - rank];
+    assert(lbDivisors[d] > 0);
+    offset =
+        (offset + lbs[d][cst->getNumCols() - 1 - rank]).floorDiv(lbDivisors[d]);

    // Set DMA start location for this dimension in the lower memory space
    // memref.
--- a/mlir/lib/Transforms/LoopFusion.cpp
+++ b/mlir/lib/Transforms/LoopFusion.cpp
@ -681,9 +681,12 @@ static bool getSliceUnion(const ComputationSliceState &sliceStateA,

 // Creates and returns a private (single-user) memref for fused loop rooted
 // at 'forInst', with (potentially reduced) memref size based on the
-// MemRefRegion written to by 'srcStoreOpInst'.
+// MemRefRegion written to by 'srcStoreOpInst' at depth 'dstLoopDepth'.
+// TODO(bondhugula): consider refactoring the common code from generateDma and
+// this one.
 static Value *createPrivateMemRef(ForInst *forInst,
-                                  OperationInst *srcStoreOpInst) {
+                                  OperationInst *srcStoreOpInst,
+                                  unsigned dstLoopDepth) {
  // Create builder to insert alloc op just before 'forInst'.
  FuncBuilder b(forInst);
  // Builder to create constants at the top level.
@ -693,28 +696,39 @@ static Value *createPrivateMemRef(ForInst *forInst,
  auto oldMemRefType = oldMemRef->getType().cast<MemRefType>();
  unsigned rank = oldMemRefType.getRank();

-  // Compute MemRefRegion for 'srcStoreOpInst'.
+  // Compute MemRefRegion for 'srcStoreOpInst' at depth 'dstLoopDepth'.
  MemRefRegion region;
-  getMemRefRegion(srcStoreOpInst, 0, &region);
+  getMemRefRegion(srcStoreOpInst, dstLoopDepth, &region);
  SmallVector<int, 4> newShape;
  std::vector<SmallVector<int64_t, 4>> lbs;
+  SmallVector<int64_t, 8> lbDivisors;
  lbs.reserve(rank);
  // Query 'region' for 'newShape' and lower bounds of MemRefRegion accessed
-  // by 'srcStoreOpInst'.
+  // by 'srcStoreOpInst' at depth 'dstLoopDepth'.
  Optional<int64_t> numElements =
-      region.getBoundingConstantSizeAndShape(&newShape, &lbs);
+      region.getConstantBoundingSizeAndShape(&newShape, &lbs, &lbDivisors);
  assert(numElements.hasValue());

-  // Build 'rank' AffineExprs from MemRefRegion 'lbs'
  const FlatAffineConstraints *cst = region.getConstraints();
+  // 'outerIVs' holds the values that this memory region is symbolic/paramteric
+  // on; this would correspond to loop IVs surrounding the level at which the
+  // slice is being materialized.
+  SmallVector<Value *, 8> outerIVs;
+  cst->getIdValues(rank, cst->getNumIds(), &outerIVs);
+
+  // Build 'rank' AffineExprs from MemRefRegion 'lbs'
  SmallVector<AffineExpr, 4> offsets;
  offsets.reserve(rank);
  for (unsigned d = 0; d < rank; ++d) {
+    assert(lbs[d].size() == cst->getNumCols() - rank && "incorrect bound size");
+
    AffineExpr offset = top.getAffineConstantExpr(0);
    for (unsigned j = 0, e = cst->getNumCols() - rank - 1; j < e; j++) {
      offset = offset + lbs[d][j] * top.getAffineDimExpr(j);
    }
-    offset = offset + lbs[d][cst->getNumCols() - 1 - rank];
+    assert(lbDivisors[d] > 0);
+    offset =
+        (offset + lbs[d][cst->getNumCols() - 1 - rank]).floorDiv(lbDivisors[d]);
    offsets.push_back(offset);
  }

@ -743,18 +757,23 @@ static Value *createPrivateMemRef(ForInst *forInst,
    if (auto constExpr = offsets[i].dyn_cast<AffineConstantExpr>())
      if (constExpr.getValue() == 0)
        ++zeroOffsetCount;
-    auto dimExpr = b.getAffineDimExpr(i);
-    remapExprs.push_back(dimExpr - offsets[i]);
+    auto dimExpr = b.getAffineDimExpr(outerIVs.size() + i);
+
+    auto remapExpr =
+        simplifyAffineExpr(dimExpr - offsets[i], outerIVs.size() + rank, 0);
+    remapExprs.push_back(remapExpr);
  }
-  auto indexRemap = zeroOffsetCount == rank
-                        ? AffineMap::Null()
-                        : b.getAffineMap(rank, 0, remapExprs, {});
+  auto indexRemap =
+      zeroOffsetCount == rank
+          ? AffineMap::Null()
+          : b.getAffineMap(outerIVs.size() + rank, 0, remapExprs, {});
  // Replace all users of 'oldMemRef' with 'newMemRef'.
-  bool ret = replaceAllMemRefUsesWith(oldMemRef, newMemRef, {}, indexRemap, {},
-                                      &*forInst->getBody()->begin());
-  assert(ret);
+  bool ret =
+      replaceAllMemRefUsesWith(oldMemRef, newMemRef, {}, indexRemap,
+                               /*extraOperands=*/outerIVs,
+                               /*domInstFilter=*/&*forInst->getBody()->begin());
+  assert(ret && "replaceAllMemrefUsesWith should always succeed here");
  (void)ret;
-  (void)indexRemap;
  return newMemRef;
 }

@ -1044,8 +1063,8 @@ public:
                storesForMemref.push_back(storeOpInst);
            }
            assert(storesForMemref.size() == 1);
-            auto *newMemRef =
-                createPrivateMemRef(dstForInst, storesForMemref[0]);
+            auto *newMemRef = createPrivateMemRef(
+                dstForInst, storesForMemref[0], dstLoopDepth);
            visitedMemrefs.insert(newMemRef);

            // Collect dst loop stats after memref privatizaton transformation.
--- a/mlir/lib/Transforms/Utils/LoopUtils.cpp
+++ b/mlir/lib/Transforms/Utils/LoopUtils.cpp
@ -109,9 +109,14 @@ bool mlir::promoteIfSingleIteration(ForInst *forInst) {
      const AffineBound lb = forInst->getLowerBound();
      SmallVector<Value *, 4> lbOperands(lb.operand_begin(), lb.operand_end());
      FuncBuilder builder(forInst->getBlock(), Block::iterator(forInst));
-      auto affineApplyOp = builder.create<AffineApplyOp>(
-          forInst->getLoc(), lb.getMap(), lbOperands);
-      forInst->replaceAllUsesWith(affineApplyOp->getResult(0));
+      if (lb.getMap() == builder.getDimIdentityMap()) {
+        // No need of generating an affine_apply.
+        forInst->replaceAllUsesWith(lbOperands[0]);
+      } else {
+        auto affineApplyOp = builder.create<AffineApplyOp>(
+            forInst->getLoc(), lb.getMap(), lbOperands);
+        forInst->replaceAllUsesWith(affineApplyOp->getResult(0));
+      }
    }
  }
  // Move the loop body instructions to the loop's containing block.
--- a/mlir/lib/Transforms/Utils/Utils.cpp
+++ b/mlir/lib/Transforms/Utils/Utils.cpp
@ -43,23 +43,6 @@ static bool isMemRefDereferencingOp(const OperationInst &op) {
  return false;
 }

-/// Replaces all uses of oldMemRef with newMemRef while optionally remapping
-/// old memref's indices to the new memref using the supplied affine map
-/// and adding any additional indices. The new memref could be of a different
-/// shape or rank, but of the same elemental type. Additional indices are added
-/// at the start. 'extraOperands' is another optional argument that corresponds
-/// to additional operands (inputs) for indexRemap at the beginning of its input
-/// list. An optional argument 'domOpFilter' restricts the replacement to only
-/// those operations that are dominated by the former. The replacement succeeds
-/// and returns true if all uses of the memref in the region where the
-/// replacement is asked for are "dereferencing" memref uses.
-//  Ex: to replace load %A[%i, %j] with load %Abuf[%t mod 2, %ii - %i, %j]:
-//  The SSA value corresponding to '%t mod 2' should be in 'extraIndices', and
-//  index remap will (%i, %j) -> (%ii - %i, %j), i.e., (d0, d1, d2) -> (d0 - d1,
-//  d2) will be the 'indexRemap', and %ii is the extra operand. Without any
-//  extra operands, note that 'indexRemap' would just be applied to the existing
-//  indices (%i, %j).
-//
 bool mlir::replaceAllMemRefUsesWith(const Value *oldMemRef, Value *newMemRef,
                                    ArrayRef<Value *> extraIndices,
                                    AffineMap indexRemap,
@ -84,6 +67,9 @@ bool mlir::replaceAllMemRefUsesWith(const Value *oldMemRef, Value *newMemRef,
  if (domInstFilter)
    domInfo = std::make_unique<DominanceInfo>(domInstFilter->getFunction());

+  // The ops where memref replacement succeeds are replaced with new ones.
+  SmallVector<OperationInst *, 8> opsToErase;
+
  // Walk all uses of old memref. Operation using the memref gets replaced.
  for (auto it = oldMemRef->use_begin(); it != oldMemRef->use_end();) {
    InstOperand &use = *(it++);
@ -171,8 +157,14 @@ bool mlir::replaceAllMemRefUsesWith(const Value *oldMemRef, Value *newMemRef,
    for (auto *res : opInst->getResults()) {
      res->replaceAllUsesWith(repOp->getResult(r++));
    }
-    opInst->erase();
+    // Collect and erase at the end since one of these op's could be
+    // domInstFilter!
+    opsToErase.push_back(opInst);
  }
+
+  for (auto *opInst : opsToErase)
+    opInst->erase();
+
  return true;
 }

--- a/mlir/test/Transforms/loop-fusion.mlir
+++ b/mlir/test/Transforms/loop-fusion.mlir
@ -9,7 +9,7 @@

 // -----

-// CHECK: [[MAP0:#map[0-9]+]] = (d0) -> (d0)
+// CHECK: [[MAP0:#map[0-9]+]] = (d0, d1) -> (-d0 + d1)

 // CHECK-LABEL: func @should_fuse_raw_dep_for_locality() {
 func @should_fuse_raw_dep_for_locality() {
@ -23,9 +23,10 @@ func @should_fuse_raw_dep_for_locality() {
    %v0 = load %m[%i1] : memref<10xf32>
  }
  // CHECK:      for %i0 = 0 to 10 {
-  // CHECK-NEXT:   %1 = affine_apply [[MAP0]](%i0)
-  // CHECK-NEXT:   store %cst, %0[%1] : memref<10xf32>
-  // CHECK-NEXT:   %2 = load %0[%i0] : memref<10xf32>
+  // CHECK-NEXT:   %1 = affine_apply [[MAP0]](%i0, %i0)
+  // CHECK-NEXT:   store %cst, %0[%1] : memref<1xf32>
+  // CHECK-NEXT:   %2 = affine_apply [[MAP0]](%i0, %i0)
+  // CHECK-NEXT:   %3 = load %0[%2] : memref<1xf32>
  // CHECK-NEXT: }
  // CHECK-NEXT: return
  return
@ -33,7 +34,7 @@ func @should_fuse_raw_dep_for_locality() {

 // -----

-// CHECK: [[MAP0:#map[0-9]+]] = (d0) -> (d0)
+// CHECK-DAG: [[MAP0:#map[0-9]+]] = (d0, d1) -> (-d0 + d1)

 // CHECK-LABEL: func @should_fuse_reduction_to_pointwise() {
 func @should_fuse_reduction_to_pointwise() {
@ -59,15 +60,17 @@ func @should_fuse_reduction_to_pointwise() {
  // Should fuse in entire inner loop on %i1 from source loop nest, as %i1
  // is not used in the access function of the store/load on %b.
  // CHECK:       for %i0 = 0 to 10 {
-  // CHECK-NEXT:    %3 = affine_apply [[MAP0]](%i0)
  // CHECK-NEXT:    for %i1 = 0 to 10 {
-  // CHECK-NEXT:      %4 = load %2[%3] : memref<10xf32>
-  // CHECK-NEXT:      %5 = load %0[%3, %i1] : memref<10x10xf32>
+  // CHECK-NEXT:      %3 = affine_apply [[MAP0]](%i0, %i0)
+  // CHECK-NEXT:      %4 = load %2[%3] : memref<1xf32>
+  // CHECK-NEXT:      %5 = load %0[%i0, %i1] : memref<10x10xf32>
  // CHECK-NEXT:      %6 = addf %4, %5 : f32
-  // CHECK-NEXT:      store %6, %2[%3] : memref<10xf32>
+  // CHECK-NEXT:      %7 = affine_apply [[MAP0]](%i0, %i0)
+  // CHECK-NEXT:      store %6, %2[%7] : memref<1xf32>
  // CHECK-NEXT:    }
-  // CHECK-NEXT:    %7 = load %2[%i0] : memref<10xf32>
-  // CHECK-NEXT:    store %7, %1[%i0] : memref<10xf32>
+  // CHECK-NEXT:    %8 = affine_apply [[MAP0]](%i0, %i0)
+  // CHECK-NEXT:    %9 = load %2[%8] : memref<1xf32>
+  // CHECK-NEXT:    store %9, %1[%i0] : memref<10xf32>
  // CHECK-NEXT:  }
  // CHECK-NEXT:  return
  return
@ -75,9 +78,9 @@ func @should_fuse_reduction_to_pointwise() {

 // -----

-// CHECK: [[MAP_SHIFT_MINUS_ONE_R1:#map[0-9]+]] = (d0) -> (d0 - 1)
-// CHECK: [[MAP_SHIFT_BY_ONE:#map[0-9]+]] = (d0, d1) -> (d0 + 1, d1 + 1)
-// CHECK: [[MAP_SHIFT_MINUS_ONE_R2:#map[0-9]+]] = (d0, d1) -> (d0 - 1, d1 - 1)
+// CHECK-DAG: [[MAP_SHIFT_MINUS_ONE_R1:#map[0-9]+]] = (d0) -> (d0 - 1)
+// CHECK-DAG: [[MAP_SHIFT_BY_ONE:#map[0-9]+]] = (d0, d1) -> (d0 + 1, d1 + 1)
+// CHECK-DAG: [[MAP_SHIFT_MINUS_IV_R2:#map[0-9]+]] = (d0, d1, d2, d3) -> (-d0 + d2, -d1 + d3)

 // CHECK-LABEL: func @should_fuse_loop_nests_with_shifts() {
 func @should_fuse_loop_nests_with_shifts() {
@ -104,16 +107,16 @@ func @should_fuse_loop_nests_with_shifts() {
  // *) Fifth affine apply shifts the loads access function by '-1', because
  //    of the offset induced by reducing the memref shape from 10x10 to 9x9.
  // NOTE: Should create a private memref with reduced shape 9x9xf32.
-  // CHECK:      %0 = alloc() : memref<9x9xf32>
+  // CHECK:      %0 = alloc() : memref<1x1xf32>
  // CHECK-NEXT: for %i0 = 1 to 10 {
  // CHECK-NEXT:   for %i1 = 1 to 10 {
  // CHECK-NEXT:     %1 = affine_apply [[MAP_SHIFT_MINUS_ONE_R1]](%i0)
  // CHECK-NEXT:     %2 = affine_apply [[MAP_SHIFT_MINUS_ONE_R1]](%i1)
  // CHECK-NEXT:     %3 = affine_apply [[MAP_SHIFT_BY_ONE]](%1, %2)
-  // CHECK-NEXT:     %4 = affine_apply [[MAP_SHIFT_MINUS_ONE_R2]](%3#0, %3#1)
-  // CHECK-NEXT:     store %cst, %0[%4#0, %4#1] : memref<9x9xf32>
-  // CHECK-NEXT:     %5 = affine_apply [[MAP_SHIFT_MINUS_ONE_R2]](%i0, %i1)
-  // CHECK-NEXT:     %6 = load %0[%5#0, %5#1] : memref<9x9xf32>
+  // CHECK-NEXT:     %4 = affine_apply [[MAP_SHIFT_MINUS_IV_R2]](%i0, %i1, %3#0, %3#1)
+  // CHECK-NEXT:     store %cst, %0[%4#0, %4#1] : memref<1x1xf32>
+  // CHECK-NEXT:     %5 = affine_apply [[MAP_SHIFT_MINUS_IV_R2]](%i0, %i1, %i0, %i1)
+  // CHECK-NEXT:     %6 = load %0[%5#0, %5#1] : memref<1x1xf32>
  // CHECK-NEXT:   }
  // CHECK-NEXT: }
  // CHECK-NEXT: return
@ -122,7 +125,7 @@ func @should_fuse_loop_nests_with_shifts() {

 // -----

-// CHECK-DAG: [[MAP_ID:#map[0-9]+]] = (d0) -> (d0)
+// CHECK-DAG: [[MAP0:#map[0-9]+]] = (d0, d1, d2, d3) -> (-d0 + d2, -d1 + d3)

 // CHECK-LABEL: func @should_fuse_loop_nest() {
 func @should_fuse_loop_nest() {
@ -147,18 +150,18 @@ func @should_fuse_loop_nest() {
    }
  }
  // Expecting private memref for '%b' first, then private memref for '%a'.
-  // CHECK:      [[NEWB:%[0-9]+]] = alloc() : memref<10x10xf32>
-  // CHECK-NEXT: [[NEWA:%[0-9]+]] = alloc() : memref<10x10xf32>
+  // CHECK:      [[NEWB:%[0-9]+]] = alloc() : memref<1x1xf32>
+  // CHECK-NEXT: [[NEWA:%[0-9]+]] = alloc() : memref<1x1xf32>
  // CHECK-NEXT: for %i0 = 0 to 10 {
  // CHECK-NEXT:   for %i1 = 0 to 10 {
-  // CHECK-NEXT:     %2 = affine_apply [[MAP_ID]](%i1)
-  // CHECK-NEXT:     %3 = affine_apply [[MAP_ID]](%i0)
-  // CHECK-NEXT:     store %cst, [[NEWA]][%2, %3] : memref<10x10xf32>
-  // CHECK-NEXT:     %4 = affine_apply [[MAP_ID]](%i0)
-  // CHECK-NEXT:     %5 = affine_apply [[MAP_ID]](%i1)
-  // CHECK-NEXT:     %6 = load [[NEWA]][%5, %4] : memref<10x10xf32>
-  // CHECK-NEXT:     store %6, [[NEWB]][%4, %5] : memref<10x10xf32>
-  // CHECK-NEXT:     %7 = load [[NEWB]][%i0, %i1] : memref<10x10xf32>
+  // CHECK-NEXT:     %2 = affine_apply [[MAP0]](%i1, %i0, %i1, %i0)
+  // CHECK-NEXT:     store %cst, [[NEWA]][%2#0, %2#1] : memref<1x1xf32>
+  // CHECK-NEXT:     %3 = affine_apply [[MAP0]](%i1, %i0, %i1, %i0)
+  // CHECK-NEXT:     %4 = load [[NEWA]][%3#0, %3#1] : memref<1x1xf32>
+  // CHECK-NEXT:     %5 = affine_apply [[MAP0]](%i0, %i1, %i0, %i1)
+  // CHECK-NEXT:     store %4, [[NEWB]][%5#0, %5#1] : memref<1x1xf32>
+  // CHECK-NEXT:     %6 = affine_apply [[MAP0]](%i0, %i1, %i0, %i1)
+  // CHECK-NEXT:     %7 = load [[NEWB]][%6#0, %6#1] : memref<1x1xf32>
  // CHECK-NEXT:   }
  // CHECK-NEXT: }
  // CHECK-NEXT: return
@ -167,7 +170,7 @@ func @should_fuse_loop_nest() {

 // -----

-// CHECK: [[MAP0:#map[0-9]+]] = (d0) -> (d0)
+// CHECK-DAG: [[MAP0:#map[0-9]+]] = (d0, d1) -> (-d0 + d1)

 // CHECK-LABEL: func @should_fuse_across_intermediate_loop_with_no_deps() {
 func @should_fuse_across_intermediate_loop_with_no_deps() {
@ -193,12 +196,13 @@ func @should_fuse_across_intermediate_loop_with_no_deps() {
  // CHECK:      for %i0 = 0 to 10 {
  // CHECK-NEXT:   store %cst, %1[%i0] : memref<10xf32>
  // CHECK-NEXT: }
-  // CHECK:      %2 = alloc() : memref<10xf32>
+  // CHECK:      %2 = alloc() : memref<1xf32>
  // CHECK:      for %i1 = 0 to 10 {
-  // CHECK-NEXT:   %3 = affine_apply [[MAP0]](%i1)
-  // CHECK-NEXT:   %4 = load %0[%3] : memref<10xf32>
-  // CHECK-NEXT:   store %4, %2[%3] : memref<10xf32>
-  // CHECK-NEXT:   %5 = load %2[%i1] : memref<10xf32>
+  // CHECK-NEXT:   %3 = load %0[%i1] : memref<10xf32>
+  // CHECK-NEXT:   %4 = affine_apply [[MAP0]](%i1, %i1)
+  // CHECK-NEXT:   store %3, %2[%4] : memref<1xf32>
+  // CHECK-NEXT:   %5 = affine_apply [[MAP0]](%i1, %i1)
+  // CHECK-NEXT:   %6 = load %2[%5] : memref<1xf32>
  // CHECK-NEXT: }
  // CHECK-NEXT: return
  return
@ -206,7 +210,7 @@ func @should_fuse_across_intermediate_loop_with_no_deps() {

 // -----

-// CHECK: [[MAP0:#map[0-9]+]] = (d0) -> (d0)
+// CHECK: [[MAP0:#map[0-9]+]] = (d0, d1) -> (-d0 + d1)

 // CHECK-LABEL: func @should_fuse_all_loops() {
 func @should_fuse_all_loops() {
@ -228,15 +232,17 @@ func @should_fuse_all_loops() {

  // Should fuse first and second loops into third.
  // Expecting private memref for '%b' first, then private memref for '%a'.
-  // CHECK:      [[NEWB:%[0-9]+]] = alloc() : memref<10xf32>
-  // CHECK-NEXT: [[NEWA:%[0-9]+]] = alloc() : memref<10xf32>
+  // CHECK:      [[NEWB:%[0-9]+]] = alloc() : memref<1xf32>
+  // CHECK-NEXT: [[NEWA:%[0-9]+]] = alloc() : memref<1xf32>
  // CHECK-NEXT: for %i0 = 0 to 10 {
-  // CHECK-NEXT:   %2 = affine_apply [[MAP0]](%i0)
-  // CHECK-NEXT:   store %cst, [[NEWA]][%2] : memref<10xf32>
-  // CHECK-NEXT:   %3 = affine_apply [[MAP0]](%i0)
-  // CHECK-NEXT:   store %cst, [[NEWB]][%3] : memref<10xf32>
-  // CHECK-NEXT:   %4 = load [[NEWA]][%i0] : memref<10xf32>
-  // CHECK-NEXT:   %5 = load [[NEWB]][%i0] : memref<10xf32>
+  // CHECK-NEXT:   %2 = affine_apply [[MAP0]](%i0, %i0)
+  // CHECK-NEXT:   store %cst, [[NEWA]][%2] : memref<1xf32>
+  // CHECK-NEXT:   %3 = affine_apply [[MAP0]](%i0, %i0)
+  // CHECK-NEXT:   store %cst, [[NEWB]][%3] : memref<1xf32>
+  // CHECK-NEXT:   %4 = affine_apply [[MAP0]](%i0, %i0)
+  // CHECK-NEXT:   %5 = load [[NEWA]][%4] : memref<1xf32>
+  // CHECK-NEXT:   %6 = affine_apply [[MAP0]](%i0, %i0)
+  // CHECK-NEXT:   %7 = load [[NEWB]][%6] : memref<1xf32>
  // CHECK-NEXT: }
  // CHECK-NEXT: return
  return
@ -244,7 +250,7 @@ func @should_fuse_all_loops() {

 // -----

-// CHECK: [[MAP0:#map[0-9]+]] = (d0) -> (d0)
+// CHECK: [[MAP0:#map[0-9]+]] = (d0, d1) -> (-d0 + d1)

 // CHECK-LABEL: func @should_fuse_first_and_second_loops() {
 func @should_fuse_first_and_second_loops() {
@ -267,15 +273,16 @@ func @should_fuse_first_and_second_loops() {

  // Should fuse first loop into the second (last loop should not be fused).
  // Should create private memref '%2' for fused loop.
-  // CHECK:      %2 = alloc() : memref<10xf32>
+  // CHECK:      %2 = alloc() : memref<1xf32>
  // CHECK-NEXT: for %i0 = 0 to 10 {
-  // CHECK-NEXT:   %3 = affine_apply [[MAP0]](%i0)
-  // CHECK-NEXT:   store %cst, %2[%3] : memref<10xf32>
-  // CHECK-NEXT:   %4 = load %2[%i0] : memref<10xf32>
+  // CHECK-NEXT:   %3 = affine_apply [[MAP0]](%i0, %i0)
+  // CHECK-NEXT:   store %cst, %2[%3] : memref<1xf32>
+  // CHECK-NEXT:   %4 = affine_apply [[MAP0]](%i0, %i0)
+  // CHECK-NEXT:   %5 = load %2[%4] : memref<1xf32>
  // CHECK-NEXT:   store %cst, %0[%i0] : memref<10xf32>
  // CHECK-NEXT: }
  // CHECK:      for %i1 = 0 to 10 {
-  // CHECK-NEXT:   %5 = load %1[%i1] : memref<10xf32>
+  // CHECK-NEXT:   %6 = load %1[%i1] : memref<10xf32>
  // CHECK-NEXT: }
  // CHECK-NEXT: return

@ -326,7 +333,7 @@ func @should_not_fuse_would_create_cycle() {
 }

 // -----
-// CHECK: #map0 = (d0) -> (d0)
+// CHECK: [[MAP0:#map[0-9]+]] = (d0, d1) -> (-d0 + d1)

 // CHECK-LABEL: func @should_fuse_across_waw_dep_with_private_memref() {
 func @should_fuse_across_waw_dep_with_private_memref() {
@ -349,11 +356,12 @@ func @should_fuse_across_waw_dep_with_private_memref() {
  // CHECK:      for %i1 = 0 to 10 {
  // CHECK-NEXT:   store %cst, %0[%i1] : memref<10xf32>
  // CHECK-NEXT: }
-  // CHECK:      %1 = alloc() : memref<10xf32>
+  // CHECK:      %1 = alloc() : memref<1xf32>
  // CHECK-NEXT: for %i2 = 0 to 10 {
-  // CHECK-NEXT:    %2 = affine_apply #map0(%i2)
-  // CHECK-NEXT:    store %cst, %1[%2] : memref<10xf32>
-  // CHECK-NEXT:    %3 = load %1[%i2] : memref<10xf32>
+  // CHECK-NEXT:    %2 = affine_apply #map0(%i2, %i2)
+  // CHECK-NEXT:    store %cst, %1[%2] : memref<1xf32>
+  // CHECK-NEXT:    %3 = affine_apply #map0(%i2, %i2)
+  // CHECK-NEXT:    %4 = load %1[%3] : memref<1xf32>
  // CHECK-NEXT: }
  // CHECK-NEXT: return
  return
@ -412,18 +420,19 @@ func @should_fuse_with_private_memref_if_top_level_access() {
  // CHECK:      for %i0 = 0 to 10 {
  // CHECK-NEXT:   store %cst, %0[%i0] : memref<10xf32>
  // CHECK-NEXT: }
-  // CHECK:      %1 = alloc() : memref<10xf32>
+  // CHECK:      %1 = alloc() : memref<1xf32>
  // CHECK-NEXT: for %i1 = 0 to 10 {
-  // CHECK-NEXT:   %2 = affine_apply #map0(%i1)
-  // CHECK-NEXT:   store %cst, %1[%2] : memref<10xf32>
-  // CHECK-NEXT:   %3 = load %1[%i1] : memref<10xf32>
+  // CHECK-NEXT:   %2 = affine_apply #map0(%i1, %i1)
+  // CHECK-NEXT:   store %cst, %1[%2] : memref<1xf32>
+  // CHECK-NEXT:   %3 = affine_apply #map0(%i1, %i1)
+  // CHECK-NEXT:   %4 = load %1[%3] : memref<1xf32>
  // CHECK-NEXT: }
  return
 }

 // -----

-// CHECK: [[MAP0:#map[0-9]+]] = (d0) -> (d0)
+// CHECK: [[MAP0:#map[0-9]+]] = (d0, d1) -> (-d0 + d1)

 // CHECK-LABEL: func @should_fuse_no_top_level_access() {
 func @should_fuse_no_top_level_access() {
@ -437,9 +446,10 @@ func @should_fuse_no_top_level_access() {
    %v0 = load %m[%i1] : memref<10xf32>
  }
  // CHECK:      for %i0 = 0 to 10 {
-  // CHECK-NEXT:   %1 = affine_apply #map0(%i0)
-  // CHECK-NEXT:   store %cst, %0[%1] : memref<10xf32>
-  // CHECK-NEXT:   %2 = load %0[%i0] : memref<10xf32>
+  // CHECK-NEXT:   %1 = affine_apply #map0(%i0, %i0)
+  // CHECK-NEXT:   store %cst, %0[%1] : memref<1xf32>
+  // CHECK-NEXT:   %2 = affine_apply #map0(%i0, %i0)
+  // CHECK-NEXT:   %3 = load %0[%2] : memref<1xf32>
  // CHECK-NEXT: }
  // CHECK-NEXT: return
  return
@ -506,10 +516,12 @@ func @should_not_fuse_if_inst_in_loop_nest() {

 // -----

-// CHECK: [[MAP0:#map[0-9]+]] = (d0) -> (d0)
-// CHECK: [[MAP1:#map[0-9]+]] = (d0, d1, d2) -> (d0, d1, d2)
+// CHECK: [[MAP0:#map[0-9]+]] = (d0, d1, d2) -> (d0, d1, d2)
+// CHECK: [[MAP1:#map[0-9]+]] = (d0, d1, d2, d3, d4, d5) -> (-d0 + d3, -d1 + d4, -d2 + d5)
 // CHECK: [[MAP_PERMUTE:#map[0-9]+]] = (d0, d1, d2) -> (d1, d2, d0)

+#map0 = (d0, d1, d2) -> (d0, d1, d2)
+
 // CHECK-LABEL: func @remap_ivs() {
 func @remap_ivs() {
  %m = alloc() : memref<10x20x30xf32>
@ -534,13 +546,12 @@ func @remap_ivs() {
 // CHECK:       for %i0 = 0 to 30 {
 // CHECK-NEXT:    for %i1 = 0 to 10 {
 // CHECK-NEXT:      for %i2 = 0 to 20 {
-// CHECK-NEXT:        %1 = affine_apply [[MAP0]](%i1)
-// CHECK-NEXT:        %2 = affine_apply [[MAP0]](%i2)
-// CHECK-NEXT:        %3 = affine_apply [[MAP0]](%i0)
-// CHECK-NEXT:        %4 = affine_apply [[MAP1]](%1, %2, %3)
-// CHECK-NEXT:        store %cst, %0[%4#0, %4#1, %4#2] : memref<10x20x30xf32>
-// CHECK-NEXT:        %5 = affine_apply [[MAP_PERMUTE]](%i0, %i1, %i2)
-// CHECK-NEXT:        %6 = load %0[%5#0, %5#1, %5#2] : memref<10x20x30xf32>
+// CHECK-NEXT:        %1 = affine_apply [[MAP0]](%i1, %i2, %i0)
+// CHECK-NEXT:        %2 = affine_apply [[MAP1]](%i1, %i2, %i0, %1#0, %1#1, %1#2)
+// CHECK-NEXT:        store %cst, %0[%2#0, %2#1, %2#2] : memref<1x1x1xf32>
+// CHECK-NEXT:        %3 = affine_apply [[MAP_PERMUTE]](%i0, %i1, %i2)
+// CHECK-NEXT:        %4 = affine_apply [[MAP1]](%i1, %i2, %i0, %3#0, %3#1, %3#2)
+// CHECK-NEXT:        %5 = load %0[%4#0, %4#1, %4#2] : memref<1x1x1xf32>
 // CHECK-NEXT:      }
 // CHECK-NEXT:    }
 // CHECK-NEXT:  }
@ -581,8 +592,10 @@ func @fuse_reshape_64_16_4(%in : memref<64xf32>) {
 }

 // -----
-// CHECK: #map0 = (d0) -> (d0 floordiv 4)
-// CHECK: #map1 = (d0) -> (d0 mod 4)
+// CHECK-DAG: #map0 = (d0) -> (d0 floordiv 4)
+// CHECK-DAG: #map1 = (d0) -> (d0 mod 4)
+// CHECK-DAG: [[MAP2:#map[0-9]+]] = (d0, d1) -> (d0 * 4 + d1)
+// CHECK-DAG: [[MAP3:#map[0-9]+]] = (d0, d1) -> (-d0 + d1)

 // Reshape a 16x4xf32 to 64xf32.
 // CHECK-LABEL: func @fuse_reshape_16_4_64
@ -606,10 +619,12 @@ func @fuse_reshape_16_4_64() {
 // CHECK-NEXT:    %2 = affine_apply #map0(%i0)
 // CHECK-NEXT:    %3 = affine_apply #map1(%i0)
 // CHECK-NEXT:    %4 = load %0[%2, %3] : memref<16x4xf32>
-// CHECK-NEXT:    %5 = affine_apply #map2(%2, %3)
-// CHECK-NEXT:    store %4, %1[%5] : memref<64xf32>
-// CHECK-NEXT:    %6 = load %1[%i0] : memref<64xf32>
-// CHECK-NEXT:    "foo"(%6) : (f32) -> ()
+// CHECK-NEXT:    %5 = affine_apply [[MAP2]](%2, %3)
+// CHECK-NEXT:    %6 = affine_apply [[MAP3]](%i0, %5)
+// CHECK-NEXT:    store %4, %1[%6] : memref<1xf32>
+// CHECK-NEXT:    %7 = affine_apply [[MAP3]](%i0, %i0)
+// CHECK-NEXT:    %8 = load %1[%7] : memref<1xf32>
+// CHECK-NEXT:    "foo"(%8) : (f32) -> ()
 // CHECK-NEXT:  }
 // CHECK-NEXT:  return
  return
@ -674,8 +689,8 @@ func @R6_to_R2_reshape_square() -> memref<64x9xi32> {
 // is eliminated if -memref-dataflow-opt is also supplied.
 //
 // CHECK:       %0 = alloc() : memref<64x9xi32>
-// CHECK-NEXT:  %1 = alloc() : memref<64x9xi32>
-// CHECK-NEXT:  %2 = alloc() :  memref<2x2x3x3x16x1xi32>
+// CHECK-NEXT:  %1 = alloc() : memref<1x1xi32>
+// CHECK-NEXT:  %2 = alloc() :  memref<1x2x3x3x16x1xi32>
 // CHECK-NEXT:  for %i0 = 0 to 64 {
 // CHECK-NEXT:    for %i1 = 0 to 9 {
 // CHECK-NEXT:      %3 = affine_apply #map0(%i0, %i1)
@ -684,16 +699,18 @@ func @R6_to_R2_reshape_square() -> memref<64x9xi32> {
 // CHECK-NEXT:      %6 = affine_apply #map3(%i0, %i1)
 // CHECK-NEXT:      %7 = affine_apply #map4(%i0, %i1)
 // CHECK-NEXT:      %8 = "foo"(%3, %4, %5, %6, %7, %c0) : (index, index, index, index, index, index) -> i32
-// CHECK-NEXT:      store %8, %2[%3, %4, %5, %6, %7, %c0] : memref<2x2x3x3x16x1xi32>
-// CHECK-NEXT:      %9 = affine_apply #map5(%i0)
-// CHECK-NEXT:      %10 = affine_apply #map5(%i1)
-// CHECK-NEXT:      %11 = affine_apply #map6(%9, %10)
-// CHECK-NEXT:      %12 = affine_apply #map7(%11)
-// CHECK-NEXT:      %13 = load %2[%12#0, %12#1, %12#2, %12#3, %12#4, %12#5] : memref<2x2x3x3x16x1xi32>
-// CHECK-NEXT:      store %13, %1[%9, %10] : memref<64x9xi32>
-// CHECK-NEXT:      %14 = load %1[%i0, %i1] : memref<64x9xi32>
-// CHECK-NEXT:      %15 = muli %14, %14 : i32
-// CHECK-NEXT:      store %15, %0[%i0, %i1] : memref<64x9xi32>
+// CHECK-NEXT:      %9 = affine_apply #map5(%i0, %i1, %3, %4, %5, %6, %7, %c0)
+// CHECK-NEXT:      store %8, %2[%9#0, %9#1, %9#2, %9#3, %9#4, %9#5] : memref<1x2x3x3x16x1xi32>
+// CHECK-NEXT:      %10 = affine_apply #map6(%i0, %i1)
+// CHECK-NEXT:      %11 = affine_apply #map7(%10)
+// CHECK-NEXT:      %12 = affine_apply #map5(%i0, %i1, %11#0, %11#1, %11#2, %11#3, %11#4, %11#5)
+// CHECK-NEXT:      %13 = load %2[%12#0, %12#1, %12#2, %12#3, %12#4, %12#5] : memref<1x2x3x3x16x1xi32>
+// CHECK-NEXT:      %14 = affine_apply #map8(%i0, %i1, %i0, %i1)
+// CHECK-NEXT:      store %13, %1[%14#0, %14#1] : memref<1x1xi32>
+// CHECK-NEXT:      %15 = affine_apply #map8(%i0, %i1, %i0, %i1)
+// CHECK-NEXT:      %16 = load %1[%15#0, %15#1] : memref<1x1xi32>
+// CHECK-NEXT:      %17 = muli %16, %16 : i32
+// CHECK-NEXT:      store %17, %0[%i0, %i1] : memref<64x9xi32>
 // CHECK-NEXT:    }
 // CHECK-NEXT:  }
 // CHECK-NEXT:  return %0 : memref<64x9xi32>
@ -725,7 +742,7 @@ func @fuse_symbolic_bounds(%M : index, %N : index) {
 }

 // -----
-// CHECK: #map0 = (d0) -> (d0)
+// CHECK-DAG: #map0 = (d0, d1) -> (-d0 + d1)

 // CHECK-LABEL: func @should_fuse_reduction_at_depth1
 func @should_fuse_reduction_at_depth1() {
@ -753,18 +770,21 @@ func @should_fuse_reduction_at_depth1() {
  // decrease the reduction memref size and possibly place it in a faster
  // memory space.
  // CHECK:       for %i0 = 0 to 10 {
-  // CHECK-NEXT:    %2 = affine_apply #map0(%i0)
  // CHECK-NEXT:    for %i1 = 0 to 100 {
-  // CHECK-NEXT:      %3 = load %1[%2] : memref<10xf32>
-  // CHECK-NEXT:      %4 = load %0[%2, %i1] : memref<10x100xf32>
+  // CHECK-NEXT:      %2 = affine_apply #map0(%i0, %i0)
+  // CHECK-NEXT:      %3 = load %1[%2] : memref<1xf32>
+  // CHECK-NEXT:      %4 = load %0[%i0, %i1] : memref<10x100xf32>
  // CHECK-NEXT:      %5 = "maxf"(%3, %4) : (f32, f32) -> f32
-  // CHECK-NEXT:      store %5, %1[%2] : memref<10xf32>
+  // CHECK-NEXT:      %6 = affine_apply #map0(%i0, %i0)
+  // CHECK-NEXT:      store %5, %1[%6] : memref<1xf32>
  // CHECK-NEXT:    }
  // CHECK-NEXT:    for %i2 = 0 to 100 {
-  // CHECK-NEXT:      %6 = load %1[%i0] : memref<10xf32>
-  // CHECK-NEXT:      %7 = load %0[%i0, %i2] : memref<10x100xf32>
-  // CHECK-NEXT:      %8 = subf %7, %6 : f32
-  // CHECK-NEXT:      store %8, %1[%i0] : memref<10xf32>
+  // CHECK-NEXT:      %7 = affine_apply #map0(%i0, %i0)
+  // CHECK-NEXT:      %8 = load %1[%7] : memref<1xf32>
+  // CHECK-NEXT:      %9 = load %0[%i0, %i2] : memref<10x100xf32>
+  // CHECK-NEXT:      %10 = subf %9, %8 : f32
+  // CHECK-NEXT:      %11 = affine_apply #map0(%i0, %i0)
+  // CHECK-NEXT:      store %10, %1[%11] : memref<1xf32>
  // CHECK-NEXT:    }
  // CHECK-NEXT:  }
  // CHECK-NEXT:  return
@ -772,7 +792,7 @@ func @should_fuse_reduction_at_depth1() {
 }

 // -----
-// CHECK: #map0 = (d0) -> (d0)
+// CHECK: #map0 = (d0, d1, d2) -> (-d0 + d1, d2)

 // CHECK-LABEL: func @should_fuse_at_src_depth1_and_dst_depth1
 func @should_fuse_at_src_depth1_and_dst_depth1() {
@ -802,18 +822,19 @@ func @should_fuse_at_src_depth1_and_dst_depth1() {
  // the fusion algorithm should detect that the source loop should be sliced
  // at depth 1 and the slice should be inserted at depth 1.
  // CHECK:       for %i0 = 0 to 100 {
-  // CHECK-NEXT:    %2 = affine_apply #map0(%i0)
  // CHECK-NEXT:    for %i1 = 0 to 16 {
-  // CHECK-NEXT:      %3 = load %0[%2, %i1] : memref<100x16xf32>
-  // CHECK-NEXT:      "op0"(%3) : (f32) -> ()
+  // CHECK-NEXT:      %2 = load %0[%i0, %i1] : memref<100x16xf32>
+  // CHECK-NEXT:      "op0"(%2) : (f32) -> ()
  // CHECK-NEXT:    }
  // CHECK-NEXT:    for %i2 = 0 to 16 {
-  // CHECK-NEXT:      %4 = "op1"() : () -> f32
-  // CHECK-NEXT:      store %4, %1[%2, %i2] : memref<100x16xf32>
+  // CHECK-NEXT:      %3 = "op1"() : () -> f32
+  // CHECK-NEXT:      %4 = affine_apply #map0(%i0, %i0, %i2)
+  // CHECK-NEXT:      store %3, %1[%4#0, %4#1] : memref<1x16xf32>
  // CHECK-NEXT:    }
  // CHECK-NEXT:    for %i3 = 0 to 16 {
-  // CHECK-NEXT:      %5 = load %1[%i0, %i3] : memref<100x16xf32>
-  // CHECK-NEXT:      "op2"(%5) : (f32) -> ()
+  // CHECK-NEXT:      %5 = affine_apply #map0(%i0, %i0, %i3)
+  // CHECK-NEXT:      %6 = load %1[%5#0, %5#1] : memref<1x16xf32>
+  // CHECK-NEXT:      "op2"(%6) : (f32) -> ()
  // CHECK-NEXT:    }
  // CHECK-NEXT:  }
  // CHECK-NEXT:  return
@ -822,6 +843,7 @@ func @should_fuse_at_src_depth1_and_dst_depth1() {

 // -----
 // CHECK: #map0 = (d0, d1) -> (d0 * 10 + d1)
+// CHECK: #map1 = (d0, d1, d2) -> (d0 * -10 - d1 + d2)

 // CHECK-LABEL: func @should_fuse_src_depth1_at_dst_depth2
 func @should_fuse_src_depth1_at_dst_depth2() {
@ -843,9 +865,11 @@ func @should_fuse_src_depth1_at_dst_depth2() {
  // CHECK:       for %i0 = 0 to 10 {
  // CHECK-NEXT:    for %i1 = 0 to 10 {
  // CHECK-NEXT:      %1 = affine_apply #map0(%i0, %i1)
-  // CHECK-NEXT:      store %cst, %0[%1] : memref<100xf32>
-  // CHECK-NEXT:      %2 = affine_apply #map0(%i0, %i1)
-  // CHECK-NEXT:      %3 = load %0[%2] : memref<100xf32>
+  // CHECK-NEXT:      %2 = affine_apply #map1(%i0, %i1, %1)
+  // CHECK-NEXT:      store %cst, %0[%2] : memref<1xf32>
+  // CHECK-NEXT:      %3 = affine_apply #map0(%i0, %i1)
+  // CHECK-NEXT:      %4 = affine_apply #map1(%i0, %i1, %3)
+  // CHECK-NEXT:      %5 = load %0[%4] : memref<1xf32>
  // CHECK-NEXT:    }
  // CHECK-NEXT:  }
  // CHECK-NEXT:  return
@ -879,7 +903,8 @@ func @fusion_at_depth0_not_currently_supported() {
 }

 // -----
-// CHECK: #map0 = (d0) -> (d0)
+
+// CHECK-DAG: #map0 = (d0, d1, d2, d3, d4, d5, d6, d7, d8, d9) -> (-d0 + d4, -d1 + d5, -d2 + d6, -d3 + d7, d8, d9)

 // CHECK-LABEL: func @should_fuse_deep_loop_nests
 func @should_fuse_deep_loop_nests() {
@ -945,18 +970,15 @@ func @should_fuse_deep_loop_nests() {
 // CHECK-NEXT:    for %i1 = 0 to 3 {
 // CHECK-NEXT:      for %i2 = 0 to 2 {
 // CHECK-NEXT:        for %i3 = 0 to 2 {
-// CHECK-NEXT:          %3 = affine_apply #map0(%i2)
-// CHECK-NEXT:          %4 = affine_apply #map0(%i3)
-// CHECK-NEXT:          %5 = affine_apply #map0(%i0)
-// CHECK-NEXT:          %6 = affine_apply #map0(%i1)
 // CHECK-NEXT:          for %i4 = 0 to 16 {
 // CHECK-NEXT:            for %i5 = 0 to 10 {
-// CHECK-NEXT:              %7 = load %0[%3, %4, %5, %6, %i4, %i5] : memref<2x2x3x3x16x10xf32, 2>
+// CHECK-NEXT:              %3 = load %0[%i2, %i3, %i0, %i1, %i4, %i5] : memref<2x2x3x3x16x10xf32, 2>
 // CHECK-NEXT:            }
 // CHECK-NEXT:          }
 // CHECK-NEXT:          for %i6 = 0 to 16 {
 // CHECK-NEXT:            for %i7 = 0 to 10 {
-// CHECK-NEXT:              store %cst, %2[%3, %4, %5, %6, %i6, %i7] : memref<2x2x3x3x16x10xf32, 2>
+// CHECK-NEXT:              %4 = affine_apply #map0(%i2, %i3, %i0, %i1, %i2, %i3, %i0, %i1, %i6, %i7)
+// CHECK-NEXT:              store %cst, %2[%4#0, %4#1, %4#2, %4#3, %4#4, %4#5] : memref<1x1x1x1x16x10xf32, 2>
 // CHECK-NEXT:            }
 // CHECK-NEXT:          }
 // CHECK-NEXT:          for %i8 = 0 to 3 {
@ -965,12 +987,13 @@ func @should_fuse_deep_loop_nests() {
 // CHECK-NEXT:                for %i11 = 0 to 2 {
 // CHECK-NEXT:                  for %i12 = 0 to 16 {
 // CHECK-NEXT:                    for %i13 = 0 to 10 {
-// CHECK-NEXT:                      %8 = load %0[%i10, %i11, %i8, %i9, %i12, %i13] : memref<2x2x3x3x16x10xf32, 2>
+// CHECK-NEXT:                      %5 = load %0[%i10, %i11, %i8, %i9, %i12, %i13] : memref<2x2x3x3x16x10xf32, 2>
 // CHECK-NEXT:                    }
 // CHECK-NEXT:                  }
 // CHECK-NEXT:                  for %i14 = 0 to 16 {
 // CHECK-NEXT:                    for %i15 = 0 to 10 {
-// CHECK-NEXT:                      %9 = load %2[%i2, %i3, %i0, %i1, %i14, %i15] : memref<2x2x3x3x16x10xf32, 2>
+// CHECK-NEXT:                      %6 = affine_apply #map0(%i2, %i3, %i0, %i1, %i2, %i3, %i0, %i1, %i14, %i15)
+// CHECK-NEXT:                      %7 = load %2[%6#0, %6#1, %6#2, %6#3, %6#4, %6#5] : memref<1x1x1x1x16x10xf32, 2>
 // CHECK-NEXT:                    }
 // CHECK-NEXT:                  }
 // CHECK-NEXT:                }
@ -986,7 +1009,7 @@ func @should_fuse_deep_loop_nests() {
 }

 // -----
-// CHECK: #map0 = (d0) -> (d0)
+// CHECK: #map0 = (d0, d1, d2) -> (-d0 + d1, d2)

 // CHECK-LABEL: func @should_fuse_at_depth1_and_reduce_slice_trip_count
 func @should_fuse_at_depth1_and_reduce_slice_trip_count() {
@ -1019,17 +1042,18 @@ func @should_fuse_at_depth1_and_reduce_slice_trip_count() {
  // NOTE: the size of the private memref created for the fused loop nest
  // is reduced from the original shape from 4x256 to 4x16 because of the
  // data accessed by the load.
-  // CHECK:       %1 = alloc() : memref<4x16xf32>
+  // CHECK:       %1 = alloc() : memref<1x16xf32>
  // CHECK-NEXT:  for %i0 = 0 to 4 {
-  // CHECK-NEXT:    %2 = affine_apply #map0(%i0)
  // CHECK-NEXT:    for %i1 = 0 to 256 {
-  // CHECK-NEXT:      %3 = load %0[%2, %i1] : memref<4x256xf32>
+  // CHECK-NEXT:      %2 = load %0[%i0, %i1] : memref<4x256xf32>
  // CHECK-NEXT:    }
  // CHECK-NEXT:    for %i2 = 0 to 16 {
-  // CHECK-NEXT:      store %cst, %1[%2, %i2] : memref<4x16xf32>
+  // CHECK-NEXT:      %3 = affine_apply #map0(%i0, %i0, %i2)
+  // CHECK-NEXT:      store %cst, %1[%3#0, %3#1] : memref<1x16xf32>
  // CHECK-NEXT:    }
  // CHECK-NEXT:    for %i3 = 0 to 16 {
-  // CHECK-NEXT:      %4 = load %1[%i0, %i3] : memref<4x16xf32>
+  // CHECK-NEXT:      %4 = affine_apply #map0(%i0, %i0, %i3)
+  // CHECK-NEXT:      %5 = load %1[%4#0, %4#1] : memref<1x16xf32>
  // CHECK-NEXT:    }
  // CHECK-NEXT:  }
  // CHECK-NEXT:  return
@ -1120,7 +1144,7 @@ func @should_fuse_at_depth1_with_trip_count_19() {


 // -----
-// CHECK: #map0 = (d0) -> (d0)
+// CHECK: [[MAP0:#map[0-9]+]] = (d0, d1) -> (-d0 + d1)

 // CHECK-LABEL: func @should_fuse_with_private_memrefs_with_diff_shapes() {
 func @should_fuse_with_private_memrefs_with_diff_shapes() {
@ -1138,17 +1162,19 @@ func @should_fuse_with_private_memrefs_with_diff_shapes() {
  }
  // Should create two new private memrefs customized to the shapes accessed
  // by loops %i1 and %i2.
-  // CHECK:      %0 = alloc() : memref<17xf32>
+  // CHECK:      %0 = alloc() : memref<1xf32>
  // CHECK-NEXT: for %i0 = 0 to 17 {
-  // CHECK-NEXT:   %1 = affine_apply #map0(%i0)
-  // CHECK-NEXT:   store %cst, %0[%1] : memref<17xf32>
-  // CHECK-NEXT:   %2 = load %0[%i0] : memref<17xf32>
+  // CHECK-NEXT:   %1 = affine_apply #map0(%i0, %i0)
+  // CHECK-NEXT:   store %cst, %0[%1] : memref<1xf32>
+  // CHECK-NEXT:   %2 = affine_apply #map0(%i0, %i0)
+  // CHECK-NEXT:   %3 = load %0[%2] : memref<1xf32>
  // CHECK-NEXT: }
-  // CHECK-NEXT: %3 = alloc() : memref<82xf32>
+  // CHECK-NEXT: %4 = alloc() : memref<1xf32>
  // CHECK-NEXT: for %i1 = 0 to 82 {
-  // CHECK-NEXT:   %4 = affine_apply #map0(%i1)
-  // CHECK-NEXT:   store %cst, %3[%4] : memref<82xf32>
-  // CHECK-NEXT:   %5 = load %3[%i1] : memref<82xf32>
+  // CHECK-NEXT:   %5 = affine_apply #map0(%i1, %i1)
+  // CHECK-NEXT:   store %cst, %4[%5] : memref<1xf32>
+  // CHECK-NEXT:   %6 = affine_apply #map0(%i1, %i1)
+  // CHECK-NEXT:   %7 = load %4[%6] : memref<1xf32>
  // CHECK-NEXT: }
  // CHECK-NEXT: return
  return
@ -1156,7 +1182,7 @@ func @should_fuse_with_private_memrefs_with_diff_shapes() {

 // -----

-// CHECK: #map0 = (d0) -> (d0)
+// CHECK: [[MAP0:#map[0-9]+]] = (d0, d1) -> (-d0 + d1)

 // CHECK-LABEL: func @fusion_should_not_remove_memref_arg(%arg0: memref<10xf32>) {
 func @fusion_should_not_remove_memref_arg(%arg0: memref<10xf32>) {
@ -1173,11 +1199,12 @@ func @fusion_should_not_remove_memref_arg(%arg0: memref<10xf32>) {
  // CHECK:       for %i0 = 0 to 10 {
  // CHECK-NEXT:    store %cst, %arg0[%i0] : memref<10xf32>
  // CHECK-NEXT:  }
-  // CHECK-NEXT:  %0 = alloc() : memref<10xf32>
+  // CHECK-NEXT:  %0 = alloc() : memref<1xf32>
  // CHECK-NEXT:  for %i1 = 0 to 10 {
-  // CHECK-NEXT:    %1 = affine_apply #map0(%i1)
-  // CHECK-NEXT:    store %cst, %0[%1] : memref<10xf32>
-  // CHECK-NEXT:    %2 = load %0[%i1] : memref<10xf32>
+  // CHECK-NEXT:    %1 = affine_apply [[MAP0]](%i1, %i1)
+  // CHECK-NEXT:    store %cst, %0[%1] : memref<1xf32>
+  // CHECK-NEXT:    %2 = affine_apply [[MAP0]](%i1, %i1)
+  // CHECK-NEXT:    %3 = load %0[%2] : memref<1xf32>
  // CHECK-NEXT:  }
  // CHECK-NEXT:  return
  return
@ -1185,7 +1212,7 @@ func @fusion_should_not_remove_memref_arg(%arg0: memref<10xf32>) {

 // -----

-// CHECK: #map0 = (d0) -> (d0)
+// CHECK: [[MAP0:#map[0-9]+]] = (d0, d1) -> (-d0 + d1)

 // CHECK-LABEL: func @fusion_should_not_remove_escaping_memref()
 func @fusion_should_not_remove_escaping_memref() -> memref<10xf32> {
@ -1202,12 +1229,63 @@ func @fusion_should_not_remove_escaping_memref() -> memref<10xf32> {
  // CHECK:       for %i0 = 0 to 10 {
  // CHECK-NEXT:    store %cst, %0[%i0] : memref<10xf32>
  // CHECK-NEXT:  }
-  // CHECK-NEXT:  %1 = alloc() : memref<10xf32>
+  // CHECK-NEXT:  %1 = alloc() : memref<1xf32>
  // CHECK-NEXT:  for %i1 = 0 to 10 {
-  // CHECK-NEXT:    %2 = affine_apply #map0(%i1)
-  // CHECK-NEXT:    store %cst, %1[%2] : memref<10xf32>
-  // CHECK-NEXT:    %3 = load %1[%i1] : memref<10xf32>
+  // CHECK-NEXT:    %2 = affine_apply [[MAP0]](%i1, %i1)
+  // CHECK-NEXT:    store %cst, %1[%2] : memref<1xf32>
+  // CHECK-NEXT:    %3 = affine_apply [[MAP0]](%i1, %i1)
+  // CHECK-NEXT:    %4 = load %1[%3] : memref<1xf32>
  // CHECK-NEXT:  }
  // CHECK-NEXT:  return %0 : memref<10xf32>
  return %m : memref<10xf32>
 }
+
+// -----
+
+// This should fuse with the %in becoming a 1x1x1.
+func @R3_to_R2_reshape() {
+  %in = alloc() : memref<2x3x16xi32>
+
+  %c0 = constant 0 : index
+
+  for %i0 = 0 to 2 {
+    for %i1 = 0 to 3 {
+      for %i2 = 0 to 16 {
+        %val = "foo"(%i0, %i1, %i2) : (index, index, index) -> i32
+        store %val, %in[%i0, %i1, %i2] : memref<2x3x16xi32>
+      }
+    }
+  }
+
+  for %ii = 0 to 32 {
+    for %jj = 0 to 3 {
+      %a0 = affine_apply (d0, d1) -> (d0 * 3 + d1) (%ii, %jj)
+      %a1 = affine_apply (d0) -> (d0 floordiv (3 * 16)) (%a0)
+      %v = load %in[%a1#0, %jj, %c0]
+        : memref<2x3x16xi32>
+    }
+  }
+  return
+}
+// CHECK:      #map0 = (d0, d1) -> ((d0 * 3 + d1) floordiv 48)
+// CHECK-NEXT: #map1 = ()[s0] -> (s0)
+// CHECK-NEXT: #map2 = (d0, d1, d2, d3, d4) -> (d2 - (d0 * 25 + d1 * 24) floordiv 24, -d1 + d3, d4)
+// CHECK-NEXT: #map3 = (d0, d1) -> (d0 * 3 + d1)
+// CHECK-NEXT: #map4 = (d0) -> (d0 floordiv 48)
+// CHECK-LABEL: func @R3_to_R2_reshape()
+// CHECK:        %0 = alloc() : memref<1x1x1xi32>
+// CHECK-NEXT:   for %i0 = 0 to 32 {
+// CHECK-NEXT:     for %i1 = 0 to 3 {
+// CHECK-NEXT:       %1 = affine_apply #map0(%i0, %i1)
+// CHECK-NEXT:       %2 = affine_apply #map1()[%c0]
+// CHECK-NEXT:       %3 = "foo"(%1, %i1, %2) : (index, index, index) -> i32
+// CHECK-NEXT:       %4 = affine_apply #map2(%i0, %i1, %1, %i1, %2)
+// CHECK-NEXT:       store %3, %0[%4#0, %4#1, %4#2] : memref<1x1x1xi32>
+// CHECK-NEXT:       %5 = affine_apply #map3(%i0, %i1)
+// CHECK-NEXT:       %6 = affine_apply #map4(%5)
+// CHECK-NEXT:       %7 = affine_apply #map2(%i0, %i1, %6, %i1, %c0)
+// CHECK-NEXT:       %8 = load %0[%7#0, %7#1, %7#2] : memref<1x1x1xi32>
+// CHECK-NEXT:     }
+// CHECK-NEXT:   }
+// CHECK-NEXT:   return
+// CHECK-NEXT: }