Update replaceAllMemRefUsesWith to generate single result affine_apply's for

index remapping - generate a sequence of single result affine_apply's for the index remapping (instead of one multi result affine_apply) - update dma-generate and loop-fusion test cases; while on this, change test cases to use single result affine apply ops - some fusion comment fix/cleanup PiperOrigin-RevId: 230985830
2019-01-25 16:00:50 -08:00 · 2019-01-25 16:00:50 -08:00 · b4a1443508
parent 629f5b7fcb
commit b4a1443508
4 changed files with 218 additions and 144 deletions
--- a/mlir/lib/Transforms/LoopFusion.cpp
+++ b/mlir/lib/Transforms/LoopFusion.cpp
@ -819,8 +819,9 @@ static uint64_t getSliceIterationCount(

 // Checks the profitability of fusing a backwards slice of the loop nest
 // surrounding 'srcOpInst' into the loop nest surrounding 'dstOpInsts'.
-// Returns true if it profitable to fuse the candidate loop nests. Returns
-// false otherwise.
+// Returns true if it is profitable to fuse the candidate loop nests. Returns
+// false otherwise. `dstLoopDepth` is set to the most profitable depth at which
+// to materialize the source loop nest slice.
 // The profitability model executes the following steps:
 // *) Computes the backward computation slice at 'srcOpInst'. This
 //    computation slice of the loop nest surrounding 'srcOpInst' is
@ -837,7 +838,7 @@ static uint64_t getSliceIterationCount(
 //    NOTE: If the dst loop nest includes multiple loads in 'dstOpInsts' for
 //    the same memref as is written by 'srcOpInst', then the union of slice
 //    loop bounds is used to compute the slice and associated slice cost.
-//    NOTE: 'dstLoopDepth' refers the loop depth within the destination loop
+//    NOTE: 'dstLoopDepth' refers to the loop depth within the destination loop
 //    nest, at which the src computation slice is inserted/fused.
 //    NOTE: We attempt to maximize the dst loop depth, but there are cases
 //    where a particular setting for 'dstLoopNest' might fuse an unsliced
@ -933,18 +934,17 @@ static bool isFusionProfitable(OperationInst *srcOpInst,
      // Compute slice boun dunion of 'tmpSliceState' and 'sliceStates[i - 1]'.
      getSliceUnion(tmpSliceState, &sliceStates[i - 1]);
    }
-    // Build trip count map for computation slice.
+    // Build trip count map for computation slice. We'll skip cases where the
+    // trip count was non-constant.
    sliceTripCountMap.clear();
    if (!buildSliceTripCountMap(srcOpInst, &sliceStates[i - 1],
                                &sliceTripCountMap))
-      // We'll skip cases where we the trip count was non-constant.
      continue;

    // Checks whether a store to load forwarding will happen.
    int64_t sliceIterationCount = getSliceIterationCount(sliceTripCountMap);
-    bool storeLoadFwdGuaranteed = (sliceIterationCount == 1);
-
    assert(sliceIterationCount > 0);
+    bool storeLoadFwdGuaranteed = (sliceIterationCount == 1);

    // Compute cost of fusion for this dest loop depth.

@ -1217,18 +1217,18 @@ public:
          if (mdg->hasDependenceTargetInRange(srcNode->id, dstNode->id, memref))
            continue;

-          // Check if fusion would be profitable.
+          // Check if fusion would be profitable and at what depth.
          // Get unique 'srcNode' store op.
          auto *srcStoreOpInst = srcNode->stores.front();
-          unsigned dstLoopDepth;
+          unsigned bestDstLoopDepth;
          mlir::ComputationSliceState sliceState;
          if (!isFusionProfitable(srcStoreOpInst, dstLoadOpInsts, &sliceState,
-                                  &dstLoopDepth))
+                                  &bestDstLoopDepth))
            continue;

          // Fuse computation slice of 'srcLoopNest' into 'dstLoopNest'.
          auto *sliceLoopNest = mlir::insertBackwardComputationSlice(
-              srcStoreOpInst, dstLoadOpInsts[0], dstLoopDepth, &sliceState);
+              srcStoreOpInst, dstLoadOpInsts[0], bestDstLoopDepth, &sliceState);
          if (sliceLoopNest != nullptr) {
            // Update edges between 'srcNode' and 'dstNode'.
            mdg->updateEdges(srcNode->id, dstNode->id);
@ -1250,7 +1250,7 @@ public:
            }
            assert(storesForMemref.size() == 1);
            auto *newMemRef = createPrivateMemRef(
-                dstForInst, storesForMemref[0], dstLoopDepth);
+                dstForInst, storesForMemref[0], bestDstLoopDepth);
            visitedMemrefs.insert(newMemRef);

            // Collect dst loop stats after memref privatizaton transformation.
--- a/mlir/lib/Transforms/Utils/Utils.cpp
+++ b/mlir/lib/Transforms/Utils/Utils.cpp
@ -128,11 +128,16 @@ bool mlir::replaceAllMemRefUsesWith(const Value *oldMemRef, Value *newMemRef,
                             oldMemRefRank);
    if (indexRemap &&
        indexRemap != builder.getMultiDimIdentityMap(indexRemap.getNumDims())) {
-      auto remapOp = builder.create<AffineApplyOp>(opInst->getLoc(), indexRemap,
-                                                   remapOperands);
+
      // Remapped indices.
-      state.operands.append(remapOp->getInstruction()->result_begin(),
-                            remapOp->getInstruction()->result_end());
+      for (auto resultExpr : indexRemap.getResults()) {
+        auto singleResMap =
+            builder.getAffineMap(indexRemap.getNumDims(),
+                                 indexRemap.getNumSymbols(), resultExpr, {});
+        auto afOp = builder.create<AffineApplyOp>(opInst->getLoc(),
+                                                  singleResMap, remapOperands);
+        state.operands.push_back(afOp->getResult(0));
+      }
    } else {
      // No remapping specified.
      state.operands.append(remapOperands.begin(), remapOperands.end());
--- a/mlir/test/Transforms/dma-generate.mlir
+++ b/mlir/test/Transforms/dma-generate.mlir
@ -1,12 +1,16 @@
 // RUN: mlir-opt %s -split-input-file -dma-generate -verify | FileCheck %s

 // Index of the buffer for the second DMA is remapped.
-// CHECK-DAG: [[MAP:#map[0-9]+]] = (d0) -> (d0 - 256)
+// CHECK-DAG: [[MAP_MINUS_256:#map[0-9]+]] = (d0) -> (d0 - 256)
+// CHECK-DAG: [[MAP_PLUS_256:#map[0-9]+]] = (d0) -> (d0 + 256)
 // CHECK-DAG: #map{{[0-9]+}} = (d0, d1) -> (d0 * 16 + d1)
-// CHECK-DAG: [[MAP_INDEX_DIFF:#map[0-9]+]] = (d0, d1, d2, d3) -> (d2 - d0, d3 - d1)
-// CHECK-DAG: [[MAP_MINUS_ONE:#map[0-9]+]] = (d0, d1) -> (d0 - 1, d1)
-// CHECK-DAG: [[MAP_ORIG_ACCESS:#map[0-9]+]] = (d0, d1)[s0, s1] -> (d0, d1 + s0 + s1)
-// CHECK-DAG: [[MAP_SUB_OFFSET:#map[0-9]+]] = (d0, d1, d2) -> (d1, d2 - (d0 + 9))
+// CHECK-DAG: [[MAP_INDEX_DIFF_EVEN:#map[0-9]+]] = (d0, d1, d2, d3) -> (d2 - d0)
+// CHECK-DAG: [[MAP_INDEX_DIFF_ODD:#map[0-9]+]] = (d0, d1, d2, d3) -> (d3 - d1)
+// CHECK-DAG: [[MAP_D0_MINUS_ONE:#map[0-9]+]] = (d0, d1) -> (d0 - 1)
+// CHECK-DAG: [[MAP_D1:#map[0-9]+]] = (d0, d1) -> (d1)
+// CHECK-DAG: [[MAP_SYM_SHIFT:#map[0-9]+]] = (d0, d1)[s0, s1] -> (d1 + s0 + s1)
+// CHECK-DAG: [[MAP_3D_D1:#map[0-9]+]] = (d0, d1, d2) -> (d1)
+// CHECK-DAG: [[MAP_SUB_OFFSET:#map[0-9]+]] = (d0, d1, d2) -> (d2 - (d0 + 9))

 // CHECK-LABEL: func @loop_nest_1d() {
 func @loop_nest_1d() {
@ -30,8 +34,8 @@ func @loop_nest_1d() {
  // CHECK-NEXT:  dma_wait %6[%c0], %c256_0 : memref<1xi32>
  // CHECK: for %i0 = 0 to 256 {
      // CHECK-NEXT: %7 = load %3[%i0] : memref<256xf32, 1>
-      // CHECK:      %8 = affine_apply #map{{[0-9]+}}(%i0)
-      // CHECK:      %9 = affine_apply [[MAP]](%8)
+      // CHECK:      %8 = affine_apply [[MAP_PLUS_256]](%i0)
+      // CHECK:      %9 = affine_apply [[MAP_MINUS_256]](%8)
      // CHECK-NEXT: %10 = load %5[%9] : memref<256xf32, 1>
      // Already in faster memory space.
      // CHECK:     %11 = load %2[%i0] : memref<256xf32, 1>
@ -171,8 +175,9 @@ func @loop_nest_tiled() -> memref<256x1024xf32> {
 // CHECK-NEXT:   for %i3 = #map
      for %i2 = (d0) -> (d0)(%i0) to (d0) -> (d0 + 32)(%i0) {
        for %i3 = (d0) -> (d0)(%i1) to (d0) -> (d0 + 32)(%i1) {
-          // CHECK:      %5 = affine_apply [[MAP_INDEX_DIFF]](%i0, %i1, %i2, %i3)
-          // CHECK-NEXT: %6 = load %3[%5#0, %5#1] : memref<32x32xf32, 1>
+          // CHECK-NEXT: %5 = affine_apply [[MAP_INDEX_DIFF_EVEN]](%i0, %i1, %i2, %i3)
+          // CHECK-NEXT: %6 = affine_apply [[MAP_INDEX_DIFF_ODD]](%i0, %i1, %i2, %i3)
+          // CHECK-NEXT: %7 = load %3[%5, %6] : memref<32x32xf32, 1>
          %1 = load %0[%i2, %i3] : memref<256x1024xf32>
        } // CHECK-NEXT: }
      }
@ -193,8 +198,9 @@ func @dma_constant_dim_access(%A : memref<100x100xf32>) {
  // CHECK-NEXT: dma_wait %1[%c0], %c100 : memref<1xi32>
  for %i = 0 to 100 {
    for %j = 0 to ()[s0] -> (s0) ()[%N] {
-      // CHECK:      %2 = affine_apply [[MAP_MINUS_ONE]](%c1_0, %i1)
-      // CHECK-NEXT: %3 = load %0[%2#0, %2#1] : memref<1x100xf32, 1>
+      // CHECK:      %2 = affine_apply [[MAP_D0_MINUS_ONE]](%c1_0, %i1)
+      // CHECK:      %3 = affine_apply [[MAP_D1]](%c1_0, %i1)
+      // CHECK-NEXT: %4 = load %0[%2, %3] : memref<1x100xf32, 1>
      load %A[%one, %j] : memref<100 x 100 x f32>
    }
  }
@ -206,8 +212,8 @@ func @dma_with_symbolic_accesses(%A : memref<100x100xf32>, %M : index) {
  %N = constant 9 : index
  for %i = 0 to 100 {
    for %j = 0 to 100 {
-      %idx = affine_apply (d0, d1) [s0, s1] -> (d0, d1 + s0 + s1)(%i, %j)[%M, %N]
-      load %A[%idx#0, %idx#1] : memref<100 x 100 x f32>
+      %idy = affine_apply (d0, d1) [s0, s1] -> (d1 + s0 + s1)(%i, %j)[%M, %N]
+      load %A[%i, %idy] : memref<100 x 100 x f32>
    }
  }
  return
@ -217,9 +223,10 @@ func @dma_with_symbolic_accesses(%A : memref<100x100xf32>, %M : index) {
 // CHECK-NEXT:  dma_wait %2[%c0], %c10000
 // CHECK-NEXT:  for %i0 = 0 to 100 {
 // CHECK-NEXT:    for %i1 = 0 to 100 {
-// CHECK-NEXT:      %3 = affine_apply [[MAP_ORIG_ACCESS]](%i0, %i1)[%arg1, %c9]
-// CHECK-NEXT:      %4 = affine_apply [[MAP_SUB_OFFSET]](%arg1, %3#0, %3#1)
-// CHECK-NEXT:      %5 = load %1[%4#0, %4#1] : memref<100x100xf32, 1>
+// CHECK-NEXT:      %3 = affine_apply [[MAP_SYM_SHIFT]](%i0, %i1)[%arg1, %c9]
+// CHECK-NEXT:      %4 = affine_apply [[MAP_3D_D1]](%arg1, %i0, %3)
+// CHECK-NEXT:      %5 = affine_apply [[MAP_SUB_OFFSET]](%arg1, %i0, %3)
+// CHECK-NEXT:      %6 = load %1[%4, %5] : memref<100x100xf32, 1>
 // CHECK-NEXT:    }
 // CHECK-NEXT:  }
 // CHECK-NEXT:  return
@ -236,8 +243,8 @@ func @dma_with_symbolic_loop_bounds(%A : memref<100x100xf32>, %M : index, %N: in
 // CHECK-NEXT:  dma_wait %1[%c0], %c10000 : memref<1xi32>
  for %i = 0 to 100 {
    for %j = %M to %N {
-      %idx = affine_apply (d0, d1) [s0] -> (d0, d1 + s0)(%i, %j)[%K]
-      load %A[%idx#0, %idx#1] : memref<100 x 100 x f32>
+      %idy = affine_apply (d1) [s0] -> (d1 + s0)(%j)[%K]
+      load %A[%i, %idy] : memref<100 x 100 x f32>
    }
  }
  return
@ -268,12 +275,14 @@ func @dma_memref_3d(%arg0: memref<1024x1024x1024xf32>) {
  for %i = 0 to 1024 {
    for %j = 0 to 1024 {
      for %k = 0 to 1024 {
-        %idx = affine_apply (d0, d1, d2) -> (d0 mod 128, d1 mod 128, d2 mod 128)(%i, %j, %k)
+        %idx = affine_apply (d0) -> (d0 mod 128)(%i)
+        %idy = affine_apply (d0) -> (d0 mod 128)(%j)
+        %idz = affine_apply (d0) -> (d0 mod 128)(%k)
        // DMA with nested striding (or emulating with loop around strided DMA)
        // not yet implemented.
-        // CHECK: %3 = load %arg0[%2#0, %2#1, %2#2] : memref<1024x1024x1024xf32>
-        %v = load %arg0[%idx#0, %idx#1, %idx#2] : memref<1024 x 1024 x 1024 x f32>
-        // expected-error@-8 {{DMA generation failed for one or more memref's}}
+        // CHECK: %5 = load %arg0[%2, %3, %4] : memref<1024x1024x1024xf32>
+        %v = load %arg0[%idx, %idy, %idz] : memref<1024 x 1024 x 1024 x f32>
+        // expected-error@-10 {{DMA generation failed for one or more memref's}}
      }
    }
  }
@ -285,8 +294,9 @@ func @dma_memref_3d(%arg0: memref<1024x1024x1024xf32>) {
 // CHECK:      #map0 = (d0) -> (d0 + 64)
 // CHECK-NEXT: #map1 = (d0) -> (d0 + 128)
 // CHECK-NEXT: #map2 = (d0) -> (d0 + 2)
-// CHECK-NEXT: #map3 = (d0, d1) -> (d0 - 2, d1 - 2)
-// CHECK-NEXT: #map4 = (d0) -> (d0 + 192)
+// CHECK-NEXT: #map3 = (d0, d1) -> (d0 - 2)
+// CHECK-NEXT: #map4 = (d0, d1) -> (d1 - 2)
+// CHECK-NEXT: #map5 = (d0) -> (d0 + 192)

 // The first load accesses ([2,258), [128,384))
 // The second load accesses ([64,320), [2,258))
@ -330,15 +340,19 @@ func @multi_load_store_union() {
 // CHECK-NEXT:      %6 = affine_apply #map2(%i0)
 // CHECK-NEXT:      %7 = affine_apply #map2(%i1)
 // CHECK-NEXT:      %8 = affine_apply #map3(%6, %5)
-// CHECK-NEXT:      %9 = load %1[%8#0, %8#1] : memref<382x446xf32, 1>
-// CHECK-NEXT:      %10 = affine_apply #map3(%4, %7)
-// CHECK-NEXT:      %11 = load %1[%10#0, %10#1] : memref<382x446xf32, 1>
-// CHECK-NEXT:      %12 = affine_apply #map1(%i0)
-// CHECK-NEXT:      %13 = affine_apply #map4(%i1)
-// CHECK-NEXT:      %14 = affine_apply #map3(%6, %13)
-// CHECK-NEXT:      store %9, %1[%14#0, %14#1] : memref<382x446xf32, 1>
-// CHECK-NEXT:      %15 = affine_apply #map3(%12, %7)
-// CHECK-NEXT:      store %11, %1[%15#0, %15#1] : memref<382x446xf32, 1>
+// CHECK-NEXT:      %9 = affine_apply #map4(%6, %5)
+// CHECK-NEXT:      %10 = load %1[%8, %9] : memref<382x446xf32, 1>
+// CHECK-NEXT:      %11 = affine_apply #map3(%4, %7)
+// CHECK-NEXT:      %12 = affine_apply #map4(%4, %7)
+// CHECK-NEXT:      %13 = load %1[%11, %12] : memref<382x446xf32, 1>
+// CHECK-NEXT:      %14 = affine_apply #map1(%i0)
+// CHECK-NEXT:      %15 = affine_apply #map5(%i1)
+// CHECK-NEXT:      %16 = affine_apply #map3(%6, %15)
+// CHECK-NEXT:      %17 = affine_apply #map4(%6, %15)
+// CHECK-NEXT:      store %10, %1[%16, %17] : memref<382x446xf32, 1>
+// CHECK-NEXT:      %18 = affine_apply #map3(%14, %7)
+// CHECK-NEXT:      %19 = affine_apply #map4(%14, %7)
+// CHECK-NEXT:      store %13, %1[%18, %19] : memref<382x446xf32, 1>
 // CHECK-NEXT:    }
 // CHECK-NEXT:  }
 // CHECK-NEXT:  dma_start %1[%c0, %c0], %0[%c2, %c2_0], %c170372, %3[%c0], %c512, %c446 : memref<382x446xf32, 1>, memref<512x512xf32>, memref<1xi32>
--- a/mlir/test/Transforms/loop-fusion.mlir
+++ b/mlir/test/Transforms/loop-fusion.mlir
@ -79,8 +79,9 @@ func @should_fuse_reduction_to_pointwise() {
 // -----

 // CHECK-DAG: [[MAP_SHIFT_MINUS_ONE_R1:#map[0-9]+]] = (d0) -> (d0 - 1)
-// CHECK-DAG: [[MAP_SHIFT_BY_ONE:#map[0-9]+]] = (d0, d1) -> (d0 + 1, d1 + 1)
-// CHECK-DAG: [[MAP_SHIFT_MINUS_IV_R2:#map[0-9]+]] = (d0, d1, d2, d3) -> (-d0 + d2, -d1 + d3)
+// CHECK-DAG: [[MAP_SHIFT_BY_ONE:#map[0-9]+]] = (d0) -> (d0 + 1)
+// CHECK-DAG: [[MAP_SHIFT_MINUS_IV_R2_EVEN:#map[0-9]+]] = (d0, d1, d2, d3) -> (-d0 + d2)
+// CHECK-DAG: [[MAP_SHIFT_MINUS_IV_R2_ODD:#map[0-9]+]] = (d0, d1, d2, d3) -> (-d1 + d3)

 // CHECK-LABEL: func @should_fuse_loop_nests_with_shifts() {
 func @should_fuse_loop_nests_with_shifts() {
@ -89,8 +90,9 @@ func @should_fuse_loop_nests_with_shifts() {

  for %i0 = 0 to 9 {
    for %i1 = 0 to 9 {
-      %a0 = affine_apply (d0, d1) -> (d0 + 1, d1 + 1) (%i0, %i1)
-      store %cf7, %a[%a0#0, %a0#1] : memref<10x10xf32>
+      %idx = affine_apply (d0) -> (d0 + 1) (%i0)
+      %idy = affine_apply (d0) -> (d0 + 1) (%i1)
+      store %cf7, %a[%idx, %idy] : memref<10x10xf32>
    }
  }
  for %i2 = 1 to 10 {
@ -112,11 +114,14 @@ func @should_fuse_loop_nests_with_shifts() {
  // CHECK-NEXT:   for %i1 = 1 to 10 {
  // CHECK-NEXT:     %1 = affine_apply [[MAP_SHIFT_MINUS_ONE_R1]](%i0)
  // CHECK-NEXT:     %2 = affine_apply [[MAP_SHIFT_MINUS_ONE_R1]](%i1)
-  // CHECK-NEXT:     %3 = affine_apply [[MAP_SHIFT_BY_ONE]](%1, %2)
-  // CHECK-NEXT:     %4 = affine_apply [[MAP_SHIFT_MINUS_IV_R2]](%i0, %i1, %3#0, %3#1)
-  // CHECK-NEXT:     store %cst, %0[%4#0, %4#1] : memref<1x1xf32>
-  // CHECK-NEXT:     %5 = affine_apply [[MAP_SHIFT_MINUS_IV_R2]](%i0, %i1, %i0, %i1)
-  // CHECK-NEXT:     %6 = load %0[%5#0, %5#1] : memref<1x1xf32>
+  // CHECK-NEXT:     %3 = affine_apply [[MAP_SHIFT_BY_ONE]](%1)
+  // CHECK-NEXT:     %4 = affine_apply [[MAP_SHIFT_BY_ONE]](%2)
+  // CHECK-NEXT:     %5 = affine_apply [[MAP_SHIFT_MINUS_IV_R2_EVEN]](%i0, %i1, %3, %4)
+  // CHECK-NEXT:     %6 = affine_apply [[MAP_SHIFT_MINUS_IV_R2_ODD]](%i0, %i1, %3, %4)
+  // CHECK-NEXT:     store %cst, %0[%5, %6] : memref<1x1xf32>
+  // CHECK-NEXT:     %7 = affine_apply [[MAP_SHIFT_MINUS_IV_R2_EVEN]](%i0, %i1, %i0, %i1)
+  // CHECK-NEXT:     %8 = affine_apply [[MAP_SHIFT_MINUS_IV_R2_ODD]](%i0, %i1, %i0, %i1)
+  // CHECK-NEXT:     %9 = load %0[%7, %8] : memref<1x1xf32>
  // CHECK-NEXT:   }
  // CHECK-NEXT: }
  // CHECK-NEXT: return
@ -125,7 +130,8 @@ func @should_fuse_loop_nests_with_shifts() {

 // -----

-// CHECK-DAG: [[MAP0:#map[0-9]+]] = (d0, d1, d2, d3) -> (-d0 + d2, -d1 + d3)
+// CHECK-DAG: [[MAP_D2_D0_DIFF:#map[0-9]+]] = (d0, d1, d2, d3) -> (-d0 + d2)
+// CHECK-DAG: [[MAP_D3_D1_DIFF:#map[0-9]+]] = (d0, d1, d2, d3) -> (-d1 + d3)

 // CHECK-LABEL: func @should_fuse_loop_nest() {
 func @should_fuse_loop_nest() {
@ -154,14 +160,18 @@ func @should_fuse_loop_nest() {
  // CHECK-NEXT: [[NEWA:%[0-9]+]] = alloc() : memref<1x1xf32>
  // CHECK-NEXT: for %i0 = 0 to 10 {
  // CHECK-NEXT:   for %i1 = 0 to 10 {
-  // CHECK-NEXT:     %2 = affine_apply [[MAP0]](%i1, %i0, %i1, %i0)
-  // CHECK-NEXT:     store %cst, [[NEWA]][%2#0, %2#1] : memref<1x1xf32>
-  // CHECK-NEXT:     %3 = affine_apply [[MAP0]](%i1, %i0, %i1, %i0)
-  // CHECK-NEXT:     %4 = load [[NEWA]][%3#0, %3#1] : memref<1x1xf32>
-  // CHECK-NEXT:     %5 = affine_apply [[MAP0]](%i0, %i1, %i0, %i1)
-  // CHECK-NEXT:     store %4, [[NEWB]][%5#0, %5#1] : memref<1x1xf32>
-  // CHECK-NEXT:     %6 = affine_apply [[MAP0]](%i0, %i1, %i0, %i1)
-  // CHECK-NEXT:     %7 = load [[NEWB]][%6#0, %6#1] : memref<1x1xf32>
+  // CHECK-NEXT:     %2 = affine_apply [[MAP_D2_D0_DIFF]](%i1, %i0, %i1, %i0)
+  // CHECK-NEXT:     %3 = affine_apply [[MAP_D3_D1_DIFF]](%i1, %i0, %i1, %i0)
+  // CHECK-NEXT:     store %cst, [[NEWA]][%2, %3] : memref<1x1xf32>
+  // CHECK-NEXT:     %4 = affine_apply [[MAP_D2_D0_DIFF]](%i1, %i0, %i1, %i0)
+  // CHECK-NEXT:     %5 = affine_apply [[MAP_D3_D1_DIFF]](%i1, %i0, %i1, %i0)
+  // CHECK-NEXT:     %6 = load [[NEWA]][%4, %5] : memref<1x1xf32>
+  // CHECK-NEXT:     %7 = affine_apply [[MAP_D2_D0_DIFF]](%i0, %i1, %i0, %i1)
+  // CHECK-NEXT:     %8 = affine_apply [[MAP_D3_D1_DIFF]](%i0, %i1, %i0, %i1)
+  // CHECK-NEXT:     store %6, [[NEWB]][%7, %8] : memref<1x1xf32>
+  // CHECK-NEXT:     %9 = affine_apply [[MAP_D2_D0_DIFF]](%i0, %i1, %i0, %i1)
+  // CHECK-NEXT:     %10 = affine_apply [[MAP_D3_D1_DIFF]](%i0, %i1, %i0, %i1)
+  // CHECK-NEXT:     %11 = load [[NEWB]][%9, %10] : memref<1x1xf32>
  // CHECK-NEXT:   }
  // CHECK-NEXT: }
  // CHECK-NEXT: return
@ -516,42 +526,42 @@ func @should_not_fuse_if_inst_in_loop_nest() {

 // -----

-// CHECK: [[MAP0:#map[0-9]+]] = (d0, d1, d2) -> (d0, d1, d2)
-// CHECK: [[MAP1:#map[0-9]+]] = (d0, d1, d2, d3, d4, d5) -> (-d0 + d3, -d1 + d4, -d2 + d5)
-// CHECK: [[MAP_PERMUTE:#map[0-9]+]] = (d0, d1, d2) -> (d1, d2, d0)
+// CHECK: [[MAP0:#map[0-9]+]] = (d0, d1, d2, d3, d4, d5) -> (-d0 + d3)
+// CHECK: [[MAP1:#map[0-9]+]] = (d0, d1, d2, d3, d4, d5) -> (-d1 + d4)
+// CHECK: [[MAP2:#map[0-9]+]] = (d0, d1, d2, d3, d4, d5) -> (-d2 + d5)

-#map0 = (d0, d1, d2) -> (d0, d1, d2)
-
-// CHECK-LABEL: func @remap_ivs() {
-func @remap_ivs() {
+// CHECK-LABEL: func @permute_and_fuse() {
+func @permute_and_fuse() {
  %m = alloc() : memref<10x20x30xf32>

  %cf7 = constant 7.0 : f32
  for %i0 = 0 to 10 {
    for %i1 = 0 to 20 {
      for %i2 = 0 to 30 {
-        %a0 = affine_apply (d0, d1, d2) -> (d0, d1, d2) (%i0, %i1, %i2)
-        store %cf7, %m[%a0#0, %a0#1, %a0#2] : memref<10x20x30xf32>
+        store %cf7, %m[%i0, %i1, %i2] : memref<10x20x30xf32>
      }
    }
  }
  for %i3 = 0 to 30 {
    for %i4 = 0 to 10 {
      for %i5 = 0 to 20 {
-        %a1 = affine_apply (d0, d1, d2) -> (d1, d2, d0) (%i3, %i4, %i5)
-        %v0 = load %m[%a1#0, %a1#1, %a1#2] : memref<10x20x30xf32>
+        %v0 = load %m[%i4, %i5, %i3] : memref<10x20x30xf32>
+        "foo"(%v0) : (f32) -> ()
      }
    }
  }
 // CHECK:       for %i0 = 0 to 30 {
 // CHECK-NEXT:    for %i1 = 0 to 10 {
 // CHECK-NEXT:      for %i2 = 0 to 20 {
-// CHECK-NEXT:        %1 = affine_apply [[MAP0]](%i1, %i2, %i0)
-// CHECK-NEXT:        %2 = affine_apply [[MAP1]](%i1, %i2, %i0, %1#0, %1#1, %1#2)
-// CHECK-NEXT:        store %cst, %0[%2#0, %2#1, %2#2] : memref<1x1x1xf32>
-// CHECK-NEXT:        %3 = affine_apply [[MAP_PERMUTE]](%i0, %i1, %i2)
-// CHECK-NEXT:        %4 = affine_apply [[MAP1]](%i1, %i2, %i0, %3#0, %3#1, %3#2)
-// CHECK-NEXT:        %5 = load %0[%4#0, %4#1, %4#2] : memref<1x1x1xf32>
+// CHECK-NEXT:        %1 = affine_apply [[MAP0]](%i1, %i2, %i0, %i1, %i2, %i0)
+// CHECK-NEXT:        %2 = affine_apply [[MAP1]](%i1, %i2, %i0, %i1, %i2, %i0)
+// CHECK-NEXT:        %3 = affine_apply [[MAP2]](%i1, %i2, %i0, %i1, %i2, %i0)
+// CHECK-NEXT:        store %cst, %0[%1, %2, %3] : memref<1x1x1xf32>
+// CHECK-NEXT:        %4 = affine_apply [[MAP0]](%i1, %i2, %i0, %i1, %i2, %i0)
+// CHECK-NEXT:        %5 = affine_apply [[MAP1]](%i1, %i2, %i0, %i1, %i2, %i0)
+// CHECK-NEXT:        %6 = affine_apply [[MAP2]](%i1, %i2, %i0, %i1, %i2, %i0)
+// CHECK-NEXT:        %7 = load %0[%4, %5, %6] : memref<1x1x1xf32>
+// CHECK-NEXT:        "foo"(%7) : (f32) -> ()
 // CHECK-NEXT:      }
 // CHECK-NEXT:    }
 // CHECK-NEXT:  }
@ -563,7 +573,8 @@ func @remap_ivs() {
 // -----

 // CHECK-DAG: #map0 = (d0, d1) -> (d0 * 4 + d1)
-// CHECK-DAG: #map1 = (d0) -> (d0 floordiv 4, d0 mod 4)
+// CHECK-DAG: #map1 = (d0) -> (d0 floordiv 4)
+// CHECK-DAG: #map2 = (d0) -> (d0 mod 4)

 // Reshape from a 64 x f32 to 16 x 4 x f32.
 // CHECK-LABEL: func @fuse_reshape_64_16_4
@ -572,8 +583,9 @@ func @fuse_reshape_64_16_4(%in : memref<64xf32>) {

  for %i0 = 0 to 64 {
    %v = load %in[%i0] : memref<64xf32>
-    %idx = affine_apply (d0) -> (d0 floordiv 4, d0 mod 4) (%i0)
-    store %v, %out[%idx#0, %idx#1] : memref<16x4xf32>
+    %idx = affine_apply (d0) -> (d0 floordiv 4) (%i0)
+    %idy = affine_apply (d0) -> (d0 mod 4) (%i0)
+    store %v, %out[%idx, %idy] : memref<16x4xf32>
  }

  for %i1 = 0 to 16 {
@ -661,16 +673,13 @@ func @R6_to_R2_reshape_square() -> memref<64x9xi32> {
    for %jj = 0 to 9 {
      // Convert output coordinates to linear index.
      %a0 = affine_apply (d0, d1) -> (d0 * 9 + d1) (%ii, %jj)
-      %a1 = affine_apply (d0) -> (
-          d0 floordiv (2 * 3 * 3 * 16 * 1),
-          (d0 mod 288) floordiv (3 * 3 * 16 * 1),
-          ((d0 mod 288) mod 144) floordiv (3 * 16 * 1),
-          (((d0 mod 288) mod 144) mod 48) floordiv (16 * 1),
-          ((((d0 mod 288) mod 144) mod 48) mod 16),
-          ((((d0 mod 144) mod 144) mod 48) mod 16) mod 1
-        ) (%a0)
-      %v = load %in[%a1#0, %a1#1, %a1#2, %a1#3, %a1#4, %a1#5]
-        : memref<2x2x3x3x16x1xi32>
+      %0 = affine_apply (d0) -> (d0 floordiv (2 * 3 * 3 * 16 * 1))(%a0)
+      %1 = affine_apply (d0) -> ((d0 mod 288) floordiv (3 * 3 * 16 * 1))(%a0)
+      %2 = affine_apply (d0) -> (((d0 mod 288) mod 144) floordiv (3 * 16 * 1))(%a0)
+      %3 = affine_apply (d0) -> ((((d0 mod 288) mod 144) mod 48) floordiv (16 * 1))(%a0)
+      %4 = affine_apply (d0) -> ((((d0 mod 288) mod 144) mod 48) mod 16)(%a0)
+      %5 = affine_apply (d0) -> (((((d0 mod 144) mod 144) mod 48) mod 16) mod 1)(%a0)
+      %v = load %in[%0, %1, %2, %3, %4, %5] : memref<2x2x3x3x16x1xi32>
      store %v, %out[%ii, %jj] : memref<64x9xi32>
    }
  }
@ -699,17 +708,34 @@ func @R6_to_R2_reshape_square() -> memref<64x9xi32> {
 // CHECK-NEXT:      %7 = affine_apply #map4(%i0, %i1)
 // CHECK-NEXT:      %8 = "foo"(%3, %4, %5, %6, %7, %c0) : (index, index, index, index, index, index) -> i32
 // CHECK-NEXT:      %9 = affine_apply #map5(%i0, %i1, %3, %4, %5, %6, %7, %c0)
-// CHECK-NEXT:      store %8, %2[%9#0, %9#1, %9#2, %9#3, %9#4, %9#5] : memref<1x2x3x3x16x1xi32>
-// CHECK-NEXT:      %10 = affine_apply #map6(%i0, %i1)
-// CHECK-NEXT:      %11 = affine_apply #map7(%10)
-// CHECK-NEXT:      %12 = affine_apply #map5(%i0, %i1, %11#0, %11#1, %11#2, %11#3, %11#4, %11#5)
-// CHECK-NEXT:      %13 = load %2[%12#0, %12#1, %12#2, %12#3, %12#4, %12#5] : memref<1x2x3x3x16x1xi32>
-// CHECK-NEXT:      %14 = affine_apply #map8(%i0, %i1, %i0, %i1)
-// CHECK-NEXT:      store %13, %1[%14#0, %14#1] : memref<1x1xi32>
-// CHECK-NEXT:      %15 = affine_apply #map8(%i0, %i1, %i0, %i1)
-// CHECK-NEXT:      %16 = load %1[%15#0, %15#1] : memref<1x1xi32>
-// CHECK-NEXT:      %17 = muli %16, %16 : i32
-// CHECK-NEXT:      store %17, %0[%i0, %i1] : memref<64x9xi32>
+// CHECK-NEXT:      %10 = affine_apply #map6(%i0, %i1, %3, %4, %5, %6, %7, %c0)
+// CHECK-NEXT:      %11 = affine_apply #map7(%i0, %i1, %3, %4, %5, %6, %7, %c0)
+// CHECK-NEXT:      %12 = affine_apply #map8(%i0, %i1, %3, %4, %5, %6, %7, %c0)
+// CHECK-NEXT:      %13 = affine_apply #map9(%i0, %i1, %3, %4, %5, %6, %7, %c0)
+// CHECK-NEXT:      %14 = affine_apply #map10(%i0, %i1, %3, %4, %5, %6, %7, %c0)
+// CHECK-NEXT:      store %8, %2[%9, %10, %11, %12, %13, %14] : memref<1x2x3x3x16x1xi32>
+// CHECK-NEXT:      %15 = affine_apply #map11(%i0, %i1)
+// CHECK-NEXT:      %16 = affine_apply #map12(%15)
+// CHECK-NEXT:      %17 = affine_apply #map13(%15)
+// CHECK-NEXT:      %18 = affine_apply #map14(%15)
+// CHECK-NEXT:      %19 = affine_apply #map15(%15)
+// CHECK-NEXT:      %20 = affine_apply #map16(%15)
+// CHECK-NEXT:      %21 = affine_apply #map17(%15)
+// CHECK-NEXT:      %22 = affine_apply #map5(%i0, %i1, %16, %17, %18, %19, %20, %21)
+// CHECK-NEXT:      %23 = affine_apply #map6(%i0, %i1, %16, %17, %18, %19, %20, %21)
+// CHECK-NEXT:      %24 = affine_apply #map7(%i0, %i1, %16, %17, %18, %19, %20, %21)
+// CHECK-NEXT:      %25 = affine_apply #map8(%i0, %i1, %16, %17, %18, %19, %20, %21)
+// CHECK-NEXT:      %26 = affine_apply #map9(%i0, %i1, %16, %17, %18, %19, %20, %21)
+// CHECK-NEXT:      %27 = affine_apply #map10(%i0, %i1, %16, %17, %18, %19, %20, %21)
+// CHECK-NEXT:      %28 = load %2[%22, %23, %24, %25, %26, %27] : memref<1x2x3x3x16x1xi32>
+// CHECK-NEXT:      %29 = affine_apply #map18(%i0, %i1, %i0, %i1)
+// CHECK-NEXT:      %30 = affine_apply #map19(%i0, %i1, %i0, %i1)
+// CHECK-NEXT:      store %28, %1[%29, %30] : memref<1x1xi32>
+// CHECK-NEXT:      %31 = affine_apply #map18(%i0, %i1, %i0, %i1)
+// CHECK-NEXT:      %32 = affine_apply #map19(%i0, %i1, %i0, %i1)
+// CHECK-NEXT:      %33 = load %1[%31, %32] : memref<1x1xi32>
+// CHECK-NEXT:      %34 = muli %33, %33 : i32
+// CHECK-NEXT:      store %34, %0[%i0, %i1] : memref<64x9xi32>
 // CHECK-NEXT:    }
 // CHECK-NEXT:  }
 // CHECK-NEXT:  return %0 : memref<64x9xi32>
@ -732,8 +758,8 @@ func @fuse_symbolic_bounds(%M : index, %N : index) {

  for %i2 = 0 to %M {
    for %i3 = 0 to %N {
-      %idx = affine_apply (d0, d1)[s0] -> (d0, d1 + s0) (%i2, %i3)[%s]
-      %v = load %m[%idx#0, %idx#1] : memref<? x ? x f32>
+      %idy = affine_apply (d0)[s0] -> (d0 + s0) (%i3)[%s]
+      %v = load %m[%i2, %idy] : memref<? x ? x f32>
    }
  }

@ -791,7 +817,8 @@ func @should_fuse_reduction_at_depth1() {
 }

 // -----
-// CHECK: #map0 = (d0, d1, d2) -> (-d0 + d1, d2)
+// CHECK: #map0 = (d0, d1, d2) -> (-d0 + d1)
+// CHECK: #map1 = (d0, d1, d2) -> (d2)

 // CHECK-LABEL: func @should_fuse_at_src_depth1_and_dst_depth1
 func @should_fuse_at_src_depth1_and_dst_depth1() {
@ -828,12 +855,14 @@ func @should_fuse_at_src_depth1_and_dst_depth1() {
  // CHECK-NEXT:    for %i2 = 0 to 16 {
  // CHECK-NEXT:      %3 = "op1"() : () -> f32
  // CHECK-NEXT:      %4 = affine_apply #map0(%i0, %i0, %i2)
-  // CHECK-NEXT:      store %3, %1[%4#0, %4#1] : memref<1x16xf32>
+  // CHECK-NEXT:      %5 = affine_apply #map1(%i0, %i0, %i2)
+  // CHECK-NEXT:      store %3, %1[%4, %5] : memref<1x16xf32>
  // CHECK-NEXT:    }
  // CHECK-NEXT:    for %i3 = 0 to 16 {
-  // CHECK-NEXT:      %5 = affine_apply #map0(%i0, %i0, %i3)
-  // CHECK-NEXT:      %6 = load %1[%5#0, %5#1] : memref<1x16xf32>
-  // CHECK-NEXT:      "op2"(%6) : (f32) -> ()
+  // CHECK-NEXT:      %6 = affine_apply #map0(%i0, %i0, %i3)
+  // CHECK-NEXT:      %7 = affine_apply #map1(%i0, %i0, %i3)
+  // CHECK-NEXT:      %8 = load %1[%6, %7] : memref<1x16xf32>
+  // CHECK-NEXT:      "op2"(%8) : (f32) -> ()
  // CHECK-NEXT:    }
  // CHECK-NEXT:  }
  // CHECK-NEXT:  return
@ -903,7 +932,12 @@ func @fusion_at_depth0_not_currently_supported() {

 // -----

-// CHECK-DAG: #map0 = (d0, d1, d2, d3, d4, d5, d6, d7, d8, d9) -> (-d0 + d4, -d1 + d5, -d2 + d6, -d3 + d7, d8, d9)
+// CHECK: #map0 = (d0, d1, d2, d3, d4, d5, d6, d7, d8, d9) -> (-d0 + d4)
+// CHECK: #map1 = (d0, d1, d2, d3, d4, d5, d6, d7, d8, d9) -> (-d1 + d5)
+// CHECK: #map2 = (d0, d1, d2, d3, d4, d5, d6, d7, d8, d9) -> (-d2 + d6)
+// CHECK: #map3 = (d0, d1, d2, d3, d4, d5, d6, d7, d8, d9) -> (-d3 + d7)
+// CHECK: #map4 = (d0, d1, d2, d3, d4, d5, d6, d7, d8, d9) -> (d8)
+// CHECK: #map5 = (d0, d1, d2, d3, d4, d5, d6, d7, d8, d9) -> (d9)

 // CHECK-LABEL: func @should_fuse_deep_loop_nests
 func @should_fuse_deep_loop_nests() {
@ -965,7 +999,9 @@ func @should_fuse_deep_loop_nests() {
 // The first four loops of the source loop nest can be sliced with iteration
 // bounds which are a function of the first four loops of destination loop nest,
 // where the destination loops nests have been interchanged.
-// CHECK:       for %i0 = 0 to 3 {
+
+// CHECK:       %2 = alloc() : memref<1x1x1x1x16x10xf32, 2>
+// CHECK-NEXT:  for %i0 = 0 to 3 {
 // CHECK-NEXT:    for %i1 = 0 to 3 {
 // CHECK-NEXT:      for %i2 = 0 to 2 {
 // CHECK-NEXT:        for %i3 = 0 to 2 {
@ -979,20 +1015,30 @@ func @should_fuse_deep_loop_nests() {
 // CHECK-NEXT:              for %i8 = 0 to 16 {
 // CHECK-NEXT:                for %i9 = 0 to 10 {
 // CHECK-NEXT:                  %4 = affine_apply #map0(%i2, %i3, %i0, %i1, %i2, %i3, %i0, %i1, %i8, %i9)
-// CHECK-NEXT:                  store %cst, %2[%4#0, %4#1, %4#2, %4#3, %4#4, %4#5] : memref<1x1x1x1x16x10xf32, 2>
+// CHECK-NEXT:                  %5 = affine_apply #map1(%i2, %i3, %i0, %i1, %i2, %i3, %i0, %i1, %i8, %i9)
+// CHECK-NEXT:                  %6 = affine_apply #map2(%i2, %i3, %i0, %i1, %i2, %i3, %i0, %i1, %i8, %i9)
+// CHECK-NEXT:                  %7 = affine_apply #map3(%i2, %i3, %i0, %i1, %i2, %i3, %i0, %i1, %i8, %i9)
+// CHECK-NEXT:                  %8 = affine_apply #map4(%i2, %i3, %i0, %i1, %i2, %i3, %i0, %i1, %i8, %i9)
+// CHECK-NEXT:                  %9 = affine_apply #map5(%i2, %i3, %i0, %i1, %i2, %i3, %i0, %i1, %i8, %i9)
+// CHECK-NEXT:                  store %cst, %2[%4, %5, %6, %7, %8, %9] : memref<1x1x1x1x16x10xf32, 2>
 // CHECK-NEXT:                }
 // CHECK-NEXT:              }
 // CHECK-NEXT:              for %i10 = 0 to 2 {
 // CHECK-NEXT:                for %i11 = 0 to 2 {
 // CHECK-NEXT:                  for %i12 = 0 to 16 {
 // CHECK-NEXT:                    for %i13 = 0 to 10 {
-// CHECK-NEXT:                      %5 = load %0[%i10, %i11, %i4, %i5, %i12, %i13] : memref<2x2x3x3x16x10xf32, 2>
+// CHECK-NEXT:                      %10 = load %0[%i10, %i11, %i4, %i5, %i12, %i13] : memref<2x2x3x3x16x10xf32, 2>
 // CHECK-NEXT:                    }
 // CHECK-NEXT:                  }
 // CHECK-NEXT:                  for %i14 = 0 to 16 {
 // CHECK-NEXT:                    for %i15 = 0 to 10 {
-// CHECK-NEXT:                      %6 = affine_apply #map0(%i2, %i3, %i0, %i1, %i2, %i3, %i0, %i1, %i14, %i15)
-// CHECK-NEXT:                      %7 = load %2[%6#0, %6#1, %6#2, %6#3, %6#4, %6#5] : memref<1x1x1x1x16x10xf32, 2>
+// CHECK-NEXT:                      %11 = affine_apply #map0(%i2, %i3, %i0, %i1, %i2, %i3, %i0, %i1, %i14, %i15)
+// CHECK-NEXT:                      %12 = affine_apply #map1(%i2, %i3, %i0, %i1, %i2, %i3, %i0, %i1, %i14, %i15)
+// CHECK-NEXT:                      %13 = affine_apply #map2(%i2, %i3, %i0, %i1, %i2, %i3, %i0, %i1, %i14, %i15)
+// CHECK-NEXT:                      %14 = affine_apply #map3(%i2, %i3, %i0, %i1, %i2, %i3, %i0, %i1, %i14, %i15)
+// CHECK-NEXT:                      %15 = affine_apply #map4(%i2, %i3, %i0, %i1, %i2, %i3, %i0, %i1, %i14, %i15)
+// CHECK-NEXT:                      %16 = affine_apply #map5(%i2, %i3, %i0, %i1, %i2, %i3, %i0, %i1, %i14, %i15)
+// CHECK-NEXT:                      %17 = load %2[%11, %12, %13, %14, %15, %16] : memref<1x1x1x1x16x10xf32, 2>
 // CHECK-NEXT:                    }
 // CHECK-NEXT:                  }
 // CHECK-NEXT:                }
@ -1008,7 +1054,8 @@ func @should_fuse_deep_loop_nests() {
 }

 // -----
-// CHECK: #map0 = (d0, d1, d2) -> (-d0 + d1, d2)
+// CHECK: #map0 = (d0, d1, d2) -> (-d0 + d1)
+// CHECK: #map1 = (d0, d1, d2) -> (d2)

 // CHECK-LABEL: func @should_fuse_at_depth1_and_reduce_slice_trip_count
 func @should_fuse_at_depth1_and_reduce_slice_trip_count() {
@ -1048,11 +1095,13 @@ func @should_fuse_at_depth1_and_reduce_slice_trip_count() {
  // CHECK-NEXT:    }
  // CHECK-NEXT:    for %i2 = 0 to 16 {
  // CHECK-NEXT:      %3 = affine_apply #map0(%i0, %i0, %i2)
-  // CHECK-NEXT:      store %cst, %1[%3#0, %3#1] : memref<1x16xf32>
+  // CHECK-NEXT:      %4 = affine_apply #map1(%i0, %i0, %i2)
+  // CHECK-NEXT:      store %cst, %1[%3, %4] : memref<1x16xf32>
  // CHECK-NEXT:    }
  // CHECK-NEXT:    for %i3 = 0 to 16 {
-  // CHECK-NEXT:      %4 = affine_apply #map0(%i0, %i0, %i3)
-  // CHECK-NEXT:      %5 = load %1[%4#0, %4#1] : memref<1x16xf32>
+  // CHECK-NEXT:      %5 = affine_apply #map0(%i0, %i0, %i3)
+  // CHECK-NEXT:      %6 = affine_apply #map1(%i0, %i0, %i3)
+  // CHECK-NEXT:      %7 = load %1[%5, %6] : memref<1x16xf32>
  // CHECK-NEXT:    }
  // CHECK-NEXT:  }
  // CHECK-NEXT:  return
@ -1259,8 +1308,8 @@ func @R3_to_R2_reshape() {
  for %ii = 0 to 32 {
    for %jj = 0 to 3 {
      %a0 = affine_apply (d0, d1) -> (d0 * 3 + d1) (%ii, %jj)
-      %a1 = affine_apply (d0) -> (d0 floordiv (3 * 16)) (%a0)
-      %v = load %in[%a1#0, %jj, %c0]
+      %idx = affine_apply (d0) -> (d0 floordiv (3 * 16)) (%a0)
+      %v = load %in[%idx, %jj, %c0]
        : memref<2x3x16xi32>
    }
  }
@ -1268,9 +1317,12 @@ func @R3_to_R2_reshape() {
 }
 // CHECK:      #map0 = (d0, d1) -> ((d0 * 3 + d1) floordiv 48)
 // CHECK-NEXT: #map1 = ()[s0] -> (s0)
-// CHECK-NEXT: #map2 = (d0, d1, d2, d3, d4) -> (d2 - (d0 * 25 + d1 * 24) floordiv 24, -d1 + d3, d4)
-// CHECK-NEXT: #map3 = (d0, d1) -> (d0 * 3 + d1)
-// CHECK-NEXT: #map4 = (d0) -> (d0 floordiv 48)
+// CHECK-NEXT: #map2 = (d0, d1, d2, d3, d4) -> (d2 - (d0 * 25 + d1 * 24) floordiv 24)
+// CHECK-NEXT: #map3 = (d0, d1, d2, d3, d4) -> (-d1 + d3)
+// CHECK-NEXT: #map4 = (d0, d1, d2, d3, d4) -> (d4)
+// CHECK-NEXT: #map5 = (d0, d1) -> (d0 * 3 + d1)
+// CHECK-NEXT: #map6 = (d0) -> (d0 floordiv 48)
+
 // CHECK-LABEL: func @R3_to_R2_reshape()
 // CHECK:        %0 = alloc() : memref<1x1x1xi32>
 // CHECK-NEXT:   for %i0 = 0 to 32 {
@ -1279,15 +1331,18 @@ func @R3_to_R2_reshape() {
 // CHECK-NEXT:      %2 = affine_apply #map1()[%c0]
 // CHECK-NEXT:      %3 = "foo"(%1, %i1, %2) : (index, index, index) -> i32
 // CHECK-NEXT:      %4 = affine_apply #map2(%i0, %i1, %1, %i1, %2)
-// CHECK-NEXT:       store %3, %0[%4#0, %4#1, %4#2] : memref<1x1x1xi32>
-// CHECK-NEXT:       %5 = affine_apply #map3(%i0, %i1)
-// CHECK-NEXT:       %6 = affine_apply #map4(%5)
-// CHECK-NEXT:       %7 = affine_apply #map2(%i0, %i1, %6, %i1, %c0)
-// CHECK-NEXT:       %8 = load %0[%7#0, %7#1, %7#2] : memref<1x1x1xi32>
+// CHECK-NEXT:      %5 = affine_apply #map3(%i0, %i1, %1, %i1, %2)
+// CHECK-NEXT:      %6 = affine_apply #map4(%i0, %i1, %1, %i1, %2)
+// CHECK-NEXT:      store %3, %0[%4, %5, %6] : memref<1x1x1xi32>
+// CHECK-NEXT:      %7 = affine_apply #map5(%i0, %i1)
+// CHECK-NEXT:      %8 = affine_apply #map6(%7)
+// CHECK-NEXT:      %9 = affine_apply #map2(%i0, %i1, %8, %i1, %c0)
+// CHECK-NEXT:      %10 = affine_apply #map3(%i0, %i1, %8, %i1, %c0)
+// CHECK-NEXT:      %11 = affine_apply #map4(%i0, %i1, %8, %i1, %c0)
+// CHECK-NEXT:      %12 = load %0[%9, %10, %11] : memref<1x1x1xi32>
 // CHECK-NEXT:    }
 // CHECK-NEXT:  }
 // CHECK-NEXT:  return
-// CHECK-NEXT: }

 // -----