[MLIR][Linalg] Hoist padding across multiple levels of tiling

This revision introduces proper backward slice computation during the hoisting of PadTensorOp. This allows hoisting padding even across multiple levels of tiling. Such hoisting requires the proper handling of loop bounds that may depend on enclosing loop variables. Differential revision: https://reviews.llvm.org/D98965
2021-03-23 17:17:30 +00:00 · 2021-03-23 17:17:30 +00:00 · 2240568579
parent e150be612b
commit 2240568579
3 changed files with 283 additions and 45 deletions
--- a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td
+++ b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td
@ -75,6 +75,11 @@ def AffineApplyOp : Affine_Op<"apply", [NoSideEffect]> {
    OpBuilder<(ins "AffineMap":$map, "ValueRange":$mapOperands),
    [{
      build($_builder, $_state, $_builder.getIndexType(), map, mapOperands);
+    }]>,
+    OpBuilder<(ins "ArrayRef<AffineExpr> ":$exprList,"ValueRange":$mapOperands),
+    [{
+      build($_builder, $_state, $_builder.getIndexType(),
+            AffineMap::inferFromExprList(exprList).front(), mapOperands);
    }]>
  ];

--- a/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp
@ -13,7 +13,9 @@

 #include "mlir/Dialect/Linalg/Transforms/Hoisting.h"
 #include "mlir/Analysis/SliceAnalysis.h"
+#include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
 #include "mlir/Dialect/SCF/SCF.h"
 #include "mlir/Dialect/SCF/Utils.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
@ -503,6 +505,134 @@ void mlir::linalg::hoistRedundantVectorTransfers(FuncOp func) {
  }
 }

+/// Return success if `v` is a value that is only transitively defined by ops of
+/// type in `OpTypeList`.
+template <typename... OpTypeList>
+static bool backwardsSliceOnlyHasOpsOfType(scf::ForOp outerLimit, Value v) {
+  // Compute a backward slice up to, but not including, `outerLimit`.
+  llvm::SetVector<Operation *> backwardSlice;
+  getBackwardSlice(v, &backwardSlice, [&](Operation *op) {
+    return outerLimit->isProperAncestor(op);
+  });
+  // Traverse the backward slice and ensure we can perform the computation to
+  // hoist.
+  for (Operation *op : backwardSlice) {
+    if (isa<OpTypeList...>(op))
+      continue;
+    LLVM_DEBUG(DBGS() << "Abort: unadmissible op in slice " << *op << "\n");
+    return false;
+  }
+  return true;
+}
+
+bool isDefinedOutsideOrConstant(scf::ForOp outer, Value v) {
+  return outer.isDefinedOutsideOfLoop(v) || v.getDefiningOp<ConstantOp>();
+}
+
+/// Compute the tightest lower bound with quantities that are all defined
+/// outside of `outer`.
+/// Return null if such a bound cannot be computed.
+Value computeLoopIndependentLowerBound(OpBuilder &b, scf::ForOp outer,
+                                       Value v) {
+  if (isDefinedOutsideOrConstant(outer, v))
+    return v;
+  return Value();
+}
+
+/// Compute the tightest upper bound with quantities that are all defined
+/// outside of `outer`.
+/// Expects all ops in the backward slice of `v` up to `outer` to be either
+/// scf.for, affine.min or affine.apply.
+static Value computeLoopIndependentUpperBound(OpBuilder &b, scf::ForOp outer,
+                                              Value v) {
+  if (isDefinedOutsideOrConstant(outer, v))
+    return v;
+
+  LLVM_DEBUG(DBGS() << "Begin loopIndependentUpperBound for: " << v << "\n");
+
+  bool ok =
+      backwardsSliceOnlyHasOpsOfType<scf::ForOp, AffineMinOp, AffineApplyOp>(
+          outer, v);
+  assert(ok && "expected to only be defined by scf::ForOp and AffineMinOp");
+
+  // Compute a backward slice up to, but not including, `outer`.
+  llvm::SetVector<Operation *> backwardSlice;
+  getBackwardSlice(v, &backwardSlice,
+                   [&](Operation *op) { return outer->isProperAncestor(op); });
+  backwardSlice.insert(v.getDefiningOp());
+
+  OpBuilder::InsertionGuard g(b);
+  b.setInsertionPoint(outer);
+  Value res = v;
+  BlockAndValueMapping bvm;
+  for (Operation *op : backwardSlice) {
+    if (isa<scf::ForOp>(op))
+      continue;
+    if (isa<AffineApplyOp>(op)) {
+      b.clone(*op, bvm);
+      continue;
+    }
+    auto sliceMinOp = cast<AffineMinOp>(op);
+    // Perform the substitution of the operands of AffineMinOp.
+    auto mapAndOperands = substituteMin(
+        sliceMinOp, [&](Operation *op) { return outer->isAncestor(op); });
+    SmallVector<Value> resultOperands = mapAndOperands.dims;
+    llvm::append_range(resultOperands, mapAndOperands.symbols);
+    AffineMap map = mapAndOperands.map;
+    canonicalizeMapAndOperands(&map, &resultOperands);
+    map = simplifyAffineMap(map);
+    res = b.create<AffineMinOp>(
+        outer->getLoc(), map,
+        llvm::to_vector<4>(llvm::map_range(resultOperands, [&](Value operand) {
+          return bvm.lookupOrDefault(operand);
+        })));
+    bvm.map(sliceMinOp, res);
+  }
+  LLVM_DEBUG(DBGS() << "End loopIndependentUpperBound with: " << res << "\n");
+  return res;
+}
+
+/// Return the number of iterations in the loop (ub - lb).ceilDiv(step).
+/// The returned Value is guaranteed not to depend on any loop comprised in
+/// [`outer`, `forOp`].
+/// Return null if such a loop-independent quantity cannot be computed.
+static Value buildLoopTripCount(OpBuilder &b, scf::ForOp outer,
+                                scf::ForOp forOp) {
+  MLIRContext *ctx = forOp->getContext();
+  AffineExpr lb, ub, step;
+  bindDims(ctx, lb, ub);
+  bindSymbols(ctx, step);
+  Value lbVal = computeLoopIndependentLowerBound(b, outer, forOp.lowerBound()),
+        ubVal = computeLoopIndependentUpperBound(b, outer, forOp.upperBound()),
+        stepVal = forOp.step();
+  if (!lbVal || !ubVal || !stepVal)
+    return Value();
+  auto loc = forOp->getLoc();
+  Value res = b.create<AffineApplyOp>(loc, (ub - lb).ceilDiv(step),
+                                      ValueRange{lbVal, ubVal, stepVal});
+  return res;
+}
+
+/// Return the current iteration number in the loop (iv - lb).ceilDiv(step).
+/// The returned Value is guaranteed not to depend on any loop comprised in
+/// [`outer`, `forOp`].
+/// Return null if such a loop-independent quantity cannot be computed.
+static Value buildLoopIterationCount(OpBuilder &b, scf::ForOp outer,
+                                     scf::ForOp forOp) {
+  MLIRContext *ctx = forOp->getContext();
+  AffineExpr iv, lb, step;
+  bindDims(ctx, iv, lb);
+  bindSymbols(ctx, step);
+  Value ivVal = forOp.getInductionVar(),
+        lbVal = computeLoopIndependentLowerBound(b, outer, forOp.lowerBound()),
+        stepVal = forOp.step();
+  if (!ivVal || !lbVal || !stepVal)
+    return Value();
+  auto loc = forOp->getLoc();
+  return b.create<AffineApplyOp>(loc, (iv - lb).ceilDiv(step),
+                                 ValueRange{ivVal, lbVal, stepVal});
+}
+
 /// Ensure prerequisites that guarantee pad op hoisting can occur.
 /// Return failure in the cases when we cannot perform hoisting; i.e. if either:
 ///   1. There exists a use of `padTensorOp` that is not a linalg input operand.
@ -510,8 +640,10 @@ void mlir::linalg::hoistRedundantVectorTransfers(FuncOp func) {
 ///   3. There exists an op with a region that is dominated by
 ///   `outermostEnclosingForOp` and that isn't a LoopLikeInterface or a
 ///    LinalgOp.
-///   3. There exists an op with side effects that is dominated by
-///    `outermostEnclosingForOp` and that isn't a LoopLikeInterface.
+///   4. There exists an op with side effects that is dominated by
+///   `outermostEnclosingForOp` and that isn't a LoopLikeInterface.
+///   5. The lower bound, upper bound and step of all the loops involved in the
+///   hoisting can be
 ///
 /// While ensuring prerequisites:
 ///   1. Fill the `backwardSlice` to contain the topologically sorted ops
@ -523,7 +655,8 @@ void mlir::linalg::hoistRedundantVectorTransfers(FuncOp func) {
 static LogicalResult
 hoistPaddingOnTensorsPrerequisites(linalg::PadTensorOp padTensorOp, int nLevels,
                                   llvm::SetVector<Operation *> &backwardSlice,
-                                   llvm::SetVector<Operation *> &packingLoops) {
+                                   llvm::SetVector<Operation *> &packingLoops,
+                                   SmallVector<Value> &dynamicTensorSizes) {
  // Bail on any use that isn't an input of a Linalg op.
  // Hoisting of inplace updates happens after vectorization.
  for (OpOperand &use : padTensorOp.result().getUses()) {
@ -583,36 +716,39 @@ hoistPaddingOnTensorsPrerequisites(linalg::PadTensorOp padTensorOp, int nLevels,
  // `outermostEnclosingForOp`.
  assert(outermostEnclosingForOp == backwardSlice.front());

+  scf::ForOp outer = cast<scf::ForOp>(outermostEnclosingForOp);
+  if (llvm::any_of(packingLoops, [&](Operation *op) {
+        scf::ForOp forOp = cast<scf::ForOp>(op);
+        Value lb = forOp.lowerBound(), ub = forOp.upperBound(),
+              step = forOp.step();
+        return !isDefinedOutsideOrConstant(outer, lb) ||
+               !(isDefinedOutsideOrConstant(outer, ub) ||
+                 backwardsSliceOnlyHasOpsOfType<scf::ForOp, AffineMinOp,
+                                                AffineApplyOp>(outer, ub)) ||
+               !isDefinedOutsideOrConstant(outer, step);
+      }))
+    return failure();
+
+  // IP just before the outermost loop considered that we hoist above.
+  OpBuilder b(outermostEnclosingForOp);
+  dynamicTensorSizes =
+      llvm::to_vector<4>(llvm::map_range(packingLoops, [&](Operation *op) {
+        return buildLoopTripCount(b, cast<scf::ForOp>(outermostEnclosingForOp),
+                                  cast<scf::ForOp>(op));
+      }));
+  // Assert all loop trip counts can be computed.
+  if (!llvm::all_of(dynamicTensorSizes, [](Value v) { return v; }))
+    llvm_unreachable("loop independence prerequisite not met");
  return success();
 }

-/// Return the number of iterations in the loop (ub - lb).ceilDiv(step).
-static Value buildLoopTripCount(OpBuilder &b, scf::ForOp forOp) {
-  MLIRContext *ctx = forOp->getContext();
-  AffineExpr lb, ub, step;
-  bindDims(ctx, lb, ub);
-  bindSymbols(ctx, step);
-  return b.create<AffineApplyOp>(
-      forOp->getLoc(), AffineMap::get(2, 1, {(ub - lb).ceilDiv(step)}, ctx),
-      ValueRange{forOp.lowerBound(), forOp.upperBound(), forOp.step()});
-}
-
-/// Return the current iteration number in the loop (iv - lb).ceilDiv(step).
-static Value buildLoopIterationCount(OpBuilder &b, scf::ForOp forOp) {
-  MLIRContext *ctx = forOp->getContext();
-  AffineExpr iv, lb, step;
-  bindDims(ctx, iv, lb);
-  bindSymbols(ctx, step);
-  return b.create<AffineApplyOp>(
-      forOp->getLoc(), AffineMap::get(2, 1, {(iv - lb).ceilDiv(step)}, ctx),
-      ValueRange{forOp.getInductionVar(), forOp.lowerBound(), forOp.step()});
-}
-
 LogicalResult mlir::linalg::hoistPaddingOnTensors(PadTensorOp &padTensorOp,
                                                  unsigned nLoops) {
+  SmallVector<Value> dynamicTensorSizes;
  llvm::SetVector<Operation *> backwardSlice, packingLoops;
  if (failed(hoistPaddingOnTensorsPrerequisites(padTensorOp, nLoops,
-                                                backwardSlice, packingLoops)))
+                                                backwardSlice, packingLoops,
+                                                dynamicTensorSizes)))
    return failure();

  // Update actual number of loops, which may be smaller.
@ -636,12 +772,8 @@ LogicalResult mlir::linalg::hoistPaddingOnTensors(PadTensorOp &padTensorOp,
  llvm::append_range(packedShape, paddedTensorType.getShape());
  auto packedTensorType =
      RankedTensorType::get(packedShape, paddedTensorType.getElementType());
-  auto dynamicSizes =
-      llvm::to_vector<4>(llvm::map_range(packingLoops, [&](Operation *op) {
-        return buildLoopTripCount(b, cast<scf::ForOp>(op));
-      }));
  Value packedTensor = b.create<linalg::InitTensorOp>(
-      loc, dynamicSizes, packedTensorType.getShape(),
+      loc, dynamicTensorSizes, packedTensorType.getShape(),
      packedTensorType.getElementType());

  // Clone the operations involved in the backward slice, iteratively stepping
@ -656,9 +788,9 @@ LogicalResult mlir::linalg::hoistPaddingOnTensors(PadTensorOp &padTensorOp,
  clonedLoopIvs.reserve(nLoops);
  leadingPackedTensorIndexings.reserve(nLoops);
  BlockAndValueMapping bvm;
-  // Stack step 1. iteratively clone loops and push `packedTensor`.
  // Insert `padTensorOp` into the backwardSlice so we clone it too.
  backwardSlice.insert(padTensorOp);
+  // Stack step 1. iteratively clone loops and push `packedTensor`.
  for (Operation *op : backwardSlice) {
    if (op->getNumRegions() == 0 || isa<linalg::PadTensorOp>(op)) {
      b.clone(*op, bvm);
@ -670,15 +802,23 @@ LogicalResult mlir::linalg::hoistPaddingOnTensors(PadTensorOp &padTensorOp,
    // Unused loop, just skip it.
    if (!packingLoops.contains(forOp))
      continue;
+
    auto clonedForOp =
-        b.create<scf::ForOp>(loc, forOp.lowerBound(), forOp.upperBound(),
-                             forOp.step(), packedTensor);
+        b.create<scf::ForOp>(loc, bvm.lookupOrDefault(forOp.lowerBound()),
+                             bvm.lookupOrDefault(forOp.upperBound()),
+                             bvm.lookupOrDefault(forOp.step()), packedTensor);
+
+    bvm.map(forOp.getInductionVar(), clonedForOp.getInductionVar());
    assert(clonedForOp->getNumRegions() == 1);
    clonedLoopIvs.push_back(clonedForOp.getInductionVar());
+
    b.setInsertionPointToStart(&clonedForOp->getRegion(0).front());
-    leadingPackedTensorIndexings.push_back(
-        buildLoopIterationCount(b, clonedForOp));
-    bvm.map(forOp.getInductionVar(), clonedLoopIvs.back());
+    Value loopIndependentIterationCount = buildLoopIterationCount(
+        b, cast<scf::ForOp>(outermostEnclosingForOp), clonedForOp);
+    // Assert the loop-independent iteration count can be computed.
+    if (!loopIndependentIterationCount)
+      llvm_unreachable("loop independence prerequisite not met");
+    leadingPackedTensorIndexings.push_back(loopIndependentIterationCount);
    packedTensor = clonedForOp.getRegionIterArgs().front();
  }

@ -716,8 +856,13 @@ LogicalResult mlir::linalg::hoistPaddingOnTensors(PadTensorOp &padTensorOp,
  b.setInsertionPoint(padTensorOp);
  SmallVector<Value> loopIterationCounts =
      llvm::to_vector<4>(llvm::map_range(packingLoops, [&](Operation *loop) {
-        return buildLoopIterationCount(b, cast<scf::ForOp>(loop));
+        return buildLoopIterationCount(
+            b, cast<scf::ForOp>(outermostEnclosingForOp),
+            cast<scf::ForOp>(loop));
      }));
+  // Assert all loop iteration counts can be computed.
+  if (llvm::any_of(loopIterationCounts, [](Value v) { return !v; }))
+    llvm_unreachable("loop independence prerequisite not met");
  // offsets = [originalLoopIvs, 0 .. 0].
  offsets.assign(loopIterationCounts.begin(), loopIterationCounts.end());
  offsets.append(paddedRank, b.getIndexAttr(0));
--- a/mlir/test/Dialect/Linalg/hoist-padding.mlir
+++ b/mlir/test/Dialect/Linalg/hoist-padding.mlir
@ -1,15 +1,14 @@
-// RUN: mlir-opt %s -test-linalg-transform-patterns=test-hoist-padding-2-level -canonicalize | FileCheck %s
-
-#map0 = affine_map<(d0)[s0] -> (2, -d0 + s0)>
-#map1 = affine_map<(d0)[s0] -> (4, -d0 + s0)>
-#map2 = affine_map<(d0)[s0] -> (3, -d0 + s0)>
-#map3 = affine_map<(d0, d1) -> (2, d0 - d1)>
-#map4 = affine_map<(d0, d1) -> (3, d0 - d1)>
+// RUN: mlir-opt %s -split-input-file -test-linalg-transform-patterns=test-hoist-padding-2-level -canonicalize | FileCheck %s

 // CHECK-DAG: #[[$DIV3:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 3)>
 // CHECK-DAG: #[[$DIV4:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 4)>
 // CHECK-DAG: #[[$DIVS3:[0-9a-z]+]] = affine_map<()[s0] -> (s0 ceildiv 3)>
 // CHECK-DAG: #[[$DIVS4:[0-9a-z]+]] = affine_map<()[s0] -> (s0 ceildiv 4)>
+#map0 = affine_map<(d0)[s0] -> (2, -d0 + s0)>
+#map1 = affine_map<(d0)[s0] -> (4, -d0 + s0)>
+#map2 = affine_map<(d0)[s0] -> (3, -d0 + s0)>
+#map3 = affine_map<(d0, d1) -> (2, d0 - d1)>
+#map4 = affine_map<(d0, d1) -> (3, d0 - d1)>

 // CHECK-LABEL: func @matmul_tensors
 //  CHECK-SAME:   %[[TA:[0-9a-z]+]]: tensor
@ -129,3 +128,92 @@ func @matmul_tensors(
  }
  return %3 : tensor<?x?xf32>
 }
+
+// -----
+
+// CHECK-DAG: #[[$MIN_REST8:[0-9a-z]+]] = affine_map<(d0)[s0] -> (8, -d0 + s0)>
+// CHECK-DAG: #[[$MIN_MOD4:[0-9a-z]+]] = affine_map<(d0) -> (4, d0 - ((d0 - 1) floordiv 4) * 4)>
+// CHECK-DAG: #[[$DIV4:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 4)>
+// CHECK-DAG: #[[$DIV2:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 2)>
+#map0 = affine_map<(d0)[s0] -> (8, -d0 + s0)>
+#map1 = affine_map<(d0, d1) -> (4, d0 - d1)>
+#map2 = affine_map<(d0, d1) -> (2, d0 - d1)>
+
+// CHECK-LABEL: func @dot
+func @dot(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>, %arg2: tensor<f32>)
+    -> tensor<f32>
+{
+  %c8 = constant 8 : index
+  %c4 = constant 4 : index
+  %cst = constant 0.000000e+00 : f32
+  %c2 = constant 2 : index
+  %c0 = constant 0 : index
+  %1 = memref.dim %arg0, %c0 : tensor<?xf32>
+  %2 = memref.dim %arg0, %c0 : tensor<?xf32>
+  %3 = memref.dim %arg1, %c0 : tensor<?xf32>
+
+  //      CHECK: scf.for %[[I:[0-9a-z]+]] =
+  //
+  //      CHECK:   %[[MR8:.*]] = affine.min #[[$MIN_REST8]](%[[I]])
+  //      CHECK:   %[[D0:.*]] = affine.apply #[[$DIV4]](%[[MR8]])
+  //      CHECK:   %[[MM4:.*]] = affine.min #[[$MIN_MOD4]](%[[MR8]])
+  //      CHECK:   %[[D1:.*]] = affine.apply #[[$DIV2]](%[[MM4]])
+  // Init tensor and pack.
+  //      CHECK:   %[[INIT_PACKED_A:.*]] = linalg.init_tensor [%[[D0]], %[[D1]], 2] : tensor<?x?x2xf32>
+  //      CHECK:   %[[PACKED_A:.*]] = scf.for %[[II:[0-9a-z]+]] = {{.*}} iter_args(%{{.*}} = %[[INIT_PACKED_A]]) -> (tensor<?x?x2xf32>) {
+  //      CHECK:     scf.for %[[III:[0-9a-z]+]] =
+  //      CHECK:       subtensor_insert %{{.*}} into %{{.*}}[%{{.*}}, %{{.*}}, 0] [1, 1, 2] [1, 1, 1] : tensor<2xf32> into tensor<?x?x2xf32>
+  //
+  //      CHECK:   %[[D0_2:.*]] = affine.apply #[[$DIV4]](%[[MR8]])
+  //      CHECK:   %[[MM4_2:.*]] = affine.min #[[$MIN_MOD4]](%[[MR8]])
+  //      CHECK:   %[[D1_2:.*]] = affine.apply #[[$DIV2]](%[[MM4_2]])
+  // Init tensor and pack.
+  //      CHECK:   %[[INIT_PACKED_B:.*]] = linalg.init_tensor [%[[D0_2]], %[[D1_2]], 2] : tensor<?x?x2xf32>
+  //      CHECK:   %[[PACKED_B:.*]] = scf.for %[[II_2:[0-9a-z]+]] = {{.*}} iter_args(%{{.*}} = %[[INIT_PACKED_B]]) -> (tensor<?x?x2xf32>) {
+  //      CHECK:     scf.for %[[III_2:[0-9a-z]+]] =
+  //      CHECK:       subtensor_insert %{{.*}} into %{{.*}}[%{{.*}}, %{{.*}}, 0] [1, 1, 2] [1, 1, 1] : tensor<2xf32> into tensor<?x?x2xf32>
+  // Compute.
+  //      CHECK:   scf.for %[[II_3:[0-9a-z]+]] =
+  //      CHECK:     scf.for %[[III_3:[0-9a-z]+]] = {{.*}} iter_args(%[[C:.*]] = %{{.*}}) -> (tensor<f32>) {
+  //      CHECK:       %[[IDX0:.*]] = affine.apply #[[$DIV4]](%[[II_3]])
+  //      CHECK:       %[[IDX1:.*]] = affine.apply #[[$DIV2]](%[[III_3]])
+  //      CHECK:       %[[A:.*]] = subtensor %[[PACKED_A]][%[[IDX0]], %[[IDX1]], 0] [1, 1, 2] [1, 1, 1] : tensor<?x?x2xf32> to tensor<2xf32>
+  //      CHECK:       %[[IDX0_2:.*]] = affine.apply #[[$DIV4]](%[[II_3]])
+  //      CHECK:       %[[IDX1_2:.*]] = affine.apply #[[$DIV2]](%[[III_3]])
+  //      CHECK:       %[[B:.*]] = subtensor %[[PACKED_B]][%[[IDX0_2]], %[[IDX1_2]], 0] [1, 1, 2] [1, 1, 1] : tensor<?x?x2xf32> to tensor<2xf32>
+  //      CHECK:       linalg.dot ins(%[[A]], %[[B]] : tensor<2xf32>, tensor<2xf32>) outs(%[[C]] : tensor<f32>) -> tensor<f32>
+
+  %4 = scf.for %arg3 = %c0 to %1 step %c8 iter_args(%arg4 = %arg2) -> (tensor<f32>) {
+    %5 = affine.min #map0(%arg3)[%2]
+    %6 = subtensor %arg0[%arg3] [%5] [1] : tensor<?xf32> to tensor<?xf32>
+    %7 = affine.min #map0(%arg3)[%3]
+    %8 = subtensor %arg1[%arg3] [%7] [1] : tensor<?xf32> to tensor<?xf32>
+    %9 = scf.for %arg5 = %c0 to %5 step %c4 iter_args(%arg6 = %arg4) -> (tensor<f32>) {
+      %10 = affine.min #map1(%5, %arg5)
+      %11 = subtensor %6[%arg5] [%10] [1] : tensor<?xf32> to tensor<?xf32>
+      %12 = affine.min #map1(%7, %arg5)
+      %13 = subtensor %8[%arg5] [%12] [1] : tensor<?xf32> to tensor<?xf32>
+      %14 = scf.for %arg7 = %c0 to %10 step %c2 iter_args(%arg8 = %arg6) -> (tensor<f32>) {
+        %15 = affine.min #map2(%10, %arg7)
+        %16 = subtensor %11[%arg7] [%15] [1] : tensor<?xf32> to tensor<?xf32>
+        %17 = affine.min #map2(%12, %arg7)
+        %18 = subtensor %13[%arg7] [%17] [1] : tensor<?xf32> to tensor<?xf32>
+        %19 = subi %c2, %15 : index
+        %20 = linalg.pad_tensor %16 low[%c0] high[%19]  {
+        ^bb0(%arg9: index):  // no predecessors
+          linalg.yield %cst : f32
+        } : tensor<?xf32> to tensor<2xf32>
+        %21 = subi %c2, %17 : index
+        %22 = linalg.pad_tensor %18 low[%c0] high[%21]  {
+        ^bb0(%arg9: index):  // no predecessors
+          linalg.yield %cst : f32
+        } : tensor<?xf32> to tensor<2xf32>
+        %23 = linalg.dot ins(%20, %22 : tensor<2xf32>, tensor<2xf32>) outs(%arg8 : tensor<f32>) -> tensor<f32>
+        scf.yield %23 : tensor<f32>
+      }
+      scf.yield %14 : tensor<f32>
+    }
+    scf.yield %9 : tensor<f32>
+  }
+  return %4 : tensor<f32>
+}