[mlir] Add for loop specialization

Summary: We already had a parallel loop specialization pass that is used to enable unrolling and consecutive vectorization by rewriting loops whose bound is defined as a min of a constant and a dynamic value into a loop with static bound (the constant) and the minimum as bound, wrapped into a conditional to dispatch between the two. This adds the same rewriting for for loops. Differential Revision: https://reviews.llvm.org/D82189
2020-06-19 14:14:30 +02:00 · 2020-06-19 14:14:30 +02:00 · 4bcd08eb1c
parent 46ea465b5b
commit 4bcd08eb1c
7 changed files with 109 additions and 17 deletions
--- a/mlir/include/mlir/Dialect/SCF/Passes.h
+++ b/mlir/include/mlir/Dialect/SCF/Passes.h
@ -20,6 +20,10 @@ namespace mlir {

 class Pass;

+/// Creates a pass that specializes for loop for unrolling and
+/// vectorization.
+std::unique_ptr<Pass> createForLoopSpecializationPass();
+
 /// Creates a loop fusion pass which fuses parallel loops.
 std::unique_ptr<Pass> createParallelLoopFusionPass();

--- a/mlir/include/mlir/Dialect/SCF/Passes.td
+++ b/mlir/include/mlir/Dialect/SCF/Passes.td
@ -1,4 +1,4 @@
-//===-- Passes.td - Loop pass definition file --------------*- tablegen -*-===//
+//===-- Passes.td - SCF pass definition file ---------------*- tablegen -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@ -11,18 +11,24 @@

 include "mlir/Pass/PassBase.td"

-def LoopParallelLoopFusion : Pass<"parallel-loop-fusion"> {
+def SCFForLoopSpecialization
+    : FunctionPass<"for-loop-specialization"> {
+  let summary = "Specialize `for` loops for vectorization";
+  let constructor = "mlir::createForLoopSpecializationPass()";
+}
+
+def SCFParallelLoopFusion : Pass<"parallel-loop-fusion"> {
  let summary = "Fuse adjacent parallel loops";
  let constructor = "mlir::createParallelLoopFusionPass()";
 }

-def LoopParallelLoopSpecialization
+def SCFParallelLoopSpecialization
    : FunctionPass<"parallel-loop-specialization"> {
  let summary = "Specialize parallel loops for vectorization";
  let constructor = "mlir::createParallelLoopSpecializationPass()";
 }

-def LoopParallelLoopTiling : FunctionPass<"parallel-loop-tiling"> {
+def SCFParallelLoopTiling : FunctionPass<"parallel-loop-tiling"> {
  let summary = "Tile parallel loops";
  let constructor = "mlir::createParallelLoopTilingPass()";
  let options = [
--- a/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt
@ -1,6 +1,6 @@
 add_mlir_dialect_library(MLIRSCFTransforms
+  LoopSpecialization.cpp
  ParallelLoopFusion.cpp
-  ParallelLoopSpecialization.cpp
  ParallelLoopTiling.cpp
  Utils.cpp

--- a/mlir/lib/Dialect/SCF/Transforms/ParallelLoopSpecialization.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/ParallelLoopSpecialization.cpp
@ -1,4 +1,4 @@
-//===- ParallelLoopSpecialization.cpp - scf.parallel specialization ------===//
+//===- LoopSpecialization.cpp - scf.parallel/SCR.for specialization -------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@ -6,7 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// Specializes parallel loops for easier unrolling and vectorization.
+// Specializes parallel loops and for loops for easier unrolling and
+// vectorization.
 //
 //===----------------------------------------------------------------------===//

@ -19,13 +20,14 @@
 #include "mlir/IR/BlockAndValueMapping.h"

 using namespace mlir;
+using scf::ForOp;
 using scf::ParallelOp;

-/// Rewrite a loop with bounds defined by an affine.min with a constant into 2
-/// loops after checking if the bounds are equal to that constant. This is
-/// beneficial if the loop will almost always have the constant bound and that
-/// version can be fully unrolled and vectorized.
-static void specializeLoopForUnrolling(ParallelOp op) {
+/// Rewrite a parallel loop with bounds defined by an affine.min with a constant
+/// into 2 loops after checking if the bounds are equal to that constant. This
+/// is beneficial if the loop will almost always have the constant bound and
+/// that version can be fully unrolled and vectorized.
+static void specializeParallelLoopForUnrolling(ParallelOp op) {
  SmallVector<int64_t, 2> constantIndices;
  constantIndices.reserve(op.upperBound().size());
  for (auto bound : op.upperBound()) {
@ -33,7 +35,7 @@ static void specializeLoopForUnrolling(ParallelOp op) {
    if (!minOp)
      return;
    int64_t minConstant = std::numeric_limits<int64_t>::max();
-    for (auto expr : minOp.map().getResults()) {
+    for (AffineExpr expr : minOp.map().getResults()) {
      if (auto constantIndex = expr.dyn_cast<AffineConstantExpr>())
        minConstant = std::min(minConstant, constantIndex.getValue());
    }
@ -58,11 +60,48 @@ static void specializeLoopForUnrolling(ParallelOp op) {
  op.erase();
 }

+/// Rewrite a for loop with bounds defined by an affine.min with a constant into
+/// 2 loops after checking if the bounds are equal to that constant. This is
+/// beneficial if the loop will almost always have the constant bound and that
+/// version can be fully unrolled and vectorized.
+static void specializeForLoopForUnrolling(ForOp op) {
+  auto bound = op.upperBound();
+  auto minOp = bound.getDefiningOp<AffineMinOp>();
+  if (!minOp)
+    return;
+  int64_t minConstant = std::numeric_limits<int64_t>::max();
+  for (AffineExpr expr : minOp.map().getResults()) {
+    if (auto constantIndex = expr.dyn_cast<AffineConstantExpr>())
+      minConstant = std::min(minConstant, constantIndex.getValue());
+  }
+  if (minConstant == std::numeric_limits<int64_t>::max())
+    return;
+
+  OpBuilder b(op);
+  BlockAndValueMapping map;
+  Value constant = b.create<ConstantIndexOp>(op.getLoc(), minConstant);
+  Value cond =
+      b.create<CmpIOp>(op.getLoc(), CmpIPredicate::eq, bound, constant);
+  map.map(bound, constant);
+  auto ifOp = b.create<scf::IfOp>(op.getLoc(), cond, /*withElseRegion=*/true);
+  ifOp.getThenBodyBuilder().clone(*op.getOperation(), map);
+  ifOp.getElseBodyBuilder().clone(*op.getOperation());
+  op.erase();
+}
+
 namespace {
 struct ParallelLoopSpecialization
-    : public LoopParallelLoopSpecializationBase<ParallelLoopSpecialization> {
+    : public SCFParallelLoopSpecializationBase<ParallelLoopSpecialization> {
  void runOnFunction() override {
-    getFunction().walk([](ParallelOp op) { specializeLoopForUnrolling(op); });
+    getFunction().walk(
+        [](ParallelOp op) { specializeParallelLoopForUnrolling(op); });
+  }
+};
+
+struct ForLoopSpecialization
+    : public SCFForLoopSpecializationBase<ForLoopSpecialization> {
+  void runOnFunction() override {
+    getFunction().walk([](ForOp op) { specializeForLoopForUnrolling(op); });
  }
 };
 } // namespace
@ -70,3 +109,7 @@ struct ParallelLoopSpecialization
 std::unique_ptr<Pass> mlir::createParallelLoopSpecializationPass() {
  return std::make_unique<ParallelLoopSpecialization>();
 }
+
+std::unique_ptr<Pass> mlir::createForLoopSpecializationPass() {
+  return std::make_unique<ForLoopSpecialization>();
+}
--- a/mlir/lib/Dialect/SCF/Transforms/ParallelLoopFusion.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/ParallelLoopFusion.cpp
@ -160,7 +160,7 @@ void mlir::scf::naivelyFuseParallelOps(Region &region) {

 namespace {
 struct ParallelLoopFusion
-    : public LoopParallelLoopFusionBase<ParallelLoopFusion> {
+    : public SCFParallelLoopFusionBase<ParallelLoopFusion> {
  void runOnOperation() override {
    getOperation()->walk([&](Operation *child) {
      for (Region &region : child->getRegions())
--- a/mlir/lib/Dialect/SCF/Transforms/ParallelLoopTiling.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/ParallelLoopTiling.cpp
@ -119,7 +119,7 @@ static bool getInnermostNestedLoops(Block *block,

 namespace {
 struct ParallelLoopTiling
-    : public LoopParallelLoopTilingBase<ParallelLoopTiling> {
+    : public SCFParallelLoopTilingBase<ParallelLoopTiling> {
  ParallelLoopTiling() = default;
  explicit ParallelLoopTiling(ArrayRef<int64_t> tileSizes) {
    this->tileSizes = tileSizes;
--- a/mlir/test/Dialect/SCF/for-loop-specialization.mlir
+++ b/mlir/test/Dialect/SCF/for-loop-specialization.mlir
@ -0,0 +1,39 @@
+// RUN: mlir-opt %s -for-loop-specialization -split-input-file | FileCheck %s
+
+#map0 = affine_map<()[s0, s1] -> (1024, s0 - s1)>
+#map1 = affine_map<()[s0, s1] -> (64, s0 - s1)>
+
+func @for(%outer: index, %A: memref<?xf32>, %B: memref<?xf32>,
+          %C: memref<?xf32>, %result: memref<?xf32>) {
+  %c0 = constant 0 : index
+  %c1 = constant 1 : index
+  %d0 = dim %A, %c0 : memref<?xf32>
+  %b0 = affine.min #map0()[%d0, %outer]
+  scf.for %i0 = %c0 to %b0 step %c1 {
+    %B_elem = load %B[%i0] : memref<?xf32>
+    %C_elem = load %C[%i0] : memref<?xf32>
+    %sum_elem = addf %B_elem, %C_elem : f32
+    store %sum_elem, %result[%i0] : memref<?xf32>
+  }
+  return
+}
+
+// CHECK-LABEL:   func @for(
+// CHECK-SAME:              [[ARG0:%.*]]: index, [[ARG1:%.*]]: memref<?xf32>, [[ARG2:%.*]]: memref<?xf32>, [[ARG3:%.*]]: memref<?xf32>, [[ARG4:%.*]]: memref<?xf32>) {
+// CHECK:           [[CST_0:%.*]] = constant 0 : index
+// CHECK:           [[CST_1:%.*]] = constant 1 : index
+// CHECK:           [[DIM_0:%.*]] = dim [[ARG1]], [[CST_0]] : memref<?xf32>
+// CHECK:           [[MIN:%.*]] = affine.min #map0(){{\[}}[[DIM_0]], [[ARG0]]]
+// CHECK:           [[CST_1024:%.*]] = constant 1024 : index
+// CHECK:           [[PRED:%.*]] = cmpi "eq", [[MIN]], [[CST_1024]] : index
+// CHECK:           scf.if [[PRED]] {
+// CHECK:             scf.for [[IDX0:%.*]] = [[CST_0]] to [[CST_1024]] step [[CST_1]] {
+// CHECK:               store
+// CHECK:             }
+// CHECK:           } else {
+// CHECK:             scf.for [[IDX0:%.*]] = [[CST_0]] to [[MIN]] step [[CST_1]] {
+// CHECK:               store
+// CHECK:             }
+// CHECK:           }
+// CHECK:           return
+// CHECK:         }