[mlir][GPU] Improve constant sinking in kernel outlining

The previous implementation did not support sinking simple expressions. In particular, it is often beneficial to sink dim operations. Differential Revision: https://reviews.llvm.org/D88439
2020-09-29 13:20:37 +02:00 · 2020-09-29 13:20:37 +02:00 · edeff6e642
parent 7bae2bc5a8
commit edeff6e642
2 changed files with 125 additions and 58 deletions
--- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
@ -18,6 +18,7 @@
 #include "mlir/IR/BlockAndValueMapping.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/SymbolTable.h"
+#include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/RegionUtils.h"

 using namespace mlir;
@ -32,10 +33,10 @@ static void createForAllDimensions(OpBuilder &builder, Location loc,
  }
 }

-// Add operations generating block/thread ids and grid/block dimensions at the
-// beginning of the `launchFuncOpBody` region. Add mapping from argument in
-// entry block of `launchOpBody`, to the corresponding result value of the added
-// operations.
+/// Adds operations generating block/thread ids and grid/block dimensions at the
+/// beginning of the `launchFuncOpBody` region. Add mapping from argument in
+/// entry block of `launchOpBody`, to the corresponding result value of the
+/// added operations.
 static void injectGpuIndexOperations(Location loc, Region &launchFuncOpBody,
                                     Region &launchOpBody,
                                     BlockAndValueMapping &map) {
@ -53,8 +54,48 @@ static void injectGpuIndexOperations(Location loc, Region &launchFuncOpBody,
    map.map(firstBlock.getArgument(indexOp.index()), indexOp.value());
 }

+/// Identifies operations that are beneficial to sink into kernels. These
+/// operations may not have side-effects, as otherwise sinking (and hence
+/// duplicating them) is not legal.
 static bool isSinkingBeneficiary(Operation *op) {
-  return isa<ConstantOp, DimOp>(op);
+  return isa<ConstantOp, DimOp, SelectOp, CmpIOp>(op);
+}
+
+/// For a given operation `op`, computes whether it is beneficial to sink the
+/// operation into the kernel. An operation can be sunk if doing so does not
+/// introduce new kernel arguments. Whether a value is already available in the
+/// kernel (and hence does not introduce new arguments) is checked by
+/// querying `availableValues`.
+/// If an operand is not yet available, we recursively check whether it can be
+/// made available by siking its defining op.
+/// Operations that are indentified for sinking are added to `beneficiaryOps` in
+/// the order the should appear in the kernel. Furthermore, `availableValues` is
+/// updated with results that will be available after sinking the identified
+/// ops.
+static bool extractBeneficiaryOps(Operation *op,
+                                  llvm::SetVector<Operation *> &beneficiaryOps,
+                                  llvm::SetVector<Value> &availableValues) {
+  if (beneficiaryOps.count(op))
+    return true;
+
+  if (!isSinkingBeneficiary(op))
+    return false;
+
+  for (Value operand : op->getOperands()) {
+    // It is already visisble in the kernel, keep going.
+    if (availableValues.count(operand))
+      continue;
+    // Else check whether it can be made available via sinking.
+    Operation *definingOp = operand.getDefiningOp();
+    if (!definingOp ||
+        !extractBeneficiaryOps(definingOp, beneficiaryOps, availableValues))
+      return false;
+  }
+  // We will sink the operation, mark its results as now available.
+  beneficiaryOps.insert(op);
+  for (Value result : op->getResults())
+    availableValues.insert(result);
+  return true;
 }

 LogicalResult mlir::sinkOperationsIntoLaunchOp(gpu::LaunchOp launchOp) {
@ -65,59 +106,30 @@ LogicalResult mlir::sinkOperationsIntoLaunchOp(gpu::LaunchOp launchOp) {
  llvm::SetVector<Value> sinkCandidates;
  getUsedValuesDefinedAbove(launchOpBody, sinkCandidates);

-  llvm::SetVector<Value> sunkValues;
-  llvm::SetVector<Operation *> sunkOperations;
-  for (Value operand : sinkCandidates) {
+  SmallVector<Value, 4> worklist(sinkCandidates.begin(), sinkCandidates.end());
+  llvm::SetVector<Operation *> toBeSunk;
+  for (Value operand : worklist) {
    Operation *operandOp = operand.getDefiningOp();
-    if (!operandOp || !isSinkingBeneficiary(operandOp))
+    if (!operandOp)
      continue;
-    // Only sink operations that do not create new sinkCandidates.
-    if (!llvm::all_of(operandOp->getOperands(), [&sinkCandidates](Value value) {
-          return sinkCandidates.count(value);
-        }))
-      continue;
-    sunkValues.insert(operand);
-    sunkOperations.insert(operandOp);
+    extractBeneficiaryOps(operandOp, toBeSunk, sinkCandidates);
  }

  // Insert operations so that the defs get cloned before uses.
  BlockAndValueMapping map;
  OpBuilder builder(launchOpBody);
-  DenseSet<Operation *> processed;
-  SmallVector<Operation *, 2> clonedOps;
-  while (processed.size() != sunkOperations.size()) {
-    auto startSize = processed.size();
-    for (Operation *sunkOperation : sunkOperations) {
-      if (processed.count(sunkOperation))
-        continue;
-
-      // Operation cant be cloned yet if any of its operands is also being sunk,
-      // but isnt cloned yet.
-      if (llvm::any_of(
-              sunkOperation->getOperands(), [&sunkValues, &map](Value value) {
-                return sunkValues.count(value) && !map.lookupOrNull(value);
-              }))
-        continue;
-
-      Operation *clonedOp = builder.clone(*sunkOperation, map);
-      // Only replace uses within the launch op.
-      for (auto result : llvm::enumerate(sunkOperation->getResults())) {
-        auto replacement = clonedOp->getResult(result.index());
-        for (auto &use : llvm::make_early_inc_range(result.value().getUses()))
-          if (use.getOwner()->getParentOfType<gpu::LaunchOp>() == launchOp)
-            use.set(replacement);
-      }
-      processed.insert(sunkOperation);
-    }
-    if (startSize == processed.size())
-      return launchOp.emitError(
-          "found illegal cyclic dependency between operations while sinking");
+  for (Operation *op : toBeSunk) {
+    Operation *clonedOp = builder.clone(*op, map);
+    // Only replace uses within the launch op.
+    for (auto pair : llvm::zip(op->getResults(), clonedOp->getResults()))
+      replaceAllUsesInRegionWith(std::get<0>(pair), std::get<1>(pair),
+                                 launchOp.body());
  }
  return success();
 }

-// Outline the `gpu.launch` operation body into a kernel function. Replace
-// `gpu.terminator` operations by `gpu.return` in the generated function.
+/// Outline the `gpu.launch` operation body into a kernel function. Replace
+/// `gpu.terminator` operations by `gpu.return` in the generated function.
 static gpu::GPUFuncOp outlineKernelFuncImpl(gpu::LaunchOp launchOp,
                                            StringRef kernelFnName,
                                            llvm::SetVector<Value> &operands) {
@ -191,9 +203,9 @@ gpu::GPUFuncOp mlir::outlineKernelFunc(gpu::LaunchOp launchOp,
  return funcOp;
 }

-// Replace `gpu.launch` operations with an `gpu.launch_func` operation launching
-// `kernelFunc`. The kernel func contains the body of the `gpu.launch` with
-// constant region arguments inlined.
+/// Replace `gpu.launch` operations with an `gpu.launch_func` operation
+/// launching `kernelFunc`. The kernel func contains the body of the
+/// `gpu.launch` with constant region arguments inlined.
 static void convertToLaunchFuncOp(gpu::LaunchOp launchOp,
                                  gpu::GPUFuncOp kernelFunc,
                                  ValueRange operands) {
@ -257,7 +269,7 @@ public:
  }

 private:
-  // Returns a gpu.module containing kernelFunc and all callees (recursive).
+  /// Returns a gpu.module containing kernelFunc and all callees (recursive).
  gpu::GPUModuleOp createKernelModule(gpu::GPUFuncOp kernelFunc,
                                      const SymbolTable &parentSymbolTable) {
    // TODO: This code cannot use an OpBuilder because it must be inserted into
--- a/mlir/test/Dialect/GPU/outlining.mlir
+++ b/mlir/test/Dialect/GPU/outlining.mlir
@ -60,7 +60,7 @@ func @launch() {
 // -----

 // CHECK: module attributes {gpu.container_module}
-
+// CHECK-LABEL: @multiple_launches
 func @multiple_launches() {
  // CHECK: %[[CST:.*]] = constant 8 : index
  %cst = constant 8 : index
@ -88,13 +88,14 @@ func @multiple_launches() {

 // -----

-func @extra_constants(%arg0 : memref<?xf32>) {
+// CHECK-LABEL: @extra_constants_not_inlined
+func @extra_constants_not_inlined(%arg0: memref<?xf32>) {
  // CHECK: %[[CST:.*]] = constant 8 : index
  %cst = constant 8 : index
  %cst2 = constant 2 : index
  %c0 = constant 0 : index
-  %cst3 = dim %arg0, %c0 : memref<?xf32>
-  // CHECK: "gpu.launch_func"(%[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %{{.*}}, %{{.*}}) {kernel = @extra_constants_kernel::@extra_constants_kernel} : (index, index, index, index, index, index, memref<?xf32>, index) -> ()
+  %cst3 = "secret_constant"() : () -> index
+  // CHECK: "gpu.launch_func"(%[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %{{.*}}, %{{.*}}) {kernel = @extra_constants_not_inlined_kernel::@extra_constants_not_inlined_kernel} : (index, index, index, index, index, index, memref<?xf32>, index) -> ()
  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst,
                                       %grid_z = %cst)
             threads(%tx, %ty, %tz) in (%block_x = %cst, %block_y = %cst,
@ -105,9 +106,62 @@ func @extra_constants(%arg0 : memref<?xf32>) {
  return
 }

-// CHECK-LABEL: func @extra_constants_kernel(%{{.*}}: memref<?xf32>, %{{.*}}: index)
-// CHECK: constant
-// CHECK: constant
+// CHECK-LABEL: func @extra_constants_not_inlined_kernel(%{{.*}}: memref<?xf32>, %{{.*}}: index)
+// CHECK: constant 2
+
+// -----
+
+// CHECK-LABEL: @extra_constants
+// CHECK-SAME: %[[ARG0:.*]]: memref<?xf32>
+func @extra_constants(%arg0: memref<?xf32>) {
+  // CHECK: %[[CST:.*]] = constant 8 : index
+  %cst = constant 8 : index
+  %cst2 = constant 2 : index
+  %c0 = constant 0 : index
+  %cst3 = dim %arg0, %c0 : memref<?xf32>
+  // CHECK: "gpu.launch_func"(%[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[ARG0]]) {kernel = @extra_constants_kernel::@extra_constants_kernel} : (index, index, index, index, index, index, memref<?xf32>) -> ()
+  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst,
+                                       %grid_z = %cst)
+             threads(%tx, %ty, %tz) in (%block_x = %cst, %block_y = %cst,
+                                        %block_z = %cst) {
+    "use"(%cst2, %arg0, %cst3) : (index, memref<?xf32>, index) -> ()
+    gpu.terminator
+  }
+  return
+}
+
+// CHECK-LABEL: func @extra_constants_kernel
+// CHECK-SAME: %[[KARG0:.*]]: memref<?xf32>
+// CHECK: constant 2
+// CHECK: constant 0
+// CHECK: dim %[[KARG0]]
+
+// -----
+
+// CHECK-LABEL: @extra_constants_noarg
+// CHECK-SAME: %[[ARG0:.*]]: memref<?xf32>, %[[ARG1:.*]]: memref<?xf32>
+func @extra_constants_noarg(%arg0: memref<?xf32>, %arg1: memref<?xf32>) {
+  // CHECK: %[[CST:.*]] = constant 8 : index
+  %cst = constant 8 : index
+  %cst2 = constant 2 : index
+  %c0 = constant 0 : index
+  // CHECK: dim %[[ARG1]]
+  %cst3 = dim %arg1, %c0 : memref<?xf32>
+  // CHECK: "gpu.launch_func"(%[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[ARG0]], %{{.*}}) {kernel = @extra_constants_noarg_kernel::@extra_constants_noarg_kernel} : (index, index, index, index, index, index, memref<?xf32>, index) -> ()
+  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst,
+                                       %grid_z = %cst)
+             threads(%tx, %ty, %tz) in (%block_x = %cst, %block_y = %cst,
+                                        %block_z = %cst) {
+    "use"(%cst2, %arg0, %cst3) : (index, memref<?xf32>, index) -> ()
+    gpu.terminator
+  }
+  return
+}
+
+// CHECK-LABEL: func @extra_constants_noarg_kernel
+// CHECK-SAME: %[[KARG0:.*]]: memref<?xf32>, %[[KARG1:.*]]: index
+// CHECK: %[[KCST:.*]] = constant 2
+// CHECK: "use"(%[[KCST]], %[[KARG0]], %[[KARG1]])

 // -----

@ -135,6 +189,7 @@ func @multiple_uses(%arg0 : memref<?xf32>) {

 llvm.mlir.global internal @global(42 : i64) : !llvm.i64

+//CHECK-LABEL: @function_call
 func @function_call(%arg0 : memref<?xf32>) {
  %cst = constant 8 : index
  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst,