[mlir][GPU] Improve constant sinking in kernel outlining

The previous implementation did not support sinking simple expressions. In particular,
it is often beneficial to sink dim operations.

Differential Revision: https://reviews.llvm.org/D88439
This commit is contained in:
Stephan Herhut 2020-09-29 13:20:37 +02:00
parent 7bae2bc5a8
commit edeff6e642
2 changed files with 125 additions and 58 deletions

View File

@ -18,6 +18,7 @@
#include "mlir/IR/BlockAndValueMapping.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/SymbolTable.h"
#include "mlir/Support/LLVM.h"
#include "mlir/Transforms/RegionUtils.h"
using namespace mlir;
@ -32,10 +33,10 @@ static void createForAllDimensions(OpBuilder &builder, Location loc,
}
}
// Add operations generating block/thread ids and grid/block dimensions at the
// beginning of the `launchFuncOpBody` region. Add mapping from argument in
// entry block of `launchOpBody`, to the corresponding result value of the added
// operations.
/// Adds operations generating block/thread ids and grid/block dimensions at the
/// beginning of the `launchFuncOpBody` region. Add mapping from argument in
/// entry block of `launchOpBody`, to the corresponding result value of the
/// added operations.
static void injectGpuIndexOperations(Location loc, Region &launchFuncOpBody,
Region &launchOpBody,
BlockAndValueMapping &map) {
@ -53,8 +54,48 @@ static void injectGpuIndexOperations(Location loc, Region &launchFuncOpBody,
map.map(firstBlock.getArgument(indexOp.index()), indexOp.value());
}
/// Identifies operations that are beneficial to sink into kernels. These
/// operations may not have side-effects, as otherwise sinking (and hence
/// duplicating them) is not legal.
static bool isSinkingBeneficiary(Operation *op) {
return isa<ConstantOp, DimOp>(op);
return isa<ConstantOp, DimOp, SelectOp, CmpIOp>(op);
}
/// For a given operation `op`, computes whether it is beneficial to sink the
/// operation into the kernel. An operation can be sunk if doing so does not
/// introduce new kernel arguments. Whether a value is already available in the
/// kernel (and hence does not introduce new arguments) is checked by
/// querying `availableValues`.
/// If an operand is not yet available, we recursively check whether it can be
/// made available by siking its defining op.
/// Operations that are indentified for sinking are added to `beneficiaryOps` in
/// the order the should appear in the kernel. Furthermore, `availableValues` is
/// updated with results that will be available after sinking the identified
/// ops.
static bool extractBeneficiaryOps(Operation *op,
llvm::SetVector<Operation *> &beneficiaryOps,
llvm::SetVector<Value> &availableValues) {
if (beneficiaryOps.count(op))
return true;
if (!isSinkingBeneficiary(op))
return false;
for (Value operand : op->getOperands()) {
// It is already visisble in the kernel, keep going.
if (availableValues.count(operand))
continue;
// Else check whether it can be made available via sinking.
Operation *definingOp = operand.getDefiningOp();
if (!definingOp ||
!extractBeneficiaryOps(definingOp, beneficiaryOps, availableValues))
return false;
}
// We will sink the operation, mark its results as now available.
beneficiaryOps.insert(op);
for (Value result : op->getResults())
availableValues.insert(result);
return true;
}
LogicalResult mlir::sinkOperationsIntoLaunchOp(gpu::LaunchOp launchOp) {
@ -65,59 +106,30 @@ LogicalResult mlir::sinkOperationsIntoLaunchOp(gpu::LaunchOp launchOp) {
llvm::SetVector<Value> sinkCandidates;
getUsedValuesDefinedAbove(launchOpBody, sinkCandidates);
llvm::SetVector<Value> sunkValues;
llvm::SetVector<Operation *> sunkOperations;
for (Value operand : sinkCandidates) {
SmallVector<Value, 4> worklist(sinkCandidates.begin(), sinkCandidates.end());
llvm::SetVector<Operation *> toBeSunk;
for (Value operand : worklist) {
Operation *operandOp = operand.getDefiningOp();
if (!operandOp || !isSinkingBeneficiary(operandOp))
if (!operandOp)
continue;
// Only sink operations that do not create new sinkCandidates.
if (!llvm::all_of(operandOp->getOperands(), [&sinkCandidates](Value value) {
return sinkCandidates.count(value);
}))
continue;
sunkValues.insert(operand);
sunkOperations.insert(operandOp);
extractBeneficiaryOps(operandOp, toBeSunk, sinkCandidates);
}
// Insert operations so that the defs get cloned before uses.
BlockAndValueMapping map;
OpBuilder builder(launchOpBody);
DenseSet<Operation *> processed;
SmallVector<Operation *, 2> clonedOps;
while (processed.size() != sunkOperations.size()) {
auto startSize = processed.size();
for (Operation *sunkOperation : sunkOperations) {
if (processed.count(sunkOperation))
continue;
// Operation cant be cloned yet if any of its operands is also being sunk,
// but isnt cloned yet.
if (llvm::any_of(
sunkOperation->getOperands(), [&sunkValues, &map](Value value) {
return sunkValues.count(value) && !map.lookupOrNull(value);
}))
continue;
Operation *clonedOp = builder.clone(*sunkOperation, map);
// Only replace uses within the launch op.
for (auto result : llvm::enumerate(sunkOperation->getResults())) {
auto replacement = clonedOp->getResult(result.index());
for (auto &use : llvm::make_early_inc_range(result.value().getUses()))
if (use.getOwner()->getParentOfType<gpu::LaunchOp>() == launchOp)
use.set(replacement);
}
processed.insert(sunkOperation);
}
if (startSize == processed.size())
return launchOp.emitError(
"found illegal cyclic dependency between operations while sinking");
for (Operation *op : toBeSunk) {
Operation *clonedOp = builder.clone(*op, map);
// Only replace uses within the launch op.
for (auto pair : llvm::zip(op->getResults(), clonedOp->getResults()))
replaceAllUsesInRegionWith(std::get<0>(pair), std::get<1>(pair),
launchOp.body());
}
return success();
}
// Outline the `gpu.launch` operation body into a kernel function. Replace
// `gpu.terminator` operations by `gpu.return` in the generated function.
/// Outline the `gpu.launch` operation body into a kernel function. Replace
/// `gpu.terminator` operations by `gpu.return` in the generated function.
static gpu::GPUFuncOp outlineKernelFuncImpl(gpu::LaunchOp launchOp,
StringRef kernelFnName,
llvm::SetVector<Value> &operands) {
@ -191,9 +203,9 @@ gpu::GPUFuncOp mlir::outlineKernelFunc(gpu::LaunchOp launchOp,
return funcOp;
}
// Replace `gpu.launch` operations with an `gpu.launch_func` operation launching
// `kernelFunc`. The kernel func contains the body of the `gpu.launch` with
// constant region arguments inlined.
/// Replace `gpu.launch` operations with an `gpu.launch_func` operation
/// launching `kernelFunc`. The kernel func contains the body of the
/// `gpu.launch` with constant region arguments inlined.
static void convertToLaunchFuncOp(gpu::LaunchOp launchOp,
gpu::GPUFuncOp kernelFunc,
ValueRange operands) {
@ -257,7 +269,7 @@ public:
}
private:
// Returns a gpu.module containing kernelFunc and all callees (recursive).
/// Returns a gpu.module containing kernelFunc and all callees (recursive).
gpu::GPUModuleOp createKernelModule(gpu::GPUFuncOp kernelFunc,
const SymbolTable &parentSymbolTable) {
// TODO: This code cannot use an OpBuilder because it must be inserted into

View File

@ -60,7 +60,7 @@ func @launch() {
// -----
// CHECK: module attributes {gpu.container_module}
// CHECK-LABEL: @multiple_launches
func @multiple_launches() {
// CHECK: %[[CST:.*]] = constant 8 : index
%cst = constant 8 : index
@ -88,13 +88,14 @@ func @multiple_launches() {
// -----
func @extra_constants(%arg0 : memref<?xf32>) {
// CHECK-LABEL: @extra_constants_not_inlined
func @extra_constants_not_inlined(%arg0: memref<?xf32>) {
// CHECK: %[[CST:.*]] = constant 8 : index
%cst = constant 8 : index
%cst2 = constant 2 : index
%c0 = constant 0 : index
%cst3 = dim %arg0, %c0 : memref<?xf32>
// CHECK: "gpu.launch_func"(%[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %{{.*}}, %{{.*}}) {kernel = @extra_constants_kernel::@extra_constants_kernel} : (index, index, index, index, index, index, memref<?xf32>, index) -> ()
%cst3 = "secret_constant"() : () -> index
// CHECK: "gpu.launch_func"(%[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %{{.*}}, %{{.*}}) {kernel = @extra_constants_not_inlined_kernel::@extra_constants_not_inlined_kernel} : (index, index, index, index, index, index, memref<?xf32>, index) -> ()
gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst,
%grid_z = %cst)
threads(%tx, %ty, %tz) in (%block_x = %cst, %block_y = %cst,
@ -105,9 +106,62 @@ func @extra_constants(%arg0 : memref<?xf32>) {
return
}
// CHECK-LABEL: func @extra_constants_kernel(%{{.*}}: memref<?xf32>, %{{.*}}: index)
// CHECK: constant
// CHECK: constant
// CHECK-LABEL: func @extra_constants_not_inlined_kernel(%{{.*}}: memref<?xf32>, %{{.*}}: index)
// CHECK: constant 2
// -----
// CHECK-LABEL: @extra_constants
// CHECK-SAME: %[[ARG0:.*]]: memref<?xf32>
func @extra_constants(%arg0: memref<?xf32>) {
// CHECK: %[[CST:.*]] = constant 8 : index
%cst = constant 8 : index
%cst2 = constant 2 : index
%c0 = constant 0 : index
%cst3 = dim %arg0, %c0 : memref<?xf32>
// CHECK: "gpu.launch_func"(%[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[ARG0]]) {kernel = @extra_constants_kernel::@extra_constants_kernel} : (index, index, index, index, index, index, memref<?xf32>) -> ()
gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst,
%grid_z = %cst)
threads(%tx, %ty, %tz) in (%block_x = %cst, %block_y = %cst,
%block_z = %cst) {
"use"(%cst2, %arg0, %cst3) : (index, memref<?xf32>, index) -> ()
gpu.terminator
}
return
}
// CHECK-LABEL: func @extra_constants_kernel
// CHECK-SAME: %[[KARG0:.*]]: memref<?xf32>
// CHECK: constant 2
// CHECK: constant 0
// CHECK: dim %[[KARG0]]
// -----
// CHECK-LABEL: @extra_constants_noarg
// CHECK-SAME: %[[ARG0:.*]]: memref<?xf32>, %[[ARG1:.*]]: memref<?xf32>
func @extra_constants_noarg(%arg0: memref<?xf32>, %arg1: memref<?xf32>) {
// CHECK: %[[CST:.*]] = constant 8 : index
%cst = constant 8 : index
%cst2 = constant 2 : index
%c0 = constant 0 : index
// CHECK: dim %[[ARG1]]
%cst3 = dim %arg1, %c0 : memref<?xf32>
// CHECK: "gpu.launch_func"(%[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[ARG0]], %{{.*}}) {kernel = @extra_constants_noarg_kernel::@extra_constants_noarg_kernel} : (index, index, index, index, index, index, memref<?xf32>, index) -> ()
gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst,
%grid_z = %cst)
threads(%tx, %ty, %tz) in (%block_x = %cst, %block_y = %cst,
%block_z = %cst) {
"use"(%cst2, %arg0, %cst3) : (index, memref<?xf32>, index) -> ()
gpu.terminator
}
return
}
// CHECK-LABEL: func @extra_constants_noarg_kernel
// CHECK-SAME: %[[KARG0:.*]]: memref<?xf32>, %[[KARG1:.*]]: index
// CHECK: %[[KCST:.*]] = constant 2
// CHECK: "use"(%[[KCST]], %[[KARG0]], %[[KARG1]])
// -----
@ -135,6 +189,7 @@ func @multiple_uses(%arg0 : memref<?xf32>) {
llvm.mlir.global internal @global(42 : i64) : !llvm.i64
//CHECK-LABEL: @function_call
func @function_call(%arg0 : memref<?xf32>) {
%cst = constant 8 : index
gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst,