forked from OSchip/llvm-project
[mlir][GPU] Improve constant sinking in kernel outlining
The previous implementation did not support sinking simple expressions. In particular, it is often beneficial to sink dim operations. Differential Revision: https://reviews.llvm.org/D88439
This commit is contained in:
parent
7bae2bc5a8
commit
edeff6e642
|
@ -18,6 +18,7 @@
|
|||
#include "mlir/IR/BlockAndValueMapping.h"
|
||||
#include "mlir/IR/Builders.h"
|
||||
#include "mlir/IR/SymbolTable.h"
|
||||
#include "mlir/Support/LLVM.h"
|
||||
#include "mlir/Transforms/RegionUtils.h"
|
||||
|
||||
using namespace mlir;
|
||||
|
@ -32,10 +33,10 @@ static void createForAllDimensions(OpBuilder &builder, Location loc,
|
|||
}
|
||||
}
|
||||
|
||||
// Add operations generating block/thread ids and grid/block dimensions at the
|
||||
// beginning of the `launchFuncOpBody` region. Add mapping from argument in
|
||||
// entry block of `launchOpBody`, to the corresponding result value of the added
|
||||
// operations.
|
||||
/// Adds operations generating block/thread ids and grid/block dimensions at the
|
||||
/// beginning of the `launchFuncOpBody` region. Add mapping from argument in
|
||||
/// entry block of `launchOpBody`, to the corresponding result value of the
|
||||
/// added operations.
|
||||
static void injectGpuIndexOperations(Location loc, Region &launchFuncOpBody,
|
||||
Region &launchOpBody,
|
||||
BlockAndValueMapping &map) {
|
||||
|
@ -53,8 +54,48 @@ static void injectGpuIndexOperations(Location loc, Region &launchFuncOpBody,
|
|||
map.map(firstBlock.getArgument(indexOp.index()), indexOp.value());
|
||||
}
|
||||
|
||||
/// Identifies operations that are beneficial to sink into kernels. These
|
||||
/// operations may not have side-effects, as otherwise sinking (and hence
|
||||
/// duplicating them) is not legal.
|
||||
static bool isSinkingBeneficiary(Operation *op) {
|
||||
return isa<ConstantOp, DimOp>(op);
|
||||
return isa<ConstantOp, DimOp, SelectOp, CmpIOp>(op);
|
||||
}
|
||||
|
||||
/// For a given operation `op`, computes whether it is beneficial to sink the
|
||||
/// operation into the kernel. An operation can be sunk if doing so does not
|
||||
/// introduce new kernel arguments. Whether a value is already available in the
|
||||
/// kernel (and hence does not introduce new arguments) is checked by
|
||||
/// querying `availableValues`.
|
||||
/// If an operand is not yet available, we recursively check whether it can be
|
||||
/// made available by siking its defining op.
|
||||
/// Operations that are indentified for sinking are added to `beneficiaryOps` in
|
||||
/// the order the should appear in the kernel. Furthermore, `availableValues` is
|
||||
/// updated with results that will be available after sinking the identified
|
||||
/// ops.
|
||||
static bool extractBeneficiaryOps(Operation *op,
|
||||
llvm::SetVector<Operation *> &beneficiaryOps,
|
||||
llvm::SetVector<Value> &availableValues) {
|
||||
if (beneficiaryOps.count(op))
|
||||
return true;
|
||||
|
||||
if (!isSinkingBeneficiary(op))
|
||||
return false;
|
||||
|
||||
for (Value operand : op->getOperands()) {
|
||||
// It is already visisble in the kernel, keep going.
|
||||
if (availableValues.count(operand))
|
||||
continue;
|
||||
// Else check whether it can be made available via sinking.
|
||||
Operation *definingOp = operand.getDefiningOp();
|
||||
if (!definingOp ||
|
||||
!extractBeneficiaryOps(definingOp, beneficiaryOps, availableValues))
|
||||
return false;
|
||||
}
|
||||
// We will sink the operation, mark its results as now available.
|
||||
beneficiaryOps.insert(op);
|
||||
for (Value result : op->getResults())
|
||||
availableValues.insert(result);
|
||||
return true;
|
||||
}
|
||||
|
||||
LogicalResult mlir::sinkOperationsIntoLaunchOp(gpu::LaunchOp launchOp) {
|
||||
|
@ -65,59 +106,30 @@ LogicalResult mlir::sinkOperationsIntoLaunchOp(gpu::LaunchOp launchOp) {
|
|||
llvm::SetVector<Value> sinkCandidates;
|
||||
getUsedValuesDefinedAbove(launchOpBody, sinkCandidates);
|
||||
|
||||
llvm::SetVector<Value> sunkValues;
|
||||
llvm::SetVector<Operation *> sunkOperations;
|
||||
for (Value operand : sinkCandidates) {
|
||||
SmallVector<Value, 4> worklist(sinkCandidates.begin(), sinkCandidates.end());
|
||||
llvm::SetVector<Operation *> toBeSunk;
|
||||
for (Value operand : worklist) {
|
||||
Operation *operandOp = operand.getDefiningOp();
|
||||
if (!operandOp || !isSinkingBeneficiary(operandOp))
|
||||
if (!operandOp)
|
||||
continue;
|
||||
// Only sink operations that do not create new sinkCandidates.
|
||||
if (!llvm::all_of(operandOp->getOperands(), [&sinkCandidates](Value value) {
|
||||
return sinkCandidates.count(value);
|
||||
}))
|
||||
continue;
|
||||
sunkValues.insert(operand);
|
||||
sunkOperations.insert(operandOp);
|
||||
extractBeneficiaryOps(operandOp, toBeSunk, sinkCandidates);
|
||||
}
|
||||
|
||||
// Insert operations so that the defs get cloned before uses.
|
||||
BlockAndValueMapping map;
|
||||
OpBuilder builder(launchOpBody);
|
||||
DenseSet<Operation *> processed;
|
||||
SmallVector<Operation *, 2> clonedOps;
|
||||
while (processed.size() != sunkOperations.size()) {
|
||||
auto startSize = processed.size();
|
||||
for (Operation *sunkOperation : sunkOperations) {
|
||||
if (processed.count(sunkOperation))
|
||||
continue;
|
||||
|
||||
// Operation cant be cloned yet if any of its operands is also being sunk,
|
||||
// but isnt cloned yet.
|
||||
if (llvm::any_of(
|
||||
sunkOperation->getOperands(), [&sunkValues, &map](Value value) {
|
||||
return sunkValues.count(value) && !map.lookupOrNull(value);
|
||||
}))
|
||||
continue;
|
||||
|
||||
Operation *clonedOp = builder.clone(*sunkOperation, map);
|
||||
// Only replace uses within the launch op.
|
||||
for (auto result : llvm::enumerate(sunkOperation->getResults())) {
|
||||
auto replacement = clonedOp->getResult(result.index());
|
||||
for (auto &use : llvm::make_early_inc_range(result.value().getUses()))
|
||||
if (use.getOwner()->getParentOfType<gpu::LaunchOp>() == launchOp)
|
||||
use.set(replacement);
|
||||
}
|
||||
processed.insert(sunkOperation);
|
||||
}
|
||||
if (startSize == processed.size())
|
||||
return launchOp.emitError(
|
||||
"found illegal cyclic dependency between operations while sinking");
|
||||
for (Operation *op : toBeSunk) {
|
||||
Operation *clonedOp = builder.clone(*op, map);
|
||||
// Only replace uses within the launch op.
|
||||
for (auto pair : llvm::zip(op->getResults(), clonedOp->getResults()))
|
||||
replaceAllUsesInRegionWith(std::get<0>(pair), std::get<1>(pair),
|
||||
launchOp.body());
|
||||
}
|
||||
return success();
|
||||
}
|
||||
|
||||
// Outline the `gpu.launch` operation body into a kernel function. Replace
|
||||
// `gpu.terminator` operations by `gpu.return` in the generated function.
|
||||
/// Outline the `gpu.launch` operation body into a kernel function. Replace
|
||||
/// `gpu.terminator` operations by `gpu.return` in the generated function.
|
||||
static gpu::GPUFuncOp outlineKernelFuncImpl(gpu::LaunchOp launchOp,
|
||||
StringRef kernelFnName,
|
||||
llvm::SetVector<Value> &operands) {
|
||||
|
@ -191,9 +203,9 @@ gpu::GPUFuncOp mlir::outlineKernelFunc(gpu::LaunchOp launchOp,
|
|||
return funcOp;
|
||||
}
|
||||
|
||||
// Replace `gpu.launch` operations with an `gpu.launch_func` operation launching
|
||||
// `kernelFunc`. The kernel func contains the body of the `gpu.launch` with
|
||||
// constant region arguments inlined.
|
||||
/// Replace `gpu.launch` operations with an `gpu.launch_func` operation
|
||||
/// launching `kernelFunc`. The kernel func contains the body of the
|
||||
/// `gpu.launch` with constant region arguments inlined.
|
||||
static void convertToLaunchFuncOp(gpu::LaunchOp launchOp,
|
||||
gpu::GPUFuncOp kernelFunc,
|
||||
ValueRange operands) {
|
||||
|
@ -257,7 +269,7 @@ public:
|
|||
}
|
||||
|
||||
private:
|
||||
// Returns a gpu.module containing kernelFunc and all callees (recursive).
|
||||
/// Returns a gpu.module containing kernelFunc and all callees (recursive).
|
||||
gpu::GPUModuleOp createKernelModule(gpu::GPUFuncOp kernelFunc,
|
||||
const SymbolTable &parentSymbolTable) {
|
||||
// TODO: This code cannot use an OpBuilder because it must be inserted into
|
||||
|
|
|
@ -60,7 +60,7 @@ func @launch() {
|
|||
// -----
|
||||
|
||||
// CHECK: module attributes {gpu.container_module}
|
||||
|
||||
// CHECK-LABEL: @multiple_launches
|
||||
func @multiple_launches() {
|
||||
// CHECK: %[[CST:.*]] = constant 8 : index
|
||||
%cst = constant 8 : index
|
||||
|
@ -88,13 +88,14 @@ func @multiple_launches() {
|
|||
|
||||
// -----
|
||||
|
||||
func @extra_constants(%arg0 : memref<?xf32>) {
|
||||
// CHECK-LABEL: @extra_constants_not_inlined
|
||||
func @extra_constants_not_inlined(%arg0: memref<?xf32>) {
|
||||
// CHECK: %[[CST:.*]] = constant 8 : index
|
||||
%cst = constant 8 : index
|
||||
%cst2 = constant 2 : index
|
||||
%c0 = constant 0 : index
|
||||
%cst3 = dim %arg0, %c0 : memref<?xf32>
|
||||
// CHECK: "gpu.launch_func"(%[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %{{.*}}, %{{.*}}) {kernel = @extra_constants_kernel::@extra_constants_kernel} : (index, index, index, index, index, index, memref<?xf32>, index) -> ()
|
||||
%cst3 = "secret_constant"() : () -> index
|
||||
// CHECK: "gpu.launch_func"(%[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %{{.*}}, %{{.*}}) {kernel = @extra_constants_not_inlined_kernel::@extra_constants_not_inlined_kernel} : (index, index, index, index, index, index, memref<?xf32>, index) -> ()
|
||||
gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst,
|
||||
%grid_z = %cst)
|
||||
threads(%tx, %ty, %tz) in (%block_x = %cst, %block_y = %cst,
|
||||
|
@ -105,9 +106,62 @@ func @extra_constants(%arg0 : memref<?xf32>) {
|
|||
return
|
||||
}
|
||||
|
||||
// CHECK-LABEL: func @extra_constants_kernel(%{{.*}}: memref<?xf32>, %{{.*}}: index)
|
||||
// CHECK: constant
|
||||
// CHECK: constant
|
||||
// CHECK-LABEL: func @extra_constants_not_inlined_kernel(%{{.*}}: memref<?xf32>, %{{.*}}: index)
|
||||
// CHECK: constant 2
|
||||
|
||||
// -----
|
||||
|
||||
// CHECK-LABEL: @extra_constants
|
||||
// CHECK-SAME: %[[ARG0:.*]]: memref<?xf32>
|
||||
func @extra_constants(%arg0: memref<?xf32>) {
|
||||
// CHECK: %[[CST:.*]] = constant 8 : index
|
||||
%cst = constant 8 : index
|
||||
%cst2 = constant 2 : index
|
||||
%c0 = constant 0 : index
|
||||
%cst3 = dim %arg0, %c0 : memref<?xf32>
|
||||
// CHECK: "gpu.launch_func"(%[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[ARG0]]) {kernel = @extra_constants_kernel::@extra_constants_kernel} : (index, index, index, index, index, index, memref<?xf32>) -> ()
|
||||
gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst,
|
||||
%grid_z = %cst)
|
||||
threads(%tx, %ty, %tz) in (%block_x = %cst, %block_y = %cst,
|
||||
%block_z = %cst) {
|
||||
"use"(%cst2, %arg0, %cst3) : (index, memref<?xf32>, index) -> ()
|
||||
gpu.terminator
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// CHECK-LABEL: func @extra_constants_kernel
|
||||
// CHECK-SAME: %[[KARG0:.*]]: memref<?xf32>
|
||||
// CHECK: constant 2
|
||||
// CHECK: constant 0
|
||||
// CHECK: dim %[[KARG0]]
|
||||
|
||||
// -----
|
||||
|
||||
// CHECK-LABEL: @extra_constants_noarg
|
||||
// CHECK-SAME: %[[ARG0:.*]]: memref<?xf32>, %[[ARG1:.*]]: memref<?xf32>
|
||||
func @extra_constants_noarg(%arg0: memref<?xf32>, %arg1: memref<?xf32>) {
|
||||
// CHECK: %[[CST:.*]] = constant 8 : index
|
||||
%cst = constant 8 : index
|
||||
%cst2 = constant 2 : index
|
||||
%c0 = constant 0 : index
|
||||
// CHECK: dim %[[ARG1]]
|
||||
%cst3 = dim %arg1, %c0 : memref<?xf32>
|
||||
// CHECK: "gpu.launch_func"(%[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[ARG0]], %{{.*}}) {kernel = @extra_constants_noarg_kernel::@extra_constants_noarg_kernel} : (index, index, index, index, index, index, memref<?xf32>, index) -> ()
|
||||
gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst,
|
||||
%grid_z = %cst)
|
||||
threads(%tx, %ty, %tz) in (%block_x = %cst, %block_y = %cst,
|
||||
%block_z = %cst) {
|
||||
"use"(%cst2, %arg0, %cst3) : (index, memref<?xf32>, index) -> ()
|
||||
gpu.terminator
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// CHECK-LABEL: func @extra_constants_noarg_kernel
|
||||
// CHECK-SAME: %[[KARG0:.*]]: memref<?xf32>, %[[KARG1:.*]]: index
|
||||
// CHECK: %[[KCST:.*]] = constant 2
|
||||
// CHECK: "use"(%[[KCST]], %[[KARG0]], %[[KARG1]])
|
||||
|
||||
// -----
|
||||
|
||||
|
@ -135,6 +189,7 @@ func @multiple_uses(%arg0 : memref<?xf32>) {
|
|||
|
||||
llvm.mlir.global internal @global(42 : i64) : !llvm.i64
|
||||
|
||||
//CHECK-LABEL: @function_call
|
||||
func @function_call(%arg0 : memref<?xf32>) {
|
||||
%cst = constant 8 : index
|
||||
gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst,
|
||||
|
|
Loading…
Reference in New Issue