diff --git a/mlir/g3doc/Dialects/GPU.md b/mlir/g3doc/Dialects/GPU.md index 9adb053b942e..2ebadbe26f47 100644 --- a/mlir/g3doc/Dialects/GPU.md +++ b/mlir/g3doc/Dialects/GPU.md @@ -141,10 +141,8 @@ Example: func @kernel_1(%arg0 : f32, %arg1 : !llvm<"float*">) attributes { nvvm.kernel: true } { - // Operations that produce block/thread IDs and dimensions will be injected - // when outlining the `gpu.launch` body to a function called by - // `gpu.launch_func`. - // TODO(tjoerg): Implement gpu.launch body outlining. + // Operations that produce block/thread IDs and dimensions are injected when + // outlining the `gpu.launch` body to a function called by `gpu.launch_func`. %tIdX = "gpu.thread_id"() {dimension: "x"} : () -> (index) %tIdY = "gpu.thread_id"() {dimension: "y"} : () -> (index) %tIdZ = "gpu.thread_id"() {dimension: "z"} : () -> (index) diff --git a/mlir/include/mlir/GPU/GPUDialect.h b/mlir/include/mlir/GPU/GPUDialect.h index ccd4e7d6b361..ea7762cc35ae 100644 --- a/mlir/include/mlir/GPU/GPUDialect.h +++ b/mlir/include/mlir/GPU/GPUDialect.h @@ -77,6 +77,11 @@ public: KernelDim3 getGridSize(); /// Get the SSA values corresponding to kernel block size. KernelDim3 getBlockSize(); + /// Get the operand values passed as kernel arguments. + Operation::operand_range getKernelOperandValues(); + /// Append the operand types passed as kernel arguments to `out`. + void getKernelOperandTypes(SmallVectorImpl &out); + /// Get the SSA values passed as operands to specify the grid size. KernelDim3 getGridSizeOperandValues(); /// Get the SSA values passed as operands to specify the block size. diff --git a/mlir/include/mlir/GPU/Passes.h b/mlir/include/mlir/GPU/Passes.h new file mode 100644 index 000000000000..9dd4ca03302f --- /dev/null +++ b/mlir/include/mlir/GPU/Passes.h @@ -0,0 +1,33 @@ +//===- Passes.h - Pass Entrypoints ------------------------------*- C++ -*-===// +// +// Copyright 2019 The MLIR Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// ============================================================================= +// +// This header file defines prototypes that expose pass constructors. +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_GPU_PASSES_H_ +#define MLIR_GPU_PASSES_H_ + +namespace mlir { + +class ModulePassBase; + +ModulePassBase *createGpuKernelOutliningPass(); + +} // namespace mlir + +#endif // MLIR_GPU_PASSES_H_ diff --git a/mlir/lib/GPU/CMakeLists.txt b/mlir/lib/GPU/CMakeLists.txt index f3b62cffeb8d..f2906268e743 100644 --- a/mlir/lib/GPU/CMakeLists.txt +++ b/mlir/lib/GPU/CMakeLists.txt @@ -1,6 +1,7 @@ add_llvm_library(MLIRGPU IR/GPUDialect.cpp IR/DialectRegistration.cpp + Transforms/KernelOutlining.cpp ADDITIONAL_HEADER_DIRS ${MLIR_MAIN_INCLUDE_DIR}/mlir/GPU diff --git a/mlir/lib/GPU/IR/GPUDialect.cpp b/mlir/lib/GPU/IR/GPUDialect.cpp index 27ee0586bac5..ff4c493ee5ee 100644 --- a/mlir/lib/GPU/IR/GPUDialect.cpp +++ b/mlir/lib/GPU/IR/GPUDialect.cpp @@ -99,6 +99,18 @@ KernelDim3 LaunchOp::getBlockSize() { return KernelDim3{args[9], args[10], args[11]}; } +Operation::operand_range LaunchOp::getKernelOperandValues() { + return {getOperation()->operand_begin() + kNumConfigOperands, + getOperation()->operand_end()}; +} + +void LaunchOp::getKernelOperandTypes(SmallVectorImpl &out) { + out.reserve(getNumOperands() - kNumConfigOperands + out.size()); + for (unsigned i = kNumConfigOperands; i < getNumOperands(); ++i) { + out.push_back(getOperand(i)->getType()); + } +} + KernelDim3 LaunchOp::getGridSizeOperandValues() { return KernelDim3{getOperand(0), getOperand(1), getOperand(2)}; } diff --git a/mlir/lib/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/GPU/Transforms/KernelOutlining.cpp new file mode 100644 index 000000000000..006ba4fceb39 --- /dev/null +++ b/mlir/lib/GPU/Transforms/KernelOutlining.cpp @@ -0,0 +1,111 @@ +//===- KernelOutlining.cpp - Implementation of GPU kernel outling ---------===// +// +// Copyright 2019 The MLIR Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// ============================================================================= +// +// This file implements the GPU dialect kernel outlining pass. +// +//===----------------------------------------------------------------------===// + +#include "mlir/GPU/GPUDialect.h" +#include "mlir/IR/BlockAndValueMapping.h" +#include "mlir/IR/Builders.h" +#include "mlir/Pass/Pass.h" +#include "mlir/StandardOps/Ops.h" + +using namespace mlir; + +namespace { + +template +void createForAllDimensions(FuncBuilder &builder, Location loc, + SmallVectorImpl &values) { + for (StringRef dim : {"x", "y", "z"}) { + Value *v = builder.create(loc, builder.getIndexType(), + builder.getStringAttr(dim)); + values.push_back(v); + } +} + +// Add operations generating block/thread ids and gird/block dimensions at the +// beginning of `kernelFunc` and replace uses of the respective function args. +void injectGpuIndexOperations(Location loc, Function &kernelFunc) { + FuncBuilder funcBuilder(kernelFunc); + SmallVector indexOps; + createForAllDimensions(funcBuilder, loc, indexOps); + createForAllDimensions(funcBuilder, loc, indexOps); + createForAllDimensions(funcBuilder, loc, indexOps); + createForAllDimensions(funcBuilder, loc, indexOps); + // Replace the leading 12 function args with the respective thread/block index + // operations. Iterate backwards since args are erased and indices change. + for (int i = 11; i >= 0; --i) { + auto &firstBlock = kernelFunc.front(); + firstBlock.getArgument(i)->replaceAllUsesWith(indexOps[i]); + firstBlock.eraseArgument(i); + } +} + +// Outline the `gpu.launch` operation body into a kernel function. +Function *outlineKernelFunc(Module &module, gpu::LaunchOp &launchOp) { + Location loc = launchOp.getLoc(); + SmallVector kernelOperandTypes; + launchOp.getKernelOperandTypes(kernelOperandTypes); + FunctionType type = + FunctionType::get(kernelOperandTypes, {}, module.getContext()); + std::string kernelFuncName = + Twine(launchOp.getOperation()->getFunction()->getName(), "_kernel").str(); + Function *outlinedFunc = new mlir::Function(loc, kernelFuncName, type); + outlinedFunc->getBody().takeBody(launchOp.getBody()); + Builder builder(&module); + outlinedFunc->setAttr(gpu::GPUDialect::getKernelFuncAttrName(), + builder.getUnitAttr()); + injectGpuIndexOperations(loc, *outlinedFunc); + module.getFunctions().push_back(outlinedFunc); + return outlinedFunc; +} + +// Replace `gpu.launch` operations with an `gpu.launch_func` operation launching +// `kernelFunc`. +void convertToLaunchFuncOp(gpu::LaunchOp &launchOp, Function &kernelFunc) { + FuncBuilder funcBuilder(launchOp); + SmallVector kernelOperandValues( + launchOp.getKernelOperandValues()); + funcBuilder.create( + launchOp.getLoc(), &kernelFunc, launchOp.getGridSizeOperandValues(), + launchOp.getBlockSizeOperandValues(), kernelOperandValues); + launchOp.erase(); +} + +} // namespace + +class GpuKernelOutliningPass : public ModulePass { +public: + void runOnModule() override { + for (auto &func : getModule()) { + func.walk([&](mlir::gpu::LaunchOp op) { + Function *outlinedFunc = outlineKernelFunc(getModule(), op); + convertToLaunchFuncOp(op, *outlinedFunc); + }); + } + } +}; + +ModulePassBase *createGpuKernelOutliningPass() { + return new GpuKernelOutliningPass(); +} + +static PassRegistration + pass("gpu-kernel-outlining", + "Outline gpu.launch bodies to kernel functions."); diff --git a/mlir/test/GPU/outlining.mlir b/mlir/test/GPU/outlining.mlir new file mode 100644 index 000000000000..7c6e9fcb5a8d --- /dev/null +++ b/mlir/test/GPU/outlining.mlir @@ -0,0 +1,68 @@ +// RUN: mlir-opt -gpu-kernel-outlining -split-input-file %s | FileCheck %s + +func @launch() { + %0 = "op"() : () -> (f32) + %1 = "op"() : () -> (memref) + %gDimX = constant 8 : index + %gDimY = constant 12 : index + %gDimZ = constant 16 : index + %bDimX = constant 20 : index + %bDimY = constant 24 : index + %bDimZ = constant 28 : index + + // CHECK: "gpu.launch_func"(%c8, %c12, %c16, %c20, %c24, %c28, %0, %1) {kernel: @launch_kernel} : (index, index, index, index, index, index, f32, memref) -> () + // CHECK-NOT: gpu.launch blocks + gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %gDimX, %grid_y = %gDimY, + %grid_z = %gDimZ) + threads(%tx, %ty, %tz) in (%block_x = %bDimX, %block_y = %bDimY, + %block_z = %bDimZ) + args(%arg0 = %0, %arg1 = %1) : f32, memref { + "use"(%arg0): (f32) -> () + "some_op"(%bx, %block_x) : (index, index) -> () + %42 = load %arg1[%tx] : memref + return + } + return +} + +// CHECK: func @launch_kernel(%arg0: f32, %arg1: memref) +// CHECK-NEXT: attributes {gpu.kernel} +// CHECK-NEXT: %0 = "gpu.block_id"() {dimension: "x"} : () -> index +// CHECK-NEXT: %1 = "gpu.block_id"() {dimension: "y"} : () -> index +// CHECK-NEXT: %2 = "gpu.block_id"() {dimension: "z"} : () -> index +// CHECK-NEXT: %3 = "gpu.thread_id"() {dimension: "x"} : () -> index +// CHECK-NEXT: %4 = "gpu.thread_id"() {dimension: "y"} : () -> index +// CHECK-NEXT: %5 = "gpu.thread_id"() {dimension: "z"} : () -> index +// CHECK-NEXT: %6 = "gpu.grid_dim"() {dimension: "x"} : () -> index +// CHECK-NEXT: %7 = "gpu.grid_dim"() {dimension: "y"} : () -> index +// CHECK-NEXT: %8 = "gpu.grid_dim"() {dimension: "z"} : () -> index +// CHECK-NEXT: %9 = "gpu.block_dim"() {dimension: "x"} : () -> index +// CHECK-NEXT: %10 = "gpu.block_dim"() {dimension: "y"} : () -> index +// CHECK-NEXT: %11 = "gpu.block_dim"() {dimension: "z"} : () -> index +// CHECK-NEXT: "use"(%arg0) : (f32) -> () +// CHECK-NEXT: "some_op"(%0, %9) : (index, index) -> () +// CHECK-NEXT: %12 = load %arg1[%3] : memref + +// ----- + +func @multiple_launches() { + %cst = constant 8 : index + // CHECK: "gpu.launch_func"(%c8, %c8, %c8, %c8, %c8, %c8) {kernel: @multiple_launches_kernel} : (index, index, index, index, index, index) -> () + gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst, + %grid_z = %cst) + threads(%tx, %ty, %tz) in (%block_x = %cst, %block_y = %cst, + %block_z = %cst) { + return + } + // CHECK: "gpu.launch_func"(%c8, %c8, %c8, %c8, %c8, %c8) {kernel: @multiple_launches_kernel_0} : (index, index, index, index, index, index) -> () + gpu.launch blocks(%bx2, %by2, %bz2) in (%grid_x2 = %cst, %grid_y2 = %cst, + %grid_z2 = %cst) + threads(%tx2, %ty2, %tz2) in (%block_x2 = %cst, %block_y2 = %cst, + %block_z2 = %cst) { + return + } + return +} + +// CHECK: func @multiple_launches_kernel() +// CHECK: func @multiple_launches_kernel_0()