forked from OSchip/llvm-project
Outline GPU kernel function into a nested module.
Roll forward of commit 5684a12
.
When outlining GPU kernels, put the kernel function inside a nested module. Then use a nested pipeline to generate the cubins, independently per kernel. In a final pass, move the cubins back to the parent module.
PiperOrigin-RevId: 270639748
This commit is contained in:
parent
c900d4994e
commit
b8676da1fc
|
@ -26,10 +26,6 @@ class OwningRewritePatternList;
|
|||
class ModuleOp;
|
||||
template <typename OpT> class OpPassBase;
|
||||
|
||||
/// Collect a set of patterns to convert from the GPU dialect to NVVM.
|
||||
void populateGpuToNVVMConversionPatterns(LLVMTypeConverter &converter,
|
||||
OwningRewritePatternList &patterns);
|
||||
|
||||
/// Creates a pass that lowers GPU dialect operations to NVVM counterparts.
|
||||
std::unique_ptr<OpPassBase<ModuleOp>> createLowerGpuOpsToNVVMOpsPass();
|
||||
|
||||
|
|
|
@ -41,9 +41,12 @@ public:
|
|||
/// Get the canonical string name of the dialect.
|
||||
static StringRef getDialectName();
|
||||
|
||||
/// Get the name of the attribute used to annotate outlined kernel functions.
|
||||
/// Get the name of the attribute used to annotate external kernel functions.
|
||||
static StringRef getKernelFuncAttrName() { return "gpu.kernel"; }
|
||||
|
||||
/// Get the name of the attribute used to annotate kernel modules.
|
||||
static StringRef getKernelModuleAttrName() { return "gpu.kernel_module"; }
|
||||
|
||||
/// Returns whether the given function is a kernel function, i.e., has the
|
||||
/// 'gpu.kernel' attribute.
|
||||
static bool isKernel(FuncOp function);
|
||||
|
|
|
@ -49,26 +49,37 @@ namespace {
|
|||
// TODO(herhut): Move to shared location.
|
||||
static constexpr const char *kCubinAnnotation = "nvvm.cubin";
|
||||
|
||||
/// A pass converting tagged kernel functions to cubin blobs.
|
||||
/// A pass converting tagged kernel modules to cubin blobs.
|
||||
///
|
||||
/// If tagged as a kernel module, each contained function is translated to NVVM
|
||||
/// IR and further to PTX. A user provided CubinGenerator compiles the PTX to
|
||||
/// GPU binary code, which is then attached as an attribute to the function. The
|
||||
/// function body is erased.
|
||||
class GpuKernelToCubinPass : public ModulePass<GpuKernelToCubinPass> {
|
||||
public:
|
||||
GpuKernelToCubinPass(
|
||||
CubinGenerator cubinGenerator = compilePtxToCubinForTesting)
|
||||
: cubinGenerator(cubinGenerator) {}
|
||||
|
||||
// Run the dialect converter on the module.
|
||||
void runOnModule() override {
|
||||
if (!getModule().getAttrOfType<UnitAttr>(
|
||||
gpu::GPUDialect::getKernelModuleAttrName()))
|
||||
return;
|
||||
|
||||
// Make sure the NVPTX target is initialized.
|
||||
LLVMInitializeNVPTXTarget();
|
||||
LLVMInitializeNVPTXTargetInfo();
|
||||
LLVMInitializeNVPTXTargetMC();
|
||||
LLVMInitializeNVPTXAsmPrinter();
|
||||
|
||||
auto llvmModule = translateModuleToNVVMIR(getModule());
|
||||
if (!llvmModule)
|
||||
return signalPassFailure();
|
||||
|
||||
for (auto function : getModule().getOps<FuncOp>()) {
|
||||
if (!gpu::GPUDialect::isKernel(function) || function.isExternal()) {
|
||||
if (!gpu::GPUDialect::isKernel(function))
|
||||
continue;
|
||||
}
|
||||
if (failed(translateGpuKernelToCubinAnnotation(function)))
|
||||
if (failed(translateGpuKernelToCubinAnnotation(*llvmModule, function)))
|
||||
signalPassFailure();
|
||||
}
|
||||
}
|
||||
|
@ -79,8 +90,13 @@ private:
|
|||
|
||||
std::string translateModuleToPtx(llvm::Module &module,
|
||||
llvm::TargetMachine &target_machine);
|
||||
|
||||
/// Converts llvmModule to cubin using the user-provded generator.
|
||||
OwnedCubin convertModuleToCubin(llvm::Module &llvmModule, FuncOp &function);
|
||||
LogicalResult translateGpuKernelToCubinAnnotation(FuncOp &function);
|
||||
|
||||
/// Translates llvmModule to cubin and assigns it to attribute of function.
|
||||
LogicalResult translateGpuKernelToCubinAnnotation(llvm::Module &llvmModule,
|
||||
FuncOp &function);
|
||||
|
||||
CubinGenerator cubinGenerator;
|
||||
};
|
||||
|
@ -135,22 +151,13 @@ OwnedCubin GpuKernelToCubinPass::convertModuleToCubin(llvm::Module &llvmModule,
|
|||
return cubinGenerator(ptx, function);
|
||||
}
|
||||
|
||||
LogicalResult
|
||||
GpuKernelToCubinPass::translateGpuKernelToCubinAnnotation(FuncOp &function) {
|
||||
Builder builder(function.getContext());
|
||||
|
||||
OwningModuleRef module = ModuleOp::create(function.getLoc());
|
||||
|
||||
// TODO(herhut): Also handle called functions.
|
||||
module->push_back(function.clone());
|
||||
|
||||
auto llvmModule = translateModuleToNVVMIR(*module);
|
||||
auto cubin = convertModuleToCubin(*llvmModule, function);
|
||||
|
||||
if (!cubin) {
|
||||
LogicalResult GpuKernelToCubinPass::translateGpuKernelToCubinAnnotation(
|
||||
llvm::Module &llvmModule, FuncOp &function) {
|
||||
auto cubin = convertModuleToCubin(llvmModule, function);
|
||||
if (!cubin)
|
||||
return function.emitError("translation to CUDA binary failed.");
|
||||
}
|
||||
|
||||
Builder builder(function.getContext());
|
||||
function.setAttr(kCubinAnnotation,
|
||||
builder.getStringAttr({cubin->data(), cubin->size()}));
|
||||
|
||||
|
|
|
@ -43,8 +43,15 @@ constexpr const char *kCubinGetterAnnotation = "nvvm.cubingetter";
|
|||
constexpr const char *kCubinGetterSuffix = "_cubin";
|
||||
constexpr const char *kCubinStorageSuffix = "_cubin_cst";
|
||||
|
||||
/// A pass generating global strings and getter functions for all cubin blobs
|
||||
/// annotated on functions via the nvvm.cubin attribute.
|
||||
/// A pass which moves cubin from function attributes in nested modules
|
||||
/// to global strings and generates getter functions.
|
||||
///
|
||||
/// The GpuKernelToCubinPass annotates kernels functions with compiled device
|
||||
/// code blobs. These functions reside in nested modules generated by
|
||||
/// GpuKernelOutliningPass. This pass consumes these modules and moves the cubin
|
||||
/// blobs back to the parent module as global strings and generates accessor
|
||||
/// functions for them. The external kernel functions (also generated by the
|
||||
/// outlining pass) are annotated with the symbol of the cubin accessor.
|
||||
class GpuGenerateCubinAccessorsPass
|
||||
: public ModulePass<GpuGenerateCubinAccessorsPass> {
|
||||
private:
|
||||
|
@ -55,18 +62,25 @@ private:
|
|||
}
|
||||
|
||||
// Inserts a global constant string containing `blob` into the parent module
|
||||
// of `orig` and generates the function that returns the address of the first
|
||||
// character of this string.
|
||||
// of `kernelFunc` and generates the function that returns the address of the
|
||||
// first character of this string.
|
||||
// TODO(herhut): consider fusing this pass with launch-func-to-cuda.
|
||||
void generate(FuncOp orig, StringAttr blob) {
|
||||
Location loc = orig.getLoc();
|
||||
SmallString<128> nameBuffer(orig.getName());
|
||||
auto module = orig.getParentOfType<ModuleOp>();
|
||||
void generate(FuncOp kernelFunc, StringAttr blob) {
|
||||
auto stubFunc = getModule().lookupSymbol<FuncOp>(kernelFunc.getName());
|
||||
if (!stubFunc) {
|
||||
kernelFunc.emitError(
|
||||
"corresponding external function not found in parent module");
|
||||
return signalPassFailure();
|
||||
}
|
||||
|
||||
Location loc = stubFunc.getLoc();
|
||||
SmallString<128> nameBuffer(stubFunc.getName());
|
||||
auto module = stubFunc.getParentOfType<ModuleOp>();
|
||||
assert(module && "function must belong to a module");
|
||||
|
||||
// Insert the getter function just after the original function.
|
||||
OpBuilder moduleBuilder(module.getBody(), module.getBody()->begin());
|
||||
moduleBuilder.setInsertionPoint(orig.getOperation()->getNextNode());
|
||||
moduleBuilder.setInsertionPoint(stubFunc.getOperation()->getNextNode());
|
||||
auto getterType = moduleBuilder.getFunctionType(
|
||||
llvm::None, LLVM::LLVMType::getInt8PtrTy(llvmDialect));
|
||||
nameBuffer.append(kCubinGetterSuffix);
|
||||
|
@ -75,7 +89,7 @@ private:
|
|||
Block *entryBlock = result.addEntryBlock();
|
||||
|
||||
// Drop the getter suffix before appending the storage suffix.
|
||||
nameBuffer.resize(orig.getName().size());
|
||||
nameBuffer.resize(stubFunc.getName().size());
|
||||
nameBuffer.append(kCubinStorageSuffix);
|
||||
|
||||
// Obtain the address of the first character of the global string containing
|
||||
|
@ -86,21 +100,23 @@ private:
|
|||
builder.create<LLVM::ReturnOp>(loc, startPtr);
|
||||
|
||||
// Store the name of the getter on the function for easier lookup.
|
||||
orig.setAttr(kCubinGetterAnnotation, builder.getSymbolRefAttr(result));
|
||||
stubFunc.setAttr(kCubinGetterAnnotation, builder.getSymbolRefAttr(result));
|
||||
}
|
||||
|
||||
public:
|
||||
// Perform the conversion on the module. This may insert globals, so it
|
||||
// cannot be done on multiple functions in parallel.
|
||||
void runOnModule() override {
|
||||
llvmDialect =
|
||||
getModule().getContext()->getRegisteredDialect<LLVM::LLVMDialect>();
|
||||
llvmDialect = getContext().getRegisteredDialect<LLVM::LLVMDialect>();
|
||||
|
||||
for (auto func : getModule().getOps<FuncOp>()) {
|
||||
StringAttr cubinBlob = func.getAttrOfType<StringAttr>(kCubinAnnotation);
|
||||
if (!cubinBlob)
|
||||
auto modules = getModule().getOps<ModuleOp>();
|
||||
for (auto module : llvm::make_early_inc_range(modules)) {
|
||||
if (!module.getAttrOfType<UnitAttr>(
|
||||
gpu::GPUDialect::getKernelModuleAttrName()))
|
||||
continue;
|
||||
generate(func, cubinBlob);
|
||||
for (auto func : module.getOps<FuncOp>()) {
|
||||
if (StringAttr blob = func.getAttrOfType<StringAttr>(kCubinAnnotation))
|
||||
generate(func, blob);
|
||||
}
|
||||
module.erase();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -23,6 +23,7 @@
|
|||
#include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"
|
||||
|
||||
#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
|
||||
#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"
|
||||
#include "mlir/Dialect/GPU/GPUDialect.h"
|
||||
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
|
||||
#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
|
||||
|
@ -38,23 +39,6 @@ using namespace mlir;
|
|||
|
||||
namespace {
|
||||
|
||||
// Rewriting that replaces the types of a LaunchFunc operation with their
|
||||
// LLVM counterparts.
|
||||
struct GPULaunchFuncOpLowering : public LLVMOpLowering {
|
||||
public:
|
||||
explicit GPULaunchFuncOpLowering(LLVMTypeConverter &lowering_)
|
||||
: LLVMOpLowering(gpu::LaunchFuncOp::getOperationName(),
|
||||
lowering_.getDialect()->getContext(), lowering_) {}
|
||||
|
||||
// Convert the kernel arguments to an LLVM type, preserve the rest.
|
||||
PatternMatchResult
|
||||
matchAndRewrite(Operation *op, ArrayRef<Value *> operands,
|
||||
ConversionPatternRewriter &rewriter) const override {
|
||||
rewriter.clone(*op)->setOperands(operands);
|
||||
return rewriter.replaceOp(op, llvm::None), matchSuccess();
|
||||
}
|
||||
};
|
||||
|
||||
// Rewriting that replaces Op with XOp, YOp, or ZOp depending on the dimension
|
||||
// that Op operates on. Op is assumed to return an `std.index` value and
|
||||
// XOp, YOp and ZOp are assumed to return an `llvm.i32` value. Depending on
|
||||
|
@ -119,20 +103,31 @@ public:
|
|||
}
|
||||
};
|
||||
|
||||
// A pass that replaces all occurences of GPU operations with their
|
||||
// A pass that replaces all occurences of GPU device operations with their
|
||||
// corresponding NVVM equivalent.
|
||||
//
|
||||
// This pass does not handle launching of kernels. Instead, it is meant to be
|
||||
// used on the body region of a launch or the body region of a kernel
|
||||
// function.
|
||||
// This pass only handles device code and is not meant to be run on GPU host
|
||||
// code.
|
||||
class LowerGpuOpsToNVVMOpsPass : public ModulePass<LowerGpuOpsToNVVMOpsPass> {
|
||||
public:
|
||||
void runOnModule() override {
|
||||
ModuleOp m = getModule();
|
||||
if (!m.getAttrOfType<UnitAttr>(gpu::GPUDialect::getKernelModuleAttrName()))
|
||||
return;
|
||||
|
||||
OwningRewritePatternList patterns;
|
||||
LLVMTypeConverter converter(m.getContext());
|
||||
populateGpuToNVVMConversionPatterns(converter, patterns);
|
||||
populateStdToLLVMConversionPatterns(converter, patterns);
|
||||
patterns.insert<
|
||||
GPUIndexIntrinsicOpLowering<gpu::ThreadId, NVVM::ThreadIdXOp,
|
||||
NVVM::ThreadIdYOp, NVVM::ThreadIdZOp>,
|
||||
GPUIndexIntrinsicOpLowering<gpu::BlockDim, NVVM::BlockDimXOp,
|
||||
NVVM::BlockDimYOp, NVVM::BlockDimZOp>,
|
||||
GPUIndexIntrinsicOpLowering<gpu::BlockId, NVVM::BlockIdXOp,
|
||||
NVVM::BlockIdYOp, NVVM::BlockIdZOp>,
|
||||
GPUIndexIntrinsicOpLowering<gpu::GridDim, NVVM::GridDimXOp,
|
||||
NVVM::GridDimYOp, NVVM::GridDimZOp>>(
|
||||
converter);
|
||||
|
||||
ConversionTarget target(getContext());
|
||||
target.addLegalDialect<LLVM::LLVMDialect>();
|
||||
|
@ -146,22 +141,6 @@ public:
|
|||
|
||||
} // anonymous namespace
|
||||
|
||||
/// Collect a set of patterns to convert from the GPU dialect to NVVM.
|
||||
void mlir::populateGpuToNVVMConversionPatterns(
|
||||
LLVMTypeConverter &converter, OwningRewritePatternList &patterns) {
|
||||
patterns
|
||||
.insert<GPULaunchFuncOpLowering,
|
||||
GPUIndexIntrinsicOpLowering<gpu::ThreadId, NVVM::ThreadIdXOp,
|
||||
NVVM::ThreadIdYOp, NVVM::ThreadIdZOp>,
|
||||
GPUIndexIntrinsicOpLowering<gpu::BlockDim, NVVM::BlockDimXOp,
|
||||
NVVM::BlockDimYOp, NVVM::BlockDimZOp>,
|
||||
GPUIndexIntrinsicOpLowering<gpu::BlockId, NVVM::BlockIdXOp,
|
||||
NVVM::BlockIdYOp, NVVM::BlockIdZOp>,
|
||||
GPUIndexIntrinsicOpLowering<gpu::GridDim, NVVM::GridDimXOp,
|
||||
NVVM::GridDimYOp, NVVM::GridDimZOp>>(
|
||||
converter);
|
||||
}
|
||||
|
||||
std::unique_ptr<OpPassBase<ModuleOp>> mlir::createLowerGpuOpsToNVVMOpsPass() {
|
||||
return std::make_unique<LowerGpuOpsToNVVMOpsPass>();
|
||||
}
|
||||
|
|
|
@ -93,7 +93,7 @@ static gpu::LaunchFuncOp inlineConstants(FuncOp kernelFunc,
|
|||
}
|
||||
|
||||
// Outline the `gpu.launch` operation body into a kernel function. Replace
|
||||
// `gpu.return` operations by `std.return` in the generated functions.
|
||||
// `gpu.return` operations by `std.return` in the generated function.
|
||||
static FuncOp outlineKernelFunc(gpu::LaunchOp launchOp) {
|
||||
Location loc = launchOp.getLoc();
|
||||
SmallVector<Type, 4> kernelOperandTypes(launchOp.getKernelOperandTypes());
|
||||
|
@ -107,7 +107,7 @@ static FuncOp outlineKernelFunc(gpu::LaunchOp launchOp) {
|
|||
outlinedFunc.setAttr(gpu::GPUDialect::getKernelFuncAttrName(),
|
||||
builder.getUnitAttr());
|
||||
injectGpuIndexOperations(loc, outlinedFunc);
|
||||
outlinedFunc.walk([](mlir::gpu::Return op) {
|
||||
outlinedFunc.walk([](gpu::Return op) {
|
||||
OpBuilder replacer(op);
|
||||
replacer.create<ReturnOp>(op.getLoc());
|
||||
op.erase();
|
||||
|
@ -131,15 +131,44 @@ static void convertToLaunchFuncOp(gpu::LaunchOp &launchOp, FuncOp kernelFunc) {
|
|||
|
||||
namespace {
|
||||
|
||||
/// Pass that moves the kernel of each LaunchOp into its separate nested module.
|
||||
///
|
||||
/// This pass moves the kernel code of each LaunchOp into a function created
|
||||
/// inside a nested module. It also creates an external function of the same
|
||||
/// name in the parent module.
|
||||
///
|
||||
/// The kernel modules are intended to be compiled to a cubin blob independently
|
||||
/// in a separate pass. The external functions can then be annotated with the
|
||||
/// symbol of the cubin accessor function.
|
||||
class GpuKernelOutliningPass : public ModulePass<GpuKernelOutliningPass> {
|
||||
public:
|
||||
void runOnModule() override {
|
||||
ModuleManager moduleManager(getModule());
|
||||
auto context = getModule().getContext();
|
||||
Builder builder(context);
|
||||
for (auto func : getModule().getOps<FuncOp>()) {
|
||||
func.walk([&](mlir::gpu::LaunchOp op) {
|
||||
// Insert just after the function.
|
||||
Block::iterator insertPt(func.getOperation()->getNextNode());
|
||||
func.walk([&](gpu::LaunchOp op) {
|
||||
// TODO(b/141098412): Handle called functions and globals.
|
||||
FuncOp outlinedFunc = outlineKernelFunc(op);
|
||||
moduleManager.insert(outlinedFunc);
|
||||
|
||||
// Potentially renames outlinedFunc to make symbol unique.
|
||||
moduleManager.insert(insertPt, outlinedFunc);
|
||||
|
||||
// Potentially changes signature, pulling in constants.
|
||||
convertToLaunchFuncOp(op, outlinedFunc);
|
||||
|
||||
// Create clone and move body from outlinedFunc.
|
||||
auto kernelFunc = outlinedFunc.cloneWithoutRegions();
|
||||
kernelFunc.getBody().takeBody(outlinedFunc.getBody());
|
||||
|
||||
// Create nested module and insert kernelFunc.
|
||||
auto kernelModule = ModuleOp::create(UnknownLoc::get(context));
|
||||
kernelModule.setAttr(gpu::GPUDialect::getKernelModuleAttrName(),
|
||||
builder.getUnitAttr());
|
||||
kernelModule.push_back(kernelFunc);
|
||||
getModule().insert(insertPt, kernelModule);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
|
|
@ -2,9 +2,14 @@
|
|||
|
||||
// CHECK: llvm.mlir.global constant @[[global:.*]]("CUBIN")
|
||||
|
||||
module attributes {gpu.kernel_module} {
|
||||
func @kernel(!llvm.float, !llvm<"float*">)
|
||||
attributes {nvvm.cubin = "CUBIN"}
|
||||
}
|
||||
|
||||
func @kernel(!llvm.float, !llvm<"float*">)
|
||||
// CHECK: attributes {gpu.kernel, nvvm.cubin = "CUBIN", nvvm.cubingetter = @[[getter:.*]]}
|
||||
attributes {gpu.kernel, nvvm.cubin = "CUBIN"}
|
||||
// CHECK: attributes {gpu.kernel, nvvm.cubingetter = @[[getter:.*]]}
|
||||
attributes {gpu.kernel}
|
||||
|
||||
// CHECK: func @[[getter]]() -> !llvm<"i8*">
|
||||
// CHECK: %[[addressof:.*]] = llvm.mlir.addressof @[[global]]
|
||||
|
|
|
@ -1,8 +1,26 @@
|
|||
// RUN: mlir-opt %s --test-kernel-to-cubin | FileCheck %s
|
||||
// RUN: mlir-opt %s --test-kernel-to-cubin -split-input-file | FileCheck %s
|
||||
|
||||
func @kernel(%arg0 : !llvm.float, %arg1 : !llvm<"float*">)
|
||||
// CHECK: attributes {gpu.kernel, nvvm.cubin = "CUBIN"}
|
||||
attributes { gpu.kernel } {
|
||||
// CHECK-NOT: llvm.return
|
||||
llvm.return
|
||||
}
|
||||
module attributes {gpu.kernel_module} {
|
||||
func @kernel(%arg0 : !llvm.float, %arg1 : !llvm<"float*">)
|
||||
// CHECK: attributes {gpu.kernel, nvvm.cubin = "CUBIN"}
|
||||
attributes { gpu.kernel } {
|
||||
// CHECK-NOT: llvm.return
|
||||
llvm.return
|
||||
}
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
module attributes {gpu.kernel_module} {
|
||||
// CHECK: func @kernel_a
|
||||
func @kernel_a()
|
||||
attributes { gpu.kernel } {
|
||||
llvm.return
|
||||
}
|
||||
|
||||
// CHECK: func @kernel_b
|
||||
func @kernel_b()
|
||||
attributes { gpu.kernel } {
|
||||
llvm.return
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,35 +1,37 @@
|
|||
// RUN: mlir-opt %s -lower-gpu-ops-to-nvvm-ops | FileCheck %s
|
||||
|
||||
// CHECK-LABEL: func @gpu_index_ops()
|
||||
func @gpu_index_ops()
|
||||
attributes { gpu.kernel } {
|
||||
// CHECK: = nvvm.read.ptx.sreg.tid.x : !llvm.i32
|
||||
%tIdX = "gpu.thread_id"() {dimension = "x"} : () -> (index)
|
||||
// CHECK: = nvvm.read.ptx.sreg.tid.y : !llvm.i32
|
||||
%tIdY = "gpu.thread_id"() {dimension = "y"} : () -> (index)
|
||||
// CHECK: = nvvm.read.ptx.sreg.tid.z : !llvm.i32
|
||||
%tIdZ = "gpu.thread_id"() {dimension = "z"} : () -> (index)
|
||||
module attributes {gpu.kernel_module} {
|
||||
// CHECK-LABEL: func @gpu_index_ops()
|
||||
func @gpu_index_ops()
|
||||
attributes { gpu.kernel } {
|
||||
// CHECK: = nvvm.read.ptx.sreg.tid.x : !llvm.i32
|
||||
%tIdX = "gpu.thread_id"() {dimension = "x"} : () -> (index)
|
||||
// CHECK: = nvvm.read.ptx.sreg.tid.y : !llvm.i32
|
||||
%tIdY = "gpu.thread_id"() {dimension = "y"} : () -> (index)
|
||||
// CHECK: = nvvm.read.ptx.sreg.tid.z : !llvm.i32
|
||||
%tIdZ = "gpu.thread_id"() {dimension = "z"} : () -> (index)
|
||||
|
||||
// CHECK: = nvvm.read.ptx.sreg.ntid.x : !llvm.i32
|
||||
%bDimX = "gpu.block_dim"() {dimension = "x"} : () -> (index)
|
||||
// CHECK: = nvvm.read.ptx.sreg.ntid.y : !llvm.i32
|
||||
%bDimY = "gpu.block_dim"() {dimension = "y"} : () -> (index)
|
||||
// CHECK: = nvvm.read.ptx.sreg.ntid.z : !llvm.i32
|
||||
%bDimZ = "gpu.block_dim"() {dimension = "z"} : () -> (index)
|
||||
// CHECK: = nvvm.read.ptx.sreg.ntid.x : !llvm.i32
|
||||
%bDimX = "gpu.block_dim"() {dimension = "x"} : () -> (index)
|
||||
// CHECK: = nvvm.read.ptx.sreg.ntid.y : !llvm.i32
|
||||
%bDimY = "gpu.block_dim"() {dimension = "y"} : () -> (index)
|
||||
// CHECK: = nvvm.read.ptx.sreg.ntid.z : !llvm.i32
|
||||
%bDimZ = "gpu.block_dim"() {dimension = "z"} : () -> (index)
|
||||
|
||||
// CHECK: = nvvm.read.ptx.sreg.ctaid.x : !llvm.i32
|
||||
%bIdX = "gpu.block_id"() {dimension = "x"} : () -> (index)
|
||||
// CHECK: = nvvm.read.ptx.sreg.ctaid.y : !llvm.i32
|
||||
%bIdY = "gpu.block_id"() {dimension = "y"} : () -> (index)
|
||||
// CHECK: = nvvm.read.ptx.sreg.ctaid.z : !llvm.i32
|
||||
%bIdZ = "gpu.block_id"() {dimension = "z"} : () -> (index)
|
||||
// CHECK: = nvvm.read.ptx.sreg.ctaid.x : !llvm.i32
|
||||
%bIdX = "gpu.block_id"() {dimension = "x"} : () -> (index)
|
||||
// CHECK: = nvvm.read.ptx.sreg.ctaid.y : !llvm.i32
|
||||
%bIdY = "gpu.block_id"() {dimension = "y"} : () -> (index)
|
||||
// CHECK: = nvvm.read.ptx.sreg.ctaid.z : !llvm.i32
|
||||
%bIdZ = "gpu.block_id"() {dimension = "z"} : () -> (index)
|
||||
|
||||
// CHECK: = nvvm.read.ptx.sreg.nctaid.x : !llvm.i32
|
||||
%gDimX = "gpu.grid_dim"() {dimension = "x"} : () -> (index)
|
||||
// CHECK: = nvvm.read.ptx.sreg.nctaid.y : !llvm.i32
|
||||
%gDimY = "gpu.grid_dim"() {dimension = "y"} : () -> (index)
|
||||
// CHECK: = nvvm.read.ptx.sreg.nctaid.z : !llvm.i32
|
||||
%gDimZ = "gpu.grid_dim"() {dimension = "z"} : () -> (index)
|
||||
// CHECK: = nvvm.read.ptx.sreg.nctaid.x : !llvm.i32
|
||||
%gDimX = "gpu.grid_dim"() {dimension = "x"} : () -> (index)
|
||||
// CHECK: = nvvm.read.ptx.sreg.nctaid.y : !llvm.i32
|
||||
%gDimY = "gpu.grid_dim"() {dimension = "y"} : () -> (index)
|
||||
// CHECK: = nvvm.read.ptx.sreg.nctaid.z : !llvm.i32
|
||||
%gDimZ = "gpu.grid_dim"() {dimension = "z"} : () -> (index)
|
||||
|
||||
std.return
|
||||
std.return
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
// RUN: mlir-opt -gpu-kernel-outlining -split-input-file %s | FileCheck %s
|
||||
// RUN: mlir-opt -gpu-kernel-outlining -split-input-file -verify-diagnostics %s | FileCheck %s
|
||||
|
||||
// CHECK-LABEL: func @launch()
|
||||
func @launch() {
|
||||
|
@ -35,7 +35,11 @@ func @launch() {
|
|||
}
|
||||
|
||||
// CHECK-LABEL: func @launch_kernel
|
||||
// CHECK-SAME: (%[[ARG0:.*]]: f32, %[[ARG1:.*]]: memref<?xf32, 1>)
|
||||
// CHECK-SAME: (f32, memref<?xf32, 1>)
|
||||
// CHECK-NEXT: attributes {gpu.kernel}
|
||||
|
||||
// CHECK-LABEL: func @launch_kernel
|
||||
// CHECK-SAME: (%[[KERNEL_ARG0:.*]]: f32, %[[KERNEL_ARG1:.*]]: memref<?xf32, 1>)
|
||||
// CHECK-NEXT: attributes {gpu.kernel}
|
||||
// CHECK-NEXT: %[[BID:.*]] = "gpu.block_id"() {dimension = "x"} : () -> index
|
||||
// CHECK-NEXT: = "gpu.block_id"() {dimension = "y"} : () -> index
|
||||
|
@ -49,9 +53,9 @@ func @launch() {
|
|||
// CHECK-NEXT: %[[BDIM:.*]] = "gpu.block_dim"() {dimension = "x"} : () -> index
|
||||
// CHECK-NEXT: = "gpu.block_dim"() {dimension = "y"} : () -> index
|
||||
// CHECK-NEXT: = "gpu.block_dim"() {dimension = "z"} : () -> index
|
||||
// CHECK-NEXT: "use"(%[[ARG0]]) : (f32) -> ()
|
||||
// CHECK-NEXT: "use"(%[[KERNEL_ARG0]]) : (f32) -> ()
|
||||
// CHECK-NEXT: "some_op"(%[[BID]], %[[BDIM]]) : (index, index) -> ()
|
||||
// CHECK-NEXT: = load %[[ARG1]][%[[TID]]] : memref<?xf32, 1>
|
||||
// CHECK-NEXT: = load %[[KERNEL_ARG1]][%[[TID]]] : memref<?xf32, 1>
|
||||
|
||||
// -----
|
||||
|
||||
|
@ -75,8 +79,8 @@ func @multiple_launches() {
|
|||
return
|
||||
}
|
||||
|
||||
// CHECK-LABEL: func @multiple_launches_kernel()
|
||||
// CHECK-LABEL: func @multiple_launches_kernel_0()
|
||||
// CHECK: func @multiple_launches_kernel()
|
||||
// CHECK: func @multiple_launches_kernel_0()
|
||||
|
||||
// -----
|
||||
|
||||
|
@ -100,3 +104,23 @@ func @extra_constants(%arg0 : memref<?xf32>) {
|
|||
// CHECK-LABEL: func @extra_constants_kernel(%{{.*}}: memref<?xf32>)
|
||||
// CHECK: constant
|
||||
// CHECK: constant
|
||||
|
||||
// -----
|
||||
|
||||
func @function_call(%arg0 : memref<?xf32>) {
|
||||
%cst = constant 8 : index
|
||||
gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst,
|
||||
%grid_z = %cst)
|
||||
threads(%tx, %ty, %tz) in (%block_x = %cst, %block_y = %cst,
|
||||
%block_z = %cst) {
|
||||
// TODO(b/141098412): Support function calls.
|
||||
// expected-error @+1 {{'device_function' does not reference a valid function}}
|
||||
call @device_function() : () -> ()
|
||||
gpu.return
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func @device_function() {
|
||||
gpu.return
|
||||
}
|
||||
|
|
|
@ -108,50 +108,22 @@ OwnedCubin compilePtxToCubin(const std::string ptx, FuncOp &function) {
|
|||
return result;
|
||||
}
|
||||
|
||||
namespace {
|
||||
// A pass that lowers all Standard and Gpu operations to LLVM dialect. It does
|
||||
// not lower the GPULaunch operation to actual code but dows translate the
|
||||
// signature of its kernel argument.
|
||||
class LowerStandardAndGpuToLLVMAndNVVM
|
||||
: public ModulePass<LowerStandardAndGpuToLLVMAndNVVM> {
|
||||
public:
|
||||
void runOnModule() override {
|
||||
ModuleOp m = getModule();
|
||||
|
||||
OwningRewritePatternList patterns;
|
||||
LLVMTypeConverter converter(m.getContext());
|
||||
populateStdToLLVMConversionPatterns(converter, patterns);
|
||||
populateGpuToNVVMConversionPatterns(converter, patterns);
|
||||
|
||||
ConversionTarget target(getContext());
|
||||
target.addLegalDialect<LLVM::LLVMDialect>();
|
||||
target.addLegalDialect<NVVM::NVVMDialect>();
|
||||
target.addLegalOp<ModuleOp>();
|
||||
target.addLegalOp<ModuleTerminatorOp>();
|
||||
target.addDynamicallyLegalOp<FuncOp>(
|
||||
[&](FuncOp op) { return converter.isSignatureLegal(op.getType()); });
|
||||
if (failed(applyFullConversion(m, target, patterns, &converter)))
|
||||
signalPassFailure();
|
||||
}
|
||||
};
|
||||
} // end anonymous namespace
|
||||
|
||||
static LogicalResult runMLIRPasses(ModuleOp m) {
|
||||
PassManager pm(m.getContext());
|
||||
applyPassManagerCLOptions(pm);
|
||||
|
||||
pm.addPass(createGpuKernelOutliningPass());
|
||||
pm.addPass(static_cast<std::unique_ptr<OpPassBase<ModuleOp>>>(
|
||||
std::make_unique<LowerStandardAndGpuToLLVMAndNVVM>()));
|
||||
pm.addPass(createConvertGPUKernelToCubinPass(&compilePtxToCubin));
|
||||
auto &kernelPm = pm.nest<ModuleOp>();
|
||||
kernelPm.addPass(createLowerGpuOpsToNVVMOpsPass());
|
||||
kernelPm.addPass(createConvertGPUKernelToCubinPass(&compilePtxToCubin));
|
||||
pm.addPass(createLowerToLLVMPass());
|
||||
pm.addPass(createGenerateCubinAccessorPass());
|
||||
pm.addPass(createConvertGpuLaunchFuncToCudaCallsPass());
|
||||
|
||||
if (failed(pm.run(m)))
|
||||
return failure();
|
||||
|
||||
return success();
|
||||
return pm.run(m);
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
registerPassManagerCLOptions();
|
||||
return mlir::JitRunnerMain(argc, argv, &runMLIRPasses);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue