forked from OSchip/llvm-project
Fuse GenerateCubinAccessors pass into LaunchFunctToCuda
Now that the accessor function is a trivial getter of the global variable, it makes less sense to have the getter generation as a separate pass. Move the getter generation into the lowering of `gpu.launch_func` to CUDA calls. This change is mostly code motion, but the process can be simplified further by generating the addressof inplace instead of using a call. This is will be done in a follow-up. PiperOrigin-RevId: 273492517
This commit is contained in:
parent
90d65d32d6
commit
16af5924cb
|
@ -61,10 +61,6 @@ createConvertGPUKernelToCubinPass(CubinGenerator cubinGenerator);
|
|||
std::unique_ptr<OpPassBase<ModuleOp>>
|
||||
createConvertGpuLaunchFuncToCudaCallsPass();
|
||||
|
||||
/// Creates a pass to augment a module with getter functions for all contained
|
||||
/// cubins as encoded via the 'nvvm.cubin' attribute.
|
||||
std::unique_ptr<OpPassBase<ModuleOp>> createGenerateCubinAccessorPass();
|
||||
|
||||
} // namespace mlir
|
||||
|
||||
#endif // MLIR_CONVERSION_GPUTOCUDA_GPUTOCUDAPASS_H_
|
||||
|
|
|
@ -4,7 +4,6 @@ if(MLIR_CUDA_CONVERSIONS_ENABLED)
|
|||
add_llvm_library(MLIRGPUtoCUDATransforms
|
||||
ConvertKernelFuncToCubin.cpp
|
||||
ConvertLaunchFuncToCudaCalls.cpp
|
||||
GenerateCubinAccessors.cpp
|
||||
)
|
||||
target_link_libraries(MLIRGPUtoCUDATransforms
|
||||
MLIRGPU
|
||||
|
|
|
@ -51,7 +51,10 @@ static constexpr const char *cuGetStreamHelperName = "mcuGetStreamHelper";
|
|||
static constexpr const char *cuStreamSynchronizeName = "mcuStreamSynchronize";
|
||||
static constexpr const char *kMcuMemHostRegisterPtr = "mcuMemHostRegisterPtr";
|
||||
|
||||
static constexpr const char *kCubinAnnotation = "nvvm.cubin";
|
||||
static constexpr const char *kCubinGetterAnnotation = "nvvm.cubingetter";
|
||||
static constexpr const char *kCubinGetterSuffix = "_cubin";
|
||||
static constexpr const char *kCubinStorageSuffix = "_cubin_cst";
|
||||
|
||||
namespace {
|
||||
|
||||
|
@ -121,6 +124,7 @@ private:
|
|||
Value *setupParamsArray(gpu::LaunchFuncOp launchOp, OpBuilder &builder);
|
||||
Value *generateKernelNameConstant(FuncOp kernelFunction, Location &loc,
|
||||
OpBuilder &builder);
|
||||
FuncOp generateCubinAccessor(FuncOp kernelFunc, StringAttr blob);
|
||||
void translateGpuLaunchCalls(mlir::gpu::LaunchFuncOp launchOp);
|
||||
|
||||
public:
|
||||
|
@ -131,10 +135,24 @@ public:
|
|||
// Cache the used LLVM types.
|
||||
initializeCachedTypes();
|
||||
|
||||
for (auto func : getModule().getOps<FuncOp>()) {
|
||||
func.walk(
|
||||
[this](mlir::gpu::LaunchFuncOp op) { translateGpuLaunchCalls(op); });
|
||||
}
|
||||
getModule().walk([this](mlir::gpu::LaunchFuncOp op) {
|
||||
auto gpuModule =
|
||||
getModule().lookupSymbol<ModuleOp>(op.getKernelModuleName());
|
||||
auto kernelFunc = gpuModule.lookupSymbol<FuncOp>(op.kernel());
|
||||
auto cubinAttr = kernelFunc.getAttrOfType<StringAttr>(kCubinAnnotation);
|
||||
if (!cubinAttr)
|
||||
return signalPassFailure();
|
||||
FuncOp getter = generateCubinAccessor(kernelFunc, cubinAttr);
|
||||
|
||||
// Store the name of the getter on the function for easier lookup and
|
||||
// remove the original CUBIN annotation.
|
||||
kernelFunc.setAttr(
|
||||
kCubinGetterAnnotation,
|
||||
SymbolRefAttr::get(getter.getName(), getter.getContext()));
|
||||
kernelFunc.removeAttr(kCubinAnnotation);
|
||||
|
||||
translateGpuLaunchCalls(op);
|
||||
});
|
||||
|
||||
// GPU kernel modules are no longer necessary since we have a global
|
||||
// constant with the CUBIN data.
|
||||
|
@ -317,6 +335,42 @@ Value *GpuLaunchFuncToCudaCallsPass::generateKernelNameConstant(
|
|||
llvmDialect);
|
||||
}
|
||||
|
||||
// Inserts a global constant string containing `blob` into the grand-parent
|
||||
// module of `kernelFunc` and generates the function that returns the address of
|
||||
// the first character of this string.
|
||||
FuncOp GpuLaunchFuncToCudaCallsPass::generateCubinAccessor(FuncOp kernelFunc,
|
||||
StringAttr blob) {
|
||||
Location loc = kernelFunc.getLoc();
|
||||
SmallString<128> nameBuffer(kernelFunc.getName());
|
||||
ModuleOp module = getModule();
|
||||
assert(kernelFunc.getParentOp() &&
|
||||
kernelFunc.getParentOp()->getParentOp() == module &&
|
||||
"expected one level of module nesting");
|
||||
|
||||
// Insert the getter function just after the GPU kernel module containing
|
||||
// `kernelFunc`.
|
||||
OpBuilder moduleBuilder(module.getBody());
|
||||
moduleBuilder.setInsertionPointAfter(kernelFunc.getParentOp());
|
||||
auto getterType = moduleBuilder.getFunctionType(
|
||||
llvm::None, LLVM::LLVMType::getInt8PtrTy(llvmDialect));
|
||||
nameBuffer.append(kCubinGetterSuffix);
|
||||
auto result = moduleBuilder.create<FuncOp>(
|
||||
loc, StringRef(nameBuffer), getterType, ArrayRef<NamedAttribute>());
|
||||
Block *entryBlock = result.addEntryBlock();
|
||||
|
||||
// Drop the getter suffix before appending the storage suffix.
|
||||
nameBuffer.resize(kernelFunc.getName().size());
|
||||
nameBuffer.append(kCubinStorageSuffix);
|
||||
|
||||
// Obtain the address of the first character of the global string containing
|
||||
// the cubin and return from the getter.
|
||||
OpBuilder builder(entryBlock);
|
||||
Value *startPtr = LLVM::createGlobalString(
|
||||
loc, builder, StringRef(nameBuffer), blob.getValue(), llvmDialect);
|
||||
builder.create<LLVM::ReturnOp>(loc, startPtr);
|
||||
return result;
|
||||
}
|
||||
|
||||
// Emits LLVM IR to launch a kernel function. Expects the module that contains
|
||||
// the compiled kernel function as a cubin in the 'nvvm.cubin' attribute of the
|
||||
// kernel function in the IR.
|
||||
|
|
|
@ -1,136 +0,0 @@
|
|||
//===- GenerateCubinAccessors.cpp - MLIR GPU lowering passes --------------===//
|
||||
//
|
||||
// Copyright 2019 The MLIR Authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
// =============================================================================
|
||||
//
|
||||
// This file implements a pass to generate LLVMIR functions that return the
|
||||
// data stored in nvvm.cubin char* blob.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h"
|
||||
#include "mlir/Dialect/GPU/GPUDialect.h"
|
||||
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
|
||||
#include "mlir/IR/Attributes.h"
|
||||
#include "mlir/IR/Builders.h"
|
||||
#include "mlir/IR/Function.h"
|
||||
#include "mlir/IR/Identifier.h"
|
||||
#include "mlir/IR/Module.h"
|
||||
#include "mlir/IR/StandardTypes.h"
|
||||
#include "mlir/Pass/Pass.h"
|
||||
#include "mlir/Pass/PassRegistry.h"
|
||||
|
||||
#include "llvm/ADT/STLExtras.h"
|
||||
|
||||
namespace mlir {
|
||||
namespace {
|
||||
|
||||
// TODO(herhut): Move to shared location.
|
||||
constexpr const char *kCubinAnnotation = "nvvm.cubin";
|
||||
constexpr const char *kCubinGetterAnnotation = "nvvm.cubingetter";
|
||||
constexpr const char *kCubinGetterSuffix = "_cubin";
|
||||
constexpr const char *kCubinStorageSuffix = "_cubin_cst";
|
||||
|
||||
/// A pass which moves cubin from function attributes in nested modules
|
||||
/// to global strings and generates getter functions.
|
||||
///
|
||||
/// The GpuKernelToCubinPass annotates kernels functions with compiled device
|
||||
/// code blobs. These functions reside in nested modules generated by
|
||||
/// GpuKernelOutliningPass. This pass consumes these modules and moves the cubin
|
||||
/// blobs back to the parent module as global strings and generates accessor
|
||||
/// functions for them. The external kernel functions (also generated by the
|
||||
/// outlining pass) are annotated with the symbol of the cubin accessor.
|
||||
class GpuGenerateCubinAccessorsPass
|
||||
: public ModulePass<GpuGenerateCubinAccessorsPass> {
|
||||
private:
|
||||
LLVM::LLVMType getIndexType() {
|
||||
unsigned bits =
|
||||
llvmDialect->getLLVMModule().getDataLayout().getPointerSizeInBits();
|
||||
return LLVM::LLVMType::getIntNTy(llvmDialect, bits);
|
||||
}
|
||||
|
||||
// Inserts a global constant string containing `blob` into the grand-parent
|
||||
// module of `kernelFunc` and generates the function that returns the address
|
||||
// of the first character of this string. Returns the generator function.
|
||||
// TODO(herhut): consider fusing this pass with launch-func-to-cuda.
|
||||
FuncOp generate(FuncOp kernelFunc, StringAttr blob) {
|
||||
Location loc = kernelFunc.getLoc();
|
||||
SmallString<128> nameBuffer(kernelFunc.getName());
|
||||
ModuleOp module = getModule();
|
||||
assert(kernelFunc.getParentOp() &&
|
||||
kernelFunc.getParentOp()->getParentOp() == module &&
|
||||
"expected one level of module nesting");
|
||||
|
||||
// Insert the getter function just after the original function.
|
||||
OpBuilder moduleBuilder(module.getBody());
|
||||
moduleBuilder.setInsertionPointAfter(kernelFunc.getParentOp());
|
||||
auto getterType = moduleBuilder.getFunctionType(
|
||||
llvm::None, LLVM::LLVMType::getInt8PtrTy(llvmDialect));
|
||||
nameBuffer.append(kCubinGetterSuffix);
|
||||
auto result = moduleBuilder.create<FuncOp>(
|
||||
loc, StringRef(nameBuffer), getterType, ArrayRef<NamedAttribute>());
|
||||
Block *entryBlock = result.addEntryBlock();
|
||||
|
||||
// Drop the getter suffix before appending the storage suffix.
|
||||
nameBuffer.resize(kernelFunc.getName().size());
|
||||
nameBuffer.append(kCubinStorageSuffix);
|
||||
|
||||
// Obtain the address of the first character of the global string containing
|
||||
// the cubin and return from the getter.
|
||||
OpBuilder builder(entryBlock);
|
||||
Value *startPtr = LLVM::createGlobalString(
|
||||
loc, builder, StringRef(nameBuffer), blob.getValue(), llvmDialect);
|
||||
builder.create<LLVM::ReturnOp>(loc, startPtr);
|
||||
return result;
|
||||
}
|
||||
|
||||
public:
|
||||
void runOnModule() override {
|
||||
llvmDialect = getContext().getRegisteredDialect<LLVM::LLVMDialect>();
|
||||
|
||||
for (auto module : getModule().getOps<ModuleOp>()) {
|
||||
if (!module.getAttrOfType<UnitAttr>(
|
||||
gpu::GPUDialect::getKernelModuleAttrName()))
|
||||
continue;
|
||||
for (auto func : module.getOps<FuncOp>()) {
|
||||
if (StringAttr blob =
|
||||
func.getAttrOfType<StringAttr>(kCubinAnnotation)) {
|
||||
FuncOp getter = generate(func, blob);
|
||||
|
||||
// Store the name of the getter on the function for easier lookup and
|
||||
// remove the CUBIN.
|
||||
func.setAttr(kCubinGetterAnnotation,
|
||||
SymbolRefAttr::get(getter.getName(), func.getContext()));
|
||||
func.removeAttr(kCubinAnnotation);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
LLVM::LLVMDialect *llvmDialect;
|
||||
};
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
std::unique_ptr<OpPassBase<ModuleOp>> createGenerateCubinAccessorPass() {
|
||||
return std::make_unique<GpuGenerateCubinAccessorsPass>();
|
||||
}
|
||||
|
||||
static PassRegistration<GpuGenerateCubinAccessorsPass>
|
||||
pass("generate-cubin-accessors",
|
||||
"Generate LLVMIR functions that give access to cubin data");
|
||||
|
||||
} // namespace mlir
|
|
@ -1,20 +0,0 @@
|
|||
// RUN: mlir-opt %s --generate-cubin-accessors | FileCheck %s
|
||||
|
||||
module attributes {gpu.container_module} {
|
||||
|
||||
// CHECK: llvm.mlir.global constant @[[global:.*]]("CUBIN")
|
||||
|
||||
module attributes {gpu.kernel_module} {
|
||||
// CHECK-LABEL: func @kernel
|
||||
func @kernel(!llvm.float, !llvm<"float*">)
|
||||
// CHECK: attributes {nvvm.cubingetter = @[[getter:.*]]}
|
||||
attributes {nvvm.cubin = "CUBIN"}
|
||||
}
|
||||
|
||||
// CHECK: func @[[getter]]() -> !llvm<"i8*">
|
||||
// CHECK: %[[addressof:.*]] = llvm.mlir.addressof @[[global]]
|
||||
// CHECK: %[[c0:.*]] = llvm.mlir.constant(0 : index)
|
||||
// CHECK: %[[gep:.*]] = llvm.getelementptr %[[addressof]][%[[c0]], %[[c0]]]
|
||||
// CHECK-SAME: -> !llvm<"i8*">
|
||||
// CHECK: llvm.return %[[gep]] : !llvm<"i8*">
|
||||
}
|
|
@ -3,22 +3,28 @@
|
|||
module attributes {gpu.container_module} {
|
||||
|
||||
// CHECK: llvm.mlir.global constant @[[kernel_name:.*]]("kernel\00")
|
||||
|
||||
func @cubin_getter() -> !llvm<"i8*">
|
||||
// CHECK: llvm.mlir.global constant @[[global:.*]]("CUBIN")
|
||||
|
||||
module @kernel_module attributes {gpu.kernel_module} {
|
||||
func @kernel(!llvm.float, !llvm<"float*">)
|
||||
attributes { gpu.kernel, nvvm.cubingetter = @cubin_getter }
|
||||
attributes { gpu.kernel, nvvm.cubin = "CUBIN" }
|
||||
}
|
||||
|
||||
// CHECK: func @[[getter:.*]]() -> !llvm<"i8*">
|
||||
// CHECK: %[[addressof:.*]] = llvm.mlir.addressof @[[global]]
|
||||
// CHECK: %[[c0:.*]] = llvm.mlir.constant(0 : index)
|
||||
// CHECK: %[[gep:.*]] = llvm.getelementptr %[[addressof]][%[[c0]], %[[c0]]]
|
||||
// CHECK-SAME: -> !llvm<"i8*">
|
||||
// CHECK: llvm.return %[[gep]] : !llvm<"i8*">
|
||||
|
||||
func @foo() {
|
||||
%0 = "op"() : () -> (!llvm.float)
|
||||
%1 = "op"() : () -> (!llvm<"float*">)
|
||||
%cst = constant 8 : index
|
||||
|
||||
// CHECK: [[cubin_ptr:%.*]] = llvm.call @[[getter]]
|
||||
// CHECK: [[module_ptr:%.*]] = llvm.alloca {{.*}} x !llvm<"i8*"> : (!llvm.i32) -> !llvm<"i8**">
|
||||
// CHECK: llvm.call @mcuModuleLoad([[module_ptr]], {{.*}}) : (!llvm<"i8**">, !llvm<"i8*">) -> !llvm.i32
|
||||
// CHECK: llvm.call @mcuModuleLoad([[module_ptr]], [[cubin_ptr]]) : (!llvm<"i8**">, !llvm<"i8*">) -> !llvm.i32
|
||||
// CHECK: [[func_ptr:%.*]] = llvm.alloca {{.*}} x !llvm<"i8*"> : (!llvm.i32) -> !llvm<"i8**">
|
||||
// CHECK: llvm.call @mcuModuleGetFunction([[func_ptr]], {{.*}}, {{.*}}) : (!llvm<"i8**">, !llvm<"i8*">, !llvm<"i8*">) -> !llvm.i32
|
||||
// CHECK: llvm.call @mcuGetStreamHelper
|
||||
|
|
|
@ -117,7 +117,6 @@ static LogicalResult runMLIRPasses(ModuleOp m) {
|
|||
kernelPm.addPass(createLowerGpuOpsToNVVMOpsPass());
|
||||
kernelPm.addPass(createConvertGPUKernelToCubinPass(&compilePtxToCubin));
|
||||
pm.addPass(createLowerToLLVMPass());
|
||||
pm.addPass(createGenerateCubinAccessorPass());
|
||||
pm.addPass(createConvertGpuLaunchFuncToCudaCallsPass());
|
||||
|
||||
return pm.run(m);
|
||||
|
|
Loading…
Reference in New Issue