[mlir] Add base class for GpuKernelToBlobPass

Instead of configuring kernel-to-cubin/rocdl lowering through callbacks, introduce a base class that target-specific passes can derive from.

Put the base class in GPU/Transforms, according to the discussion in D98203.

The mlir-cuda-runner will go away shortly, and the mlir-rocdl-runner as well at some point. I therefore kept the existing code path working and will remove it in a separate step.

Depends On D98168

Reviewed By: herhut

Differential Revision: https://reviews.llvm.org/D98279
This commit is contained in:
Christian Sigg 2021-03-10 10:35:20 +01:00
parent 8d9b9c0edc
commit 4d295cf5b5
7 changed files with 173 additions and 116 deletions

View File

@ -9,9 +9,14 @@
#define MLIR_CONVERSION_GPUCOMMON_GPUCOMMONPASS_H_
#include "mlir/Support/LLVM.h"
#include "llvm/IR/Module.h"
#include "llvm/ADT/StringRef.h"
#include <vector>
namespace llvm {
class LLVMContext;
class Module;
} // namespace llvm
namespace mlir {
class LLVMTypeConverter;
@ -26,9 +31,6 @@ class OperationPass;
namespace gpu {
class GPUModuleOp;
/// Returns the default annotation name for GPU binary blobs.
std::string getDefaultGpuBinaryAnnotation();
} // namespace gpu
namespace LLVM {

View File

@ -13,8 +13,15 @@
#ifndef MLIR_DIALECT_GPU_PASSES_H_
#define MLIR_DIALECT_GPU_PASSES_H_
#include "mlir/Dialect/GPU/GPUDialect.h"
#include "mlir/Pass/Pass.h"
namespace llvm {
class TargetMachine;
class LLVMContext;
class Module;
} // namespace llvm
namespace mlir {
/// Replaces `gpu.launch` with `gpu.launch_func` by moving the region into
/// a separate kernel function.
@ -33,6 +40,45 @@ inline void populateGpuRewritePatterns(MLIRContext *context,
populateGpuAllReducePatterns(context, patterns);
}
namespace gpu {
/// Returns the default annotation name for GPU binary blobs.
std::string getDefaultGpuBinaryAnnotation();
/// Base pass class to serialize kernel functions through LLVM into
/// user-specified IR and add the resulting blob as module attribute.
class SerializeToBlobPass : public OperationPass<gpu::GPUModuleOp> {
public:
SerializeToBlobPass(TypeID passID);
SerializeToBlobPass(const SerializeToBlobPass &other);
void runOnOperation() final;
private:
// Creates the LLVM target machine to generate the ISA.
std::unique_ptr<llvm::TargetMachine> createTargetMachine();
// Translates the 'getOperation()' result to an LLVM module.
virtual std::unique_ptr<llvm::Module>
translateToLLVMIR(llvm::LLVMContext &llvmContext) = 0;
// Serializes the target ISA to binary form.
virtual std::unique_ptr<std::vector<char>>
serializeISA(const std::string &isa) = 0;
protected:
Option<std::string> triple{*this, "triple",
::llvm::cl::desc("Target triple")};
Option<std::string> chip{*this, "chip",
::llvm::cl::desc("Target architecture")};
Option<std::string> features{*this, "features",
::llvm::cl::desc("Target features")};
Option<std::string> gpuBinaryAnnotation{
*this, "gpu-binary-annotation",
llvm::cl::desc("Annotation attribute string for GPU binary"),
llvm::cl::init(getDefaultGpuBinaryAnnotation())};
};
} // namespace gpu
//===----------------------------------------------------------------------===//
// Registration
//===----------------------------------------------------------------------===//

View File

@ -24,8 +24,6 @@ add_mlir_conversion_library(MLIRGPUToGPURuntimeTransforms
intrinsics_gen
LINK_COMPONENTS
Core
MC
${AMDGPU_LIBS}
${NVPTX_LIBS}

View File

@ -15,6 +15,7 @@
#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
#include "mlir/Dialect/GPU/GPUDialect.h"
#include "mlir/Dialect/GPU/Passes.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/IR/Attributes.h"
#include "mlir/IR/Builders.h"
@ -25,14 +26,10 @@
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/Twine.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/LegacyPassManager.h"
#include "llvm/IR/Module.h"
#include "llvm/Support/Error.h"
#include "llvm/Support/Mutex.h"
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Support/TargetSelect.h"
#include "llvm/Target/TargetMachine.h"
using namespace mlir;
@ -45,126 +42,43 @@ namespace {
/// GPU binary code, which is then attached as an attribute to the function.
/// The function body is erased.
class GpuKernelToBlobPass
: public PassWrapper<GpuKernelToBlobPass, OperationPass<gpu::GPUModuleOp>> {
: public PassWrapper<GpuKernelToBlobPass, gpu::SerializeToBlobPass> {
public:
GpuKernelToBlobPass(LoweringCallback loweringCallback,
BlobGenerator blobGenerator, StringRef triple,
StringRef targetChip, StringRef features,
StringRef gpuBinaryAnnotation)
: loweringCallback(loweringCallback), blobGenerator(blobGenerator),
triple(triple), targetChip(targetChip), features(features) {
: loweringCallback(loweringCallback), blobGenerator(blobGenerator) {
if (!triple.empty())
this->triple = triple.str();
if (!targetChip.empty())
this->chip = targetChip.str();
if (!features.empty())
this->features = features.str();
if (!gpuBinaryAnnotation.empty())
this->gpuBinaryAnnotation = gpuBinaryAnnotation.str();
}
GpuKernelToBlobPass(const GpuKernelToBlobPass &other)
: loweringCallback(other.loweringCallback),
blobGenerator(other.blobGenerator), triple(other.triple),
targetChip(other.targetChip), features(other.features) {}
void runOnOperation() override {
gpu::GPUModuleOp module = getOperation();
// Lower the module to an LLVM IR module using a separate context to enable
// multi-threaded processing.
llvm::LLVMContext llvmContext;
std::unique_ptr<llvm::Module> llvmModule =
loweringCallback(module, llvmContext, "LLVMDialectModule");
if (!llvmModule)
return signalPassFailure();
// Translate the llvm module to a target blob and attach the result as
// attribute to the module.
if (auto blobAttr = translateGPUModuleToBinaryAnnotation(
*llvmModule, module.getLoc(), module.getName()))
module->setAttr(gpuBinaryAnnotation, blobAttr);
else
signalPassFailure();
private:
// Translates the 'getOperation()' result to an LLVM module.
std::unique_ptr<llvm::Module>
translateToLLVMIR(llvm::LLVMContext &llvmContext) override {
return loweringCallback(getOperation(), llvmContext, "LLVMDialectModule");
}
private:
std::string translateModuleToISA(llvm::Module &module,
llvm::TargetMachine &targetMachine);
/// Converts llvmModule to a blob with target instructions using the
/// user-provided generator. Location is used for error reporting and name is
/// forwarded to the blob generator to use in its logging mechanisms.
OwnedBlob convertModuleToBlob(llvm::Module &llvmModule, Location loc,
StringRef name);
/// Translates llvmModule to a blob with target instructions and returns the
/// result as attribute.
StringAttr translateGPUModuleToBinaryAnnotation(llvm::Module &llvmModule,
Location loc, StringRef name);
// Serializes the target ISA to binary form.
std::unique_ptr<std::vector<char>>
serializeISA(const std::string &isa) override {
return blobGenerator(isa, getOperation().getLoc(),
getOperation().getName());
}
LoweringCallback loweringCallback;
BlobGenerator blobGenerator;
llvm::Triple triple;
std::string targetChip;
std::string features;
Option<std::string> gpuBinaryAnnotation{
*this, "gpu-binary-annotation",
llvm::cl::desc("Annotation attribute string for GPU binary"),
llvm::cl::init(gpu::getDefaultGpuBinaryAnnotation())};
};
} // anonymous namespace
std::string gpu::getDefaultGpuBinaryAnnotation() { return "gpu.binary"; }
std::string
GpuKernelToBlobPass::translateModuleToISA(llvm::Module &module,
llvm::TargetMachine &targetMachine) {
std::string targetISA;
{
llvm::raw_string_ostream stream(targetISA);
llvm::buffer_ostream pstream(stream);
llvm::legacy::PassManager codegenPasses;
targetMachine.addPassesToEmitFile(codegenPasses, pstream, nullptr,
llvm::CGFT_AssemblyFile);
codegenPasses.run(module);
}
return targetISA;
}
OwnedBlob GpuKernelToBlobPass::convertModuleToBlob(llvm::Module &llvmModule,
Location loc,
StringRef name) {
std::unique_ptr<llvm::TargetMachine> targetMachine;
{
std::string error;
const llvm::Target *target =
llvm::TargetRegistry::lookupTarget("", triple, error);
if (target == nullptr) {
emitError(loc, "cannot initialize target triple");
return {};
}
targetMachine.reset(target->createTargetMachine(triple.str(), targetChip,
features, {}, {}));
if (targetMachine == nullptr) {
emitError(loc, "cannot initialize target machine");
return {};
}
}
llvmModule.setDataLayout(targetMachine->createDataLayout());
auto targetISA = translateModuleToISA(llvmModule, *targetMachine);
return blobGenerator(targetISA, loc, name);
}
StringAttr GpuKernelToBlobPass::translateGPUModuleToBinaryAnnotation(
llvm::Module &llvmModule, Location loc, StringRef name) {
auto blob = convertModuleToBlob(llvmModule, loc, name);
if (!blob)
return {};
return StringAttr::get(loc->getContext(), {blob->data(), blob->size()});
}
std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
mlir::createConvertGPUKernelToBlobPass(LoweringCallback loweringCallback,
BlobGenerator blobGenerator,

View File

@ -20,6 +20,7 @@
#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
#include "mlir/Dialect/Async/IR/Async.h"
#include "mlir/Dialect/GPU/GPUDialect.h"
#include "mlir/Dialect/GPU/Passes.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/IR/Attributes.h"
#include "mlir/IR/Builders.h"
@ -27,10 +28,6 @@
#include "mlir/IR/BuiltinTypes.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Type.h"
#include "llvm/Support/Error.h"
#include "llvm/Support/FormatVariadic.h"

View File

@ -5,10 +5,15 @@ add_mlir_dialect_library(MLIRGPU
Transforms/KernelOutlining.cpp
Transforms/MemoryPromotion.cpp
Transforms/ParallelLoopMapper.cpp
Transforms/SerializeToBlob.cpp
ADDITIONAL_HEADER_DIRS
${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/GPU
LINK_COMPONENTS
Core
MC
DEPENDS
MLIRGPUOpsIncGen
MLIRGPUOpInterfacesIncGen

View File

@ -0,0 +1,95 @@
//===- SerializeToBlob.cpp - MLIR GPU lowering pass -----------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements a base class for a pass to serialize a gpu module
// into a binary blob that can be executed on a GPU. The binary blob is added
// as a string attribute to the gpu module.
//
//===----------------------------------------------------------------------===//
#include "mlir/Dialect/GPU/Passes.h"
#include "mlir/Pass/Pass.h"
#include "llvm/IR/LegacyPassManager.h"
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Support/TargetSelect.h"
#include "llvm/Target/TargetMachine.h"
using namespace mlir;
std::string gpu::getDefaultGpuBinaryAnnotation() { return "gpu.binary"; }
gpu::SerializeToBlobPass::SerializeToBlobPass(TypeID passID)
: OperationPass<gpu::GPUModuleOp>(passID) {}
gpu::SerializeToBlobPass::SerializeToBlobPass(const SerializeToBlobPass &other)
: OperationPass<gpu::GPUModuleOp>(other) {
// Pass::Option has no copy constructor, copy them manually.
triple = other.triple;
chip = other.chip;
features = other.features;
gpuBinaryAnnotation = other.gpuBinaryAnnotation;
}
static std::string translateToISA(llvm::Module &llvmModule,
llvm::TargetMachine &targetMachine) {
llvmModule.setDataLayout(targetMachine.createDataLayout());
std::string targetISA;
llvm::raw_string_ostream stream(targetISA);
llvm::buffer_ostream pstream(stream);
llvm::legacy::PassManager codegenPasses;
targetMachine.addPassesToEmitFile(codegenPasses, pstream, nullptr,
llvm::CGFT_AssemblyFile);
codegenPasses.run(llvmModule);
return targetISA;
}
void gpu::SerializeToBlobPass::runOnOperation() {
// Lower the module to an LLVM IR module using a separate context to enable
// multi-threaded processing.
llvm::LLVMContext llvmContext;
std::unique_ptr<llvm::Module> llvmModule = translateToLLVMIR(llvmContext);
if (!llvmModule)
return signalPassFailure();
// Lower the LLVM IR module to target ISA.
std::unique_ptr<llvm::TargetMachine> targetMachine = createTargetMachine();
if (!targetMachine)
return signalPassFailure();
std::string targetISA = translateToISA(*llvmModule, *targetMachine);
// Serialize the target ISA.
std::unique_ptr<std::vector<char>> blob = serializeISA(targetISA);
if (!blob)
return signalPassFailure();
// Add the blob as module attribute.
auto attr = StringAttr::get(&getContext(), {blob->data(), blob->size()});
getOperation()->setAttr(gpuBinaryAnnotation, attr);
}
std::unique_ptr<llvm::TargetMachine>
gpu::SerializeToBlobPass::createTargetMachine() {
Location loc = getOperation().getLoc();
std::string error;
const llvm::Target *target =
llvm::TargetRegistry::lookupTarget(triple, error);
if (!target) {
emitError(loc, Twine("failed to lookup target: ") + error);
return {};
}
llvm::TargetMachine *machine =
target->createTargetMachine(triple, chip, features, {}, {});
if (!machine) {
emitError(loc, "failed to create target machine");
return {};
}
return std::unique_ptr<llvm::TargetMachine>{machine};
}