forked from OSchip/llvm-project
[mlir] Add base class for GpuKernelToBlobPass
Instead of configuring kernel-to-cubin/rocdl lowering through callbacks, introduce a base class that target-specific passes can derive from. Put the base class in GPU/Transforms, according to the discussion in D98203. The mlir-cuda-runner will go away shortly, and the mlir-rocdl-runner as well at some point. I therefore kept the existing code path working and will remove it in a separate step. Depends On D98168 Reviewed By: herhut Differential Revision: https://reviews.llvm.org/D98279
This commit is contained in:
parent
8d9b9c0edc
commit
4d295cf5b5
|
@ -9,9 +9,14 @@
|
|||
#define MLIR_CONVERSION_GPUCOMMON_GPUCOMMONPASS_H_
|
||||
|
||||
#include "mlir/Support/LLVM.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include "llvm/ADT/StringRef.h"
|
||||
#include <vector>
|
||||
|
||||
namespace llvm {
|
||||
class LLVMContext;
|
||||
class Module;
|
||||
} // namespace llvm
|
||||
|
||||
namespace mlir {
|
||||
|
||||
class LLVMTypeConverter;
|
||||
|
@ -26,9 +31,6 @@ class OperationPass;
|
|||
|
||||
namespace gpu {
|
||||
class GPUModuleOp;
|
||||
|
||||
/// Returns the default annotation name for GPU binary blobs.
|
||||
std::string getDefaultGpuBinaryAnnotation();
|
||||
} // namespace gpu
|
||||
|
||||
namespace LLVM {
|
||||
|
|
|
@ -13,8 +13,15 @@
|
|||
#ifndef MLIR_DIALECT_GPU_PASSES_H_
|
||||
#define MLIR_DIALECT_GPU_PASSES_H_
|
||||
|
||||
#include "mlir/Dialect/GPU/GPUDialect.h"
|
||||
#include "mlir/Pass/Pass.h"
|
||||
|
||||
namespace llvm {
|
||||
class TargetMachine;
|
||||
class LLVMContext;
|
||||
class Module;
|
||||
} // namespace llvm
|
||||
|
||||
namespace mlir {
|
||||
/// Replaces `gpu.launch` with `gpu.launch_func` by moving the region into
|
||||
/// a separate kernel function.
|
||||
|
@ -33,6 +40,45 @@ inline void populateGpuRewritePatterns(MLIRContext *context,
|
|||
populateGpuAllReducePatterns(context, patterns);
|
||||
}
|
||||
|
||||
namespace gpu {
|
||||
/// Returns the default annotation name for GPU binary blobs.
|
||||
std::string getDefaultGpuBinaryAnnotation();
|
||||
|
||||
/// Base pass class to serialize kernel functions through LLVM into
|
||||
/// user-specified IR and add the resulting blob as module attribute.
|
||||
class SerializeToBlobPass : public OperationPass<gpu::GPUModuleOp> {
|
||||
public:
|
||||
SerializeToBlobPass(TypeID passID);
|
||||
SerializeToBlobPass(const SerializeToBlobPass &other);
|
||||
|
||||
void runOnOperation() final;
|
||||
|
||||
private:
|
||||
// Creates the LLVM target machine to generate the ISA.
|
||||
std::unique_ptr<llvm::TargetMachine> createTargetMachine();
|
||||
|
||||
// Translates the 'getOperation()' result to an LLVM module.
|
||||
virtual std::unique_ptr<llvm::Module>
|
||||
translateToLLVMIR(llvm::LLVMContext &llvmContext) = 0;
|
||||
|
||||
// Serializes the target ISA to binary form.
|
||||
virtual std::unique_ptr<std::vector<char>>
|
||||
serializeISA(const std::string &isa) = 0;
|
||||
|
||||
protected:
|
||||
Option<std::string> triple{*this, "triple",
|
||||
::llvm::cl::desc("Target triple")};
|
||||
Option<std::string> chip{*this, "chip",
|
||||
::llvm::cl::desc("Target architecture")};
|
||||
Option<std::string> features{*this, "features",
|
||||
::llvm::cl::desc("Target features")};
|
||||
Option<std::string> gpuBinaryAnnotation{
|
||||
*this, "gpu-binary-annotation",
|
||||
llvm::cl::desc("Annotation attribute string for GPU binary"),
|
||||
llvm::cl::init(getDefaultGpuBinaryAnnotation())};
|
||||
};
|
||||
} // namespace gpu
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Registration
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
|
|
@ -24,8 +24,6 @@ add_mlir_conversion_library(MLIRGPUToGPURuntimeTransforms
|
|||
intrinsics_gen
|
||||
|
||||
LINK_COMPONENTS
|
||||
Core
|
||||
MC
|
||||
${AMDGPU_LIBS}
|
||||
${NVPTX_LIBS}
|
||||
|
||||
|
|
|
@ -15,6 +15,7 @@
|
|||
#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
|
||||
|
||||
#include "mlir/Dialect/GPU/GPUDialect.h"
|
||||
#include "mlir/Dialect/GPU/Passes.h"
|
||||
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
|
||||
#include "mlir/IR/Attributes.h"
|
||||
#include "mlir/IR/Builders.h"
|
||||
|
@ -25,14 +26,10 @@
|
|||
|
||||
#include "llvm/ADT/Optional.h"
|
||||
#include "llvm/ADT/Twine.h"
|
||||
#include "llvm/IR/Constants.h"
|
||||
#include "llvm/IR/LegacyPassManager.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include "llvm/Support/Error.h"
|
||||
#include "llvm/Support/Mutex.h"
|
||||
#include "llvm/Support/TargetRegistry.h"
|
||||
#include "llvm/Support/TargetSelect.h"
|
||||
#include "llvm/Target/TargetMachine.h"
|
||||
|
||||
using namespace mlir;
|
||||
|
||||
|
@ -45,126 +42,43 @@ namespace {
|
|||
/// GPU binary code, which is then attached as an attribute to the function.
|
||||
/// The function body is erased.
|
||||
class GpuKernelToBlobPass
|
||||
: public PassWrapper<GpuKernelToBlobPass, OperationPass<gpu::GPUModuleOp>> {
|
||||
: public PassWrapper<GpuKernelToBlobPass, gpu::SerializeToBlobPass> {
|
||||
public:
|
||||
GpuKernelToBlobPass(LoweringCallback loweringCallback,
|
||||
BlobGenerator blobGenerator, StringRef triple,
|
||||
StringRef targetChip, StringRef features,
|
||||
StringRef gpuBinaryAnnotation)
|
||||
: loweringCallback(loweringCallback), blobGenerator(blobGenerator),
|
||||
triple(triple), targetChip(targetChip), features(features) {
|
||||
: loweringCallback(loweringCallback), blobGenerator(blobGenerator) {
|
||||
if (!triple.empty())
|
||||
this->triple = triple.str();
|
||||
if (!targetChip.empty())
|
||||
this->chip = targetChip.str();
|
||||
if (!features.empty())
|
||||
this->features = features.str();
|
||||
if (!gpuBinaryAnnotation.empty())
|
||||
this->gpuBinaryAnnotation = gpuBinaryAnnotation.str();
|
||||
}
|
||||
|
||||
GpuKernelToBlobPass(const GpuKernelToBlobPass &other)
|
||||
: loweringCallback(other.loweringCallback),
|
||||
blobGenerator(other.blobGenerator), triple(other.triple),
|
||||
targetChip(other.targetChip), features(other.features) {}
|
||||
|
||||
void runOnOperation() override {
|
||||
gpu::GPUModuleOp module = getOperation();
|
||||
|
||||
// Lower the module to an LLVM IR module using a separate context to enable
|
||||
// multi-threaded processing.
|
||||
llvm::LLVMContext llvmContext;
|
||||
std::unique_ptr<llvm::Module> llvmModule =
|
||||
loweringCallback(module, llvmContext, "LLVMDialectModule");
|
||||
if (!llvmModule)
|
||||
return signalPassFailure();
|
||||
|
||||
// Translate the llvm module to a target blob and attach the result as
|
||||
// attribute to the module.
|
||||
if (auto blobAttr = translateGPUModuleToBinaryAnnotation(
|
||||
*llvmModule, module.getLoc(), module.getName()))
|
||||
module->setAttr(gpuBinaryAnnotation, blobAttr);
|
||||
else
|
||||
signalPassFailure();
|
||||
private:
|
||||
// Translates the 'getOperation()' result to an LLVM module.
|
||||
std::unique_ptr<llvm::Module>
|
||||
translateToLLVMIR(llvm::LLVMContext &llvmContext) override {
|
||||
return loweringCallback(getOperation(), llvmContext, "LLVMDialectModule");
|
||||
}
|
||||
|
||||
private:
|
||||
std::string translateModuleToISA(llvm::Module &module,
|
||||
llvm::TargetMachine &targetMachine);
|
||||
|
||||
/// Converts llvmModule to a blob with target instructions using the
|
||||
/// user-provided generator. Location is used for error reporting and name is
|
||||
/// forwarded to the blob generator to use in its logging mechanisms.
|
||||
OwnedBlob convertModuleToBlob(llvm::Module &llvmModule, Location loc,
|
||||
StringRef name);
|
||||
|
||||
/// Translates llvmModule to a blob with target instructions and returns the
|
||||
/// result as attribute.
|
||||
StringAttr translateGPUModuleToBinaryAnnotation(llvm::Module &llvmModule,
|
||||
Location loc, StringRef name);
|
||||
// Serializes the target ISA to binary form.
|
||||
std::unique_ptr<std::vector<char>>
|
||||
serializeISA(const std::string &isa) override {
|
||||
return blobGenerator(isa, getOperation().getLoc(),
|
||||
getOperation().getName());
|
||||
}
|
||||
|
||||
LoweringCallback loweringCallback;
|
||||
BlobGenerator blobGenerator;
|
||||
|
||||
llvm::Triple triple;
|
||||
std::string targetChip;
|
||||
std::string features;
|
||||
|
||||
Option<std::string> gpuBinaryAnnotation{
|
||||
*this, "gpu-binary-annotation",
|
||||
llvm::cl::desc("Annotation attribute string for GPU binary"),
|
||||
llvm::cl::init(gpu::getDefaultGpuBinaryAnnotation())};
|
||||
};
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
std::string gpu::getDefaultGpuBinaryAnnotation() { return "gpu.binary"; }
|
||||
|
||||
std::string
|
||||
GpuKernelToBlobPass::translateModuleToISA(llvm::Module &module,
|
||||
llvm::TargetMachine &targetMachine) {
|
||||
std::string targetISA;
|
||||
{
|
||||
llvm::raw_string_ostream stream(targetISA);
|
||||
llvm::buffer_ostream pstream(stream);
|
||||
llvm::legacy::PassManager codegenPasses;
|
||||
targetMachine.addPassesToEmitFile(codegenPasses, pstream, nullptr,
|
||||
llvm::CGFT_AssemblyFile);
|
||||
codegenPasses.run(module);
|
||||
}
|
||||
|
||||
return targetISA;
|
||||
}
|
||||
|
||||
OwnedBlob GpuKernelToBlobPass::convertModuleToBlob(llvm::Module &llvmModule,
|
||||
Location loc,
|
||||
StringRef name) {
|
||||
std::unique_ptr<llvm::TargetMachine> targetMachine;
|
||||
{
|
||||
std::string error;
|
||||
const llvm::Target *target =
|
||||
llvm::TargetRegistry::lookupTarget("", triple, error);
|
||||
if (target == nullptr) {
|
||||
emitError(loc, "cannot initialize target triple");
|
||||
return {};
|
||||
}
|
||||
targetMachine.reset(target->createTargetMachine(triple.str(), targetChip,
|
||||
features, {}, {}));
|
||||
if (targetMachine == nullptr) {
|
||||
emitError(loc, "cannot initialize target machine");
|
||||
return {};
|
||||
}
|
||||
}
|
||||
|
||||
llvmModule.setDataLayout(targetMachine->createDataLayout());
|
||||
|
||||
auto targetISA = translateModuleToISA(llvmModule, *targetMachine);
|
||||
|
||||
return blobGenerator(targetISA, loc, name);
|
||||
}
|
||||
|
||||
StringAttr GpuKernelToBlobPass::translateGPUModuleToBinaryAnnotation(
|
||||
llvm::Module &llvmModule, Location loc, StringRef name) {
|
||||
auto blob = convertModuleToBlob(llvmModule, loc, name);
|
||||
if (!blob)
|
||||
return {};
|
||||
return StringAttr::get(loc->getContext(), {blob->data(), blob->size()});
|
||||
}
|
||||
|
||||
std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
|
||||
mlir::createConvertGPUKernelToBlobPass(LoweringCallback loweringCallback,
|
||||
BlobGenerator blobGenerator,
|
||||
|
|
|
@ -20,6 +20,7 @@
|
|||
#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
|
||||
#include "mlir/Dialect/Async/IR/Async.h"
|
||||
#include "mlir/Dialect/GPU/GPUDialect.h"
|
||||
#include "mlir/Dialect/GPU/Passes.h"
|
||||
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
|
||||
#include "mlir/IR/Attributes.h"
|
||||
#include "mlir/IR/Builders.h"
|
||||
|
@ -27,10 +28,6 @@
|
|||
#include "mlir/IR/BuiltinTypes.h"
|
||||
|
||||
#include "llvm/ADT/STLExtras.h"
|
||||
#include "llvm/IR/DataLayout.h"
|
||||
#include "llvm/IR/DerivedTypes.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include "llvm/IR/Type.h"
|
||||
#include "llvm/Support/Error.h"
|
||||
#include "llvm/Support/FormatVariadic.h"
|
||||
|
||||
|
|
|
@ -5,10 +5,15 @@ add_mlir_dialect_library(MLIRGPU
|
|||
Transforms/KernelOutlining.cpp
|
||||
Transforms/MemoryPromotion.cpp
|
||||
Transforms/ParallelLoopMapper.cpp
|
||||
Transforms/SerializeToBlob.cpp
|
||||
|
||||
ADDITIONAL_HEADER_DIRS
|
||||
${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/GPU
|
||||
|
||||
LINK_COMPONENTS
|
||||
Core
|
||||
MC
|
||||
|
||||
DEPENDS
|
||||
MLIRGPUOpsIncGen
|
||||
MLIRGPUOpInterfacesIncGen
|
||||
|
|
|
@ -0,0 +1,95 @@
|
|||
//===- SerializeToBlob.cpp - MLIR GPU lowering pass -----------------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This file implements a base class for a pass to serialize a gpu module
|
||||
// into a binary blob that can be executed on a GPU. The binary blob is added
|
||||
// as a string attribute to the gpu module.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "mlir/Dialect/GPU/Passes.h"
|
||||
#include "mlir/Pass/Pass.h"
|
||||
#include "llvm/IR/LegacyPassManager.h"
|
||||
#include "llvm/Support/TargetRegistry.h"
|
||||
#include "llvm/Support/TargetSelect.h"
|
||||
#include "llvm/Target/TargetMachine.h"
|
||||
|
||||
using namespace mlir;
|
||||
|
||||
std::string gpu::getDefaultGpuBinaryAnnotation() { return "gpu.binary"; }
|
||||
|
||||
gpu::SerializeToBlobPass::SerializeToBlobPass(TypeID passID)
|
||||
: OperationPass<gpu::GPUModuleOp>(passID) {}
|
||||
|
||||
gpu::SerializeToBlobPass::SerializeToBlobPass(const SerializeToBlobPass &other)
|
||||
: OperationPass<gpu::GPUModuleOp>(other) {
|
||||
// Pass::Option has no copy constructor, copy them manually.
|
||||
triple = other.triple;
|
||||
chip = other.chip;
|
||||
features = other.features;
|
||||
gpuBinaryAnnotation = other.gpuBinaryAnnotation;
|
||||
}
|
||||
|
||||
static std::string translateToISA(llvm::Module &llvmModule,
|
||||
llvm::TargetMachine &targetMachine) {
|
||||
llvmModule.setDataLayout(targetMachine.createDataLayout());
|
||||
|
||||
std::string targetISA;
|
||||
llvm::raw_string_ostream stream(targetISA);
|
||||
llvm::buffer_ostream pstream(stream);
|
||||
llvm::legacy::PassManager codegenPasses;
|
||||
targetMachine.addPassesToEmitFile(codegenPasses, pstream, nullptr,
|
||||
llvm::CGFT_AssemblyFile);
|
||||
codegenPasses.run(llvmModule);
|
||||
return targetISA;
|
||||
}
|
||||
|
||||
void gpu::SerializeToBlobPass::runOnOperation() {
|
||||
// Lower the module to an LLVM IR module using a separate context to enable
|
||||
// multi-threaded processing.
|
||||
llvm::LLVMContext llvmContext;
|
||||
std::unique_ptr<llvm::Module> llvmModule = translateToLLVMIR(llvmContext);
|
||||
if (!llvmModule)
|
||||
return signalPassFailure();
|
||||
|
||||
// Lower the LLVM IR module to target ISA.
|
||||
std::unique_ptr<llvm::TargetMachine> targetMachine = createTargetMachine();
|
||||
if (!targetMachine)
|
||||
return signalPassFailure();
|
||||
|
||||
std::string targetISA = translateToISA(*llvmModule, *targetMachine);
|
||||
|
||||
// Serialize the target ISA.
|
||||
std::unique_ptr<std::vector<char>> blob = serializeISA(targetISA);
|
||||
if (!blob)
|
||||
return signalPassFailure();
|
||||
|
||||
// Add the blob as module attribute.
|
||||
auto attr = StringAttr::get(&getContext(), {blob->data(), blob->size()});
|
||||
getOperation()->setAttr(gpuBinaryAnnotation, attr);
|
||||
}
|
||||
|
||||
std::unique_ptr<llvm::TargetMachine>
|
||||
gpu::SerializeToBlobPass::createTargetMachine() {
|
||||
Location loc = getOperation().getLoc();
|
||||
std::string error;
|
||||
const llvm::Target *target =
|
||||
llvm::TargetRegistry::lookupTarget(triple, error);
|
||||
if (!target) {
|
||||
emitError(loc, Twine("failed to lookup target: ") + error);
|
||||
return {};
|
||||
}
|
||||
llvm::TargetMachine *machine =
|
||||
target->createTargetMachine(triple, chip, features, {}, {});
|
||||
if (!machine) {
|
||||
emitError(loc, "failed to create target machine");
|
||||
return {};
|
||||
}
|
||||
|
||||
return std::unique_ptr<llvm::TargetMachine>{machine};
|
||||
}
|
Loading…
Reference in New Issue