forked from OSchip/llvm-project
[MLIR][GPU] Run generic LLVM optimizations when serializing (on AMD)
- Adds hooks that allow SerializeTo* passes to arbitrarily transform the produced LLVM Module before it is passed to the code generation passes. - Uses these hooks within the SerializeToHsaco pass in order to run LLVM optimizations and to set the optimization level on the TargetMachine. - Adds an optLevel parameter to SerializeToHsaco Future work may include moving much of what's been added to SerializeToHsaco to SerializeToBlob, but that would require confirmation from the NVVM backend maintainers that it would be appropriate to do so. Depends on D114107 Reviewed By: mehdi_amini Differential Revision: https://reviews.llvm.org/D114113
This commit is contained in:
parent
47555d73f6
commit
bd22554af0
|
@ -54,14 +54,23 @@ public:
|
||||||
protected:
|
protected:
|
||||||
void getDependentDialects(DialectRegistry ®istry) const override;
|
void getDependentDialects(DialectRegistry ®istry) const override;
|
||||||
|
|
||||||
private:
|
/// Hook allowing the application of optimizations before codegen
|
||||||
/// Creates the LLVM target machine to generate the ISA.
|
/// By default, does nothing
|
||||||
std::unique_ptr<llvm::TargetMachine> createTargetMachine();
|
virtual LogicalResult optimizeLlvm(llvm::Module &llvmModule,
|
||||||
|
llvm::TargetMachine &targetMachine);
|
||||||
|
|
||||||
/// Translates the 'getOperation()' result to an LLVM module.
|
/// Translates the 'getOperation()' result to an LLVM module.
|
||||||
virtual std::unique_ptr<llvm::Module>
|
virtual std::unique_ptr<llvm::Module>
|
||||||
translateToLLVMIR(llvm::LLVMContext &llvmContext);
|
translateToLLVMIR(llvm::LLVMContext &llvmContext);
|
||||||
|
|
||||||
|
private:
|
||||||
|
/// Creates the LLVM target machine to generate the ISA.
|
||||||
|
std::unique_ptr<llvm::TargetMachine> createTargetMachine();
|
||||||
|
|
||||||
|
/// Translates the module to ISA
|
||||||
|
Optional<std::string> translateToISA(llvm::Module &llvmModule,
|
||||||
|
llvm::TargetMachine &targetMachine);
|
||||||
|
|
||||||
/// Serializes the target ISA to binary form.
|
/// Serializes the target ISA to binary form.
|
||||||
virtual std::unique_ptr<std::vector<char>>
|
virtual std::unique_ptr<std::vector<char>>
|
||||||
serializeISA(const std::string &isa) = 0;
|
serializeISA(const std::string &isa) = 0;
|
||||||
|
|
|
@ -162,6 +162,7 @@ if(MLIR_ENABLE_ROCM_RUNNER)
|
||||||
target_link_libraries(MLIRGPUOps
|
target_link_libraries(MLIRGPUOps
|
||||||
PRIVATE
|
PRIVATE
|
||||||
lldELF
|
lldELF
|
||||||
|
MLIRExecutionEngine
|
||||||
MLIRROCDLToLLVMIRTranslation
|
MLIRROCDLToLLVMIRTranslation
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -31,18 +31,28 @@ gpu::SerializeToBlobPass::SerializeToBlobPass(TypeID passID)
|
||||||
gpu::SerializeToBlobPass::SerializeToBlobPass(const SerializeToBlobPass &other)
|
gpu::SerializeToBlobPass::SerializeToBlobPass(const SerializeToBlobPass &other)
|
||||||
: OperationPass<gpu::GPUModuleOp>(other) {}
|
: OperationPass<gpu::GPUModuleOp>(other) {}
|
||||||
|
|
||||||
static std::string translateToISA(llvm::Module &llvmModule,
|
Optional<std::string>
|
||||||
llvm::TargetMachine &targetMachine) {
|
gpu::SerializeToBlobPass::translateToISA(llvm::Module &llvmModule,
|
||||||
|
llvm::TargetMachine &targetMachine) {
|
||||||
llvmModule.setDataLayout(targetMachine.createDataLayout());
|
llvmModule.setDataLayout(targetMachine.createDataLayout());
|
||||||
|
|
||||||
|
if (failed(optimizeLlvm(llvmModule, targetMachine)))
|
||||||
|
return llvm::None;
|
||||||
|
|
||||||
std::string targetISA;
|
std::string targetISA;
|
||||||
llvm::raw_string_ostream stream(targetISA);
|
llvm::raw_string_ostream stream(targetISA);
|
||||||
llvm::buffer_ostream pstream(stream);
|
|
||||||
llvm::legacy::PassManager codegenPasses;
|
llvm::legacy::PassManager codegenPasses;
|
||||||
targetMachine.addPassesToEmitFile(codegenPasses, pstream, nullptr,
|
|
||||||
llvm::CGFT_AssemblyFile);
|
{ // Drop pstream after this to prevent the ISA from being stuck buffering
|
||||||
codegenPasses.run(llvmModule);
|
llvm::buffer_ostream pstream(stream);
|
||||||
return targetISA;
|
if (targetMachine.addPassesToEmitFile(codegenPasses, pstream, nullptr,
|
||||||
|
llvm::CGFT_AssemblyFile))
|
||||||
|
return llvm::None;
|
||||||
|
|
||||||
|
codegenPasses.run(llvmModule);
|
||||||
|
}
|
||||||
|
return stream.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
void gpu::SerializeToBlobPass::runOnOperation() {
|
void gpu::SerializeToBlobPass::runOnOperation() {
|
||||||
|
@ -58,7 +68,13 @@ void gpu::SerializeToBlobPass::runOnOperation() {
|
||||||
if (!targetMachine)
|
if (!targetMachine)
|
||||||
return signalPassFailure();
|
return signalPassFailure();
|
||||||
|
|
||||||
std::string targetISA = translateToISA(*llvmModule, *targetMachine);
|
Optional<std::string> maybeTargetISA =
|
||||||
|
translateToISA(*llvmModule, *targetMachine);
|
||||||
|
|
||||||
|
if (!maybeTargetISA.hasValue())
|
||||||
|
return signalPassFailure();
|
||||||
|
|
||||||
|
std::string targetISA = std::move(maybeTargetISA.getValue());
|
||||||
|
|
||||||
// Serialize the target ISA.
|
// Serialize the target ISA.
|
||||||
std::unique_ptr<std::vector<char>> blob = serializeISA(targetISA);
|
std::unique_ptr<std::vector<char>> blob = serializeISA(targetISA);
|
||||||
|
@ -71,6 +87,14 @@ void gpu::SerializeToBlobPass::runOnOperation() {
|
||||||
getOperation()->setAttr(gpuBinaryAnnotation, attr);
|
getOperation()->setAttr(gpuBinaryAnnotation, attr);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
LogicalResult
|
||||||
|
gpu::SerializeToBlobPass::optimizeLlvm(llvm::Module &llvmModule,
|
||||||
|
llvm::TargetMachine &targetMachine) {
|
||||||
|
// TODO: If serializeToCubin ends up defining optimizations, factor them
|
||||||
|
// into here from SerializeToHsaco
|
||||||
|
return success();
|
||||||
|
}
|
||||||
|
|
||||||
void gpu::SerializeToBlobPass::getDependentDialects(
|
void gpu::SerializeToBlobPass::getDependentDialects(
|
||||||
DialectRegistry ®istry) const {
|
DialectRegistry ®istry) const {
|
||||||
registerLLVMDialectTranslation(registry);
|
registerLLVMDialectTranslation(registry);
|
||||||
|
|
|
@ -15,6 +15,7 @@
|
||||||
#include "mlir/IR/MLIRContext.h"
|
#include "mlir/IR/MLIRContext.h"
|
||||||
|
|
||||||
#if MLIR_GPU_TO_HSACO_PASS_ENABLE
|
#if MLIR_GPU_TO_HSACO_PASS_ENABLE
|
||||||
|
#include "mlir/ExecutionEngine/OptUtils.h"
|
||||||
#include "mlir/Pass/Pass.h"
|
#include "mlir/Pass/Pass.h"
|
||||||
#include "mlir/Support/FileUtilities.h"
|
#include "mlir/Support/FileUtilities.h"
|
||||||
#include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h"
|
#include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h"
|
||||||
|
@ -53,12 +54,24 @@ namespace {
|
||||||
class SerializeToHsacoPass
|
class SerializeToHsacoPass
|
||||||
: public PassWrapper<SerializeToHsacoPass, gpu::SerializeToBlobPass> {
|
: public PassWrapper<SerializeToHsacoPass, gpu::SerializeToBlobPass> {
|
||||||
public:
|
public:
|
||||||
SerializeToHsacoPass(StringRef triple, StringRef arch, StringRef features);
|
SerializeToHsacoPass(StringRef triple, StringRef arch, StringRef features,
|
||||||
|
int optLevel);
|
||||||
|
SerializeToHsacoPass(const SerializeToHsacoPass &other);
|
||||||
StringRef getArgument() const override { return "gpu-to-hsaco"; }
|
StringRef getArgument() const override { return "gpu-to-hsaco"; }
|
||||||
StringRef getDescription() const override {
|
StringRef getDescription() const override {
|
||||||
return "Lower GPU kernel function to HSACO binary annotations";
|
return "Lower GPU kernel function to HSACO binary annotations";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected:
|
||||||
|
Option<int> optLevel{
|
||||||
|
*this, "opt-level",
|
||||||
|
llvm::cl::desc("Optimization level for HSACO compilation"),
|
||||||
|
llvm::cl::init(2)};
|
||||||
|
|
||||||
|
/// Adds LLVM optimization passes
|
||||||
|
LogicalResult optimizeLlvm(llvm::Module &llvmModule,
|
||||||
|
llvm::TargetMachine &targetMachine) override;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void getDependentDialects(DialectRegistry ®istry) const override;
|
void getDependentDialects(DialectRegistry ®istry) const override;
|
||||||
|
|
||||||
|
@ -72,6 +85,8 @@ private:
|
||||||
};
|
};
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
|
SerializeToHsacoPass::SerializeToHsacoPass(const SerializeToHsacoPass &other)
|
||||||
|
: PassWrapper<SerializeToHsacoPass, gpu::SerializeToBlobPass>(other) {}
|
||||||
static std::string getDefaultChip() {
|
static std::string getDefaultChip() {
|
||||||
const char kDefaultChip[] = "gfx900";
|
const char kDefaultChip[] = "gfx900";
|
||||||
|
|
||||||
|
@ -137,10 +152,12 @@ static void maybeSetOption(Pass::Option<std::string> &option,
|
||||||
}
|
}
|
||||||
|
|
||||||
SerializeToHsacoPass::SerializeToHsacoPass(StringRef triple, StringRef arch,
|
SerializeToHsacoPass::SerializeToHsacoPass(StringRef triple, StringRef arch,
|
||||||
StringRef features) {
|
StringRef features, int optLevel) {
|
||||||
maybeSetOption(this->triple, [&triple] { return triple.str(); });
|
maybeSetOption(this->triple, [&triple] { return triple.str(); });
|
||||||
maybeSetOption(this->chip, [&arch] { return arch.str(); });
|
maybeSetOption(this->chip, [&arch] { return arch.str(); });
|
||||||
maybeSetOption(this->features, [&features] { return features.str(); });
|
maybeSetOption(this->features, [&features] { return features.str(); });
|
||||||
|
if (this->optLevel.getNumOccurrences() == 0)
|
||||||
|
this->optLevel.setValue(optLevel);
|
||||||
}
|
}
|
||||||
|
|
||||||
void SerializeToHsacoPass::getDependentDialects(
|
void SerializeToHsacoPass::getDependentDialects(
|
||||||
|
@ -149,6 +166,30 @@ void SerializeToHsacoPass::getDependentDialects(
|
||||||
gpu::SerializeToBlobPass::getDependentDialects(registry);
|
gpu::SerializeToBlobPass::getDependentDialects(registry);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
LogicalResult
|
||||||
|
SerializeToHsacoPass::optimizeLlvm(llvm::Module &llvmModule,
|
||||||
|
llvm::TargetMachine &targetMachine) {
|
||||||
|
int optLevel = this->optLevel.getValue();
|
||||||
|
if (optLevel < 0 || optLevel > 3)
|
||||||
|
return getOperation().emitError()
|
||||||
|
<< "Invalid HSA optimization level" << optLevel << "\n";
|
||||||
|
|
||||||
|
targetMachine.setOptLevel(static_cast<llvm::CodeGenOpt::Level>(optLevel));
|
||||||
|
|
||||||
|
auto transformer =
|
||||||
|
makeOptimizingTransformer(optLevel, /*sizeLevel=*/0, &targetMachine);
|
||||||
|
auto error = transformer(&llvmModule);
|
||||||
|
if (error) {
|
||||||
|
InFlightDiagnostic mlirError = getOperation()->emitError();
|
||||||
|
llvm::handleAllErrors(
|
||||||
|
std::move(error), [&mlirError](const llvm::ErrorInfoBase &ei) {
|
||||||
|
mlirError << "Could not optimize LLVM IR: " << ei.message() << "\n";
|
||||||
|
});
|
||||||
|
return mlirError;
|
||||||
|
}
|
||||||
|
return success();
|
||||||
|
}
|
||||||
|
|
||||||
std::unique_ptr<SmallVectorImpl<char>>
|
std::unique_ptr<SmallVectorImpl<char>>
|
||||||
SerializeToHsacoPass::assembleIsa(const std::string &isa) {
|
SerializeToHsacoPass::assembleIsa(const std::string &isa) {
|
||||||
auto loc = getOperation().getLoc();
|
auto loc = getOperation().getLoc();
|
||||||
|
@ -286,7 +327,7 @@ void mlir::registerGpuSerializeToHsacoPass() {
|
||||||
LLVMInitializeAMDGPUTargetMC();
|
LLVMInitializeAMDGPUTargetMC();
|
||||||
|
|
||||||
return std::make_unique<SerializeToHsacoPass>("amdgcn-amd-amdhsa", "",
|
return std::make_unique<SerializeToHsacoPass>("amdgcn-amd-amdhsa", "",
|
||||||
"");
|
"", 2);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
#else // MLIR_GPU_TO_HSACO_PASS_ENABLE
|
#else // MLIR_GPU_TO_HSACO_PASS_ENABLE
|
||||||
|
|
Loading…
Reference in New Issue