[mlir][gpu] Introduce mlir-rocm-runner.

Summary:
`mlir-rocm-runner` is introduced in this commit to execute GPU modules on ROCm
platform. A small wrapper to encapsulate ROCm's HIP runtime API is also inside
the commit.

Due to behavior of ROCm, raw pointers inside memrefs passed to `gpu.launch`
must be modified on the host side to properly capture the pointer values
addressable on the GPU.

LLVM MC is used to assemble AMD GCN ISA coming out from
`ConvertGPUKernelToBlobPass` to binary form, and LLD is used to produce a shared
ELF object which could be loaded by ROCm HIP runtime.

gfx900 is the default target be used right now, although it could be altered via
an option in `mlir-rocm-runner`. Future revisions may consider using ROCm Agent
Enumerator to detect the right target on the system.

Notice AMDGPU Code Object V2 is used in this revision. Future enhancements may
upgrade to AMDGPU Code Object V3.

Bitcode libraries in ROCm-Device-Libs, which implements math routines exposed in
`rocdl` dialect are not yet linked, and is left as a TODO in the logic.

Reviewers: herhut

Subscribers: mgorny, tpr, dexonsmith, mehdi_amini, rriddle, jpienaar, shauheen, antiagainst, nicolasvasilache, csigg, arpith-jacob, mgester, lucyrfox, aartbik, liufengdb, stephenneuendorffer, Joonsoo, grosul1, frgossen, Kayjukh, jurahul, llvm-commits

Tags: #mlir, #llvm

Differential Revision: https://reviews.llvm.org/D80676
This commit is contained in:
Wen-Heng (Jack) Chung 2020-05-20 16:07:49 -05:00
parent 164f4b9d26
commit 2fd6403a6d
14 changed files with 634 additions and 4 deletions

View File

@ -41,6 +41,7 @@ endif()
add_definitions(-DMLIR_ROCM_CONVERSIONS_ENABLED=${MLIR_ROCM_CONVERSIONS_ENABLED})
set(MLIR_CUDA_RUNNER_ENABLED 0 CACHE BOOL "Enable building the mlir CUDA runner")
set(MLIR_ROCM_RUNNER_ENABLED 0 CACHE BOOL "Enable building the mlir ROCm runner")
set(MLIR_VULKAN_RUNNER_ENABLED 0 CACHE BOOL "Enable building the mlir Vulkan runner")
option(MLIR_INCLUDE_TESTS

View File

@ -43,7 +43,8 @@ using LoweringCallback =
/// instead uses a small wrapper library that exports a stable and conveniently
/// typed ABI on top of GPU runtimes such as CUDA or ROCm (HIP).
std::unique_ptr<OperationPass<ModuleOp>>
createConvertGpuLaunchFuncToGpuRuntimeCallsPass();
createConvertGpuLaunchFuncToGpuRuntimeCallsPass(
StringRef gpuBinaryAnnotation = "");
/// Creates a pass to convert kernel functions into GPU target object blobs.
///

View File

@ -123,6 +123,11 @@ private:
void translateGpuLaunchCalls(mlir::gpu::LaunchFuncOp launchOp);
public:
GpuLaunchFuncToGpuRuntimeCallsPass() = default;
GpuLaunchFuncToGpuRuntimeCallsPass(StringRef gpuBinaryAnnotation) {
this->gpuBinaryAnnotation = gpuBinaryAnnotation.str();
}
// Run the dialect converter on the module.
void runOnOperation() override {
// Cache the LLVMDialect for the current module.
@ -457,6 +462,8 @@ void GpuLaunchFuncToGpuRuntimeCallsPass::translateGpuLaunchCalls(
}
std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
mlir::createConvertGpuLaunchFuncToGpuRuntimeCallsPass() {
return std::make_unique<GpuLaunchFuncToGpuRuntimeCallsPass>();
mlir::createConvertGpuLaunchFuncToGpuRuntimeCallsPass(
StringRef gpuBinaryAnnotation) {
return std::make_unique<GpuLaunchFuncToGpuRuntimeCallsPass>(
gpuBinaryAnnotation);
}

View File

@ -13,8 +13,9 @@ set(MLIR_DIALECT_LINALG_INTEGRATION_TEST_LIB_DIR ${CMAKE_LIBRARY_OUTPUT_DIRECTOR
set(MLIR_RUNNER_UTILS_DIR ${CMAKE_LIBRARY_OUTPUT_DIRECTORY})
# Passed to lit.site.cfg.py.in to set up the path where to find the libraries
# for the mlir cuda runner tests.
# for the mlir cuda / rocm / vulkan runner tests.
set(MLIR_CUDA_WRAPPER_LIBRARY_DIR ${CMAKE_LIBRARY_OUTPUT_DIRECTORY})
set(MLIR_ROCM_WRAPPER_LIBRARY_DIR ${CMAKE_LIBRARY_OUTPUT_DIRECTORY})
set(MLIR_VULKAN_WRAPPER_LIBRARY_DIR ${CMAKE_LIBRARY_OUTPUT_DIRECTORY})
configure_lit_site_cfg(
@ -64,6 +65,12 @@ if(MLIR_CUDA_RUNNER_ENABLED)
)
endif()
if(MLIR_ROCM_RUNNER_ENABLED)
list(APPEND MLIR_TEST_DEPENDS
mlir-rocm-runner
)
endif()
if(MLIR_VULKAN_RUNNER_ENABLED)
list(APPEND MLIR_TEST_DEPENDS
mlir-vulkan-runner

View File

@ -70,6 +70,7 @@ tools.extend([
ToolSubst('%cuda_wrapper_library_dir', config.cuda_wrapper_library_dir, unresolved='ignore'),
ToolSubst('%linalg_test_lib_dir', config.linalg_test_lib_dir, unresolved='ignore'),
ToolSubst('%mlir_runner_utils_dir', config.mlir_runner_utils_dir, unresolved='ignore'),
ToolSubst('%rocm_wrapper_library_dir', config.rocm_wrapper_library_dir, unresolved='ignore'),
ToolSubst('%vulkan_wrapper_library_dir', config.vulkan_wrapper_library_dir, unresolved='ignore')
])

View File

@ -39,6 +39,8 @@ config.run_cuda_tests = @MLIR_CUDA_CONVERSIONS_ENABLED@
config.cuda_wrapper_library_dir = "@MLIR_CUDA_WRAPPER_LIBRARY_DIR@"
config.enable_cuda_runner = @MLIR_CUDA_RUNNER_ENABLED@
config.run_rocm_tests = @MLIR_ROCM_CONVERSIONS_ENABLED@
config.rocm_wrapper_library_dir = "@MLIR_ROCM_WRAPPER_LIBRARY_DIR@"
config.enable_rocm_runner = @MLIR_ROCM_RUNNER_ENABLED@
config.vulkan_wrapper_library_dir = "@MLIR_VULKAN_WRAPPER_LIBRARY_DIR@"
config.enable_vulkan_runner = @MLIR_VULKAN_RUNNER_ENABLED@

View File

@ -0,0 +1,32 @@
// RUN: mlir-rocm-runner %s --shared-libs=%rocm_wrapper_library_dir/librocm-runtime-wrappers%shlibext,%linalg_test_lib_dir/libmlir_runner_utils%shlibext --entry-point-result=void | FileCheck %s
func @other_func(%arg0 : f32, %arg1 : memref<?xf32>) {
%cst = constant 1 : index
%cst2 = dim %arg1, 0 : memref<?xf32>
gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst, %grid_z = %cst)
threads(%tx, %ty, %tz) in (%block_x = %cst2, %block_y = %cst, %block_z = %cst) {
store %arg0, %arg1[%tx] : memref<?xf32>
gpu.terminator
}
return
}
// CHECK: [1, 1, 1, 1, 1]
func @main() {
%arg0 = alloc() : memref<5xf32>
%21 = constant 5 : i32
%22 = memref_cast %arg0 : memref<5xf32> to memref<?xf32>
%cast = memref_cast %22 : memref<?xf32> to memref<*xf32>
call @mgpuMemHostRegisterFloat(%cast) : (memref<*xf32>) -> ()
%23 = memref_cast %22 : memref<?xf32> to memref<*xf32>
call @print_memref_f32(%23) : (memref<*xf32>) -> ()
%24 = constant 1.0 : f32
%25 = call @mgpuMemGetDeviceMemRef1dFloat(%22) : (memref<?xf32>) -> (memref<?xf32>)
call @other_func(%24, %25) : (f32, memref<?xf32>) -> ()
call @print_memref_f32(%23) : (memref<*xf32>) -> ()
return
}
func @mgpuMemHostRegisterFloat(%ptr : memref<*xf32>)
func @mgpuMemGetDeviceMemRef1dFloat(%ptr : memref<?xf32>) -> (memref<?xf32>)
func @print_memref_f32(%ptr : memref<*xf32>)

View File

@ -0,0 +1,2 @@
if not config.enable_rocm_runner:
config.unsupported = True

View File

@ -0,0 +1,30 @@
// RUN: mlir-rocm-runner %s --print-ir-after-all --shared-libs=%rocm_wrapper_library_dir/librocm-runtime-wrappers%shlibext,%linalg_test_lib_dir/libmlir_runner_utils%shlibext --entry-point-result=void | FileCheck %s --dump-input=always
// CHECK: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
func @main() {
%arg = alloc() : memref<13xi32>
%dst = memref_cast %arg : memref<13xi32> to memref<?xi32>
%one = constant 1 : index
%sx = dim %dst, 0 : memref<?xi32>
%cast_dst = memref_cast %dst : memref<?xi32> to memref<*xi32>
call @mgpuMemHostRegisterInt32(%cast_dst) : (memref<*xi32>) -> ()
%dst_device = call @mgpuMemGetDeviceMemRef1dInt32(%dst) : (memref<?xi32>) -> (memref<?xi32>)
gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %one, %grid_y = %one, %grid_z = %one)
threads(%tx, %ty, %tz) in (%block_x = %sx, %block_y = %one, %block_z = %one) {
%t0 = index_cast %tx : index to i32
store %t0, %dst_device[%tx] : memref<?xi32>
gpu.terminator
}
gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %one, %grid_y = %one, %grid_z = %one)
threads(%tx, %ty, %tz) in (%block_x = %sx, %block_y = %one, %block_z = %one) {
%t0 = index_cast %tx : index to i32
store %t0, %dst_device[%tx] : memref<?xi32>
gpu.terminator
}
call @print_memref_i32(%cast_dst) : (memref<*xi32>) -> ()
return
}
func @mgpuMemHostRegisterInt32(%ptr : memref<*xi32>)
func @mgpuMemGetDeviceMemRef1dInt32(%ptr : memref<?xi32>) -> (memref<?xi32>)
func @print_memref_i32(%ptr : memref<*xi32>)

View File

@ -0,0 +1,42 @@
// RUN: mlir-rocm-runner %s --shared-libs=%rocm_wrapper_library_dir/librocm-runtime-wrappers%shlibext,%linalg_test_lib_dir/libmlir_runner_utils%shlibext --entry-point-result=void | FileCheck %s
func @vecadd(%arg0 : memref<?xf32>, %arg1 : memref<?xf32>, %arg2 : memref<?xf32>) {
%cst = constant 1 : index
%cst2 = dim %arg0, 0 : memref<?xf32>
gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst, %grid_z = %cst)
threads(%tx, %ty, %tz) in (%block_x = %cst2, %block_y = %cst, %block_z = %cst) {
%a = load %arg0[%tx] : memref<?xf32>
%b = load %arg1[%tx] : memref<?xf32>
%c = addf %a, %b : f32
store %c, %arg2[%tx] : memref<?xf32>
gpu.terminator
}
return
}
// CHECK: [2.46, 2.46, 2.46, 2.46, 2.46]
func @main() {
%0 = alloc() : memref<5xf32>
%1 = alloc() : memref<5xf32>
%2 = alloc() : memref<5xf32>
%3 = memref_cast %0 : memref<5xf32> to memref<?xf32>
%4 = memref_cast %1 : memref<5xf32> to memref<?xf32>
%5 = memref_cast %2 : memref<5xf32> to memref<?xf32>
%6 = memref_cast %3 : memref<?xf32> to memref<*xf32>
%7 = memref_cast %4 : memref<?xf32> to memref<*xf32>
%8 = memref_cast %5 : memref<?xf32> to memref<*xf32>
call @mgpuMemHostRegisterFloat(%6) : (memref<*xf32>) -> ()
call @mgpuMemHostRegisterFloat(%7) : (memref<*xf32>) -> ()
call @mgpuMemHostRegisterFloat(%8) : (memref<*xf32>) -> ()
%9 = call @mgpuMemGetDeviceMemRef1dFloat(%3) : (memref<?xf32>) -> (memref<?xf32>)
%10 = call @mgpuMemGetDeviceMemRef1dFloat(%4) : (memref<?xf32>) -> (memref<?xf32>)
%11 = call @mgpuMemGetDeviceMemRef1dFloat(%5) : (memref<?xf32>) -> (memref<?xf32>)
call @vecadd(%9, %10, %11) : (memref<?xf32>, memref<?xf32>, memref<?xf32>) -> ()
call @print_memref_f32(%8) : (memref<*xf32>) -> ()
return
}
func @mgpuMemHostRegisterFloat(%ptr : memref<*xf32>)
func @mgpuMemGetDeviceMemRef1dFloat(%ptr : memref<?xf32>) -> (memref<?xf32>)
func @print_memref_f32(%ptr : memref<*xf32>)

View File

@ -2,6 +2,7 @@ add_subdirectory(mlir-cuda-runner)
add_subdirectory(mlir-cpu-runner)
add_subdirectory(mlir-linalg-ods-gen)
add_subdirectory(mlir-opt)
add_subdirectory(mlir-rocm-runner)
add_subdirectory(mlir-translate)
add_subdirectory(mlir-vulkan-runner)
add_subdirectory(mlir-shlib)

View File

@ -0,0 +1,113 @@
set(LLVM_OPTIONAL_SOURCES
rocm-runtime-wrappers.cpp
mlir-rocm-runner.cpp
)
if(MLIR_ROCM_RUNNER_ENABLED)
if (NOT ("AMDGPU" IN_LIST LLVM_TARGETS_TO_BUILD))
message(SEND_ERROR
"Building the mlir rocm runner requires the AMDGPU backend")
endif()
# Ensure lld is enabled.
if (NOT "lld" IN_LIST LLVM_ENABLE_PROJECTS)
message(SEND_ERROR "lld is not enabled. Please revise LLVM_ENABLE_PROJECTS")
endif()
# lld header files.
include_directories(${MLIR_SOURCE_DIR}/../lld/include)
# Configure ROCm support.
if (NOT DEFINED ROCM_PATH)
if (NOT DEFINED ENV{ROCM_PATH})
set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to which ROCm has been installed")
else()
set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to which ROCm has been installed")
endif()
set(HIP_PATH "${ROCM_PATH}/hip" CACHE PATH " Path to which HIP has been installed")
endif()
set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
find_package(HIP)
if (NOT HIP_FOUND)
message(SEND_ERROR "Build the mlir rocm runner requires a working ROCm and HIP install")
else()
message(STATUS "ROCm HIP version: ${HIP_VERSION}")
endif()
# Locate HIP runtime library.
find_library(ROCM_RUNTIME_LIBRARY hip_hcc
PATHS "${HIP_PATH}/lib")
if (NOT ROCM_RUNTIME_LIBRARY)
message(SEND_ERROR "Could not locate ROCm HIP runtime library")
else()
message(STATUS "ROCm HIP runtime lib: ${ROCM_RUNTIME_LIBRARY}")
endif()
# Set HIP compile-time flags.
add_definitions(-D__HIP_PLATFORM_HCC__)
add_llvm_library(rocm-runtime-wrappers SHARED
rocm-runtime-wrappers.cpp
)
target_include_directories(rocm-runtime-wrappers
PRIVATE
"${HIP_PATH}/../include"
"${HIP_PATH}/include"
LLVMSupport
)
target_link_libraries(rocm-runtime-wrappers
PUBLIC
LLVMSupport
${ROCM_RUNTIME_LIBRARY}
)
get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
get_property(conversion_libs GLOBAL PROPERTY MLIR_CONVERSION_LIBS)
set(LIBS
${dialect_libs}
${conversion_libs}
lldCommon
lldDriver
lldELF
LLVMCore
LLVMLTO
LLVMMC
LLVMMCParser
LLVMOption
LLVMSupport
MLIRJitRunner
MLIRAnalysis
MLIREDSC
MLIRExecutionEngine
MLIRIR
MLIRParser
MLIRROCDLIR
MLIRSupport
MLIRTargetLLVMIR
MLIRTargetROCDLIR
MLIRTransforms
MLIRTranslation
${ROCM_RUNTIME_LIBRARY}
)
# Manually expand the target library, since our MLIR libraries
# aren't plugged into the LLVM dependency tracking. If we don't
# do this then we can't insert the CodeGen library after ourselves
llvm_expand_pseudo_components(TARGET_LIBS AllTargetsCodeGens AllTargetsAsmParsers)
# Prepend LLVM in front of every target, this is how the library
# are named with CMake
SET(targets_to_link)
FOREACH(t ${TARGET_LIBS})
LIST(APPEND targets_to_link "LLVM${t}")
ENDFOREACH(t)
add_llvm_tool(mlir-rocm-runner
mlir-rocm-runner.cpp
DEPENDS
rocm-runtime-wrappers
)
llvm_update_compile_flags(mlir-rocm-runner)
target_link_libraries(mlir-rocm-runner PRIVATE ${LIBS} ${targets_to_link})
endif()

View File

@ -0,0 +1,248 @@
//===- mlir-rocm-runner.cpp - MLIR ROCM Execution Driver-------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This is a command line utility that executes an MLIR file on the GPU by
// translating MLIR to ROCDL/LLVM IR before JIT-compiling and executing the
// latter.
//
//===----------------------------------------------------------------------===//
#include "llvm/ADT/STLExtras.h"
#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
#include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h"
#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"
#include "mlir/Dialect/GPU/GPUDialect.h"
#include "mlir/Dialect/GPU/Passes.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
#include "mlir/ExecutionEngine/JitRunner.h"
#include "mlir/ExecutionEngine/OptUtils.h"
#include "mlir/IR/Function.h"
#include "mlir/IR/Module.h"
#include "mlir/InitAllDialects.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Pass/PassManager.h"
#include "mlir/Support/FileUtilities.h"
#include "mlir/Target/ROCDLIR.h"
#include "mlir/Transforms/DialectConversion.h"
#include "mlir/Transforms/Passes.h"
#include "llvm/Support/FileUtilities.h"
#include "llvm/Support/InitLLVM.h"
#include "llvm/Support/SourceMgr.h"
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Support/TargetSelect.h"
// MC headers.
#include "llvm/MC/MCAsmBackend.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCCodeEmitter.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCInstPrinter.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCObjectFileInfo.h"
#include "llvm/MC/MCObjectWriter.h"
#include "llvm/MC/MCParser/AsmLexer.h"
#include "llvm/MC/MCParser/MCTargetAsmParser.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/MC/MCTargetOptionsCommandFlags.h"
// lld headers.
#include "lld/Common/Driver.h"
using namespace mlir;
using namespace llvm;
using Blob = SmallVector<char, 0>;
static cl::opt<std::string> tripleName("triple", cl::desc("target triple"),
cl::value_desc("triple string"),
cl::init("amdgcn-amd-amdhsa"));
// TODO(whchung): Add feature to automatically detect available AMD GCN ISA
// version via `rocm-agent-enumerator` utility.
static cl::opt<std::string> targetChip("target", cl::desc("target chip"),
cl::value_desc("AMDGPU ISA version"),
cl::init("gfx900"));
static cl::opt<std::string> features("feature", cl::desc("target features"),
cl::value_desc("AMDGPU target features"),
cl::init("-code-object-v3"));
static LogicalResult assembleIsa(const std::string isa, StringRef name,
Blob &result) {
raw_svector_ostream os(result);
std::string error;
Triple theTriple(Triple::normalize(tripleName));
const Target *theTarget =
TargetRegistry::lookupTarget(theTriple.normalize(), error);
if (!theTarget) {
WithColor::error(errs(), name) << error;
return failure();
}
SourceMgr srcMgr;
srcMgr.AddNewSourceBuffer(MemoryBuffer::getMemBuffer(isa), SMLoc());
const MCTargetOptions mcOptions;
std::unique_ptr<MCRegisterInfo> mri(theTarget->createMCRegInfo(tripleName));
std::unique_ptr<MCAsmInfo> mai(
theTarget->createMCAsmInfo(*mri, tripleName, mcOptions));
mai->setRelaxELFRelocations(true);
MCObjectFileInfo mofi;
MCContext ctx(mai.get(), mri.get(), &mofi, &srcMgr, &mcOptions);
mofi.InitMCObjectFileInfo(theTriple, false, ctx, false);
SmallString<128> cwd;
if (!sys::fs::current_path(cwd))
ctx.setCompilationDir(cwd);
std::unique_ptr<MCStreamer> mcStreamer;
std::unique_ptr<MCInstrInfo> mcii(theTarget->createMCInstrInfo());
std::unique_ptr<MCSubtargetInfo> sti(
theTarget->createMCSubtargetInfo(tripleName, targetChip, features));
MCCodeEmitter *ce = theTarget->createMCCodeEmitter(*mcii, *mri, ctx);
MCAsmBackend *mab = theTarget->createMCAsmBackend(*sti, *mri, mcOptions);
mcStreamer.reset(theTarget->createMCObjectStreamer(
theTriple, ctx, std::unique_ptr<MCAsmBackend>(mab),
mab->createObjectWriter(os), std::unique_ptr<MCCodeEmitter>(ce), *sti,
mcOptions.MCRelaxAll, mcOptions.MCIncrementalLinkerCompatible,
/*DWARFMustBeAtTheEnd*/ false));
mcStreamer->setUseAssemblerInfoForParsing(true);
std::unique_ptr<MCAsmParser> parser(
createMCAsmParser(srcMgr, ctx, *mcStreamer, *mai));
std::unique_ptr<MCTargetAsmParser> tap(
theTarget->createMCAsmParser(*sti, *parser, *mcii, mcOptions));
if (!tap) {
WithColor::error(errs(), name) << "assembler initialization error.\n";
return failure();
}
parser->setTargetParser(*tap);
parser->Run(false);
return success();
}
static LogicalResult createHsaco(const Blob &isaBlob, StringRef name,
Blob &hsacoBlob) {
// Save the ISA binary to a temp file.
int tempIsaBinaryFd = -1;
SmallString<128> tempIsaBinaryFilename;
std::error_code ec = sys::fs::createTemporaryFile(
"kernel", "o", tempIsaBinaryFd, tempIsaBinaryFilename);
if (ec) {
WithColor::error(errs(), name)
<< "temporary file for ISA binary creation error.\n";
return failure();
}
FileRemover cleanupIsaBinary(tempIsaBinaryFilename);
raw_fd_ostream tempIsaBinaryOs(tempIsaBinaryFd, true);
tempIsaBinaryOs << isaBlob;
tempIsaBinaryOs.close();
// Create a temp file for HSA code object.
int tempHsacoFD = -1;
SmallString<128> tempHsacoFilename;
ec = sys::fs::createTemporaryFile("kernel", "hsaco", tempHsacoFD,
tempHsacoFilename);
if (ec) {
WithColor::error(errs(), name)
<< "temporary file for HSA code object creation error.\n";
return failure();
}
FileRemover cleanupHsaco(tempHsacoFilename);
// Invoke lld. Expect a true return value from lld.
bool ret = lld::elf::link({"ld.lld", "-shared", tempIsaBinaryFilename.c_str(),
"-o", tempHsacoFilename.c_str()},
/*canEarlyExit=*/false, llvm::outs(), llvm::errs());
if (!ret) {
WithColor::error(errs(), name) << "lld invocation error.\n";
return failure();
}
// Load the HSA code object.
auto hsacoFile = mlir::openInputFile(tempHsacoFilename);
if (!hsacoFile) {
WithColor::error(errs(), name)
<< "read HSA code object from temp file error.\n";
return failure();
}
hsacoBlob.assign(hsacoFile->getBuffer().begin(),
hsacoFile->getBuffer().end());
return success();
}
static std::unique_ptr<llvm::Module> compileModuleToROCDLIR(Operation *m) {
auto llvmModule = translateModuleToROCDLIR(m);
// TODO(whchung): Link with ROCm-Device-Libs in case needed (ex: the Module
// depends on math functions).
return llvmModule;
}
static OwnedBlob compileISAToHsaco(const std::string isa, Location loc,
StringRef name) {
// ISA -> ISA in binary form via MC.
// Use lld to create HSA code object.
Blob isaBlob;
Blob hsacoBlob;
if (succeeded(assembleIsa(isa, name, isaBlob)) &&
succeeded(createHsaco(isaBlob, name, hsacoBlob)))
return std::make_unique<std::vector<char>>(hsacoBlob.begin(),
hsacoBlob.end());
WithColor::error(errs(), name) << "producing HSA code object error.\n";
return {};
}
static LogicalResult runMLIRPasses(ModuleOp m) {
PassManager pm(m.getContext());
applyPassManagerCLOptions(pm);
pm.addPass(createGpuKernelOutliningPass());
auto &kernelPm = pm.nest<gpu::GPUModuleOp>();
kernelPm.addPass(createStripDebugInfoPass());
kernelPm.addPass(createLowerGpuOpsToROCDLOpsPass());
kernelPm.addPass(createConvertGPUKernelToBlobPass(
compileModuleToROCDLIR, compileISAToHsaco, tripleName, targetChip,
features, /*gpuBinaryAnnotation=*/"rocdl.hsaco"));
pm.addPass(createLowerToLLVMPass());
pm.addPass(createConvertGpuLaunchFuncToGpuRuntimeCallsPass(
/*gpuBinaryAnnotation=*/"rocdl.hsaco"));
return pm.run(m);
}
int main(int argc, char **argv) {
registerPassManagerCLOptions();
mlir::registerAllDialects();
llvm::InitLLVM y(argc, argv);
llvm::InitializeAllTargetInfos();
llvm::InitializeAllTargetMCs();
llvm::InitializeAllAsmParsers();
// Initialize LLVM AMDGPU backend.
LLVMInitializeAMDGPUTarget();
LLVMInitializeAMDGPUTargetInfo();
LLVMInitializeAMDGPUTargetMC();
LLVMInitializeAMDGPUAsmPrinter();
mlir::initializeLLVMPasses();
return mlir::JitRunnerMain(argc, argv, &runMLIRPasses);
}

View File

@ -0,0 +1,143 @@
//===- rocm-runtime-wrappers.cpp - MLIR ROCM runner wrapper library -------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Implements C wrappers around the ROCM library for easy linking in ORC jit.
// Also adds some debugging helpers that are helpful when writing MLIR code to
// run on GPUs.
//
//===----------------------------------------------------------------------===//
#include <cassert>
#include <numeric>
#include "mlir/ExecutionEngine/CRunnerUtils.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/Support/raw_ostream.h"
#include "hip/hip_runtime.h"
namespace {
int32_t reportErrorIfAny(hipError_t result, const char *where) {
if (result != hipSuccess) {
llvm::errs() << "HIP failed with " << result << " in " << where << "\n";
}
return result;
}
} // anonymous namespace
extern "C" int32_t mgpuModuleLoad(void **module, void *data) {
int32_t err = reportErrorIfAny(
hipModuleLoadData(reinterpret_cast<hipModule_t *>(module), data),
"ModuleLoad");
return err;
}
extern "C" int32_t mgpuModuleGetFunction(void **function, void *module,
const char *name) {
return reportErrorIfAny(
hipModuleGetFunction(reinterpret_cast<hipFunction_t *>(function),
reinterpret_cast<hipModule_t>(module), name),
"GetFunction");
}
// The wrapper uses intptr_t instead of ROCM's unsigned int to match
// the type of MLIR's index type. This avoids the need for casts in the
// generated MLIR code.
extern "C" int32_t mgpuLaunchKernel(void *function, intptr_t gridX,
intptr_t gridY, intptr_t gridZ,
intptr_t blockX, intptr_t blockY,
intptr_t blockZ, int32_t smem, void *stream,
void **params, void **extra) {
return reportErrorIfAny(
hipModuleLaunchKernel(reinterpret_cast<hipFunction_t>(function), gridX,
gridY, gridZ, blockX, blockY, blockZ, smem,
reinterpret_cast<hipStream_t>(stream), params,
extra),
"LaunchKernel");
}
extern "C" void *mgpuGetStreamHelper() {
hipStream_t stream;
reportErrorIfAny(hipStreamCreate(&stream), "StreamCreate");
return stream;
}
extern "C" int32_t mgpuStreamSynchronize(void *stream) {
return reportErrorIfAny(
hipStreamSynchronize(reinterpret_cast<hipStream_t>(stream)),
"StreamSync");
}
/// Helper functions for writing mlir example code
// Allows to register byte array with the ROCM runtime. Helpful until we have
// transfer functions implemented.
extern "C" void mgpuMemHostRegister(void *ptr, uint64_t sizeBytes) {
reportErrorIfAny(hipHostRegister(ptr, sizeBytes, /*flags=*/0),
"MemHostRegister");
}
// Allows to register a MemRef with the ROCM runtime. Initializes array with
// value. Helpful until we have transfer functions implemented.
template <typename T>
void mgpuMemHostRegisterMemRef(T *pointer, llvm::ArrayRef<int64_t> sizes,
llvm::ArrayRef<int64_t> strides, T value) {
assert(sizes.size() == strides.size());
llvm::SmallVector<int64_t, 4> denseStrides(strides.size());
std::partial_sum(sizes.rbegin(), sizes.rend(), denseStrides.rbegin(),
std::multiplies<int64_t>());
auto count = denseStrides.front();
// Only densely packed tensors are currently supported.
std::rotate(denseStrides.begin(), denseStrides.begin() + 1,
denseStrides.end());
denseStrides.back() = 1;
assert(strides == llvm::makeArrayRef(denseStrides));
std::fill_n(pointer, count, value);
mgpuMemHostRegister(pointer, count * sizeof(T));
}
extern "C" void mgpuMemHostRegisterFloat(int64_t rank, void *ptr) {
auto *desc = static_cast<StridedMemRefType<float, 1> *>(ptr);
auto sizes = llvm::ArrayRef<int64_t>(desc->sizes, rank);
auto strides = llvm::ArrayRef<int64_t>(desc->sizes + rank, rank);
mgpuMemHostRegisterMemRef(desc->data + desc->offset, sizes, strides, 1.23f);
}
extern "C" void mgpuMemHostRegisterInt32(int64_t rank, void *ptr) {
auto *desc = static_cast<StridedMemRefType<int32_t, 1> *>(ptr);
auto sizes = llvm::ArrayRef<int64_t>(desc->sizes, rank);
auto strides = llvm::ArrayRef<int64_t>(desc->sizes + rank, rank);
mgpuMemHostRegisterMemRef(desc->data + desc->offset, sizes, strides, 123);
}
template <typename T>
void mgpuMemGetDevicePointer(T *hostPtr, T **devicePtr) {
reportErrorIfAny(hipSetDevice(0), "hipSetDevice");
reportErrorIfAny(
hipHostGetDevicePointer((void **)devicePtr, hostPtr, /*flags=*/0),
"hipHostGetDevicePointer");
}
extern "C" StridedMemRefType<float, 1>
mgpuMemGetDeviceMemRef1dFloat(float *allocated, float *aligned, int64_t offset,
int64_t size, int64_t stride) {
float *devicePtr = nullptr;
mgpuMemGetDevicePointer(aligned, &devicePtr);
return {devicePtr, devicePtr, offset, {size}, {stride}};
}
extern "C" StridedMemRefType<int32_t, 1>
mgpuMemGetDeviceMemRef1dInt32(int32_t *allocated, int32_t *aligned,
int64_t offset, int64_t size, int64_t stride) {
int32_t *devicePtr = nullptr;
mgpuMemGetDevicePointer(aligned, &devicePtr);
return {devicePtr, devicePtr, offset, {size}, {stride}};
}