[mlir][gpu] Introduce mlir-rocm-runner.

Summary: `mlir-rocm-runner` is introduced in this commit to execute GPU modules on ROCm platform. A small wrapper to encapsulate ROCm's HIP runtime API is also inside the commit. Due to behavior of ROCm, raw pointers inside memrefs passed to `gpu.launch` must be modified on the host side to properly capture the pointer values addressable on the GPU. LLVM MC is used to assemble AMD GCN ISA coming out from `ConvertGPUKernelToBlobPass` to binary form, and LLD is used to produce a shared ELF object which could be loaded by ROCm HIP runtime. gfx900 is the default target be used right now, although it could be altered via an option in `mlir-rocm-runner`. Future revisions may consider using ROCm Agent Enumerator to detect the right target on the system. Notice AMDGPU Code Object V2 is used in this revision. Future enhancements may upgrade to AMDGPU Code Object V3. Bitcode libraries in ROCm-Device-Libs, which implements math routines exposed in `rocdl` dialect are not yet linked, and is left as a TODO in the logic. Reviewers: herhut Subscribers: mgorny, tpr, dexonsmith, mehdi_amini, rriddle, jpienaar, shauheen, antiagainst, nicolasvasilache, csigg, arpith-jacob, mgester, lucyrfox, aartbik, liufengdb, stephenneuendorffer, Joonsoo, grosul1, frgossen, Kayjukh, jurahul, llvm-commits Tags: #mlir, #llvm Differential Revision: https://reviews.llvm.org/D80676
2020-05-20 16:07:49 -05:00 · 2020-05-20 16:07:49 -05:00 · 2fd6403a6d
parent 164f4b9d26
commit 2fd6403a6d
14 changed files with 634 additions and 4 deletions
--- a/mlir/CMakeLists.txt
+++ b/mlir/CMakeLists.txt
@ -41,6 +41,7 @@ endif()
 add_definitions(-DMLIR_ROCM_CONVERSIONS_ENABLED=${MLIR_ROCM_CONVERSIONS_ENABLED})

 set(MLIR_CUDA_RUNNER_ENABLED 0 CACHE BOOL "Enable building the mlir CUDA runner")
+set(MLIR_ROCM_RUNNER_ENABLED 0 CACHE BOOL "Enable building the mlir ROCm runner")
 set(MLIR_VULKAN_RUNNER_ENABLED 0 CACHE BOOL "Enable building the mlir Vulkan runner")

 option(MLIR_INCLUDE_TESTS
--- a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
+++ b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
@ -43,7 +43,8 @@ using LoweringCallback =
 /// instead uses a small wrapper library that exports a stable and conveniently
 /// typed ABI on top of GPU runtimes such as CUDA or ROCm (HIP).
 std::unique_ptr<OperationPass<ModuleOp>>
-createConvertGpuLaunchFuncToGpuRuntimeCallsPass();
+createConvertGpuLaunchFuncToGpuRuntimeCallsPass(
+    StringRef gpuBinaryAnnotation = "");

 /// Creates a pass to convert kernel functions into GPU target object blobs.
 ///
--- a/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp
+++ b/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp
@ -123,6 +123,11 @@ private:
  void translateGpuLaunchCalls(mlir::gpu::LaunchFuncOp launchOp);

 public:
+  GpuLaunchFuncToGpuRuntimeCallsPass() = default;
+  GpuLaunchFuncToGpuRuntimeCallsPass(StringRef gpuBinaryAnnotation) {
+    this->gpuBinaryAnnotation = gpuBinaryAnnotation.str();
+  }
+
  // Run the dialect converter on the module.
  void runOnOperation() override {
    // Cache the LLVMDialect for the current module.
@ -457,6 +462,8 @@ void GpuLaunchFuncToGpuRuntimeCallsPass::translateGpuLaunchCalls(
 }

 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-mlir::createConvertGpuLaunchFuncToGpuRuntimeCallsPass() {
-  return std::make_unique<GpuLaunchFuncToGpuRuntimeCallsPass>();
+mlir::createConvertGpuLaunchFuncToGpuRuntimeCallsPass(
+    StringRef gpuBinaryAnnotation) {
+  return std::make_unique<GpuLaunchFuncToGpuRuntimeCallsPass>(
+      gpuBinaryAnnotation);
 }
--- a/mlir/test/CMakeLists.txt
+++ b/mlir/test/CMakeLists.txt
@ -13,8 +13,9 @@ set(MLIR_DIALECT_LINALG_INTEGRATION_TEST_LIB_DIR ${CMAKE_LIBRARY_OUTPUT_DIRECTOR
 set(MLIR_RUNNER_UTILS_DIR ${CMAKE_LIBRARY_OUTPUT_DIRECTORY})

 # Passed to lit.site.cfg.py.in to set up the path where to find the libraries
-# for the mlir cuda runner tests.
+# for the mlir cuda / rocm / vulkan runner tests.
 set(MLIR_CUDA_WRAPPER_LIBRARY_DIR ${CMAKE_LIBRARY_OUTPUT_DIRECTORY})
+set(MLIR_ROCM_WRAPPER_LIBRARY_DIR ${CMAKE_LIBRARY_OUTPUT_DIRECTORY})
 set(MLIR_VULKAN_WRAPPER_LIBRARY_DIR ${CMAKE_LIBRARY_OUTPUT_DIRECTORY})

 configure_lit_site_cfg(
@ -64,6 +65,12 @@ if(MLIR_CUDA_RUNNER_ENABLED)
  )
 endif()

+if(MLIR_ROCM_RUNNER_ENABLED)
+  list(APPEND MLIR_TEST_DEPENDS
+    mlir-rocm-runner
+  )
+endif()
+
 if(MLIR_VULKAN_RUNNER_ENABLED)
  list(APPEND MLIR_TEST_DEPENDS
    mlir-vulkan-runner
--- a/mlir/test/lit.cfg.py
+++ b/mlir/test/lit.cfg.py
@ -70,6 +70,7 @@ tools.extend([
    ToolSubst('%cuda_wrapper_library_dir', config.cuda_wrapper_library_dir, unresolved='ignore'),
    ToolSubst('%linalg_test_lib_dir', config.linalg_test_lib_dir, unresolved='ignore'),
    ToolSubst('%mlir_runner_utils_dir', config.mlir_runner_utils_dir, unresolved='ignore'),
+    ToolSubst('%rocm_wrapper_library_dir', config.rocm_wrapper_library_dir, unresolved='ignore'),
    ToolSubst('%vulkan_wrapper_library_dir', config.vulkan_wrapper_library_dir, unresolved='ignore')
 ])

--- a/mlir/test/lit.site.cfg.py.in
+++ b/mlir/test/lit.site.cfg.py.in
@ -39,6 +39,8 @@ config.run_cuda_tests = @MLIR_CUDA_CONVERSIONS_ENABLED@
 config.cuda_wrapper_library_dir = "@MLIR_CUDA_WRAPPER_LIBRARY_DIR@"
 config.enable_cuda_runner = @MLIR_CUDA_RUNNER_ENABLED@
 config.run_rocm_tests = @MLIR_ROCM_CONVERSIONS_ENABLED@
+config.rocm_wrapper_library_dir = "@MLIR_ROCM_WRAPPER_LIBRARY_DIR@"
+config.enable_rocm_runner = @MLIR_ROCM_RUNNER_ENABLED@
 config.vulkan_wrapper_library_dir = "@MLIR_VULKAN_WRAPPER_LIBRARY_DIR@"
 config.enable_vulkan_runner = @MLIR_VULKAN_RUNNER_ENABLED@

--- a/mlir/test/mlir-rocm-runner/gpu-to-hsaco.mlir
+++ b/mlir/test/mlir-rocm-runner/gpu-to-hsaco.mlir
@ -0,0 +1,32 @@
+// RUN: mlir-rocm-runner %s --shared-libs=%rocm_wrapper_library_dir/librocm-runtime-wrappers%shlibext,%linalg_test_lib_dir/libmlir_runner_utils%shlibext --entry-point-result=void | FileCheck %s
+
+func @other_func(%arg0 : f32, %arg1 : memref<?xf32>) {
+  %cst = constant 1 : index
+  %cst2 = dim %arg1, 0 : memref<?xf32>
+  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst, %grid_z = %cst)
+             threads(%tx, %ty, %tz) in (%block_x = %cst2, %block_y = %cst, %block_z = %cst) {
+    store %arg0, %arg1[%tx] : memref<?xf32>
+    gpu.terminator
+  }
+  return
+}
+
+// CHECK: [1, 1, 1, 1, 1]
+func @main() {
+  %arg0 = alloc() : memref<5xf32>
+  %21 = constant 5 : i32
+  %22 = memref_cast %arg0 : memref<5xf32> to memref<?xf32>
+  %cast = memref_cast %22 : memref<?xf32> to memref<*xf32>
+  call @mgpuMemHostRegisterFloat(%cast) : (memref<*xf32>) -> ()
+  %23 = memref_cast %22 : memref<?xf32> to memref<*xf32>
+  call @print_memref_f32(%23) : (memref<*xf32>) -> ()
+  %24 = constant 1.0 : f32
+  %25 = call @mgpuMemGetDeviceMemRef1dFloat(%22) : (memref<?xf32>) -> (memref<?xf32>)
+  call @other_func(%24, %25) : (f32, memref<?xf32>) -> ()
+  call @print_memref_f32(%23) : (memref<*xf32>) -> ()
+  return
+}
+
+func @mgpuMemHostRegisterFloat(%ptr : memref<*xf32>)
+func @mgpuMemGetDeviceMemRef1dFloat(%ptr : memref<?xf32>) -> (memref<?xf32>)
+func @print_memref_f32(%ptr : memref<*xf32>)
--- a/mlir/test/mlir-rocm-runner/lit.local.cfg
+++ b/mlir/test/mlir-rocm-runner/lit.local.cfg
@ -0,0 +1,2 @@
+if not config.enable_rocm_runner:
+  config.unsupported = True
--- a/mlir/test/mlir-rocm-runner/two-modules.mlir
+++ b/mlir/test/mlir-rocm-runner/two-modules.mlir
@ -0,0 +1,30 @@
+// RUN: mlir-rocm-runner %s --print-ir-after-all --shared-libs=%rocm_wrapper_library_dir/librocm-runtime-wrappers%shlibext,%linalg_test_lib_dir/libmlir_runner_utils%shlibext --entry-point-result=void | FileCheck %s --dump-input=always
+
+// CHECK: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
+func @main() {
+  %arg = alloc() : memref<13xi32>
+  %dst = memref_cast %arg : memref<13xi32> to memref<?xi32>
+  %one = constant 1 : index
+  %sx = dim %dst, 0 : memref<?xi32>
+  %cast_dst = memref_cast %dst : memref<?xi32> to memref<*xi32>
+  call @mgpuMemHostRegisterInt32(%cast_dst) : (memref<*xi32>) -> ()
+  %dst_device = call @mgpuMemGetDeviceMemRef1dInt32(%dst) : (memref<?xi32>) -> (memref<?xi32>)
+  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %one, %grid_y = %one, %grid_z = %one)
+             threads(%tx, %ty, %tz) in (%block_x = %sx, %block_y = %one, %block_z = %one) {
+    %t0 = index_cast %tx : index to i32
+    store %t0, %dst_device[%tx] : memref<?xi32>
+    gpu.terminator
+  }
+  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %one, %grid_y = %one, %grid_z = %one)
+             threads(%tx, %ty, %tz) in (%block_x = %sx, %block_y = %one, %block_z = %one) {
+    %t0 = index_cast %tx : index to i32
+    store %t0, %dst_device[%tx] : memref<?xi32>
+    gpu.terminator
+  }
+  call @print_memref_i32(%cast_dst) : (memref<*xi32>) -> ()
+  return
+}
+
+func @mgpuMemHostRegisterInt32(%ptr : memref<*xi32>)
+func @mgpuMemGetDeviceMemRef1dInt32(%ptr : memref<?xi32>) -> (memref<?xi32>)
+func @print_memref_i32(%ptr : memref<*xi32>)
--- a/mlir/test/mlir-rocm-runner/vecadd.mlir
+++ b/mlir/test/mlir-rocm-runner/vecadd.mlir
@ -0,0 +1,42 @@
+// RUN: mlir-rocm-runner %s --shared-libs=%rocm_wrapper_library_dir/librocm-runtime-wrappers%shlibext,%linalg_test_lib_dir/libmlir_runner_utils%shlibext --entry-point-result=void | FileCheck %s
+
+func @vecadd(%arg0 : memref<?xf32>, %arg1 : memref<?xf32>, %arg2 : memref<?xf32>) {
+  %cst = constant 1 : index
+  %cst2 = dim %arg0, 0 : memref<?xf32>
+  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst, %grid_z = %cst)
+             threads(%tx, %ty, %tz) in (%block_x = %cst2, %block_y = %cst, %block_z = %cst) {
+    %a = load %arg0[%tx] : memref<?xf32>
+    %b = load %arg1[%tx] : memref<?xf32>
+    %c = addf %a, %b : f32
+    store %c, %arg2[%tx] : memref<?xf32>
+    gpu.terminator
+  }
+  return
+}
+
+// CHECK: [2.46, 2.46, 2.46, 2.46, 2.46]
+func @main() {
+  %0 = alloc() : memref<5xf32>
+  %1 = alloc() : memref<5xf32>
+  %2 = alloc() : memref<5xf32>
+  %3 = memref_cast %0 : memref<5xf32> to memref<?xf32>
+  %4 = memref_cast %1 : memref<5xf32> to memref<?xf32>
+  %5 = memref_cast %2 : memref<5xf32> to memref<?xf32>
+  %6 = memref_cast %3 : memref<?xf32> to memref<*xf32>
+  %7 = memref_cast %4 : memref<?xf32> to memref<*xf32>
+  %8 = memref_cast %5 : memref<?xf32> to memref<*xf32>
+  call @mgpuMemHostRegisterFloat(%6) : (memref<*xf32>) -> ()
+  call @mgpuMemHostRegisterFloat(%7) : (memref<*xf32>) -> ()
+  call @mgpuMemHostRegisterFloat(%8) : (memref<*xf32>) -> ()
+  %9 = call @mgpuMemGetDeviceMemRef1dFloat(%3) : (memref<?xf32>) -> (memref<?xf32>)
+  %10 = call @mgpuMemGetDeviceMemRef1dFloat(%4) : (memref<?xf32>) -> (memref<?xf32>)
+  %11 = call @mgpuMemGetDeviceMemRef1dFloat(%5) : (memref<?xf32>) -> (memref<?xf32>)
+
+  call @vecadd(%9, %10, %11) : (memref<?xf32>, memref<?xf32>, memref<?xf32>) -> ()
+  call @print_memref_f32(%8) : (memref<*xf32>) -> ()
+  return
+}
+
+func @mgpuMemHostRegisterFloat(%ptr : memref<*xf32>)
+func @mgpuMemGetDeviceMemRef1dFloat(%ptr : memref<?xf32>) -> (memref<?xf32>)
+func @print_memref_f32(%ptr : memref<*xf32>)
--- a/mlir/tools/CMakeLists.txt
+++ b/mlir/tools/CMakeLists.txt
@ -2,6 +2,7 @@ add_subdirectory(mlir-cuda-runner)
 add_subdirectory(mlir-cpu-runner)
 add_subdirectory(mlir-linalg-ods-gen)
 add_subdirectory(mlir-opt)
+add_subdirectory(mlir-rocm-runner)
 add_subdirectory(mlir-translate)
 add_subdirectory(mlir-vulkan-runner)
 add_subdirectory(mlir-shlib)
--- a/mlir/tools/mlir-rocm-runner/CMakeLists.txt
+++ b/mlir/tools/mlir-rocm-runner/CMakeLists.txt
@ -0,0 +1,113 @@
+set(LLVM_OPTIONAL_SOURCES
+  rocm-runtime-wrappers.cpp
+  mlir-rocm-runner.cpp
+  )
+
+if(MLIR_ROCM_RUNNER_ENABLED)
+  if (NOT ("AMDGPU" IN_LIST LLVM_TARGETS_TO_BUILD))
+    message(SEND_ERROR
+      "Building the mlir rocm runner requires the AMDGPU backend")
+  endif()
+
+  # Ensure lld is enabled.
+  if (NOT "lld" IN_LIST LLVM_ENABLE_PROJECTS)
+    message(SEND_ERROR "lld is not enabled. Please revise LLVM_ENABLE_PROJECTS")
+  endif()
+
+  # lld header files.
+  include_directories(${MLIR_SOURCE_DIR}/../lld/include)
+
+  # Configure ROCm support.
+  if (NOT DEFINED ROCM_PATH)
+    if (NOT DEFINED ENV{ROCM_PATH})
+      set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to which ROCm has been installed")
+    else()
+      set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to which ROCm has been installed")
+    endif()
+    set(HIP_PATH "${ROCM_PATH}/hip" CACHE PATH " Path to which HIP has been installed")
+  endif()
+  set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
+  find_package(HIP)
+  if (NOT HIP_FOUND)
+    message(SEND_ERROR "Build the mlir rocm runner requires a working ROCm and HIP install")
+  else()
+    message(STATUS "ROCm HIP version: ${HIP_VERSION}")
+  endif()
+
+  # Locate HIP runtime library.
+  find_library(ROCM_RUNTIME_LIBRARY hip_hcc
+               PATHS "${HIP_PATH}/lib")
+  if (NOT ROCM_RUNTIME_LIBRARY)
+    message(SEND_ERROR "Could not locate ROCm HIP runtime library")
+  else()
+    message(STATUS "ROCm HIP runtime lib: ${ROCM_RUNTIME_LIBRARY}")
+  endif()
+
+  # Set HIP compile-time flags.
+  add_definitions(-D__HIP_PLATFORM_HCC__)
+
+  add_llvm_library(rocm-runtime-wrappers SHARED
+    rocm-runtime-wrappers.cpp
+  )
+  target_include_directories(rocm-runtime-wrappers
+    PRIVATE
+    "${HIP_PATH}/../include"
+    "${HIP_PATH}/include"
+    LLVMSupport
+  )
+  target_link_libraries(rocm-runtime-wrappers
+    PUBLIC
+    LLVMSupport
+    ${ROCM_RUNTIME_LIBRARY}
+  )
+
+  get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
+  get_property(conversion_libs GLOBAL PROPERTY MLIR_CONVERSION_LIBS)
+  set(LIBS
+    ${dialect_libs}
+    ${conversion_libs}
+    lldCommon
+    lldDriver
+    lldELF
+    LLVMCore
+    LLVMLTO
+    LLVMMC
+    LLVMMCParser
+    LLVMOption
+    LLVMSupport
+    MLIRJitRunner
+    MLIRAnalysis
+    MLIREDSC
+    MLIRExecutionEngine
+    MLIRIR
+    MLIRParser
+    MLIRROCDLIR
+    MLIRSupport
+    MLIRTargetLLVMIR
+    MLIRTargetROCDLIR
+    MLIRTransforms
+    MLIRTranslation
+    ${ROCM_RUNTIME_LIBRARY}
+  )
+
+  # Manually expand the target library, since our MLIR libraries
+  # aren't plugged into the LLVM dependency tracking. If we don't
+  # do this then we can't insert the CodeGen library after ourselves
+  llvm_expand_pseudo_components(TARGET_LIBS AllTargetsCodeGens AllTargetsAsmParsers)
+  # Prepend LLVM in front of every target, this is how the library
+  # are named with CMake
+  SET(targets_to_link)
+  FOREACH(t ${TARGET_LIBS})
+    LIST(APPEND targets_to_link "LLVM${t}")
+  ENDFOREACH(t)
+
+  add_llvm_tool(mlir-rocm-runner
+    mlir-rocm-runner.cpp
+
+    DEPENDS
+    rocm-runtime-wrappers
+    )
+  llvm_update_compile_flags(mlir-rocm-runner)
+  target_link_libraries(mlir-rocm-runner PRIVATE ${LIBS} ${targets_to_link})
+
+endif()
--- a/mlir/tools/mlir-rocm-runner/mlir-rocm-runner.cpp
+++ b/mlir/tools/mlir-rocm-runner/mlir-rocm-runner.cpp
@ -0,0 +1,248 @@
+//===- mlir-rocm-runner.cpp - MLIR ROCM Execution Driver-------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a command line utility that executes an MLIR file on the GPU by
+// translating MLIR to ROCDL/LLVM IR before JIT-compiling and executing the
+// latter.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/STLExtras.h"
+
+#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
+#include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h"
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"
+#include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/GPU/Passes.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
+#include "mlir/ExecutionEngine/JitRunner.h"
+#include "mlir/ExecutionEngine/OptUtils.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Module.h"
+#include "mlir/InitAllDialects.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Support/FileUtilities.h"
+#include "mlir/Target/ROCDLIR.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/Passes.h"
+#include "llvm/Support/FileUtilities.h"
+#include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
+
+// MC headers.
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCParser/AsmLexer.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCTargetOptionsCommandFlags.h"
+
+// lld headers.
+#include "lld/Common/Driver.h"
+
+using namespace mlir;
+using namespace llvm;
+
+using Blob = SmallVector<char, 0>;
+
+static cl::opt<std::string> tripleName("triple", cl::desc("target triple"),
+                                       cl::value_desc("triple string"),
+                                       cl::init("amdgcn-amd-amdhsa"));
+
+// TODO(whchung): Add feature to automatically detect available AMD GCN ISA
+// version via `rocm-agent-enumerator` utility.
+static cl::opt<std::string> targetChip("target", cl::desc("target chip"),
+                                       cl::value_desc("AMDGPU ISA version"),
+                                       cl::init("gfx900"));
+
+static cl::opt<std::string> features("feature", cl::desc("target features"),
+                                     cl::value_desc("AMDGPU target features"),
+                                     cl::init("-code-object-v3"));
+
+static LogicalResult assembleIsa(const std::string isa, StringRef name,
+                                 Blob &result) {
+  raw_svector_ostream os(result);
+
+  std::string error;
+  Triple theTriple(Triple::normalize(tripleName));
+  const Target *theTarget =
+      TargetRegistry::lookupTarget(theTriple.normalize(), error);
+  if (!theTarget) {
+    WithColor::error(errs(), name) << error;
+    return failure();
+  }
+
+  SourceMgr srcMgr;
+  srcMgr.AddNewSourceBuffer(MemoryBuffer::getMemBuffer(isa), SMLoc());
+
+  const MCTargetOptions mcOptions;
+  std::unique_ptr<MCRegisterInfo> mri(theTarget->createMCRegInfo(tripleName));
+  std::unique_ptr<MCAsmInfo> mai(
+      theTarget->createMCAsmInfo(*mri, tripleName, mcOptions));
+  mai->setRelaxELFRelocations(true);
+
+  MCObjectFileInfo mofi;
+  MCContext ctx(mai.get(), mri.get(), &mofi, &srcMgr, &mcOptions);
+  mofi.InitMCObjectFileInfo(theTriple, false, ctx, false);
+
+  SmallString<128> cwd;
+  if (!sys::fs::current_path(cwd))
+    ctx.setCompilationDir(cwd);
+
+  std::unique_ptr<MCStreamer> mcStreamer;
+  std::unique_ptr<MCInstrInfo> mcii(theTarget->createMCInstrInfo());
+  std::unique_ptr<MCSubtargetInfo> sti(
+      theTarget->createMCSubtargetInfo(tripleName, targetChip, features));
+
+  MCCodeEmitter *ce = theTarget->createMCCodeEmitter(*mcii, *mri, ctx);
+  MCAsmBackend *mab = theTarget->createMCAsmBackend(*sti, *mri, mcOptions);
+  mcStreamer.reset(theTarget->createMCObjectStreamer(
+      theTriple, ctx, std::unique_ptr<MCAsmBackend>(mab),
+      mab->createObjectWriter(os), std::unique_ptr<MCCodeEmitter>(ce), *sti,
+      mcOptions.MCRelaxAll, mcOptions.MCIncrementalLinkerCompatible,
+      /*DWARFMustBeAtTheEnd*/ false));
+  mcStreamer->setUseAssemblerInfoForParsing(true);
+
+  std::unique_ptr<MCAsmParser> parser(
+      createMCAsmParser(srcMgr, ctx, *mcStreamer, *mai));
+  std::unique_ptr<MCTargetAsmParser> tap(
+      theTarget->createMCAsmParser(*sti, *parser, *mcii, mcOptions));
+
+  if (!tap) {
+    WithColor::error(errs(), name) << "assembler initialization error.\n";
+    return failure();
+  }
+
+  parser->setTargetParser(*tap);
+  parser->Run(false);
+
+  return success();
+}
+
+static LogicalResult createHsaco(const Blob &isaBlob, StringRef name,
+                                 Blob &hsacoBlob) {
+  // Save the ISA binary to a temp file.
+  int tempIsaBinaryFd = -1;
+  SmallString<128> tempIsaBinaryFilename;
+  std::error_code ec = sys::fs::createTemporaryFile(
+      "kernel", "o", tempIsaBinaryFd, tempIsaBinaryFilename);
+  if (ec) {
+    WithColor::error(errs(), name)
+        << "temporary file for ISA binary creation error.\n";
+    return failure();
+  }
+  FileRemover cleanupIsaBinary(tempIsaBinaryFilename);
+  raw_fd_ostream tempIsaBinaryOs(tempIsaBinaryFd, true);
+  tempIsaBinaryOs << isaBlob;
+  tempIsaBinaryOs.close();
+
+  // Create a temp file for HSA code object.
+  int tempHsacoFD = -1;
+  SmallString<128> tempHsacoFilename;
+  ec = sys::fs::createTemporaryFile("kernel", "hsaco", tempHsacoFD,
+                                    tempHsacoFilename);
+  if (ec) {
+    WithColor::error(errs(), name)
+        << "temporary file for HSA code object creation error.\n";
+    return failure();
+  }
+  FileRemover cleanupHsaco(tempHsacoFilename);
+
+  // Invoke lld. Expect a true return value from lld.
+  bool ret = lld::elf::link({"ld.lld", "-shared", tempIsaBinaryFilename.c_str(),
+                             "-o", tempHsacoFilename.c_str()},
+                            /*canEarlyExit=*/false, llvm::outs(), llvm::errs());
+  if (!ret) {
+    WithColor::error(errs(), name) << "lld invocation error.\n";
+    return failure();
+  }
+
+  // Load the HSA code object.
+  auto hsacoFile = mlir::openInputFile(tempHsacoFilename);
+  if (!hsacoFile) {
+    WithColor::error(errs(), name)
+        << "read HSA code object from temp file error.\n";
+    return failure();
+  }
+  hsacoBlob.assign(hsacoFile->getBuffer().begin(),
+                   hsacoFile->getBuffer().end());
+
+  return success();
+}
+
+static std::unique_ptr<llvm::Module> compileModuleToROCDLIR(Operation *m) {
+  auto llvmModule = translateModuleToROCDLIR(m);
+  // TODO(whchung): Link with ROCm-Device-Libs in case needed (ex: the Module
+  // depends on math functions).
+  return llvmModule;
+}
+
+static OwnedBlob compileISAToHsaco(const std::string isa, Location loc,
+                                   StringRef name) {
+  // ISA -> ISA in binary form via MC.
+  // Use lld to create HSA code object.
+  Blob isaBlob;
+  Blob hsacoBlob;
+
+  if (succeeded(assembleIsa(isa, name, isaBlob)) &&
+      succeeded(createHsaco(isaBlob, name, hsacoBlob)))
+    return std::make_unique<std::vector<char>>(hsacoBlob.begin(),
+                                               hsacoBlob.end());
+
+  WithColor::error(errs(), name) << "producing HSA code object error.\n";
+  return {};
+}
+
+static LogicalResult runMLIRPasses(ModuleOp m) {
+  PassManager pm(m.getContext());
+  applyPassManagerCLOptions(pm);
+
+  pm.addPass(createGpuKernelOutliningPass());
+  auto &kernelPm = pm.nest<gpu::GPUModuleOp>();
+  kernelPm.addPass(createStripDebugInfoPass());
+  kernelPm.addPass(createLowerGpuOpsToROCDLOpsPass());
+  kernelPm.addPass(createConvertGPUKernelToBlobPass(
+      compileModuleToROCDLIR, compileISAToHsaco, tripleName, targetChip,
+      features, /*gpuBinaryAnnotation=*/"rocdl.hsaco"));
+  pm.addPass(createLowerToLLVMPass());
+  pm.addPass(createConvertGpuLaunchFuncToGpuRuntimeCallsPass(
+      /*gpuBinaryAnnotation=*/"rocdl.hsaco"));
+
+  return pm.run(m);
+}
+
+int main(int argc, char **argv) {
+  registerPassManagerCLOptions();
+  mlir::registerAllDialects();
+  llvm::InitLLVM y(argc, argv);
+  llvm::InitializeAllTargetInfos();
+  llvm::InitializeAllTargetMCs();
+  llvm::InitializeAllAsmParsers();
+
+  // Initialize LLVM AMDGPU backend.
+  LLVMInitializeAMDGPUTarget();
+  LLVMInitializeAMDGPUTargetInfo();
+  LLVMInitializeAMDGPUTargetMC();
+  LLVMInitializeAMDGPUAsmPrinter();
+
+  mlir::initializeLLVMPasses();
+  return mlir::JitRunnerMain(argc, argv, &runMLIRPasses);
+}
--- a/mlir/tools/mlir-rocm-runner/rocm-runtime-wrappers.cpp
+++ b/mlir/tools/mlir-rocm-runner/rocm-runtime-wrappers.cpp
@ -0,0 +1,143 @@
+//===- rocm-runtime-wrappers.cpp - MLIR ROCM runner wrapper library -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements C wrappers around the ROCM library for easy linking in ORC jit.
+// Also adds some debugging helpers that are helpful when writing MLIR code to
+// run on GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cassert>
+#include <numeric>
+
+#include "mlir/ExecutionEngine/CRunnerUtils.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include "hip/hip_runtime.h"
+
+namespace {
+int32_t reportErrorIfAny(hipError_t result, const char *where) {
+  if (result != hipSuccess) {
+    llvm::errs() << "HIP failed with " << result << " in " << where << "\n";
+  }
+  return result;
+}
+} // anonymous namespace
+
+extern "C" int32_t mgpuModuleLoad(void **module, void *data) {
+  int32_t err = reportErrorIfAny(
+      hipModuleLoadData(reinterpret_cast<hipModule_t *>(module), data),
+      "ModuleLoad");
+  return err;
+}
+
+extern "C" int32_t mgpuModuleGetFunction(void **function, void *module,
+                                         const char *name) {
+  return reportErrorIfAny(
+      hipModuleGetFunction(reinterpret_cast<hipFunction_t *>(function),
+                           reinterpret_cast<hipModule_t>(module), name),
+      "GetFunction");
+}
+
+// The wrapper uses intptr_t instead of ROCM's unsigned int to match
+// the type of MLIR's index type. This avoids the need for casts in the
+// generated MLIR code.
+extern "C" int32_t mgpuLaunchKernel(void *function, intptr_t gridX,
+                                    intptr_t gridY, intptr_t gridZ,
+                                    intptr_t blockX, intptr_t blockY,
+                                    intptr_t blockZ, int32_t smem, void *stream,
+                                    void **params, void **extra) {
+  return reportErrorIfAny(
+      hipModuleLaunchKernel(reinterpret_cast<hipFunction_t>(function), gridX,
+                            gridY, gridZ, blockX, blockY, blockZ, smem,
+                            reinterpret_cast<hipStream_t>(stream), params,
+                            extra),
+      "LaunchKernel");
+}
+
+extern "C" void *mgpuGetStreamHelper() {
+  hipStream_t stream;
+  reportErrorIfAny(hipStreamCreate(&stream), "StreamCreate");
+  return stream;
+}
+
+extern "C" int32_t mgpuStreamSynchronize(void *stream) {
+  return reportErrorIfAny(
+      hipStreamSynchronize(reinterpret_cast<hipStream_t>(stream)),
+      "StreamSync");
+}
+
+/// Helper functions for writing mlir example code
+
+// Allows to register byte array with the ROCM runtime. Helpful until we have
+// transfer functions implemented.
+extern "C" void mgpuMemHostRegister(void *ptr, uint64_t sizeBytes) {
+  reportErrorIfAny(hipHostRegister(ptr, sizeBytes, /*flags=*/0),
+                   "MemHostRegister");
+}
+
+// Allows to register a MemRef with the ROCM runtime. Initializes array with
+// value. Helpful until we have transfer functions implemented.
+template <typename T>
+void mgpuMemHostRegisterMemRef(T *pointer, llvm::ArrayRef<int64_t> sizes,
+                               llvm::ArrayRef<int64_t> strides, T value) {
+  assert(sizes.size() == strides.size());
+  llvm::SmallVector<int64_t, 4> denseStrides(strides.size());
+
+  std::partial_sum(sizes.rbegin(), sizes.rend(), denseStrides.rbegin(),
+                   std::multiplies<int64_t>());
+  auto count = denseStrides.front();
+
+  // Only densely packed tensors are currently supported.
+  std::rotate(denseStrides.begin(), denseStrides.begin() + 1,
+              denseStrides.end());
+  denseStrides.back() = 1;
+  assert(strides == llvm::makeArrayRef(denseStrides));
+
+  std::fill_n(pointer, count, value);
+  mgpuMemHostRegister(pointer, count * sizeof(T));
+}
+
+extern "C" void mgpuMemHostRegisterFloat(int64_t rank, void *ptr) {
+  auto *desc = static_cast<StridedMemRefType<float, 1> *>(ptr);
+  auto sizes = llvm::ArrayRef<int64_t>(desc->sizes, rank);
+  auto strides = llvm::ArrayRef<int64_t>(desc->sizes + rank, rank);
+  mgpuMemHostRegisterMemRef(desc->data + desc->offset, sizes, strides, 1.23f);
+}
+
+extern "C" void mgpuMemHostRegisterInt32(int64_t rank, void *ptr) {
+  auto *desc = static_cast<StridedMemRefType<int32_t, 1> *>(ptr);
+  auto sizes = llvm::ArrayRef<int64_t>(desc->sizes, rank);
+  auto strides = llvm::ArrayRef<int64_t>(desc->sizes + rank, rank);
+  mgpuMemHostRegisterMemRef(desc->data + desc->offset, sizes, strides, 123);
+}
+
+template <typename T>
+void mgpuMemGetDevicePointer(T *hostPtr, T **devicePtr) {
+  reportErrorIfAny(hipSetDevice(0), "hipSetDevice");
+  reportErrorIfAny(
+      hipHostGetDevicePointer((void **)devicePtr, hostPtr, /*flags=*/0),
+      "hipHostGetDevicePointer");
+}
+
+extern "C" StridedMemRefType<float, 1>
+mgpuMemGetDeviceMemRef1dFloat(float *allocated, float *aligned, int64_t offset,
+                              int64_t size, int64_t stride) {
+  float *devicePtr = nullptr;
+  mgpuMemGetDevicePointer(aligned, &devicePtr);
+  return {devicePtr, devicePtr, offset, {size}, {stride}};
+}
+
+extern "C" StridedMemRefType<int32_t, 1>
+mgpuMemGetDeviceMemRef1dInt32(int32_t *allocated, int32_t *aligned,
+                              int64_t offset, int64_t size, int64_t stride) {
+  int32_t *devicePtr = nullptr;
+  mgpuMemGetDevicePointer(aligned, &devicePtr);
+  return {devicePtr, devicePtr, offset, {size}, {stride}};
+}