llvm-project/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp

//===- cuda-runtime-wrappers.cpp - MLIR CUDA runner wrapper library -------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Implements C wrappers around the CUDA library for easy linking in ORC jit.
// Also adds some debugging helpers that are helpful when writing MLIR code to
// run on GPUs.
//
//===----------------------------------------------------------------------===//

#include <cassert>
#include <numeric>

#include "mlir/ExecutionEngine/CRunnerUtils.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/Support/raw_ostream.h"

#include "cuda.h"

#define CUDA_REPORT_IF_ERROR(expr)                                             \
  [](CUresult result) {                                                        \
    if (!result)                                                               \
      return;                                                                  \
    const char *name = nullptr;                                                \
    cuGetErrorName(result, &name);                                             \
    if (!name)                                                                 \
      name = "<unknown>";                                                      \
    llvm::errs() << "'" << #expr << "' failed with '" << name << "'\n";        \
  }(expr)

extern "C" CUmodule mgpuModuleLoad(void *data) {
  CUmodule module = nullptr;
  CUDA_REPORT_IF_ERROR(cuModuleLoadData(&module, data));
  return module;
}

extern "C" CUfunction mgpuModuleGetFunction(CUmodule module, const char *name) {
  CUfunction function = nullptr;
  CUDA_REPORT_IF_ERROR(cuModuleGetFunction(&function, module, name));
  return function;
}

// The wrapper uses intptr_t instead of CUDA's unsigned int to match
// the type of MLIR's index type. This avoids the need for casts in the
// generated MLIR code.
extern "C" void mgpuLaunchKernel(CUfunction function, intptr_t gridX,
                                 intptr_t gridY, intptr_t gridZ,
                                 intptr_t blockX, intptr_t blockY,
                                 intptr_t blockZ, int32_t smem, CUstream stream,
                                 void **params, void **extra) {
  CUDA_REPORT_IF_ERROR(cuLaunchKernel(function, gridX, gridY, gridZ, blockX,
                                      blockY, blockZ, smem, stream, params,
                                      extra));
}

extern "C" CUstream mgpuStreamCreate() {
  CUstream stream = nullptr;
  CUDA_REPORT_IF_ERROR(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING));
  return stream;
}

extern "C" void mgpuStreamSynchronize(CUstream stream) {
  CUDA_REPORT_IF_ERROR(cuStreamSynchronize(stream));
}

/// Helper functions for writing mlir example code

// Allows to register byte array with the CUDA runtime. Helpful until we have
// transfer functions implemented.
extern "C" void mgpuMemHostRegister(void *ptr, uint64_t sizeBytes) {
  CUDA_REPORT_IF_ERROR(cuMemHostRegister(ptr, sizeBytes, /*flags=*/0));
}

// Allows to register a MemRef with the CUDA runtime. Initializes array with
// value. Helpful until we have transfer functions implemented.
template <typename T>
void mgpuMemHostRegisterMemRef(const DynamicMemRefType<T> &memRef, T value) {
  llvm::SmallVector<int64_t, 4> denseStrides(memRef.rank);
  llvm::ArrayRef<int64_t> sizes(memRef.sizes, memRef.rank);
  llvm::ArrayRef<int64_t> strides(memRef.strides, memRef.rank);

  std::partial_sum(sizes.rbegin(), sizes.rend(), denseStrides.rbegin(),
                   std::multiplies<int64_t>());
  auto count = denseStrides.front();

  // Only densely packed tensors are currently supported.
  std::rotate(denseStrides.begin(), denseStrides.begin() + 1,
              denseStrides.end());
  denseStrides.back() = 1;
  assert(strides == llvm::makeArrayRef(denseStrides));

  auto *pointer = memRef.data + memRef.offset;
  std::fill_n(pointer, count, value);
  mgpuMemHostRegister(pointer, count * sizeof(T));
}

extern "C" void mgpuMemHostRegisterFloat(int64_t rank, void *ptr) {
  UnrankedMemRefType<float> memRef = {rank, ptr};
  mgpuMemHostRegisterMemRef(DynamicMemRefType<float>(memRef), 1.23f);
}

extern "C" void mgpuMemHostRegisterInt32(int64_t rank, void *ptr) {
  UnrankedMemRefType<int32_t> memRef = {rank, ptr};
  mgpuMemHostRegisterMemRef(DynamicMemRefType<int32_t>(memRef), 123);
}
Add an mlir-cuda-runner tool. This tool allows to execute MLIR IR snippets written in the GPU dialect on a CUDA capable GPU. For this to work, a working CUDA install is required and the build has to be configured with MLIR_CUDA_RUNNER_ENABLED set to 1. PiperOrigin-RevId: 256551415 2019-07-04 22:49:52 +08:00			`//===- cuda-runtime-wrappers.cpp - MLIR CUDA runner wrapper library -------===//`
			`//`
Mass update the MLIR license header to mention "Part of the LLVM project" This is an artifact from merging MLIR into LLVM, the file headers are now aligned with the rest of the project. 2020-01-26 11:58:30 +08:00			`// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.`
Adjust License.txt file to use the LLVM license PiperOrigin-RevId: 286906740 2019-12-24 01:35:36 +08:00			`// See https://llvm.org/LICENSE.txt for license information.`
			`// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception`
Add an mlir-cuda-runner tool. This tool allows to execute MLIR IR snippets written in the GPU dialect on a CUDA capable GPU. For this to work, a working CUDA install is required and the build has to be configured with MLIR_CUDA_RUNNER_ENABLED set to 1. PiperOrigin-RevId: 256551415 2019-07-04 22:49:52 +08:00			`//`
Adjust License.txt file to use the LLVM license PiperOrigin-RevId: 286906740 2019-12-24 01:35:36 +08:00			`//===----------------------------------------------------------------------===//`
Add an mlir-cuda-runner tool. This tool allows to execute MLIR IR snippets written in the GPU dialect on a CUDA capable GPU. For this to work, a working CUDA install is required and the build has to be configured with MLIR_CUDA_RUNNER_ENABLED set to 1. PiperOrigin-RevId: 256551415 2019-07-04 22:49:52 +08:00			`//`
			`// Implements C wrappers around the CUDA library for easy linking in ORC jit.`
			`// Also adds some debugging helpers that are helpful when writing MLIR code to`
			`// run on GPUs.`
			`//`
			`//===----------------------------------------------------------------------===//`

Make type and rank explicit in mcuMemHostRegister function. Fix registered size of indirect MemRefType kernel arguments. PiperOrigin-RevId: 281362940 2019-11-20 05:12:19 +08:00			`#include <cassert>`
			`#include <numeric>`
Add an mlir-cuda-runner tool. This tool allows to execute MLIR IR snippets written in the GPU dialect on a CUDA capable GPU. For this to work, a working CUDA install is required and the build has to be configured with MLIR_CUDA_RUNNER_ENABLED set to 1. PiperOrigin-RevId: 256551415 2019-07-04 22:49:52 +08:00
Unrank mcuMemHostRegister tensor argument. Reviewers: herhut Reviewed By: herhut Subscribers: mehdi_amini, rriddle, jpienaar, shauheen, antiagainst, nicolasvasilache, arpith-jacob, mgester, lucyrfox, aartbik, liufengdb, stephenneuendorffer, Joonsoo, grosul1, frgossen, Kayjukh, jurahul, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D80118 2020-05-18 20:13:14 +08:00			`#include "mlir/ExecutionEngine/CRunnerUtils.h"`
Fix all-reduce int tests by host-registering memrefs. Reduce amount of boiler plate to register host memory. Summary: Fix all-reduce int tests by host-registering memrefs. Reviewers: herhut Reviewed By: herhut Subscribers: clementval, mehdi_amini, rriddle, jpienaar, burmako, shauheen, antiagainst, nicolasvasilache, arpith-jacob, mgester, lucyrfox, liufengdb, Joonsoo, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D76563 2020-03-23 03:18:23 +08:00			`#include "llvm/ADT/ArrayRef.h"`
Add an mlir-cuda-runner tool. This tool allows to execute MLIR IR snippets written in the GPU dialect on a CUDA capable GPU. For this to work, a working CUDA install is required and the build has to be configured with MLIR_CUDA_RUNNER_ENABLED set to 1. PiperOrigin-RevId: 256551415 2019-07-04 22:49:52 +08:00			`#include "llvm/Support/raw_ostream.h"`

			`#include "cuda.h"`

Clean up cuda-runtime-wrappers API. Do not return error code, instead return created resource handles or void. Error reporting is done by the library function. Reviewed By: herhut Differential Revision: https://reviews.llvm.org/D84660 2020-07-28 22:29:29 +08:00			`#define CUDA_REPORT_IF_ERROR(expr) \`
			`[](CUresult result) { \`
			`if (!result) \`
			`return; \`
			`const char *name = nullptr; \`
			`cuGetErrorName(result, &name); \`
			`if (!name) \`
			`name = "<unknown>"; \`
			`llvm::errs() << "'" << #expr << "' failed with '" << name << "'\n"; \`
			`}(expr)`

			`extern "C" CUmodule mgpuModuleLoad(void *data) {`
			`CUmodule module = nullptr;`
			`CUDA_REPORT_IF_ERROR(cuModuleLoadData(&module, data));`
			`return module;`
Add an mlir-cuda-runner tool. This tool allows to execute MLIR IR snippets written in the GPU dialect on a CUDA capable GPU. For this to work, a working CUDA install is required and the build has to be configured with MLIR_CUDA_RUNNER_ENABLED set to 1. PiperOrigin-RevId: 256551415 2019-07-04 22:49:52 +08:00			`}`

Clean up cuda-runtime-wrappers API. Do not return error code, instead return created resource handles or void. Error reporting is done by the library function. Reviewed By: herhut Differential Revision: https://reviews.llvm.org/D84660 2020-07-28 22:29:29 +08:00			`extern "C" CUfunction mgpuModuleGetFunction(CUmodule module, const char *name) {`
			`CUfunction function = nullptr;`
			`CUDA_REPORT_IF_ERROR(cuModuleGetFunction(&function, module, name));`
			`return function;`
Add an mlir-cuda-runner tool. This tool allows to execute MLIR IR snippets written in the GPU dialect on a CUDA capable GPU. For this to work, a working CUDA install is required and the build has to be configured with MLIR_CUDA_RUNNER_ENABLED set to 1. PiperOrigin-RevId: 256551415 2019-07-04 22:49:52 +08:00			`}`

			`// The wrapper uses intptr_t instead of CUDA's unsigned int to match`
			`// the type of MLIR's index type. This avoids the need for casts in the`
			`// generated MLIR code.`
Clean up cuda-runtime-wrappers API. Do not return error code, instead return created resource handles or void. Error reporting is done by the library function. Reviewed By: herhut Differential Revision: https://reviews.llvm.org/D84660 2020-07-28 22:29:29 +08:00			`extern "C" void mgpuLaunchKernel(CUfunction function, intptr_t gridX,`
			`intptr_t gridY, intptr_t gridZ,`
			`intptr_t blockX, intptr_t blockY,`
			`intptr_t blockZ, int32_t smem, CUstream stream,`
			`void params, void extra) {`
			`CUDA_REPORT_IF_ERROR(cuLaunchKernel(function, gridX, gridY, gridZ, blockX,`
			`blockY, blockZ, smem, stream, params,`
			`extra));`
Add an mlir-cuda-runner tool. This tool allows to execute MLIR IR snippets written in the GPU dialect on a CUDA capable GPU. For this to work, a working CUDA install is required and the build has to be configured with MLIR_CUDA_RUNNER_ENABLED set to 1. PiperOrigin-RevId: 256551415 2019-07-04 22:49:52 +08:00			`}`

Clean up cuda-runtime-wrappers API. Do not return error code, instead return created resource handles or void. Error reporting is done by the library function. Reviewed By: herhut Differential Revision: https://reviews.llvm.org/D84660 2020-07-28 22:29:29 +08:00			`extern "C" CUstream mgpuStreamCreate() {`
			`CUstream stream = nullptr;`
			`CUDA_REPORT_IF_ERROR(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING));`
Add an mlir-cuda-runner tool. This tool allows to execute MLIR IR snippets written in the GPU dialect on a CUDA capable GPU. For this to work, a working CUDA install is required and the build has to be configured with MLIR_CUDA_RUNNER_ENABLED set to 1. PiperOrigin-RevId: 256551415 2019-07-04 22:49:52 +08:00			`return stream;`
			`}`

Clean up cuda-runtime-wrappers API. Do not return error code, instead return created resource handles or void. Error reporting is done by the library function. Reviewed By: herhut Differential Revision: https://reviews.llvm.org/D84660 2020-07-28 22:29:29 +08:00			`extern "C" void mgpuStreamSynchronize(CUstream stream) {`
			`CUDA_REPORT_IF_ERROR(cuStreamSynchronize(stream));`
Add an mlir-cuda-runner tool. This tool allows to execute MLIR IR snippets written in the GPU dialect on a CUDA capable GPU. For this to work, a working CUDA install is required and the build has to be configured with MLIR_CUDA_RUNNER_ENABLED set to 1. PiperOrigin-RevId: 256551415 2019-07-04 22:49:52 +08:00			`}`

			`/// Helper functions for writing mlir example code`

Make type and rank explicit in mcuMemHostRegister function. Fix registered size of indirect MemRefType kernel arguments. PiperOrigin-RevId: 281362940 2019-11-20 05:12:19 +08:00			`// Allows to register byte array with the CUDA runtime. Helpful until we have`
			`// transfer functions implemented.`
[mlir][gpu] Refactor ConvertGpuLaunchFuncToCudaCalls pass. Due to similar APIs between CUDA and ROCm (HIP), ConvertGpuLaunchFuncToCudaCalls pass could be used on both platforms with some refactoring. In this commit: - Migrate ConvertLaunchFuncToCudaCalls from GPUToCUDA to GPUCommon, and rename. - Rename runtime wrapper APIs be platform-neutral. - Let GPU binary annotation attribute be specifiable as a PassOption. - Naming changes within the implementation and tests. Subsequent patches would introduce ROCm-specific tests and runtime wrapper APIs. Differential Revision: https://reviews.llvm.org/D80167 2020-05-19 02:01:54 +08:00			`extern "C" void mgpuMemHostRegister(void *ptr, uint64_t sizeBytes) {`
Clean up cuda-runtime-wrappers API. Do not return error code, instead return created resource handles or void. Error reporting is done by the library function. Reviewed By: herhut Differential Revision: https://reviews.llvm.org/D84660 2020-07-28 22:29:29 +08:00			`CUDA_REPORT_IF_ERROR(cuMemHostRegister(ptr, sizeBytes, /flags=/0));`
Make type and rank explicit in mcuMemHostRegister function. Fix registered size of indirect MemRefType kernel arguments. PiperOrigin-RevId: 281362940 2019-11-20 05:12:19 +08:00			`}`

			`// Allows to register a MemRef with the CUDA runtime. Initializes array with`
			`// value. Helpful until we have transfer functions implemented.`
Fix all-reduce int tests by host-registering memrefs. Reduce amount of boiler plate to register host memory. Summary: Fix all-reduce int tests by host-registering memrefs. Reviewers: herhut Reviewed By: herhut Subscribers: clementval, mehdi_amini, rriddle, jpienaar, burmako, shauheen, antiagainst, nicolasvasilache, arpith-jacob, mgester, lucyrfox, liufengdb, Joonsoo, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D76563 2020-03-23 03:18:23 +08:00			`template <typename T>`
Clean up cuda-runtime-wrappers API. Do not return error code, instead return created resource handles or void. Error reporting is done by the library function. Reviewed By: herhut Differential Revision: https://reviews.llvm.org/D84660 2020-07-28 22:29:29 +08:00			`void mgpuMemHostRegisterMemRef(const DynamicMemRefType<T> &memRef, T value) {`
			`llvm::SmallVector<int64_t, 4> denseStrides(memRef.rank);`
			`llvm::ArrayRef<int64_t> sizes(memRef.sizes, memRef.rank);`
			`llvm::ArrayRef<int64_t> strides(memRef.strides, memRef.rank);`
Fix all-reduce int tests by host-registering memrefs. Reduce amount of boiler plate to register host memory. Summary: Fix all-reduce int tests by host-registering memrefs. Reviewers: herhut Reviewed By: herhut Subscribers: clementval, mehdi_amini, rriddle, jpienaar, burmako, shauheen, antiagainst, nicolasvasilache, arpith-jacob, mgester, lucyrfox, liufengdb, Joonsoo, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D76563 2020-03-23 03:18:23 +08:00
			`std::partial_sum(sizes.rbegin(), sizes.rend(), denseStrides.rbegin(),`
			`std::multiplies<int64_t>());`
			`auto count = denseStrides.front();`

			`// Only densely packed tensors are currently supported.`
			`std::rotate(denseStrides.begin(), denseStrides.begin() + 1,`
			`denseStrides.end());`
			`denseStrides.back() = 1;`
			`assert(strides == llvm::makeArrayRef(denseStrides));`

Clean up cuda-runtime-wrappers API. Do not return error code, instead return created resource handles or void. Error reporting is done by the library function. Reviewed By: herhut Differential Revision: https://reviews.llvm.org/D84660 2020-07-28 22:29:29 +08:00			`auto *pointer = memRef.data + memRef.offset;`
Fix all-reduce int tests by host-registering memrefs. Reduce amount of boiler plate to register host memory. Summary: Fix all-reduce int tests by host-registering memrefs. Reviewers: herhut Reviewed By: herhut Subscribers: clementval, mehdi_amini, rriddle, jpienaar, burmako, shauheen, antiagainst, nicolasvasilache, arpith-jacob, mgester, lucyrfox, liufengdb, Joonsoo, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D76563 2020-03-23 03:18:23 +08:00			`std::fill_n(pointer, count, value);`
[mlir][gpu] Refactor ConvertGpuLaunchFuncToCudaCalls pass. Due to similar APIs between CUDA and ROCm (HIP), ConvertGpuLaunchFuncToCudaCalls pass could be used on both platforms with some refactoring. In this commit: - Migrate ConvertLaunchFuncToCudaCalls from GPUToCUDA to GPUCommon, and rename. - Rename runtime wrapper APIs be platform-neutral. - Let GPU binary annotation attribute be specifiable as a PassOption. - Naming changes within the implementation and tests. Subsequent patches would introduce ROCm-specific tests and runtime wrapper APIs. Differential Revision: https://reviews.llvm.org/D80167 2020-05-19 02:01:54 +08:00			`mgpuMemHostRegister(pointer, count * sizeof(T));`
Promote MemRefDescriptor to a pointer to struct when passing function boundaries in LLVMLowering. The strided MemRef RFC discusses a normalized descriptor and interaction with library calls (https://groups.google.com/a/tensorflow.org/forum/#!topic/mlir/MaL8m2nXuio). Lowering of nested LLVM structs as value types does not play nicely with externally compiled C/C++ functions due to ABI issues. Solving the ABI problem generally is a very complex problem and most likely involves taking a dependence on clang that we do not want atm. A simple workaround is to pass pointers to memref descriptors at function boundaries, which this CL implement. PiperOrigin-RevId: 271591708 2019-09-28 00:55:38 +08:00			`}`
[mlir] use unpacked memref descriptors at function boundaries The existing (default) calling convention for memrefs in standard-to-LLVM conversion was motivated by interfacing with LLVM IR produced from C sources. In particular, it passes a pointer to the memref descriptor structure when calling the function. Therefore, the descriptor is allocated on stack before the call. This convention leads to several problems. PR44644 indicates a problem with stack exhaustion when calling functions with memref-typed arguments in a loop. Allocating outside of the loop may lead to concurrent access problems in case the loop is parallel. When targeting GPUs, the contents of the stack-allocated memory for the descriptor (passed by pointer) needs to be explicitly copied to the device. Using an aggregate type makes it impossible to attach pointer-specific argument attributes pertaining to alignment and aliasing in the LLVM dialect. Change the default calling convention for memrefs in standard-to-LLVM conversion to transform a memref into a list of arguments, each of primitive type, that are comprised in the memref descriptor. This avoids stack allocation for ranked memrefs (and thus stack exhaustion and potential concurrent access problems) and simplifies the device function invocation on GPUs. Provide an option in the standard-to-LLVM conversion to generate auxiliary wrapper function with the same interface as the previous calling convention, compatible with LLVM IR porduced from C sources. These auxiliary functions pack the individual values into a descriptor structure or unpack it. They also handle descriptor stack allocation if necessary, serving as an allocation scope: the memory reserved by `alloca` will be freed on exiting the auxiliary function. The effect of this change on MLIR-generated only LLVM IR is minimal. When interfacing MLIR-generated LLVM IR with C-generated LLVM IR, the integration only needs to require auxiliary functions and change the function name to call the wrapper function instead of the original function. This also opens the door to forwarding aliasing and alignment information from memrefs to LLVM IR pointers in the standrd-to-LLVM conversion. 2020-02-10 21:12:47 +08:00
[MLIR] NFC: Rename mcuMemHostRegister* to mgpuMemHostRegister* to make it consistent with the other cuda-runner functions and ROCm. Summary: Rename mcuMemHostRegister* to mgpuMemHostRegister*. Reviewers: herhut Reviewed By: herhut Subscribers: yaxunl, mehdi_amini, rriddle, jpienaar, shauheen, antiagainst, nicolasvasilache, arpith-jacob, mgester, lucyrfox, aartbik, liufengdb, stephenneuendorffer, Joonsoo, grosul1, Kayjukh, jurahul, msifontes Tags: #mlir Differential Revision: https://reviews.llvm.org/D84583 2020-07-25 20:56:35 +08:00			`extern "C" void mgpuMemHostRegisterFloat(int64_t rank, void *ptr) {`
Clean up cuda-runtime-wrappers API. Do not return error code, instead return created resource handles or void. Error reporting is done by the library function. Reviewed By: herhut Differential Revision: https://reviews.llvm.org/D84660 2020-07-28 22:29:29 +08:00			`UnrankedMemRefType<float> memRef = {rank, ptr};`
			`mgpuMemHostRegisterMemRef(DynamicMemRefType<float>(memRef), 1.23f);`
Fix all-reduce int tests by host-registering memrefs. Reduce amount of boiler plate to register host memory. Summary: Fix all-reduce int tests by host-registering memrefs. Reviewers: herhut Reviewed By: herhut Subscribers: clementval, mehdi_amini, rriddle, jpienaar, burmako, shauheen, antiagainst, nicolasvasilache, arpith-jacob, mgester, lucyrfox, liufengdb, Joonsoo, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D76563 2020-03-23 03:18:23 +08:00			`}`

[MLIR] NFC: Rename mcuMemHostRegister* to mgpuMemHostRegister* to make it consistent with the other cuda-runner functions and ROCm. Summary: Rename mcuMemHostRegister* to mgpuMemHostRegister*. Reviewers: herhut Reviewed By: herhut Subscribers: yaxunl, mehdi_amini, rriddle, jpienaar, shauheen, antiagainst, nicolasvasilache, arpith-jacob, mgester, lucyrfox, aartbik, liufengdb, stephenneuendorffer, Joonsoo, grosul1, Kayjukh, jurahul, msifontes Tags: #mlir Differential Revision: https://reviews.llvm.org/D84583 2020-07-25 20:56:35 +08:00			`extern "C" void mgpuMemHostRegisterInt32(int64_t rank, void *ptr) {`
Clean up cuda-runtime-wrappers API. Do not return error code, instead return created resource handles or void. Error reporting is done by the library function. Reviewed By: herhut Differential Revision: https://reviews.llvm.org/D84660 2020-07-28 22:29:29 +08:00			`UnrankedMemRefType<int32_t> memRef = {rank, ptr};`
			`mgpuMemHostRegisterMemRef(DynamicMemRefType<int32_t>(memRef), 123);`
Add an mlir-cuda-runner tool. This tool allows to execute MLIR IR snippets written in the GPU dialect on a CUDA capable GPU. For this to work, a working CUDA install is required and the build has to be configured with MLIR_CUDA_RUNNER_ENABLED set to 1. PiperOrigin-RevId: 256551415 2019-07-04 22:49:52 +08:00			`}`