[mlir] Add base class for GpuKernelToBlobPass

Instead of configuring kernel-to-cubin/rocdl lowering through callbacks, introduce a base class that target-specific passes can derive from. Put the base class in GPU/Transforms, according to the discussion in D98203. The mlir-cuda-runner will go away shortly, and the mlir-rocdl-runner as well at some point. I therefore kept the existing code path working and will remove it in a separate step. Depends On D98168 Reviewed By: herhut Differential Revision: https://reviews.llvm.org/D98279
2021-03-10 10:35:20 +01:00 · 2021-03-10 10:35:20 +01:00 · 4d295cf5b5
parent 8d9b9c0edc
commit 4d295cf5b5
7 changed files with 173 additions and 116 deletions
--- a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
+++ b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
@ -9,9 +9,14 @@
 #define MLIR_CONVERSION_GPUCOMMON_GPUCOMMONPASS_H_

 #include "mlir/Support/LLVM.h"
-#include "llvm/IR/Module.h"
+#include "llvm/ADT/StringRef.h"
 #include <vector>

+namespace llvm {
+class LLVMContext;
+class Module;
+} // namespace llvm
+
 namespace mlir {

 class LLVMTypeConverter;
@ -26,9 +31,6 @@ class OperationPass;

 namespace gpu {
 class GPUModuleOp;
-
-/// Returns the default annotation name for GPU binary blobs.
-std::string getDefaultGpuBinaryAnnotation();
 } // namespace gpu

 namespace LLVM {
--- a/mlir/include/mlir/Dialect/GPU/Passes.h
+++ b/mlir/include/mlir/Dialect/GPU/Passes.h
@ -13,8 +13,15 @@
 #ifndef MLIR_DIALECT_GPU_PASSES_H_
 #define MLIR_DIALECT_GPU_PASSES_H_

+#include "mlir/Dialect/GPU/GPUDialect.h"
 #include "mlir/Pass/Pass.h"

+namespace llvm {
+class TargetMachine;
+class LLVMContext;
+class Module;
+} // namespace llvm
+
 namespace mlir {
 /// Replaces `gpu.launch` with `gpu.launch_func` by moving the region into
 /// a separate kernel function.
@ -33,6 +40,45 @@ inline void populateGpuRewritePatterns(MLIRContext *context,
  populateGpuAllReducePatterns(context, patterns);
 }

+namespace gpu {
+/// Returns the default annotation name for GPU binary blobs.
+std::string getDefaultGpuBinaryAnnotation();
+
+/// Base pass class to serialize kernel functions through LLVM into
+/// user-specified IR and add the resulting blob as module attribute.
+class SerializeToBlobPass : public OperationPass<gpu::GPUModuleOp> {
+public:
+  SerializeToBlobPass(TypeID passID);
+  SerializeToBlobPass(const SerializeToBlobPass &other);
+
+  void runOnOperation() final;
+
+private:
+  // Creates the LLVM target machine to generate the ISA.
+  std::unique_ptr<llvm::TargetMachine> createTargetMachine();
+
+  // Translates the 'getOperation()' result to an LLVM module.
+  virtual std::unique_ptr<llvm::Module>
+  translateToLLVMIR(llvm::LLVMContext &llvmContext) = 0;
+
+  // Serializes the target ISA to binary form.
+  virtual std::unique_ptr<std::vector<char>>
+  serializeISA(const std::string &isa) = 0;
+
+protected:
+  Option<std::string> triple{*this, "triple",
+                             ::llvm::cl::desc("Target triple")};
+  Option<std::string> chip{*this, "chip",
+                           ::llvm::cl::desc("Target architecture")};
+  Option<std::string> features{*this, "features",
+                               ::llvm::cl::desc("Target features")};
+  Option<std::string> gpuBinaryAnnotation{
+      *this, "gpu-binary-annotation",
+      llvm::cl::desc("Annotation attribute string for GPU binary"),
+      llvm::cl::init(getDefaultGpuBinaryAnnotation())};
+};
+} // namespace gpu
+
 //===----------------------------------------------------------------------===//
 // Registration
 //===----------------------------------------------------------------------===//
--- a/mlir/lib/Conversion/GPUCommon/CMakeLists.txt
+++ b/mlir/lib/Conversion/GPUCommon/CMakeLists.txt
@ -24,8 +24,6 @@ add_mlir_conversion_library(MLIRGPUToGPURuntimeTransforms
  intrinsics_gen

  LINK_COMPONENTS
-  Core
-  MC
  ${AMDGPU_LIBS}
  ${NVPTX_LIBS}

--- a/mlir/lib/Conversion/GPUCommon/ConvertKernelFuncToBlob.cpp
+++ b/mlir/lib/Conversion/GPUCommon/ConvertKernelFuncToBlob.cpp
@ -15,6 +15,7 @@
 #include "mlir/Conversion/GPUCommon/GPUCommonPass.h"

 #include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/GPU/Passes.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
@ -25,14 +26,10 @@

 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/Twine.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/LegacyPassManager.h"
-#include "llvm/IR/Module.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/Mutex.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
-#include "llvm/Target/TargetMachine.h"

 using namespace mlir;

@ -45,126 +42,43 @@ namespace {
 /// GPU binary code, which is then attached as an attribute to the function.
 /// The function body is erased.
 class GpuKernelToBlobPass
-    : public PassWrapper<GpuKernelToBlobPass, OperationPass<gpu::GPUModuleOp>> {
+    : public PassWrapper<GpuKernelToBlobPass, gpu::SerializeToBlobPass> {
 public:
  GpuKernelToBlobPass(LoweringCallback loweringCallback,
                      BlobGenerator blobGenerator, StringRef triple,
                      StringRef targetChip, StringRef features,
                      StringRef gpuBinaryAnnotation)
-      : loweringCallback(loweringCallback), blobGenerator(blobGenerator),
-        triple(triple), targetChip(targetChip), features(features) {
+      : loweringCallback(loweringCallback), blobGenerator(blobGenerator) {
+    if (!triple.empty())
+      this->triple = triple.str();
+    if (!targetChip.empty())
+      this->chip = targetChip.str();
+    if (!features.empty())
+      this->features = features.str();
    if (!gpuBinaryAnnotation.empty())
      this->gpuBinaryAnnotation = gpuBinaryAnnotation.str();
  }

-  GpuKernelToBlobPass(const GpuKernelToBlobPass &other)
-      : loweringCallback(other.loweringCallback),
-        blobGenerator(other.blobGenerator), triple(other.triple),
-        targetChip(other.targetChip), features(other.features) {}
-
-  void runOnOperation() override {
-    gpu::GPUModuleOp module = getOperation();
-
-    // Lower the module to an LLVM IR module using a separate context to enable
-    // multi-threaded processing.
-    llvm::LLVMContext llvmContext;
-    std::unique_ptr<llvm::Module> llvmModule =
-        loweringCallback(module, llvmContext, "LLVMDialectModule");
-    if (!llvmModule)
-      return signalPassFailure();
-
-    // Translate the llvm module to a target blob and attach the result as
-    // attribute to the module.
-    if (auto blobAttr = translateGPUModuleToBinaryAnnotation(
-            *llvmModule, module.getLoc(), module.getName()))
-      module->setAttr(gpuBinaryAnnotation, blobAttr);
-    else
-      signalPassFailure();
+private:
+  // Translates the 'getOperation()' result to an LLVM module.
+  std::unique_ptr<llvm::Module>
+  translateToLLVMIR(llvm::LLVMContext &llvmContext) override {
+    return loweringCallback(getOperation(), llvmContext, "LLVMDialectModule");
  }

-private:
-  std::string translateModuleToISA(llvm::Module &module,
-                                   llvm::TargetMachine &targetMachine);
-
-  /// Converts llvmModule to a blob with target instructions using the
-  /// user-provided generator. Location is used for error reporting and name is
-  /// forwarded to the blob generator to use in its logging mechanisms.
-  OwnedBlob convertModuleToBlob(llvm::Module &llvmModule, Location loc,
-                                StringRef name);
-
-  /// Translates llvmModule to a blob with target instructions and returns the
-  /// result as attribute.
-  StringAttr translateGPUModuleToBinaryAnnotation(llvm::Module &llvmModule,
-                                                  Location loc, StringRef name);
+  // Serializes the target ISA to binary form.
+  std::unique_ptr<std::vector<char>>
+  serializeISA(const std::string &isa) override {
+    return blobGenerator(isa, getOperation().getLoc(),
+                         getOperation().getName());
+  }

  LoweringCallback loweringCallback;
  BlobGenerator blobGenerator;
-
-  llvm::Triple triple;
-  std::string targetChip;
-  std::string features;
-
-  Option<std::string> gpuBinaryAnnotation{
-      *this, "gpu-binary-annotation",
-      llvm::cl::desc("Annotation attribute string for GPU binary"),
-      llvm::cl::init(gpu::getDefaultGpuBinaryAnnotation())};
 };

 } // anonymous namespace

-std::string gpu::getDefaultGpuBinaryAnnotation() { return "gpu.binary"; }
-
-std::string
-GpuKernelToBlobPass::translateModuleToISA(llvm::Module &module,
-                                          llvm::TargetMachine &targetMachine) {
-  std::string targetISA;
-  {
-    llvm::raw_string_ostream stream(targetISA);
-    llvm::buffer_ostream pstream(stream);
-    llvm::legacy::PassManager codegenPasses;
-    targetMachine.addPassesToEmitFile(codegenPasses, pstream, nullptr,
-                                      llvm::CGFT_AssemblyFile);
-    codegenPasses.run(module);
-  }
-
-  return targetISA;
-}
-
-OwnedBlob GpuKernelToBlobPass::convertModuleToBlob(llvm::Module &llvmModule,
-                                                   Location loc,
-                                                   StringRef name) {
-  std::unique_ptr<llvm::TargetMachine> targetMachine;
-  {
-    std::string error;
-    const llvm::Target *target =
-        llvm::TargetRegistry::lookupTarget("", triple, error);
-    if (target == nullptr) {
-      emitError(loc, "cannot initialize target triple");
-      return {};
-    }
-    targetMachine.reset(target->createTargetMachine(triple.str(), targetChip,
-                                                    features, {}, {}));
-    if (targetMachine == nullptr) {
-      emitError(loc, "cannot initialize target machine");
-      return {};
-    }
-  }
-
-  llvmModule.setDataLayout(targetMachine->createDataLayout());
-
-  auto targetISA = translateModuleToISA(llvmModule, *targetMachine);
-
-  return blobGenerator(targetISA, loc, name);
-}
-
-StringAttr GpuKernelToBlobPass::translateGPUModuleToBinaryAnnotation(
-    llvm::Module &llvmModule, Location loc, StringRef name) {
-  auto blob = convertModuleToBlob(llvmModule, loc, name);
-  if (!blob)
-    return {};
-  return StringAttr::get(loc->getContext(), {blob->data(), blob->size()});
-}
-
 std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
 mlir::createConvertGPUKernelToBlobPass(LoweringCallback loweringCallback,
                                       BlobGenerator blobGenerator,
--- a/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp
+++ b/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp
@ -20,6 +20,7 @@
 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
 #include "mlir/Dialect/Async/IR/Async.h"
 #include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/GPU/Passes.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
@ -27,10 +28,6 @@
 #include "mlir/IR/BuiltinTypes.h"

 #include "llvm/ADT/STLExtras.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/FormatVariadic.h"

--- a/mlir/lib/Dialect/GPU/CMakeLists.txt
+++ b/mlir/lib/Dialect/GPU/CMakeLists.txt
@ -5,10 +5,15 @@ add_mlir_dialect_library(MLIRGPU
  Transforms/KernelOutlining.cpp
  Transforms/MemoryPromotion.cpp
  Transforms/ParallelLoopMapper.cpp
+  Transforms/SerializeToBlob.cpp

  ADDITIONAL_HEADER_DIRS
  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/GPU

+  LINK_COMPONENTS
+  Core
+  MC
+
  DEPENDS
  MLIRGPUOpsIncGen
  MLIRGPUOpInterfacesIncGen
--- a/mlir/lib/Dialect/GPU/Transforms/SerializeToBlob.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SerializeToBlob.cpp
@ -0,0 +1,95 @@
+//===- SerializeToBlob.cpp - MLIR GPU lowering pass -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a base class for a pass to serialize a gpu module
+// into a binary blob that can be executed on a GPU. The binary blob is added
+// as a string attribute to the gpu module.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/GPU/Passes.h"
+#include "mlir/Pass/Pass.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace mlir;
+
+std::string gpu::getDefaultGpuBinaryAnnotation() { return "gpu.binary"; }
+
+gpu::SerializeToBlobPass::SerializeToBlobPass(TypeID passID)
+    : OperationPass<gpu::GPUModuleOp>(passID) {}
+
+gpu::SerializeToBlobPass::SerializeToBlobPass(const SerializeToBlobPass &other)
+    : OperationPass<gpu::GPUModuleOp>(other) {
+  // Pass::Option has no copy constructor, copy them manually.
+  triple = other.triple;
+  chip = other.chip;
+  features = other.features;
+  gpuBinaryAnnotation = other.gpuBinaryAnnotation;
+}
+
+static std::string translateToISA(llvm::Module &llvmModule,
+                                  llvm::TargetMachine &targetMachine) {
+  llvmModule.setDataLayout(targetMachine.createDataLayout());
+
+  std::string targetISA;
+  llvm::raw_string_ostream stream(targetISA);
+  llvm::buffer_ostream pstream(stream);
+  llvm::legacy::PassManager codegenPasses;
+  targetMachine.addPassesToEmitFile(codegenPasses, pstream, nullptr,
+                                    llvm::CGFT_AssemblyFile);
+  codegenPasses.run(llvmModule);
+  return targetISA;
+}
+
+void gpu::SerializeToBlobPass::runOnOperation() {
+  // Lower the module to an LLVM IR module using a separate context to enable
+  // multi-threaded processing.
+  llvm::LLVMContext llvmContext;
+  std::unique_ptr<llvm::Module> llvmModule = translateToLLVMIR(llvmContext);
+  if (!llvmModule)
+    return signalPassFailure();
+
+  // Lower the LLVM IR module to target ISA.
+  std::unique_ptr<llvm::TargetMachine> targetMachine = createTargetMachine();
+  if (!targetMachine)
+    return signalPassFailure();
+
+  std::string targetISA = translateToISA(*llvmModule, *targetMachine);
+
+  // Serialize the target ISA.
+  std::unique_ptr<std::vector<char>> blob = serializeISA(targetISA);
+  if (!blob)
+    return signalPassFailure();
+
+  // Add the blob as module attribute.
+  auto attr = StringAttr::get(&getContext(), {blob->data(), blob->size()});
+  getOperation()->setAttr(gpuBinaryAnnotation, attr);
+}
+
+std::unique_ptr<llvm::TargetMachine>
+gpu::SerializeToBlobPass::createTargetMachine() {
+  Location loc = getOperation().getLoc();
+  std::string error;
+  const llvm::Target *target =
+      llvm::TargetRegistry::lookupTarget(triple, error);
+  if (!target) {
+    emitError(loc, Twine("failed to lookup target: ") + error);
+    return {};
+  }
+  llvm::TargetMachine *machine =
+      target->createTargetMachine(triple, chip, features, {}, {});
+  if (!machine) {
+    emitError(loc, "failed to create target machine");
+    return {};
+  }
+
+  return std::unique_ptr<llvm::TargetMachine>{machine};
+}