Automated rollback of commit 5684a12434

PiperOrigin-RevId: 270126672
2019-09-19 14:33:54 -07:00 · 2019-09-19 14:33:54 -07:00 · 2df646bef6
parent c8961d408e
commit 2df646bef6
11 changed files with 165 additions and 216 deletions
--- a/mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h
+++ b/mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h
@ -26,6 +26,10 @@ class OwningRewritePatternList;
 class ModuleOp;
 template <typename OpT> class OpPassBase;

+/// Collect a set of patterns to convert from the GPU dialect to NVVM.
+void populateGpuToNVVMConversionPatterns(LLVMTypeConverter &converter,
+                                         OwningRewritePatternList &patterns);
+
 /// Creates a pass that lowers GPU dialect operations to NVVM counterparts.
 std::unique_ptr<OpPassBase<ModuleOp>> createLowerGpuOpsToNVVMOpsPass();

--- a/mlir/include/mlir/Dialect/GPU/GPUDialect.h
+++ b/mlir/include/mlir/Dialect/GPU/GPUDialect.h
@ -41,12 +41,9 @@ public:
  /// Get the canonical string name of the dialect.
  static StringRef getDialectName();

-  /// Get the name of the attribute used to annotate external kernel functions.
+  /// Get the name of the attribute used to annotate outlined kernel functions.
  static StringRef getKernelFuncAttrName() { return "gpu.kernel"; }

-  /// Get the name of the attribute used to annotate kernel modules.
-  static StringRef getKernelModuleAttrName() { return "gpu.kernel_module"; }
-
  /// Returns whether the given function is a kernel function, i.e., has the
  /// 'gpu.kernel' attribute.
  static bool isKernel(FuncOp function);
--- a/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp
+++ b/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp
@ -49,37 +49,26 @@ namespace {
 // TODO(herhut): Move to shared location.
 static constexpr const char *kCubinAnnotation = "nvvm.cubin";

-/// A pass converting tagged kernel modules to cubin blobs.
-///
-/// If tagged as a kernel module, each contained function is translated to NVVM
-/// IR and further to PTX. A user provided CubinGenerator compiles the PTX to
-/// GPU binary code, which is then attached as an attribute to the function. The
-/// function body is erased.
+/// A pass converting tagged kernel functions to cubin blobs.
 class GpuKernelToCubinPass : public ModulePass<GpuKernelToCubinPass> {
 public:
  GpuKernelToCubinPass(
      CubinGenerator cubinGenerator = compilePtxToCubinForTesting)
      : cubinGenerator(cubinGenerator) {}

+  // Run the dialect converter on the module.
  void runOnModule() override {
-    if (!getModule().getAttrOfType<UnitAttr>(
-            gpu::GPUDialect::getKernelModuleAttrName()))
-      return;
-
    // Make sure the NVPTX target is initialized.
    LLVMInitializeNVPTXTarget();
    LLVMInitializeNVPTXTargetInfo();
    LLVMInitializeNVPTXTargetMC();
    LLVMInitializeNVPTXAsmPrinter();

-    auto llvmModule = translateModuleToNVVMIR(getModule());
-    if (!llvmModule)
-      return signalPassFailure();
-
    for (auto function : getModule().getOps<FuncOp>()) {
-      if (!gpu::GPUDialect::isKernel(function))
+      if (!gpu::GPUDialect::isKernel(function) || function.isExternal()) {
        continue;
-      if (failed(translateGpuKernelToCubinAnnotation(*llvmModule, function)))
+      }
+      if (failed(translateGpuKernelToCubinAnnotation(function)))
        signalPassFailure();
    }
  }
@ -90,13 +79,8 @@ private:

  std::string translateModuleToPtx(llvm::Module &module,
                                   llvm::TargetMachine &target_machine);
-
-  /// Converts llvmModule to cubin using the user-provded generator.
  OwnedCubin convertModuleToCubin(llvm::Module &llvmModule, FuncOp &function);
-
-  /// Translates llvmModule to cubin and assigns it to attribute of function.
-  LogicalResult translateGpuKernelToCubinAnnotation(llvm::Module &llvmModule,
-                                                    FuncOp &function);
+  LogicalResult translateGpuKernelToCubinAnnotation(FuncOp &function);

  CubinGenerator cubinGenerator;
 };
@ -151,13 +135,22 @@ OwnedCubin GpuKernelToCubinPass::convertModuleToCubin(llvm::Module &llvmModule,
  return cubinGenerator(ptx, function);
 }

-LogicalResult GpuKernelToCubinPass::translateGpuKernelToCubinAnnotation(
-    llvm::Module &llvmModule, FuncOp &function) {
-  auto cubin = convertModuleToCubin(llvmModule, function);
-  if (!cubin)
-    return function.emitError("translation to CUDA binary failed.");
-
+LogicalResult
+GpuKernelToCubinPass::translateGpuKernelToCubinAnnotation(FuncOp &function) {
  Builder builder(function.getContext());
+
+  OwningModuleRef module = ModuleOp::create(function.getLoc());
+
+  // TODO(herhut): Also handle called functions.
+  module->push_back(function.clone());
+
+  auto llvmModule = translateModuleToNVVMIR(*module);
+  auto cubin = convertModuleToCubin(*llvmModule, function);
+
+  if (!cubin) {
+    return function.emitError("translation to CUDA binary failed.");
+  }
+
  function.setAttr(kCubinAnnotation,
                   builder.getStringAttr({cubin->data(), cubin->size()}));

--- a/mlir/lib/Conversion/GPUToCUDA/GenerateCubinAccessors.cpp
+++ b/mlir/lib/Conversion/GPUToCUDA/GenerateCubinAccessors.cpp
@ -43,15 +43,8 @@ constexpr const char *kCubinGetterAnnotation = "nvvm.cubingetter";
 constexpr const char *kCubinGetterSuffix = "_cubin";
 constexpr const char *kCubinStorageSuffix = "_cubin_cst";

-/// A pass which moves cubin from function attributes in nested modules
-/// to global strings and generates getter functions.
-///
-/// The GpuKernelToCubinPass annotates kernels functions with compiled device
-/// code blobs. These functions reside in nested modules generated by
-/// GpuKernelOutliningPass. This pass consumes these modules and moves the cubin
-/// blobs back to the parent module as global strings and generates accessor
-/// functions for them. The external kernel functions (also generated by the
-/// outlining pass) are annotated with the symbol of the cubin accessor.
+/// A pass generating global strings and getter functions for all cubin blobs
+/// annotated on functions via the nvvm.cubin attribute.
 class GpuGenerateCubinAccessorsPass
    : public ModulePass<GpuGenerateCubinAccessorsPass> {
 private:
@ -62,25 +55,18 @@ private:
  }

  // Inserts a global constant string containing `blob` into the parent module
-  // of `kernelFunc` and generates the function that returns the address of the
-  // first character of this string.
+  // of `orig` and generates the function that returns the address of the first
+  // character of this string.
  // TODO(herhut): consider fusing this pass with launch-func-to-cuda.
-  void generate(FuncOp kernelFunc, StringAttr blob) {
-    auto stubFunc = getModule().lookupSymbol<FuncOp>(kernelFunc.getName());
-    if (!stubFunc) {
-      kernelFunc.emitError(
-          "corresponding external function not found in parent module");
-      return signalPassFailure();
-    }
-
-    Location loc = stubFunc.getLoc();
-    SmallString<128> nameBuffer(stubFunc.getName());
-    auto module = stubFunc.getParentOfType<ModuleOp>();
+  void generate(FuncOp orig, StringAttr blob) {
+    Location loc = orig.getLoc();
+    SmallString<128> nameBuffer(orig.getName());
+    auto module = orig.getParentOfType<ModuleOp>();
    assert(module && "function must belong to a module");

    // Insert the getter function just after the original function.
    OpBuilder moduleBuilder(module.getBody(), module.getBody()->begin());
-    moduleBuilder.setInsertionPoint(stubFunc.getOperation()->getNextNode());
+    moduleBuilder.setInsertionPoint(orig.getOperation()->getNextNode());
    auto getterType = moduleBuilder.getFunctionType(
        llvm::None, LLVM::LLVMType::getInt8PtrTy(llvmDialect));
    nameBuffer.append(kCubinGetterSuffix);
@ -89,7 +75,7 @@ private:
    Block *entryBlock = result.addEntryBlock();

    // Drop the getter suffix before appending the storage suffix.
-    nameBuffer.resize(stubFunc.getName().size());
+    nameBuffer.resize(orig.getName().size());
    nameBuffer.append(kCubinStorageSuffix);

    // Obtain the address of the first character of the global string containing
@ -100,23 +86,21 @@ private:
    builder.create<LLVM::ReturnOp>(loc, startPtr);

    // Store the name of the getter on the function for easier lookup.
-    stubFunc.setAttr(kCubinGetterAnnotation, builder.getSymbolRefAttr(result));
+    orig.setAttr(kCubinGetterAnnotation, builder.getSymbolRefAttr(result));
  }

 public:
+  // Perform the conversion on the module.  This may insert globals, so it
+  // cannot be done on multiple functions in parallel.
  void runOnModule() override {
-    llvmDialect = getContext().getRegisteredDialect<LLVM::LLVMDialect>();
+    llvmDialect =
+        getModule().getContext()->getRegisteredDialect<LLVM::LLVMDialect>();

-    auto modules = getModule().getOps<ModuleOp>();
-    for (auto module : llvm::make_early_inc_range(modules)) {
-      if (!module.getAttrOfType<UnitAttr>(
-              gpu::GPUDialect::getKernelModuleAttrName()))
+    for (auto func : getModule().getOps<FuncOp>()) {
+      StringAttr cubinBlob = func.getAttrOfType<StringAttr>(kCubinAnnotation);
+      if (!cubinBlob)
        continue;
-      for (auto func : module.getOps<FuncOp>()) {
-        if (StringAttr blob = func.getAttrOfType<StringAttr>(kCubinAnnotation))
-          generate(func, blob);
-      }
-      module.erase();
+      generate(func, cubinBlob);
    }
  }

--- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@ -23,7 +23,6 @@
 #include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"

 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
-#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"
 #include "mlir/Dialect/GPU/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"
@ -39,6 +38,23 @@ using namespace mlir;

 namespace {

+// Rewriting that replaces the types of a LaunchFunc operation with their
+// LLVM counterparts.
+struct GPULaunchFuncOpLowering : public LLVMOpLowering {
+public:
+  explicit GPULaunchFuncOpLowering(LLVMTypeConverter &lowering_)
+      : LLVMOpLowering(gpu::LaunchFuncOp::getOperationName(),
+                       lowering_.getDialect()->getContext(), lowering_) {}
+
+  // Convert the kernel arguments to an LLVM type, preserve the rest.
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value *> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    rewriter.clone(*op)->setOperands(operands);
+    return rewriter.replaceOp(op, llvm::None), matchSuccess();
+  }
+};
+
 // Rewriting that replaces Op with XOp, YOp, or ZOp depending on the dimension
 // that Op operates on.  Op is assumed to return an `std.index` value and
 // XOp, YOp and ZOp are assumed to return an `llvm.i32` value.  Depending on
@ -103,31 +119,20 @@ public:
  }
 };

-// A pass that replaces all occurences of GPU device operations with their
+// A pass that replaces all occurences of GPU operations with their
 // corresponding NVVM equivalent.
 //
-// This pass only handles device code and is not meant to be run on GPU host
-// code.
+// This pass does not handle launching of kernels. Instead, it is meant to be
+// used on the body region of a launch or the body region of a kernel
+// function.
 class LowerGpuOpsToNVVMOpsPass : public ModulePass<LowerGpuOpsToNVVMOpsPass> {
 public:
  void runOnModule() override {
    ModuleOp m = getModule();
-    if (!m.getAttrOfType<UnitAttr>(gpu::GPUDialect::getKernelModuleAttrName()))
-      return;

    OwningRewritePatternList patterns;
    LLVMTypeConverter converter(m.getContext());
-    populateStdToLLVMConversionPatterns(converter, patterns);
-    patterns.insert<
-        GPUIndexIntrinsicOpLowering<gpu::ThreadId, NVVM::ThreadIdXOp,
-                                    NVVM::ThreadIdYOp, NVVM::ThreadIdZOp>,
-        GPUIndexIntrinsicOpLowering<gpu::BlockDim, NVVM::BlockDimXOp,
-                                    NVVM::BlockDimYOp, NVVM::BlockDimZOp>,
-        GPUIndexIntrinsicOpLowering<gpu::BlockId, NVVM::BlockIdXOp,
-                                    NVVM::BlockIdYOp, NVVM::BlockIdZOp>,
-        GPUIndexIntrinsicOpLowering<gpu::GridDim, NVVM::GridDimXOp,
-                                    NVVM::GridDimYOp, NVVM::GridDimZOp>>(
-        converter);
+    populateGpuToNVVMConversionPatterns(converter, patterns);

    ConversionTarget target(getContext());
    target.addLegalDialect<LLVM::LLVMDialect>();
@ -141,6 +146,22 @@ public:

 } // anonymous namespace

+/// Collect a set of patterns to convert from the GPU dialect to NVVM.
+void mlir::populateGpuToNVVMConversionPatterns(
+    LLVMTypeConverter &converter, OwningRewritePatternList &patterns) {
+  patterns
+      .insert<GPULaunchFuncOpLowering,
+              GPUIndexIntrinsicOpLowering<gpu::ThreadId, NVVM::ThreadIdXOp,
+                                          NVVM::ThreadIdYOp, NVVM::ThreadIdZOp>,
+              GPUIndexIntrinsicOpLowering<gpu::BlockDim, NVVM::BlockDimXOp,
+                                          NVVM::BlockDimYOp, NVVM::BlockDimZOp>,
+              GPUIndexIntrinsicOpLowering<gpu::BlockId, NVVM::BlockIdXOp,
+                                          NVVM::BlockIdYOp, NVVM::BlockIdZOp>,
+              GPUIndexIntrinsicOpLowering<gpu::GridDim, NVVM::GridDimXOp,
+                                          NVVM::GridDimYOp, NVVM::GridDimZOp>>(
+          converter);
+}
+
 std::unique_ptr<OpPassBase<ModuleOp>> mlir::createLowerGpuOpsToNVVMOpsPass() {
  return std::make_unique<LowerGpuOpsToNVVMOpsPass>();
 }
--- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
@ -93,7 +93,7 @@ static gpu::LaunchFuncOp inlineConstants(FuncOp kernelFunc,
 }

 // Outline the `gpu.launch` operation body into a kernel function. Replace
-// `gpu.return` operations by `std.return` in the generated function.
+// `gpu.return` operations by `std.return` in the generated functions.
 static FuncOp outlineKernelFunc(gpu::LaunchOp launchOp) {
  Location loc = launchOp.getLoc();
  SmallVector<Type, 4> kernelOperandTypes(launchOp.getKernelOperandTypes());
@ -107,7 +107,7 @@ static FuncOp outlineKernelFunc(gpu::LaunchOp launchOp) {
  outlinedFunc.setAttr(gpu::GPUDialect::getKernelFuncAttrName(),
                       builder.getUnitAttr());
  injectGpuIndexOperations(loc, outlinedFunc);
-  outlinedFunc.walk([](gpu::Return op) {
+  outlinedFunc.walk([](mlir::gpu::Return op) {
    OpBuilder replacer(op);
    replacer.create<ReturnOp>(op.getLoc());
    op.erase();
@ -131,44 +131,15 @@ static void convertToLaunchFuncOp(gpu::LaunchOp &launchOp, FuncOp kernelFunc) {

 namespace {

-/// Pass that moves the kernel of each LaunchOp into its separate nested module.
-///
-/// This pass moves the kernel code of each LaunchOp into a function created
-/// inside a nested module. It also creates an external function of the same
-/// name in the parent module.
-///
-/// The kernel modules are intended to be compiled to a cubin blob independently
-/// in a separate pass. The external functions can then be annotated with the
-/// symbol of the cubin accessor function.
 class GpuKernelOutliningPass : public ModulePass<GpuKernelOutliningPass> {
 public:
  void runOnModule() override {
    ModuleManager moduleManager(getModule());
-    auto context = getModule().getContext();
-    Builder builder(context);
    for (auto func : getModule().getOps<FuncOp>()) {
-      // Insert just after the function.
-      Block::iterator insertPt(func.getOperation()->getNextNode());
-      func.walk([&](gpu::LaunchOp op) {
-        // TODO(b/141098412): Handle called functions and globals.
+      func.walk([&](mlir::gpu::LaunchOp op) {
        FuncOp outlinedFunc = outlineKernelFunc(op);
-
-        // Potentially renames outlinedFunc to make symbol unique.
-        moduleManager.insert(insertPt, outlinedFunc);
-
-        // Potentially changes signature, pulling in constants.
+        moduleManager.insert(outlinedFunc);
        convertToLaunchFuncOp(op, outlinedFunc);
-
-        // Create clone and move body from outlinedFunc.
-        auto kernelFunc = outlinedFunc.cloneWithoutRegions();
-        kernelFunc.getBody().takeBody(outlinedFunc.getBody());
-
-        // Create nested module and insert kernelFunc.
-        auto kernelModule = ModuleOp::create(UnknownLoc::get(context));
-        kernelModule.setAttr(gpu::GPUDialect::getKernelModuleAttrName(),
-                             builder.getUnitAttr());
-        kernelModule.push_back(kernelFunc);
-        getModule().insert(insertPt, kernelModule);
      });
    }
  }
--- a/mlir/test/Conversion/GPUToCUDA/insert-cubin-getter.mlir
+++ b/mlir/test/Conversion/GPUToCUDA/insert-cubin-getter.mlir
@ -2,14 +2,9 @@

 // CHECK: llvm.mlir.global constant @[[global:.*]]("CUBIN")

-module attributes {gpu.kernel_module} {
-  func @kernel(!llvm.float, !llvm<"float*">)
-  attributes  {nvvm.cubin = "CUBIN"}
-}
-
 func @kernel(!llvm.float, !llvm<"float*">)
-// CHECK: attributes  {gpu.kernel, nvvm.cubingetter = @[[getter:.*]]}
-  attributes  {gpu.kernel}
+// CHECK: attributes  {gpu.kernel, nvvm.cubin = "CUBIN", nvvm.cubingetter = @[[getter:.*]]}
+  attributes  {gpu.kernel, nvvm.cubin = "CUBIN"}

 // CHECK: func @[[getter]]() -> !llvm<"i8*">
 // CHECK: %[[addressof:.*]] = llvm.mlir.addressof @[[global]]
--- a/mlir/test/Conversion/GPUToCUDA/lower-nvvm-kernel-to-cubin.mlir
+++ b/mlir/test/Conversion/GPUToCUDA/lower-nvvm-kernel-to-cubin.mlir
@ -1,26 +1,8 @@
-// RUN: mlir-opt %s --test-kernel-to-cubin -split-input-file | FileCheck %s
+// RUN: mlir-opt %s --test-kernel-to-cubin | FileCheck %s

-module attributes {gpu.kernel_module} {
-  func @kernel(%arg0 : !llvm.float, %arg1 : !llvm<"float*">)
-    // CHECK: attributes  {gpu.kernel, nvvm.cubin = "CUBIN"}
-    attributes  { gpu.kernel } {
-    // CHECK-NOT: llvm.return
-    llvm.return
-  }
-}
-
-// -----
-
-module attributes {gpu.kernel_module} {
-  // CHECK: func @kernel_a
-  func @kernel_a()
-    attributes  { gpu.kernel } {
-    llvm.return
-  }
-
-  // CHECK: func @kernel_b
-  func @kernel_b()
-    attributes  { gpu.kernel } {
-    llvm.return
-  }
-}
+func @kernel(%arg0 : !llvm.float, %arg1 : !llvm<"float*">)
+// CHECK: attributes  {gpu.kernel, nvvm.cubin = "CUBIN"}
+  attributes  { gpu.kernel } {
+// CHECK-NOT: llvm.return
+  llvm.return
+}
--- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
@ -1,37 +1,35 @@
 // RUN: mlir-opt %s -lower-gpu-ops-to-nvvm-ops | FileCheck %s

-module attributes {gpu.kernel_module} {
-  // CHECK-LABEL: func @gpu_index_ops()
-  func @gpu_index_ops()
-      attributes { gpu.kernel } {
-    // CHECK: = nvvm.read.ptx.sreg.tid.x : !llvm.i32
-    %tIdX = "gpu.thread_id"() {dimension = "x"} : () -> (index)
-    // CHECK: = nvvm.read.ptx.sreg.tid.y : !llvm.i32
-    %tIdY = "gpu.thread_id"() {dimension = "y"} : () -> (index)
-    // CHECK: = nvvm.read.ptx.sreg.tid.z : !llvm.i32
-    %tIdZ = "gpu.thread_id"() {dimension = "z"} : () -> (index)
+// CHECK-LABEL: func @gpu_index_ops()
+func @gpu_index_ops()
+    attributes { gpu.kernel } {
+  // CHECK: = nvvm.read.ptx.sreg.tid.x : !llvm.i32
+  %tIdX = "gpu.thread_id"() {dimension = "x"} : () -> (index)
+  // CHECK: = nvvm.read.ptx.sreg.tid.y : !llvm.i32
+  %tIdY = "gpu.thread_id"() {dimension = "y"} : () -> (index)
+  // CHECK: = nvvm.read.ptx.sreg.tid.z : !llvm.i32
+  %tIdZ = "gpu.thread_id"() {dimension = "z"} : () -> (index)

-    // CHECK: = nvvm.read.ptx.sreg.ntid.x : !llvm.i32
-    %bDimX = "gpu.block_dim"() {dimension = "x"} : () -> (index)
-    // CHECK: = nvvm.read.ptx.sreg.ntid.y : !llvm.i32
-    %bDimY = "gpu.block_dim"() {dimension = "y"} : () -> (index)
-    // CHECK: = nvvm.read.ptx.sreg.ntid.z : !llvm.i32
-    %bDimZ = "gpu.block_dim"() {dimension = "z"} : () -> (index)
+  // CHECK: = nvvm.read.ptx.sreg.ntid.x : !llvm.i32
+  %bDimX = "gpu.block_dim"() {dimension = "x"} : () -> (index)
+  // CHECK: = nvvm.read.ptx.sreg.ntid.y : !llvm.i32
+  %bDimY = "gpu.block_dim"() {dimension = "y"} : () -> (index)
+  // CHECK: = nvvm.read.ptx.sreg.ntid.z : !llvm.i32
+  %bDimZ = "gpu.block_dim"() {dimension = "z"} : () -> (index)

-    // CHECK: = nvvm.read.ptx.sreg.ctaid.x : !llvm.i32
-    %bIdX = "gpu.block_id"() {dimension = "x"} : () -> (index)
-    // CHECK: = nvvm.read.ptx.sreg.ctaid.y : !llvm.i32
-    %bIdY = "gpu.block_id"() {dimension = "y"} : () -> (index)
-    // CHECK: = nvvm.read.ptx.sreg.ctaid.z : !llvm.i32
-    %bIdZ = "gpu.block_id"() {dimension = "z"} : () -> (index)
+  // CHECK: = nvvm.read.ptx.sreg.ctaid.x : !llvm.i32
+  %bIdX = "gpu.block_id"() {dimension = "x"} : () -> (index)
+  // CHECK: = nvvm.read.ptx.sreg.ctaid.y : !llvm.i32
+  %bIdY = "gpu.block_id"() {dimension = "y"} : () -> (index)
+  // CHECK: = nvvm.read.ptx.sreg.ctaid.z : !llvm.i32
+  %bIdZ = "gpu.block_id"() {dimension = "z"} : () -> (index)

-    // CHECK: = nvvm.read.ptx.sreg.nctaid.x : !llvm.i32
-    %gDimX = "gpu.grid_dim"() {dimension = "x"} : () -> (index)
-    // CHECK: = nvvm.read.ptx.sreg.nctaid.y : !llvm.i32
-    %gDimY = "gpu.grid_dim"() {dimension = "y"} : () -> (index)
-    // CHECK: = nvvm.read.ptx.sreg.nctaid.z : !llvm.i32
-    %gDimZ = "gpu.grid_dim"() {dimension = "z"} : () -> (index)
+  // CHECK: = nvvm.read.ptx.sreg.nctaid.x : !llvm.i32
+  %gDimX = "gpu.grid_dim"() {dimension = "x"} : () -> (index)
+  // CHECK: = nvvm.read.ptx.sreg.nctaid.y : !llvm.i32
+  %gDimY = "gpu.grid_dim"() {dimension = "y"} : () -> (index)
+  // CHECK: = nvvm.read.ptx.sreg.nctaid.z : !llvm.i32
+  %gDimZ = "gpu.grid_dim"() {dimension = "z"} : () -> (index)

-    std.return
-  }
+  std.return
 }
--- a/mlir/test/Dialect/GPU/outlining.mlir
+++ b/mlir/test/Dialect/GPU/outlining.mlir
@ -1,4 +1,4 @@
-// RUN: mlir-opt -gpu-kernel-outlining -split-input-file -verify-diagnostics %s | FileCheck %s
+// RUN: mlir-opt -gpu-kernel-outlining -split-input-file %s | FileCheck %s

 // CHECK-LABEL: func @launch()
 func @launch() {
@ -35,11 +35,7 @@ func @launch() {
 }

 // CHECK-LABEL: func @launch_kernel
-// CHECK-SAME: (f32, memref<?xf32, 1>)
-// CHECK-NEXT: attributes {gpu.kernel}
-
-// CHECK-LABEL: func @launch_kernel
-// CHECK-SAME: (%[[KERNEL_ARG0:.*]]: f32, %[[KERNEL_ARG1:.*]]: memref<?xf32, 1>)
+// CHECK-SAME: (%[[ARG0:.*]]: f32, %[[ARG1:.*]]: memref<?xf32, 1>)
 // CHECK-NEXT: attributes {gpu.kernel}
 // CHECK-NEXT: %[[BID:.*]] = "gpu.block_id"() {dimension = "x"} : () -> index
 // CHECK-NEXT: = "gpu.block_id"() {dimension = "y"} : () -> index
@ -53,9 +49,9 @@ func @launch() {
 // CHECK-NEXT: %[[BDIM:.*]] = "gpu.block_dim"() {dimension = "x"} : () -> index
 // CHECK-NEXT: = "gpu.block_dim"() {dimension = "y"} : () -> index
 // CHECK-NEXT: = "gpu.block_dim"() {dimension = "z"} : () -> index
-// CHECK-NEXT: "use"(%[[KERNEL_ARG0]]) : (f32) -> ()
+// CHECK-NEXT: "use"(%[[ARG0]]) : (f32) -> ()
 // CHECK-NEXT: "some_op"(%[[BID]], %[[BDIM]]) : (index, index) -> ()
-// CHECK-NEXT: = load %[[KERNEL_ARG1]][%[[TID]]] : memref<?xf32, 1>
+// CHECK-NEXT: = load %[[ARG1]][%[[TID]]] : memref<?xf32, 1>

 // -----

@ -79,8 +75,8 @@ func @multiple_launches() {
  return
 }

-// CHECK: func @multiple_launches_kernel()
-// CHECK: func @multiple_launches_kernel_0()
+// CHECK-LABEL: func @multiple_launches_kernel()
+// CHECK-LABEL: func @multiple_launches_kernel_0()

 // -----

@ -104,23 +100,3 @@ func @extra_constants(%arg0 : memref<?xf32>) {
 // CHECK-LABEL: func @extra_constants_kernel(%{{.*}}: memref<?xf32>)
 // CHECK: constant
 // CHECK: constant
-
-// -----
-
-func @function_call(%arg0 : memref<?xf32>) {
-  %cst = constant 8 : index
-  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst,
-                                       %grid_z = %cst)
-             threads(%tx, %ty, %tz) in (%block_x = %cst, %block_y = %cst,
-                                        %block_z = %cst) {
-    // TODO(b/141098412): Support function calls.
-    // expected-error @+1 {{'device_function' does not reference a valid function}}
-    call @device_function() : () -> ()
-    gpu.return
-  }
-  return
-}
-
-func @device_function() {
-  gpu.return
-}
--- a/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp
+++ b/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp
@ -108,22 +108,50 @@ OwnedCubin compilePtxToCubin(const std::string ptx, FuncOp &function) {
  return result;
 }

+namespace {
+// A pass that lowers all Standard and Gpu operations to LLVM dialect. It does
+// not lower the GPULaunch operation to actual code but dows translate the
+// signature of its kernel argument.
+class LowerStandardAndGpuToLLVMAndNVVM
+    : public ModulePass<LowerStandardAndGpuToLLVMAndNVVM> {
+public:
+  void runOnModule() override {
+    ModuleOp m = getModule();
+
+    OwningRewritePatternList patterns;
+    LLVMTypeConverter converter(m.getContext());
+    populateStdToLLVMConversionPatterns(converter, patterns);
+    populateGpuToNVVMConversionPatterns(converter, patterns);
+
+    ConversionTarget target(getContext());
+    target.addLegalDialect<LLVM::LLVMDialect>();
+    target.addLegalDialect<NVVM::NVVMDialect>();
+    target.addLegalOp<ModuleOp>();
+    target.addLegalOp<ModuleTerminatorOp>();
+    target.addDynamicallyLegalOp<FuncOp>(
+        [&](FuncOp op) { return converter.isSignatureLegal(op.getType()); });
+    if (failed(applyFullConversion(m, target, patterns, &converter)))
+      signalPassFailure();
+  }
+};
+} // end anonymous namespace
+
 static LogicalResult runMLIRPasses(ModuleOp m) {
  PassManager pm(m.getContext());
-  applyPassManagerCLOptions(pm);

  pm.addPass(createGpuKernelOutliningPass());
-  auto &kernelPm = pm.nest<ModuleOp>();
-  kernelPm.addPass(createLowerGpuOpsToNVVMOpsPass());
-  kernelPm.addPass(createConvertGPUKernelToCubinPass(&compilePtxToCubin));
-  pm.addPass(createLowerToLLVMPass());
+  pm.addPass(static_cast<std::unique_ptr<OpPassBase<ModuleOp>>>(
+      std::make_unique<LowerStandardAndGpuToLLVMAndNVVM>()));
+  pm.addPass(createConvertGPUKernelToCubinPass(&compilePtxToCubin));
  pm.addPass(createGenerateCubinAccessorPass());
  pm.addPass(createConvertGpuLaunchFuncToCudaCallsPass());

-  return pm.run(m);
+  if (failed(pm.run(m)))
+    return failure();
+
+  return success();
 }

 int main(int argc, char **argv) {
-  registerPassManagerCLOptions();
  return mlir::JitRunnerMain(argc, argv, &runMLIRPasses);
 }