Use named modules for gpu.launch_func

The kernel function called by gpu.launch_func is now placed into an isolated nested module during the outlining stage to simplify separate compilation. Until recently, modules did not have names and could not be referenced. This limitation was circumvented by introducing a stub kernel at the same name at the same nesting level as the module containing the actual kernel. This relation is only effective in one direction: from actual kernel function to its launch_func "caller". Leverage the recently introduced symbol name attributes on modules to refer to a specific nested module from `gpu.launch_func`. This removes the implicit connection between the identically named stub and kernel functions. It also enables support for `gpu.launch_func`s to call different kernels located in the same module. PiperOrigin-RevId: 273491891
2019-10-08 04:29:58 -07:00 · 2019-10-08 04:29:58 -07:00 · 90d65d32d6
parent 780f107a57
commit 90d65d32d6
15 changed files with 673 additions and 426 deletions
--- a/mlir/g3doc/Dialects/GPU.md
+++ b/mlir/g3doc/Dialects/GPU.md
@ -120,12 +120,17 @@ to have that information by construction.

 ### `gpu.launch_func`

-Launch a kernel given as a function on the specified grid of thread blocks.
-`gpu.launch` operations are lowered to `gpu.launch_func` operations by outlining
-the kernel body into a function, which is closer to the NVVM model. The
-`gpu.launch_func` operation has a function attribute named `kernel` to specify
-the kernel function to launch. The kernel function itself has a `nvvm.kernel`
-attribute.
+Launch a kernel function on the specified grid of thread blocks. `gpu.launch`
+operations are lowered to `gpu.launch_func` operations by outlining the kernel
+body into a function in a dedicated module, which reflects the separate
+compilation process. The kernel function is required to have the `gpu.kernel`
+attribute. The module containing the kernel function is required to have the
+`gpu.kernel_module` attribute and must be named. And finally, the module
+containing the kernel module (which thus cannot be the top-level module) is
+required to have the `gpu.container_module` attribute. The `gpu.launch_func`
+operation has a string attribute named `kernel` to specify the name of the
+kernel function to launch and an attribute named `kernel_module` to specify the
+name of the module containing that kernel function.

 The operation takes at least six operands, with the first three operands being
 grid sizes along x,y,z dimensions and the following three being block sizes
@ -138,36 +143,43 @@ A custom syntax for this operation is currently not available.
 Example:

 ```mlir {.mlir}
-func @kernel_1(%arg0 : f32, %arg1 : !llvm<"float*">)
-    attributes { nvvm.kernel: true } {
+module attributes {gpu.container_module} {

-  // Operations that produce block/thread IDs and dimensions are injected when
-  // outlining the `gpu.launch` body to a function called by `gpu.launch_func`.
-  %tIdX = "gpu.thread_id"() {dimension: "x"} : () -> (index)
-  %tIdY = "gpu.thread_id"() {dimension: "y"} : () -> (index)
-  %tIdZ = "gpu.thread_id"() {dimension: "z"} : () -> (index)
+  // This module creates a separate compilation unit for the GPU compiler.
+  module @kernels attributes {gpu.kernel_module} {
+    func @kernel_1(%arg0 : f32, %arg1 : !llvm<"float*">)
+        attributes { nvvm.kernel: true } {

-  %bDimX = "gpu.block_dim"() {dimension: "x"} : () -> (index)
-  %bDimY = "gpu.block_dim"() {dimension: "y"} : () -> (index)
-  %bDimZ = "gpu.block_dim"() {dimension: "z"} : () -> (index)
+      // Operations that produce block/thread IDs and dimensions are injected when
+      // outlining the `gpu.launch` body to a function called by `gpu.launch_func`.
+      %tIdX = "gpu.thread_id"() {dimension: "x"} : () -> (index)
+      %tIdY = "gpu.thread_id"() {dimension: "y"} : () -> (index)
+      %tIdZ = "gpu.thread_id"() {dimension: "z"} : () -> (index)

-  %bIdX = "gpu.block_id"() {dimension: "x"} : () -> (index)
-  %bIdY = "gpu.block_id"() {dimension: "y"} : () -> (index)
-  %bIdZ = "gpu.block_id"() {dimension: "z"} : () -> (index)
+      %bDimX = "gpu.block_dim"() {dimension: "x"} : () -> (index)
+      %bDimY = "gpu.block_dim"() {dimension: "y"} : () -> (index)
+      %bDimZ = "gpu.block_dim"() {dimension: "z"} : () -> (index)

-  %gDimX = "gpu.grid_dim"() {dimension: "x"} : () -> (index)
-  %gDimY = "gpu.grid_dim"() {dimension: "y"} : () -> (index)
-  %gDimZ = "gpu.grid_dim"() {dimension: "z"} : () -> (index)
+      %bIdX = "gpu.block_id"() {dimension: "x"} : () -> (index)
+      %bIdY = "gpu.block_id"() {dimension: "y"} : () -> (index)
+      %bIdZ = "gpu.block_id"() {dimension: "z"} : () -> (index)

-  "some_op"(%bx, %tx) : (index, index) -> ()
-  %42 = load %arg1[%bx] : memref<?xf32, 1>
+      %gDimX = "gpu.grid_dim"() {dimension: "x"} : () -> (index)
+      %gDimY = "gpu.grid_dim"() {dimension: "y"} : () -> (index)
+      %gDimZ = "gpu.grid_dim"() {dimension: "z"} : () -> (index)
+
+      "some_op"(%bx, %tx) : (index, index) -> ()
+      %42 = load %arg1[%bx] : memref<?xf32, 1>
+    }
+  }
+
+  "gpu.launch_func"(%cst, %cst, %cst,  // Grid sizes.
+                    %cst, %cst, %cst,  // Block sizes.
+                    %arg0, %arg1)      // Arguments passed to the kernel function.
+        { kernel_module = @kernels,    // Module containing the kernel function.
+          kernel = "kernel_1" }        // Kernel function.
+        : (index, index, index, index, index, index, f32, !llvm<"float*">) -> ()
 }
-
-"gpu.launch_func"(%cst, %cst, %cst,  // Grid sizes.
-                  %cst, %cst, %cst,  // Block sizes.
-                  %arg0, %arg1)      // Arguments passed to the kernel function.
-      {kernel: @kernel_1 : (f32, !llvm<"float*">) -> ()}  // Kernel function.
-      : (index, index, index, index, index, index, f32, !llvm<"float*">) -> ()
 ```

 ### `gpu.thread_id`
--- a/mlir/include/mlir/Dialect/GPU/GPUDialect.h
+++ b/mlir/include/mlir/Dialect/GPU/GPUDialect.h
@ -38,6 +38,12 @@ public:
  /// Create the dialect in the given `context`.
  GPUDialect(MLIRContext *context);

+  /// Get the name of the attribute used to annotate the modules that contain
+  /// kernel modules.
+  static StringRef getContainerModuleAttrName() {
+    return "gpu.container_module";
+  }
+
  /// Get the canonical string name of the dialect.
  static StringRef getDialectName();

@ -50,6 +56,9 @@ public:
  /// Returns whether the given function is a kernel function, i.e., has the
  /// 'gpu.kernel' attribute.
  static bool isKernel(FuncOp function);
+
+  LogicalResult verifyOperationAttribute(Operation *op,
+                                         NamedAttribute attr) override;
 };

 /// Utility class for the GPU dialect to represent triples of `Value`s
@ -147,6 +156,9 @@ public:
  StringRef kernel();
  /// The number of operands passed to the kernel function.
  unsigned getNumKernelOperands();
+  /// The name of the kernel module specified by the operation's `kernel_module`
+  /// attribute.
+  StringRef getKernelModuleName();
  /// The i-th operand passed to the kernel function.
  Value *getKernelOperand(unsigned i);

@ -164,8 +176,17 @@ public:
  static constexpr unsigned kNumConfigOperands = 6;

 private:
-  /// The name of the function attribute specifying the kernel to launch.
+  // This needs to quietly verify if attributes with names defined below are
+  // present since it is run before the verifier of this op.
+  friend LogicalResult GPUDialect::verifyOperationAttribute(Operation *,
+                                                            NamedAttribute);
+
+  /// The name of the symbolRef attribute specifying the kernel to launch.
  static StringRef getKernelAttrName() { return "kernel"; }
+
+  /// The name of the symbolRef attribute specifying the name of the module
+  /// containing the kernel to launch.
+  static StringRef getKernelModuleAttrName() { return "kernel_module"; }
 };

 #define GET_OP_CLASSES
--- a/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp
+++ b/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp
@ -135,6 +135,12 @@ public:
      func.walk(
          [this](mlir::gpu::LaunchFuncOp op) { translateGpuLaunchCalls(op); });
    }
+
+    // GPU kernel modules are no longer necessary since we have a global
+    // constant with the CUBIN data.
+    for (auto m : llvm::make_early_inc_range(getModule().getOps<ModuleOp>()))
+      if (m.getAttrOfType<UnitAttr>(gpu::GPUDialect::getKernelModuleAttrName()))
+        m.erase();
  }

 private:
@ -342,11 +348,12 @@ void GpuLaunchFuncToCudaCallsPass::translateGpuLaunchCalls(
  // Emit a call to the cubin getter to retrieve a pointer to the data that
  // represents the cubin at runtime.
  // TODO(herhut): This should rather be a static global once supported.
-  auto kernelFunction = getModule().lookupSymbol<FuncOp>(launchOp.kernel());
-  if (!kernelFunction) {
-    launchOp.emitError("missing kernel function ") << launchOp.kernel();
-    return signalPassFailure();
-  }
+  auto kernelModule =
+      getModule().lookupSymbol<ModuleOp>(launchOp.getKernelModuleName());
+  assert(kernelModule && "expected a kernel module");
+  auto kernelFunction = kernelModule.lookupSymbol<FuncOp>(launchOp.kernel());
+  assert(kernelFunction && "expected a kernel function");
+
  auto cubinGetter =
      kernelFunction.getAttrOfType<SymbolRefAttr>(kCubinGetterAnnotation);
  if (!cubinGetter) {
--- a/mlir/lib/Conversion/GPUToCUDA/GenerateCubinAccessors.cpp
+++ b/mlir/lib/Conversion/GPUToCUDA/GenerateCubinAccessors.cpp
@ -61,26 +61,21 @@ private:
    return LLVM::LLVMType::getIntNTy(llvmDialect, bits);
  }

-  // Inserts a global constant string containing `blob` into the parent module
-  // of `kernelFunc` and generates the function that returns the address of the
-  // first character of this string.
+  // Inserts a global constant string containing `blob` into the grand-parent
+  // module of `kernelFunc` and generates the function that returns the address
+  // of the first character of this string. Returns the generator function.
  // TODO(herhut): consider fusing this pass with launch-func-to-cuda.
-  void generate(FuncOp kernelFunc, StringAttr blob) {
-    auto stubFunc = getModule().lookupSymbol<FuncOp>(kernelFunc.getName());
-    if (!stubFunc) {
-      kernelFunc.emitError(
-          "corresponding external function not found in parent module");
-      return signalPassFailure();
-    }
-
-    Location loc = stubFunc.getLoc();
-    SmallString<128> nameBuffer(stubFunc.getName());
-    auto module = stubFunc.getParentOfType<ModuleOp>();
-    assert(module && "function must belong to a module");
+  FuncOp generate(FuncOp kernelFunc, StringAttr blob) {
+    Location loc = kernelFunc.getLoc();
+    SmallString<128> nameBuffer(kernelFunc.getName());
+    ModuleOp module = getModule();
+    assert(kernelFunc.getParentOp() &&
+           kernelFunc.getParentOp()->getParentOp() == module &&
+           "expected one level of module nesting");

    // Insert the getter function just after the original function.
-    OpBuilder moduleBuilder(module.getBody(), module.getBody()->begin());
-    moduleBuilder.setInsertionPointAfter(stubFunc.getOperation());
+    OpBuilder moduleBuilder(module.getBody());
+    moduleBuilder.setInsertionPointAfter(kernelFunc.getParentOp());
    auto getterType = moduleBuilder.getFunctionType(
        llvm::None, LLVM::LLVMType::getInt8PtrTy(llvmDialect));
    nameBuffer.append(kCubinGetterSuffix);
@ -89,7 +84,7 @@ private:
    Block *entryBlock = result.addEntryBlock();

    // Drop the getter suffix before appending the storage suffix.
-    nameBuffer.resize(stubFunc.getName().size());
+    nameBuffer.resize(kernelFunc.getName().size());
    nameBuffer.append(kCubinStorageSuffix);

    // Obtain the address of the first character of the global string containing
@ -98,25 +93,29 @@ private:
    Value *startPtr = LLVM::createGlobalString(
        loc, builder, StringRef(nameBuffer), blob.getValue(), llvmDialect);
    builder.create<LLVM::ReturnOp>(loc, startPtr);
-
-    // Store the name of the getter on the function for easier lookup.
-    stubFunc.setAttr(kCubinGetterAnnotation, builder.getSymbolRefAttr(result));
+    return result;
  }

 public:
  void runOnModule() override {
    llvmDialect = getContext().getRegisteredDialect<LLVM::LLVMDialect>();

-    auto modules = getModule().getOps<ModuleOp>();
-    for (auto module : llvm::make_early_inc_range(modules)) {
+    for (auto module : getModule().getOps<ModuleOp>()) {
      if (!module.getAttrOfType<UnitAttr>(
              gpu::GPUDialect::getKernelModuleAttrName()))
        continue;
      for (auto func : module.getOps<FuncOp>()) {
-        if (StringAttr blob = func.getAttrOfType<StringAttr>(kCubinAnnotation))
-          generate(func, blob);
+        if (StringAttr blob =
+                func.getAttrOfType<StringAttr>(kCubinAnnotation)) {
+          FuncOp getter = generate(func, blob);
+
+          // Store the name of the getter on the function for easier lookup and
+          // remove the CUBIN.
+          func.setAttr(kCubinGetterAnnotation,
+                       SymbolRefAttr::get(getter.getName(), func.getContext()));
+          func.removeAttr(kCubinAnnotation);
+        }
      }
-      module.erase();
    }
  }

--- a/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.cpp
+++ b/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.cpp
@ -121,7 +121,7 @@ void GPUToSPIRVPass::runOnModule() {
  auto module = getModule();

  SmallVector<Operation *, 4> spirvModules;
-  for (auto funcOp : module.getOps<FuncOp>()) {
+  module.walk([&module, &spirvModules](FuncOp funcOp) {
    if (gpu::GPUDialect::isKernel(funcOp)) {
      OpBuilder builder(module.getBodyRegion());
      // Create a new spirv::ModuleOp for this function, and clone the
@ -139,7 +139,7 @@ void GPUToSPIRVPass::runOnModule() {
      moduleBuilder.clone(*funcOp.getOperation());
      spirvModules.push_back(spvModule);
    }
-  }
+  });

  /// Dialect conversion to lower the functions with the spirv::ModuleOps.
  SPIRVBasicTypeConverter basicTypeConverter;
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@ -31,6 +31,10 @@
 using namespace mlir;
 using namespace mlir::gpu;

+//===----------------------------------------------------------------------===//
+// GPUDialect
+//===----------------------------------------------------------------------===//
+
 StringRef GPUDialect::getDialectName() { return "gpu"; }

 bool GPUDialect::isKernel(FuncOp function) {
@ -47,6 +51,78 @@ GPUDialect::GPUDialect(MLIRContext *context)
                >();
 }

+LogicalResult GPUDialect::verifyOperationAttribute(Operation *op,
+                                                   NamedAttribute attr) {
+  if (!attr.second.isa<UnitAttr>() ||
+      !attr.first.is(getContainerModuleAttrName()))
+    return success();
+
+  auto module = dyn_cast<ModuleOp>(op);
+  if (!module)
+    return op->emitError("expected '")
+           << getContainerModuleAttrName() << "' attribute to be attached to '"
+           << ModuleOp::getOperationName() << '\'';
+
+  auto walkResult = module.walk([&module](LaunchFuncOp launchOp) -> WalkResult {
+    // Ignore launches that are nested more or less deep than functions in the
+    // module we are currently checking.
+    if (!launchOp.getParentOp() ||
+        launchOp.getParentOp()->getParentOp() != module)
+      return success();
+
+    // Ignore launch ops with missing attributes here. The errors will be
+    // reported by the verifiers of those ops.
+    if (!launchOp.getAttrOfType<StringAttr>(
+            LaunchFuncOp::getKernelAttrName()) ||
+        !launchOp.getAttrOfType<SymbolRefAttr>(
+            LaunchFuncOp::getKernelModuleAttrName()))
+      return success();
+
+    // Check that `launch_func` refers to a well-formed GPU kernel module.
+    StringRef kernelModuleName = launchOp.getKernelModuleName();
+    auto kernelModule = module.lookupSymbol<ModuleOp>(kernelModuleName);
+    if (!kernelModule)
+      return launchOp.emitOpError()
+             << "kernel module '" << kernelModuleName << "' is undefined";
+    if (!kernelModule.getAttrOfType<UnitAttr>(
+            GPUDialect::getKernelModuleAttrName()))
+      return launchOp.emitOpError("module '")
+             << kernelModuleName << "' is missing the '"
+             << GPUDialect::getKernelModuleAttrName() << "' attribute";
+
+    // Check that `launch_func` refers to a well-formed kernel function.
+    StringRef kernelName = launchOp.kernel();
+    auto kernelFunction = kernelModule.lookupSymbol<FuncOp>(kernelName);
+    if (!kernelFunction)
+      return launchOp.emitOpError("kernel function '")
+             << kernelName << "' is undefined";
+    if (!kernelFunction.getAttrOfType<mlir::UnitAttr>(
+            GPUDialect::getKernelFuncAttrName()))
+      return launchOp.emitOpError("kernel function is missing the '")
+             << GPUDialect::getKernelFuncAttrName() << "' attribute";
+    if (launchOp.getNumKernelOperands() != kernelFunction.getNumArguments())
+      return launchOp.emitOpError("got ") << launchOp.getNumKernelOperands()
+                                          << " kernel operands but expected "
+                                          << kernelFunction.getNumArguments();
+
+    // Due to the ordering of the current impl of lowering and LLVMLowering,
+    // type checks need to be temporarily disabled.
+    // TODO(ntv,zinenko,herhut): reactivate checks once "changing gpu.launchFunc
+    // to encode target module" has landed.
+    // auto functionType = kernelFunc.getType();
+    // for (unsigned i = 0; i < numKernelFuncArgs; ++i) {
+    //   if (getKernelOperand(i)->getType() != functionType.getInput(i)) {
+    //     return emitOpError("type of function argument ")
+    //            << i << " does not match";
+    //   }
+    // }
+
+    return success();
+  });
+
+  return walkResult.wasInterrupted() ? failure() : success();
+}
+
 template <typename T> static LogicalResult verifyIndexOp(T op) {
  auto dimension = op.dimension();
  if (dimension != "x" && dimension != "y" && dimension != "z")
@ -394,7 +470,11 @@ void LaunchFuncOp::build(Builder *builder, OperationState &result,
      {gridSizeX, gridSizeY, gridSizeZ, blockSizeX, blockSizeY, blockSizeZ});
  result.addOperands(kernelOperands);
  result.addAttribute(getKernelAttrName(),
-                      builder->getSymbolRefAttr(kernelFunc));
+                      builder->getStringAttr(kernelFunc.getName()));
+  auto kernelModule = kernelFunc.getParentOfType<ModuleOp>();
+  if (Optional<StringRef> kernelModuleName = kernelModule.getName())
+    result.addAttribute(getKernelModuleAttrName(),
+                        builder->getSymbolRefAttr(*kernelModuleName));
 }

 void LaunchFuncOp::build(Builder *builder, OperationState &result,
@ -406,13 +486,17 @@ void LaunchFuncOp::build(Builder *builder, OperationState &result,
 }

 StringRef LaunchFuncOp::kernel() {
-  return getAttrOfType<SymbolRefAttr>(getKernelAttrName()).getValue();
+  return getAttrOfType<StringAttr>(getKernelAttrName()).getValue();
 }

 unsigned LaunchFuncOp::getNumKernelOperands() {
  return getNumOperands() - kNumConfigOperands;
 }

+StringRef LaunchFuncOp::getKernelModuleName() {
+  return getAttrOfType<SymbolRefAttr>(getKernelModuleAttrName()).getValue();
+}
+
 Value *LaunchFuncOp::getKernelOperand(unsigned i) {
  return getOperation()->getOperand(i + kNumConfigOperands);
 }
@ -426,39 +510,25 @@ KernelDim3 LaunchFuncOp::getBlockSizeOperandValues() {
 }

 LogicalResult LaunchFuncOp::verify() {
-  auto kernelAttr = this->getAttr(getKernelAttrName());
-  if (!kernelAttr) {
-    return emitOpError("attribute 'kernel' must be specified");
-  } else if (!kernelAttr.isa<SymbolRefAttr>()) {
-    return emitOpError("attribute 'kernel' must be a function");
-  }
-
  auto module = getParentOfType<ModuleOp>();
-  FuncOp kernelFunc = module.lookupSymbol<FuncOp>(kernel());
-  if (!kernelFunc)
-    return emitOpError("kernel function '") << kernelAttr << "' is undefined";
+  if (!module)
+    return emitOpError("expected to belong to a module");
+
+  if (!module.getAttrOfType<UnitAttr>(GPUDialect::getContainerModuleAttrName()))
+    return emitOpError("expected the closest surrounding module to have the '" +
+                       GPUDialect::getContainerModuleAttrName() +
+                       "' attribute");
+
+  auto kernelAttr = getAttrOfType<StringAttr>(getKernelAttrName());
+  if (!kernelAttr)
+    return emitOpError("string attribute '" + getKernelAttrName() +
+                       "' must be specified");
+
+  auto kernelModuleAttr =
+      getAttrOfType<SymbolRefAttr>(getKernelModuleAttrName());
+  if (!kernelModuleAttr)
+    return emitOpError("symbol reference attribute '" +
+                       getKernelModuleAttrName() + "' must be specified");

-  if (!kernelFunc.getAttrOfType<mlir::UnitAttr>(
-          GPUDialect::getKernelFuncAttrName())) {
-    return emitOpError("kernel function is missing the '")
-           << GPUDialect::getKernelFuncAttrName() << "' attribute";
-  }
-  unsigned numKernelFuncArgs = kernelFunc.getNumArguments();
-  if (getNumKernelOperands() != numKernelFuncArgs) {
-    return emitOpError("got ")
-           << getNumKernelOperands() << " kernel operands but expected "
-           << numKernelFuncArgs;
-  }
-  // Due to the ordering of the current impl of lowering and LLVMLowering, type
-  // checks need to be temporarily disabled.
-  // TODO(ntv,zinenko,herhut): reactivate checks once "changing gpu.launchFunc
-  // to encode target module" has landed.
-  // auto functionType = kernelFunc.getType();
-  // for (unsigned i = 0; i < numKernelFuncArgs; ++i) {
-  //   if (getKernelOperand(i)->getType() != functionType.getInput(i)) {
-  //     return emitOpError("type of function argument ")
-  //            << i << " does not match";
-  //   }
-  // }
  return success();
 }
--- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
@ -144,27 +144,30 @@ class GpuKernelOutliningPass : public ModulePass<GpuKernelOutliningPass> {
 public:
  void runOnModule() override {
    ModuleManager moduleManager(getModule());
+    bool modified = false;
    for (auto func : getModule().getOps<FuncOp>()) {
      // Insert just after the function.
      Block::iterator insertPt(func.getOperation()->getNextNode());
      func.walk([&](gpu::LaunchOp op) {
        FuncOp outlinedFunc = outlineKernelFunc(op);

-        // Potentially renames outlinedFunc to make symbol unique.
-        moduleManager.insert(insertPt, outlinedFunc);
+        // Create nested module and insert outlinedFunc. The module will
+        // originally get the same name as the function, but may be renamed on
+        // insertion into the parent module.
+        auto kernelModule = createKernelModule(outlinedFunc, moduleManager);
+        moduleManager.insert(insertPt, kernelModule);

        // Potentially changes signature, pulling in constants.
        convertToLaunchFuncOp(op, outlinedFunc);
-
-        // Create clone and move body from outlinedFunc.
-        auto kernelFunc = outlinedFunc.cloneWithoutRegions();
-        kernelFunc.getBody().takeBody(outlinedFunc.getBody());
-
-        // Create nested module and insert kernelFunc.
-        auto kernelModule = createKernelModule(kernelFunc, moduleManager);
-        getModule().insert(insertPt, kernelModule);
+        modified = true;
      });
    }
+
+    // If any new module was inserted in this module, annotate this module as
+    // a container module.
+    if (modified)
+      getModule().setAttr(gpu::GPUDialect::getContainerModuleAttrName(),
+                          UnitAttr::get(&getContext()));
  }

 private:
@ -172,9 +175,11 @@ private:
  ModuleOp createKernelModule(FuncOp kernelFunc,
                              const ModuleManager &parentModuleManager) {
    auto context = getModule().getContext();
-    auto kernelModule = ModuleOp::create(UnknownLoc::get(context));
+    Builder builder(context);
+    auto kernelModule =
+        ModuleOp::create(builder.getUnknownLoc(), kernelFunc.getName());
    kernelModule.setAttr(gpu::GPUDialect::getKernelModuleAttrName(),
-                         UnitAttr::get(context));
+                         builder.getUnitAttr());
    ModuleManager moduleManager(kernelModule);

    llvm::SmallVector<FuncOp, 8> funcsToInsert = {kernelFunc};
--- a/mlir/test/Conversion/GPUToCUDA/insert-cubin-getter.mlir
+++ b/mlir/test/Conversion/GPUToCUDA/insert-cubin-getter.mlir
@ -1,15 +1,15 @@
 // RUN: mlir-opt %s --generate-cubin-accessors | FileCheck %s

+module attributes {gpu.container_module} {
+
 // CHECK: llvm.mlir.global constant @[[global:.*]]("CUBIN")

-module attributes {gpu.kernel_module} {
-  func @kernel(!llvm.float, !llvm<"float*">)
-  attributes  {nvvm.cubin = "CUBIN"}
-}
-
-func @kernel(!llvm.float, !llvm<"float*">)
-// CHECK: attributes  {gpu.kernel, nvvm.cubingetter = @[[getter:.*]]}
-  attributes  {gpu.kernel}
+  module attributes {gpu.kernel_module} {
+    // CHECK-LABEL: func @kernel
+    func @kernel(!llvm.float, !llvm<"float*">)
+    // CHECK: attributes  {nvvm.cubingetter = @[[getter:.*]]}
+    attributes  {nvvm.cubin = "CUBIN"}
+  }

 // CHECK: func @[[getter]]() -> !llvm<"i8*">
 // CHECK: %[[addressof:.*]] = llvm.mlir.addressof @[[global]]
@ -17,3 +17,4 @@ func @kernel(!llvm.float, !llvm<"float*">)
 // CHECK: %[[gep:.*]] = llvm.getelementptr %[[addressof]][%[[c0]], %[[c0]]]
 // CHECK-SAME: -> !llvm<"i8*">
 // CHECK: llvm.return %[[gep]] : !llvm<"i8*">
+}
--- a/mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir
+++ b/mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir
@ -1,27 +1,33 @@
 // RUN: mlir-opt %s --launch-func-to-cuda | FileCheck %s

-// CHECK: llvm.mlir.global constant @[[kernel_name:.*]]("kernel\00")
+module attributes {gpu.container_module} {

-func @cubin_getter() -> !llvm<"i8*">
+  // CHECK: llvm.mlir.global constant @[[kernel_name:.*]]("kernel\00")

-func @kernel(!llvm.float, !llvm<"float*">)
-    attributes { gpu.kernel, nvvm.cubingetter = @cubin_getter }
+  func @cubin_getter() -> !llvm<"i8*">
+
+  module @kernel_module attributes {gpu.kernel_module} {
+    func @kernel(!llvm.float, !llvm<"float*">)
+        attributes { gpu.kernel, nvvm.cubingetter = @cubin_getter }
+  }


-func @foo() {
-  %0 = "op"() : () -> (!llvm.float)
-  %1 = "op"() : () -> (!llvm<"float*">)
-  %cst = constant 8 : index
+  func @foo() {
+    %0 = "op"() : () -> (!llvm.float)
+    %1 = "op"() : () -> (!llvm<"float*">)
+    %cst = constant 8 : index

-  // CHECK: [[module_ptr:%.*]] = llvm.alloca {{.*}} x !llvm<"i8*"> : (!llvm.i32) -> !llvm<"i8**">
-  // CHECK: llvm.call @mcuModuleLoad([[module_ptr]], {{.*}}) : (!llvm<"i8**">, !llvm<"i8*">) -> !llvm.i32
-  // CHECK: [[func_ptr:%.*]] = llvm.alloca {{.*}} x !llvm<"i8*"> : (!llvm.i32) -> !llvm<"i8**">
-  // CHECK: llvm.call @mcuModuleGetFunction([[func_ptr]], {{.*}}, {{.*}}) : (!llvm<"i8**">, !llvm<"i8*">, !llvm<"i8*">) -> !llvm.i32
-  // CHECK: llvm.call @mcuGetStreamHelper
-  // CHECK: llvm.call @mcuLaunchKernel
-  // CHECK: llvm.call @mcuStreamSynchronize
-  "gpu.launch_func"(%cst, %cst, %cst, %cst, %cst, %cst, %0, %1) { kernel = @kernel }
-      : (index, index, index, index, index, index, !llvm.float, !llvm<"float*">) -> ()
+    // CHECK: [[module_ptr:%.*]] = llvm.alloca {{.*}} x !llvm<"i8*"> : (!llvm.i32) -> !llvm<"i8**">
+    // CHECK: llvm.call @mcuModuleLoad([[module_ptr]], {{.*}}) : (!llvm<"i8**">, !llvm<"i8*">) -> !llvm.i32
+    // CHECK: [[func_ptr:%.*]] = llvm.alloca {{.*}} x !llvm<"i8*"> : (!llvm.i32) -> !llvm<"i8**">
+    // CHECK: llvm.call @mcuModuleGetFunction([[func_ptr]], {{.*}}, {{.*}}) : (!llvm<"i8**">, !llvm<"i8*">, !llvm<"i8*">) -> !llvm.i32
+    // CHECK: llvm.call @mcuGetStreamHelper
+    // CHECK: llvm.call @mcuLaunchKernel
+    // CHECK: llvm.call @mcuStreamSynchronize
+    "gpu.launch_func"(%cst, %cst, %cst, %cst, %cst, %cst, %0, %1) { kernel = "kernel", kernel_module = @kernel_module }
+        : (index, index, index, index, index, index, !llvm.float, !llvm<"float*">) -> ()
+
+    return
+  }

-  return
 }
--- a/mlir/test/Conversion/GPUToSPIRV/builtins.mlir
+++ b/mlir/test/Conversion/GPUToSPIRV/builtins.mlir
@ -1,113 +1,137 @@
 // RUN: mlir-opt -split-input-file -convert-gpu-to-spirv %s -o - | FileCheck %s

-func @builtin() {
-  %c0 = constant 1 : index
-  "gpu.launch_func"(%c0, %c0, %c0, %c0, %c0, %c0) {kernel = @builtin_workgroup_id_x} : (index, index, index, index, index, index) -> ()
-  return
-}
+module attributes {gpu.container_module} {
+  func @builtin() {
+    %c0 = constant 1 : index
+    "gpu.launch_func"(%c0, %c0, %c0, %c0, %c0, %c0) {kernel = "builtin_workgroup_id_x", kernel_module = @kernels} : (index, index, index, index, index, index) -> ()
+    return
+  }

-// CHECK-LABEL:  spv.module "Logical" "GLSL450"
-// CHECK: spv.globalVariable [[WORKGROUPID:@.*]] built_in("WorkgroupId")
-func @builtin_workgroup_id_x()
-  attributes {gpu.kernel} {
-  // CHECK: [[ADDRESS:%.*]] = spv._address_of [[WORKGROUPID]]
-  // CHECK-NEXT: [[VEC:%.*]] = spv.Load "Input" [[ADDRESS]]
-  // CHECK-NEXT: {{%.*}} = spv.CompositeExtract [[VEC]]{{\[}}0 : i32{{\]}}
-  %0 = "gpu.block_id"() {dimension = "x"} : () -> index
-  return
+  // CHECK-LABEL:  spv.module "Logical" "GLSL450"
+  // CHECK: spv.globalVariable [[WORKGROUPID:@.*]] built_in("WorkgroupId")
+  module @kernels attributes {gpu.kernel_module} {
+    func @builtin_workgroup_id_x()
+      attributes {gpu.kernel} {
+      // CHECK: [[ADDRESS:%.*]] = spv._address_of [[WORKGROUPID]]
+      // CHECK-NEXT: [[VEC:%.*]] = spv.Load "Input" [[ADDRESS]]
+      // CHECK-NEXT: {{%.*}} = spv.CompositeExtract [[VEC]]{{\[}}0 : i32{{\]}}
+      %0 = "gpu.block_id"() {dimension = "x"} : () -> index
+      return
+    }
+  }
 }

 // -----

-func @builtin() {
-  %c0 = constant 1 : index
-  "gpu.launch_func"(%c0, %c0, %c0, %c0, %c0, %c0) {kernel = @builtin_workgroup_id_y} : (index, index, index, index, index, index) -> ()
-  return
-}
+module attributes {gpu.container_module} {
+  func @builtin() {
+    %c0 = constant 1 : index
+    "gpu.launch_func"(%c0, %c0, %c0, %c0, %c0, %c0) {kernel = "builtin_workgroup_id_y", kernel_module = @kernels} : (index, index, index, index, index, index) -> ()
+    return
+  }

-// CHECK-LABEL:  spv.module "Logical" "GLSL450"
-// CHECK: spv.globalVariable [[WORKGROUPID:@.*]] built_in("WorkgroupId")
-func @builtin_workgroup_id_y()
-  attributes {gpu.kernel} {
-  // CHECK: [[ADDRESS:%.*]] = spv._address_of [[WORKGROUPID]]
-  // CHECK-NEXT: [[VEC:%.*]] = spv.Load "Input" [[ADDRESS]]
-  // CHECK-NEXT: {{%.*}} = spv.CompositeExtract [[VEC]]{{\[}}1 : i32{{\]}}
-  %0 = "gpu.block_id"() {dimension = "y"} : () -> index
-  return
+  // CHECK-LABEL:  spv.module "Logical" "GLSL450"
+  // CHECK: spv.globalVariable [[WORKGROUPID:@.*]] built_in("WorkgroupId")
+  module @kernels attributes {gpu.kernel_module} {
+    func @builtin_workgroup_id_y()
+      attributes {gpu.kernel} {
+      // CHECK: [[ADDRESS:%.*]] = spv._address_of [[WORKGROUPID]]
+      // CHECK-NEXT: [[VEC:%.*]] = spv.Load "Input" [[ADDRESS]]
+      // CHECK-NEXT: {{%.*}} = spv.CompositeExtract [[VEC]]{{\[}}1 : i32{{\]}}
+      %0 = "gpu.block_id"() {dimension = "y"} : () -> index
+      return
+    }
+  }
 }

 // -----

-func @builtin() {
-  %c0 = constant 1 : index
-  "gpu.launch_func"(%c0, %c0, %c0, %c0, %c0, %c0) {kernel = @builtin_workgroup_id_z} : (index, index, index, index, index, index) -> ()
-  return
-}
+module attributes {gpu.container_module} {
+  func @builtin() {
+    %c0 = constant 1 : index
+    "gpu.launch_func"(%c0, %c0, %c0, %c0, %c0, %c0) {kernel = "builtin_workgroup_id_z", kernel_module = @kernels} : (index, index, index, index, index, index) -> ()
+    return
+  }

-// CHECK-LABEL:  spv.module "Logical" "GLSL450"
-// CHECK: spv.globalVariable [[WORKGROUPID:@.*]] built_in("WorkgroupId")
-func @builtin_workgroup_id_z()
-  attributes {gpu.kernel} {
-  // CHECK: [[ADDRESS:%.*]] = spv._address_of [[WORKGROUPID]]
-  // CHECK-NEXT: [[VEC:%.*]] = spv.Load "Input" [[ADDRESS]]
-  // CHECK-NEXT: {{%.*}} = spv.CompositeExtract [[VEC]]{{\[}}2 : i32{{\]}}
-  %0 = "gpu.block_id"() {dimension = "z"} : () -> index
-  return
+  // CHECK-LABEL:  spv.module "Logical" "GLSL450"
+  // CHECK: spv.globalVariable [[WORKGROUPID:@.*]] built_in("WorkgroupId")
+  module @kernels attributes {gpu.kernel_module} {
+    func @builtin_workgroup_id_z()
+      attributes {gpu.kernel} {
+      // CHECK: [[ADDRESS:%.*]] = spv._address_of [[WORKGROUPID]]
+      // CHECK-NEXT: [[VEC:%.*]] = spv.Load "Input" [[ADDRESS]]
+      // CHECK-NEXT: {{%.*}} = spv.CompositeExtract [[VEC]]{{\[}}2 : i32{{\]}}
+      %0 = "gpu.block_id"() {dimension = "z"} : () -> index
+      return
+    }
+  }
 }

 // -----

-func @builtin() {
-  %c0 = constant 1 : index
-  "gpu.launch_func"(%c0, %c0, %c0, %c0, %c0, %c0) {kernel = @builtin_workgroup_size_x} : (index, index, index, index, index, index) -> ()
-  return
-}
+module attributes {gpu.container_module} {
+  func @builtin() {
+    %c0 = constant 1 : index
+    "gpu.launch_func"(%c0, %c0, %c0, %c0, %c0, %c0) {kernel = "builtin_workgroup_size_x", kernel_module = @kernels} : (index, index, index, index, index, index) -> ()
+    return
+  }

-// CHECK-LABEL:  spv.module "Logical" "GLSL450"
-// CHECK: spv.globalVariable [[WORKGROUPSIZE:@.*]] built_in("WorkgroupSize")
-func @builtin_workgroup_size_x()
-  attributes {gpu.kernel} {
-  // CHECK: [[ADDRESS:%.*]] = spv._address_of [[WORKGROUPSIZE]]
-  // CHECK-NEXT: [[VEC:%.*]] = spv.Load "Input" [[ADDRESS]]
-  // CHECK-NEXT: {{%.*}} = spv.CompositeExtract [[VEC]]{{\[}}0 : i32{{\]}}
-  %0 = "gpu.block_dim"() {dimension = "x"} : () -> index
-  return
+  // CHECK-LABEL:  spv.module "Logical" "GLSL450"
+  // CHECK: spv.globalVariable [[WORKGROUPSIZE:@.*]] built_in("WorkgroupSize")
+  module @kernels attributes {gpu.kernel_module} {
+    func @builtin_workgroup_size_x()
+      attributes {gpu.kernel} {
+      // CHECK: [[ADDRESS:%.*]] = spv._address_of [[WORKGROUPSIZE]]
+      // CHECK-NEXT: [[VEC:%.*]] = spv.Load "Input" [[ADDRESS]]
+      // CHECK-NEXT: {{%.*}} = spv.CompositeExtract [[VEC]]{{\[}}0 : i32{{\]}}
+      %0 = "gpu.block_dim"() {dimension = "x"} : () -> index
+      return
+    }
+  }
 }

 // -----

-func @builtin() {
-  %c0 = constant 1 : index
-  "gpu.launch_func"(%c0, %c0, %c0, %c0, %c0, %c0) {kernel = @builtin_local_id_x} : (index, index, index, index, index, index) -> ()
-  return
-}
+module attributes {gpu.container_module} {
+  func @builtin() {
+    %c0 = constant 1 : index
+    "gpu.launch_func"(%c0, %c0, %c0, %c0, %c0, %c0) {kernel = "builtin_local_id_x", kernel_module = @kernels} : (index, index, index, index, index, index) -> ()
+    return
+  }

-// CHECK-LABEL:  spv.module "Logical" "GLSL450"
-// CHECK: spv.globalVariable [[LOCALINVOCATIONID:@.*]] built_in("LocalInvocationId")
-func @builtin_local_id_x()
-  attributes {gpu.kernel} {
-  // CHECK: [[ADDRESS:%.*]] = spv._address_of [[LOCALINVOCATIONID]]
-  // CHECK-NEXT: [[VEC:%.*]] = spv.Load "Input" [[ADDRESS]]
-  // CHECK-NEXT: {{%.*}} = spv.CompositeExtract [[VEC]]{{\[}}0 : i32{{\]}}
-  %0 = "gpu.thread_id"() {dimension = "x"} : () -> index
-  return
+  // CHECK-LABEL:  spv.module "Logical" "GLSL450"
+  // CHECK: spv.globalVariable [[LOCALINVOCATIONID:@.*]] built_in("LocalInvocationId")
+  module @kernels attributes {gpu.kernel_module} {
+    func @builtin_local_id_x()
+      attributes {gpu.kernel} {
+      // CHECK: [[ADDRESS:%.*]] = spv._address_of [[LOCALINVOCATIONID]]
+      // CHECK-NEXT: [[VEC:%.*]] = spv.Load "Input" [[ADDRESS]]
+      // CHECK-NEXT: {{%.*}} = spv.CompositeExtract [[VEC]]{{\[}}0 : i32{{\]}}
+      %0 = "gpu.thread_id"() {dimension = "x"} : () -> index
+      return
+    }
+  }
 }

 // -----

-func @builtin() {
-  %c0 = constant 1 : index
-  "gpu.launch_func"(%c0, %c0, %c0, %c0, %c0, %c0) {kernel = @builtin_num_workgroups_x} : (index, index, index, index, index, index) -> ()
-  return
-}
+module attributes {gpu.container_module} {
+  func @builtin() {
+    %c0 = constant 1 : index
+    "gpu.launch_func"(%c0, %c0, %c0, %c0, %c0, %c0) {kernel = "builtin_num_workgroups_x", kernel_module = @kernels} : (index, index, index, index, index, index) -> ()
+    return
+  }

-// CHECK-LABEL:  spv.module "Logical" "GLSL450"
-// CHECK: spv.globalVariable [[NUMWORKGROUPS:@.*]] built_in("NumWorkgroups")
-func @builtin_num_workgroups_x()
-  attributes {gpu.kernel} {
-  // CHECK: [[ADDRESS:%.*]] = spv._address_of [[NUMWORKGROUPS]]
-  // CHECK-NEXT: [[VEC:%.*]] = spv.Load "Input" [[ADDRESS]]
-  // CHECK-NEXT: {{%.*}} = spv.CompositeExtract [[VEC]]{{\[}}0 : i32{{\]}}
-  %0 = "gpu.grid_dim"() {dimension = "x"} : () -> index
-  return
+  // CHECK-LABEL:  spv.module "Logical" "GLSL450"
+  // CHECK: spv.globalVariable [[NUMWORKGROUPS:@.*]] built_in("NumWorkgroups")
+  module @kernels attributes {gpu.kernel_module} {
+    func @builtin_num_workgroups_x()
+      attributes {gpu.kernel} {
+      // CHECK: [[ADDRESS:%.*]] = spv._address_of [[NUMWORKGROUPS]]
+      // CHECK-NEXT: [[VEC:%.*]] = spv.Load "Input" [[ADDRESS]]
+      // CHECK-NEXT: {{%.*}} = spv.CompositeExtract [[VEC]]{{\[}}0 : i32{{\]}}
+      %0 = "gpu.grid_dim"() {dimension = "x"} : () -> index
+      return
+    }
+  }
 }
--- a/mlir/test/Conversion/GPUToSPIRV/load_store.mlir
+++ b/mlir/test/Conversion/GPUToSPIRV/load_store.mlir
@ -1,52 +1,56 @@
 // RUN: mlir-opt -convert-gpu-to-spirv %s -o - | FileCheck %s

-func @load_store(%arg0: memref<12x4xf32>, %arg1: memref<12x4xf32>, %arg2: memref<12x4xf32>) {
-  %c0 = constant 0 : index
-  %c12 = constant 12 : index
-  %0 = subi %c12, %c0 : index
-  %c1 = constant 1 : index
-  %c0_0 = constant 0 : index
-  %c4 = constant 4 : index
-  %1 = subi %c4, %c0_0 : index
-  %c1_1 = constant 1 : index
-  %c1_2 = constant 1 : index
-  "gpu.launch_func"(%0, %c1_2, %c1_2, %1, %c1_2, %c1_2, %arg0, %arg1, %arg2, %c0, %c0_0, %c1, %c1_1) {kernel = @load_store_kernel} : (index, index, index, index, index, index, memref<12x4xf32>, memref<12x4xf32>, memref<12x4xf32>, index, index, index, index) -> ()
-  return
-}
+module attributes {gpu.container_module} {
+  func @load_store(%arg0: memref<12x4xf32>, %arg1: memref<12x4xf32>, %arg2: memref<12x4xf32>) {
+    %c0 = constant 0 : index
+    %c12 = constant 12 : index
+    %0 = subi %c12, %c0 : index
+    %c1 = constant 1 : index
+    %c0_0 = constant 0 : index
+    %c4 = constant 4 : index
+    %1 = subi %c4, %c0_0 : index
+    %c1_1 = constant 1 : index
+    %c1_2 = constant 1 : index
+    "gpu.launch_func"(%0, %c1_2, %c1_2, %1, %c1_2, %c1_2, %arg0, %arg1, %arg2, %c0, %c0_0, %c1, %c1_1) {kernel = "load_store_kernel", kernel_module = @kernels} : (index, index, index, index, index, index, memref<12x4xf32>, memref<12x4xf32>, memref<12x4xf32>, index, index, index, index) -> ()
+    return
+  }

-// CHECK-LABEL: spv.module "Logical" "GLSL450"
-// CHECK: spv.globalVariable {{@.*}} bind(0, 0) : [[TYPE1:!spv.ptr<!spv.array<12 x !spv.array<4 x f32>>, StorageBuffer>]]
-// CHECK-NEXT: spv.globalVariable {{@.*}} bind(0, 1) : [[TYPE2:!spv.ptr<!spv.array<12 x !spv.array<4 x f32>>, StorageBuffer>]]
-// CHECK-NEXT: spv.globalVariable {{@.*}} bind(0, 2) : [[TYPE3:!spv.ptr<!spv.array<12 x !spv.array<4 x f32>>, StorageBuffer>]]
-// CHECK: func @load_store_kernel([[ARG0:%.*]]: [[TYPE1]], [[ARG1:%.*]]: [[TYPE2]], [[ARG2:%.*]]: [[TYPE3]], [[ARG3:%.*]]: i32, [[ARG4:%.*]]: i32, [[ARG5:%.*]]: i32, [[ARG6:%.*]]: i32)
-func @load_store_kernel(%arg0: memref<12x4xf32>, %arg1: memref<12x4xf32>, %arg2: memref<12x4xf32>, %arg3: index, %arg4: index, %arg5: index, %arg6: index)
-  attributes  {gpu.kernel} {
-  %0 = "gpu.block_id"() {dimension = "x"} : () -> index
-  %1 = "gpu.block_id"() {dimension = "y"} : () -> index
-  %2 = "gpu.block_id"() {dimension = "z"} : () -> index
-  %3 = "gpu.thread_id"() {dimension = "x"} : () -> index
-  %4 = "gpu.thread_id"() {dimension = "y"} : () -> index
-  %5 = "gpu.thread_id"() {dimension = "z"} : () -> index
-  %6 = "gpu.grid_dim"() {dimension = "x"} : () -> index
-  %7 = "gpu.grid_dim"() {dimension = "y"} : () -> index
-  %8 = "gpu.grid_dim"() {dimension = "z"} : () -> index
-  %9 = "gpu.block_dim"() {dimension = "x"} : () -> index
-  %10 = "gpu.block_dim"() {dimension = "y"} : () -> index
-  %11 = "gpu.block_dim"() {dimension = "z"} : () -> index
-  // CHECK: [[INDEX1:%.*]] = spv.IAdd [[ARG3]], {{%.*}}
-  %12 = addi %arg3, %0 : index
-  // CHECK: [[INDEX2:%.*]] = spv.IAdd [[ARG4]], {{%.*}}
-  %13 = addi %arg4, %3 : index
-  // CHECK: [[PTR1:%.*]] = spv.AccessChain [[ARG0]]{{\[}}[[INDEX1]], [[INDEX2]]{{\]}}
-  // CHECK-NEXT: [[VAL1:%.*]] = spv.Load "StorageBuffer" [[PTR1]]
-  %14 = load %arg0[%12, %13] : memref<12x4xf32>
-  // CHECK: [[PTR2:%.*]] = spv.AccessChain [[ARG1]]{{\[}}[[INDEX1]], [[INDEX2]]{{\]}}
-  // CHECK-NEXT: [[VAL2:%.*]] = spv.Load "StorageBuffer" [[PTR2]]
-  %15 = load %arg1[%12, %13] : memref<12x4xf32>
-  // CHECK: [[VAL3:%.*]] = spv.FAdd [[VAL1]], [[VAL2]]
-  %16 = addf %14, %15 : f32
-  // CHECK: [[PTR3:%.*]] = spv.AccessChain [[ARG2]]{{\[}}[[INDEX1]], [[INDEX2]]{{\]}}
-  // CHECK-NEXT: spv.Store "StorageBuffer" [[PTR3]], [[VAL3]]
-  store %16, %arg2[%12, %13] : memref<12x4xf32>
-  return
-}
+  // CHECK-LABEL: spv.module "Logical" "GLSL450"
+  // CHECK: spv.globalVariable {{@.*}} bind(0, 0) : [[TYPE1:!spv.ptr<!spv.array<12 x !spv.array<4 x f32>>, StorageBuffer>]]
+  // CHECK-NEXT: spv.globalVariable {{@.*}} bind(0, 1) : [[TYPE2:!spv.ptr<!spv.array<12 x !spv.array<4 x f32>>, StorageBuffer>]]
+  // CHECK-NEXT: spv.globalVariable {{@.*}} bind(0, 2) : [[TYPE3:!spv.ptr<!spv.array<12 x !spv.array<4 x f32>>, StorageBuffer>]]
+  // CHECK: func @load_store_kernel([[ARG0:%.*]]: [[TYPE1]], [[ARG1:%.*]]: [[TYPE2]], [[ARG2:%.*]]: [[TYPE3]], [[ARG3:%.*]]: i32, [[ARG4:%.*]]: i32, [[ARG5:%.*]]: i32, [[ARG6:%.*]]: i32)
+  module @kernels attributes {gpu.kernel_module} {
+    func @load_store_kernel(%arg0: memref<12x4xf32>, %arg1: memref<12x4xf32>, %arg2: memref<12x4xf32>, %arg3: index, %arg4: index, %arg5: index, %arg6: index)
+      attributes  {gpu.kernel} {
+      %0 = "gpu.block_id"() {dimension = "x"} : () -> index
+      %1 = "gpu.block_id"() {dimension = "y"} : () -> index
+      %2 = "gpu.block_id"() {dimension = "z"} : () -> index
+      %3 = "gpu.thread_id"() {dimension = "x"} : () -> index
+      %4 = "gpu.thread_id"() {dimension = "y"} : () -> index
+      %5 = "gpu.thread_id"() {dimension = "z"} : () -> index
+      %6 = "gpu.grid_dim"() {dimension = "x"} : () -> index
+      %7 = "gpu.grid_dim"() {dimension = "y"} : () -> index
+      %8 = "gpu.grid_dim"() {dimension = "z"} : () -> index
+      %9 = "gpu.block_dim"() {dimension = "x"} : () -> index
+      %10 = "gpu.block_dim"() {dimension = "y"} : () -> index
+      %11 = "gpu.block_dim"() {dimension = "z"} : () -> index
+      // CHECK: [[INDEX1:%.*]] = spv.IAdd [[ARG3]], {{%.*}}
+      %12 = addi %arg3, %0 : index
+      // CHECK: [[INDEX2:%.*]] = spv.IAdd [[ARG4]], {{%.*}}
+      %13 = addi %arg4, %3 : index
+      // CHECK: [[PTR1:%.*]] = spv.AccessChain [[ARG0]]{{\[}}[[INDEX1]], [[INDEX2]]{{\]}}
+      // CHECK-NEXT: [[VAL1:%.*]] = spv.Load "StorageBuffer" [[PTR1]]
+      %14 = load %arg0[%12, %13] : memref<12x4xf32>
+      // CHECK: [[PTR2:%.*]] = spv.AccessChain [[ARG1]]{{\[}}[[INDEX1]], [[INDEX2]]{{\]}}
+      // CHECK-NEXT: [[VAL2:%.*]] = spv.Load "StorageBuffer" [[PTR2]]
+      %15 = load %arg1[%12, %13] : memref<12x4xf32>
+      // CHECK: [[VAL3:%.*]] = spv.FAdd [[VAL1]], [[VAL2]]
+      %16 = addf %14, %15 : f32
+      // CHECK: [[PTR3:%.*]] = spv.AccessChain [[ARG2]]{{\[}}[[INDEX1]], [[INDEX2]]{{\]}}
+      // CHECK-NEXT: spv.Store "StorageBuffer" [[PTR3]], [[VAL3]]
+      store %16, %arg2[%12, %13] : memref<12x4xf32>
+      return
+    }
+  }
+}
--- a/mlir/test/Conversion/GPUToSPIRV/simple.mlir
+++ b/mlir/test/Conversion/GPUToSPIRV/simple.mlir
@ -1,21 +1,27 @@
 // RUN: mlir-opt -convert-gpu-to-spirv %s -o - | FileCheck %s

-// CHECK:       spv.module "Logical" "GLSL450" {
-// CHECK-NEXT:    spv.globalVariable [[VAR1:@.*]] bind(0, 0) : !spv.ptr<f32, StorageBuffer>
-// CHECK-NEXT:    spv.globalVariable [[VAR2:@.*]] bind(0, 1) : !spv.ptr<!spv.array<12 x f32>, StorageBuffer>
-// CHECK-NEXT:    func @kernel_1
-// CHECK-NEXT:      spv.Return
-// CHECK:       spv.EntryPoint "GLCompute" @kernel_1, [[VAR1]], [[VAR2]]
-func @kernel_1(%arg0 : f32, %arg1 : memref<12xf32, 1>)
-    attributes { gpu.kernel } {
-  return
-}
+module attributes {gpu.container_module} {

-func @foo() {
-  %0 = "op"() : () -> (f32)
-  %1 = "op"() : () -> (memref<12xf32, 1>)
-  %cst = constant 1 : index
-  "gpu.launch_func"(%cst, %cst, %cst, %cst, %cst, %cst, %0, %1) { kernel = @kernel_1 }
-      : (index, index, index, index, index, index, f32, memref<12xf32, 1>) -> ()
-  return
-}
+  // CHECK:       spv.module "Logical" "GLSL450" {
+  // CHECK-NEXT:    spv.globalVariable [[VAR1:@.*]] bind(0, 0) : !spv.ptr<f32, StorageBuffer>
+  // CHECK-NEXT:    spv.globalVariable [[VAR2:@.*]] bind(0, 1) : !spv.ptr<!spv.array<12 x f32>, StorageBuffer>
+  // CHECK-NEXT:    func @kernel_1
+  // CHECK-NEXT:      spv.Return
+  // CHECK:       spv.EntryPoint "GLCompute" @kernel_1, [[VAR1]], [[VAR2]]
+  module @kernels attributes {gpu.kernel_module} {
+    func @kernel_1(%arg0 : f32, %arg1 : memref<12xf32, 1>)
+        attributes { gpu.kernel } {
+      return
+    }
+  }
+
+  func @foo() {
+    %0 = "op"() : () -> (f32)
+    %1 = "op"() : () -> (memref<12xf32, 1>)
+    %cst = constant 1 : index
+    "gpu.launch_func"(%cst, %cst, %cst, %cst, %cst, %cst, %0, %1) { kernel = "kernel_1", kernel_module = @kernels }
+        : (index, index, index, index, index, index, f32, memref<12xf32, 1>) -> ()
+    return
+  }
+
+}
--- a/mlir/test/Dialect/GPU/invalid.mlir
+++ b/mlir/test/Dialect/GPU/invalid.mlir
@ -96,8 +96,8 @@ func @launch_func_too_few_operands(%sz : index) {

 // -----

-func @launch_func_missing_callee_attribute(%sz : index) {
-  // expected-error@+1 {{attribute 'kernel' must be specified}}
+func @launch_func_missing_parent_module_attribute(%sz : index) {
+  // expected-error@+1 {{expected the closest surrounding module to have the 'gpu.container_module' attribute}}
  "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz, %sz) {foo = "bar"}
      : (index, index, index, index, index, index) -> ()
  return
@ -105,54 +105,134 @@ func @launch_func_missing_callee_attribute(%sz : index) {

 // -----

-func @launch_func_no_function_attribute(%sz : index) {
-  // expected-error@+1 {{attribute 'kernel' must be a function}}
-  "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz, %sz) {kernel = 10}
-      : (index, index, index, index, index, index) -> ()
-  return
+module attributes {gpu.container_module} {
+  func @launch_func_missing_callee_attribute(%sz : index) {
+    // expected-error@+1 {{string attribute 'kernel' must be specified}}
+    "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz, %sz) {foo = "bar"}
+        : (index, index, index, index, index, index) -> ()
+    return
+  }
 }

 // -----

-func @launch_func_undefined_function(%sz : index) {
-  // expected-error@+1 {{kernel function '@kernel_1' is undefined}}
-  "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz, %sz) { kernel = @kernel_1 }
-      : (index, index, index, index, index, index) -> ()
-  return
+module attributes {gpu.container_module} {
+  func @launch_func_missing_module_attribute(%sz : index) {
+    // expected-error@+1 {{attribute 'kernel_module' must be specified}}
+    "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz, %sz) {kernel = "launch_func_missing_kernel_attr"}
+        : (index, index, index, index, index, index) -> ()
+    return
+  }
 }

 // -----

-func @kernel_1(%arg1 : !llvm<"float*">) {
-  return
-}
-
-func @launch_func_missing_kernel_attr(%sz : index, %arg : !llvm<"float*">) {
-  // expected-error@+1 {{kernel function is missing the 'gpu.kernel' attribute}}
-  "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz, %sz, %arg) {kernel = @kernel_1}
-      : (index, index, index, index, index, index, !llvm<"float*">) -> ()
-  return
+module attributes {gpu.container_module} {
+  func @launch_func_no_function_attribute(%sz : index) {
+    // expected-error@+1 {{string attribute 'kernel' must be specified}}
+    "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz, %sz) {kernel = 10}
+        : (index, index, index, index, index, index) -> ()
+    return
+  }
 }

 // -----

-func @kernel_1(%arg1 : !llvm<"float*">) attributes { gpu.kernel } {
-  return
-}
-
-func @launch_func_kernel_operand_size(%sz : index, %arg : !llvm<"float*">) {
-  // expected-error@+1 {{got 2 kernel operands but expected 1}}
-  "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz, %sz, %arg, %arg)
-      {kernel = @kernel_1}
-      : (index, index, index, index, index, index, !llvm<"float*">,
-         !llvm<"float*">) -> ()
-  return
+module attributes {gpu.container_module} {
+  func @launch_func_module_attribute_wrong_type(%sz : index) {
+    // expected-error@+1 {{symbol reference attribute 'kernel_module' must be specified}}
+    "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz, %sz)
+    {kernel = "launch_func_module_attribute_wrong_type", kernel_module = 10}
+        : (index, index, index, index, index, index) -> ()
+    return
+  }
 }

 // -----

-func @kernel_1(%arg1 : !llvm<"float*">) attributes { gpu.kernel } {
-  return
+module attributes {gpu.container_module} {
+  func @launch_func_undefined_module(%sz : index) {
+    // expected-error@+1 {{kernel module 'kernels' is undefined}}
+    "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz, %sz)
+    { kernel = "kernel_1", kernel_module = @kernels }
+        : (index, index, index, index, index, index) -> ()
+    return
+  }
+}
+
+// -----
+
+module attributes {gpu.container_module} {
+  module @kernels {
+  }
+
+  func @launch_func_missing_module_attirbute(%sz : index) {
+    // expected-error@+1 {{module 'kernels' is missing the 'gpu.kernel_module' attribute}}
+    "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz, %sz)
+    { kernel = "kernel_1", kernel_module = @kernels }
+        : (index, index, index, index, index, index) -> ()
+    return
+  }
+}
+
+// -----
+
+module attributes {gpu.container_module} {
+  module @kernels attributes {gpu.kernel_module} {
+  }
+
+  func @launch_func_undefined_function(%sz : index) {
+    // expected-error@+1 {{kernel function 'kernel_1' is undefined}}
+    "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz, %sz)
+    { kernel = "kernel_1", kernel_module = @kernels }
+        : (index, index, index, index, index, index) -> ()
+    return
+  }
+}
+
+// -----
+
+module attributes {gpu.container_module} {
+  module @kernels attributes {gpu.kernel_module} {
+    func @kernel_1(%arg1 : !llvm<"float*">) {
+      return
+    }
+  }
+
+  func @launch_func_missing_kernel_attr(%sz : index, %arg : !llvm<"float*">) {
+    // expected-error@+1 {{kernel function is missing the 'gpu.kernel' attribute}}
+    "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz, %sz, %arg)
+    {kernel = "kernel_1", kernel_module = @kernels}
+        : (index, index, index, index, index, index, !llvm<"float*">) -> ()
+    return
+  }
+}
+
+// -----
+
+module attributes {gpu.container_module} {
+  module @kernels attributes {gpu.kernel_module} {
+    func @kernel_1(%arg1 : !llvm<"float*">) attributes { gpu.kernel } {
+      return
+    }
+  }
+
+  func @launch_func_kernel_operand_size(%sz : index, %arg : !llvm<"float*">) {
+    // expected-error@+1 {{got 2 kernel operands but expected 1}}
+    "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz, %sz, %arg, %arg)
+        {kernel = "kernel_1", kernel_module = @kernels}
+        : (index, index, index, index, index, index, !llvm<"float*">,
+           !llvm<"float*">) -> ()
+    return
+  }
+}
+
+// -----
+
+module @kernels attributes {gpu.kernel_module} {
+  func @kernel_1(%arg1 : !llvm<"float*">) attributes { gpu.kernel } {
+    return
+  }
 }

 // Due to the ordering of the current impl of lowering and LLVMLowering, type
@ -162,7 +242,7 @@ func @kernel_1(%arg1 : !llvm<"float*">) attributes { gpu.kernel } {
 // func @launch_func_kernel_operand_types(%sz : index, %arg : f32) {
 //   // expected-err@+1 {{type of function argument 0 does not match}}
 //   "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz, %sz, %arg)
-//       {kernel = @kernel_1}
+//       {kernel = "kernel_1"}
 //       : (index, index, index, index, index, index, f32) -> ()
 //   return
 // }
--- a/mlir/test/Dialect/GPU/ops.mlir
+++ b/mlir/test/Dialect/GPU/ops.mlir
@ -1,105 +1,113 @@
 // RUN: mlir-opt %s | FileCheck %s

-// CHECK-LABEL:func @no_args(%{{.*}}: index)
-func @no_args(%sz : index) {
-  // CHECK: gpu.launch blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}})
-  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %sz, %grid_y = %sz, %grid_z = %sz)
-             threads(%tx, %ty, %tz) in (%block_x = %sz, %block_y = %sz, %block_z = %sz) {
-    // CHECK: gpu.return
-    gpu.return
-  }
-  return
-}
+module attributes {gpu.container_module} {

-// CHECK-LABEL:func @args(%{{.*}}: index, %{{.*}}: index, %{{.*}}: f32, %{{.*}}: memref<?xf32, 1>) {
-func @args(%blk : index, %thrd : index, %float : f32, %data : memref<?xf32,1>) {
-  // CHECK: gpu.launch blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) args(%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) : f32, memref<?xf32, 1>
-  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %blk, %grid_y = %blk, %grid_z = %blk)
-             threads(%tx, %ty, %tz) in (%block_x = %thrd, %block_y = %thrd, %block_z = %thrd)
-	     args(%kernel_arg0 = %float, %kernel_arg1 = %data) : f32, memref<?xf32, 1> {
-    // CHECK: gpu.return
-    gpu.return
+  // CHECK-LABEL:func @no_args(%{{.*}}: index)
+  func @no_args(%sz : index) {
+    // CHECK: gpu.launch blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}})
+    gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %sz, %grid_y = %sz, %grid_z = %sz)
+               threads(%tx, %ty, %tz) in (%block_x = %sz, %block_y = %sz, %block_z = %sz) {
+      // CHECK: gpu.return
+      gpu.return
+    }
+    return
  }
-  return
-}

-// It is possible to use values passed into the region as arguments.
-// CHECK-LABEL: func @passing_values
-func @passing_values(%blk : index, %thrd : index, %float : f32, %data : memref<?xf32,1>) {
-  // CHECK: gpu.launch blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) args(%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) : f32, memref<?xf32, 1>
-  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %blk, %grid_y = %blk, %grid_z = %blk)
-             threads(%tx, %ty, %tz) in (%block_x = %thrd, %block_y = %thrd, %block_z = %thrd)
-	     args(%kernel_arg0 = %float, %kernel_arg1 = %data) : f32, memref<?xf32, 1> {
-    // CHECK: "use"(%{{.*}})
-    "use"(%kernel_arg0): (f32) -> ()
-    // CHECK: gpu.return
-    gpu.return
+  // CHECK-LABEL:func @args(%{{.*}}: index, %{{.*}}: index, %{{.*}}: f32, %{{.*}}: memref<?xf32, 1>) {
+  func @args(%blk : index, %thrd : index, %float : f32, %data : memref<?xf32,1>) {
+    // CHECK: gpu.launch blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) args(%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) : f32, memref<?xf32, 1>
+    gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %blk, %grid_y = %blk, %grid_z = %blk)
+               threads(%tx, %ty, %tz) in (%block_x = %thrd, %block_y = %thrd, %block_z = %thrd)
+               args(%kernel_arg0 = %float, %kernel_arg1 = %data) : f32, memref<?xf32, 1> {
+      // CHECK: gpu.return
+      gpu.return
+    }
+    return
  }
-  return
-}

-// It is possible to use values defined in nested regions as long as they don't
-// cross kernel launch region boundaries.
-// CHECK-LABEL: func @nested_isolation
-func @nested_isolation(%sz : index) {
-  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %sz, %grid_y = %sz, %grid_z = %sz)
-             threads(%tx, %ty, %tz) in (%block_x = %sz, %block_y = %sz, %block_z = %sz) {
-    "region"() ({
-      // CHECK: %{{.*}} = "produce"()
-      %val = "produce"() : () -> (index)
+  // It is possible to use values passed into the region as arguments.
+  // CHECK-LABEL: func @passing_values
+  func @passing_values(%blk : index, %thrd : index, %float : f32, %data : memref<?xf32,1>) {
+    // CHECK: gpu.launch blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) args(%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) : f32, memref<?xf32, 1>
+    gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %blk, %grid_y = %blk, %grid_z = %blk)
+               threads(%tx, %ty, %tz) in (%block_x = %thrd, %block_y = %thrd, %block_z = %thrd)
+               args(%kernel_arg0 = %float, %kernel_arg1 = %data) : f32, memref<?xf32, 1> {
+      // CHECK: "use"(%{{.*}})
+      "use"(%kernel_arg0): (f32) -> ()
+      // CHECK: gpu.return
+      gpu.return
+    }
+    return
+  }
+
+  // It is possible to use values defined in nested regions as long as they don't
+  // cross kernel launch region boundaries.
+  // CHECK-LABEL: func @nested_isolation
+  func @nested_isolation(%sz : index) {
+    gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %sz, %grid_y = %sz, %grid_z = %sz)
+               threads(%tx, %ty, %tz) in (%block_x = %sz, %block_y = %sz, %block_z = %sz) {
      "region"() ({
-        // CHECK: "use"(%{{.*}})
-        "use"(%val) : (index) -> ()
+        // CHECK: %{{.*}} = "produce"()
+        %val = "produce"() : () -> (index)
+        "region"() ({
+          // CHECK: "use"(%{{.*}})
+          "use"(%val) : (index) -> ()
+        }) : () -> ()
      }) : () -> ()
-    }) : () -> ()
-    // CHECK: gpu.return
-    gpu.return
+      // CHECK: gpu.return
+      gpu.return
+    }
+    return
  }
-  return
-}
-
-func @kernel_1(%arg0 : f32, %arg1 : memref<?xf32, 1>)
-    attributes { gpu.kernel } {
-  %tIdX = "gpu.thread_id"() {dimension = "x"} : () -> (index)
-  %tIdY = "gpu.thread_id"() {dimension = "y"} : () -> (index)
-  %tIdZ = "gpu.thread_id"() {dimension = "z"} : () -> (index)
-
-  %bDimX = "gpu.block_dim"() {dimension = "x"} : () -> (index)
-  %bDimY = "gpu.block_dim"() {dimension = "y"} : () -> (index)
-  %bDimZ = "gpu.block_dim"() {dimension = "z"} : () -> (index)
-
-  %bIdX = "gpu.block_id"() {dimension = "x"} : () -> (index)
-  %bIdY = "gpu.block_id"() {dimension = "y"} : () -> (index)
-  %bIdZ = "gpu.block_id"() {dimension = "z"} : () -> (index)
-
-  %gDimX = "gpu.grid_dim"() {dimension = "x"} : () -> (index)
-  %gDimY = "gpu.grid_dim"() {dimension = "y"} : () -> (index)
-  %gDimZ = "gpu.grid_dim"() {dimension = "z"} : () -> (index)
-
-  %one = constant 1.0 : f32
-  %sum = "gpu.all_reduce"(%one) : (f32) -> (f32)
-
-  "some_op"(%bIdX, %tIdX) : (index, index) -> ()
-  %42 = load %arg1[%bIdX] : memref<?xf32, 1>
-  return
-}
-
-func @kernel_2(f32, memref<?xf32, 1>)
-    attributes { gpu.kernel }
-
-func @foo() {
-  %0 = "op"() : () -> (f32)
-  %1 = "op"() : () -> (memref<?xf32, 1>)
-  // CHECK: %{{.*}} = constant 8
-  %cst = constant 8 : index
-
-  // CHECK: "gpu.launch_func"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {kernel = @kernel_1} : (index, index, index, index, index, index, f32, memref<?xf32, 1>) -> ()
-  "gpu.launch_func"(%cst, %cst, %cst, %cst, %cst, %cst, %0, %1) { kernel = @kernel_1 }
-      : (index, index, index, index, index, index, f32, memref<?xf32, 1>) -> ()
-
-  // CHECK: "gpu.launch_func"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {kernel = @kernel_2} : (index, index, index, index, index, index, f32, memref<?xf32, 1>) -> ()
-  "gpu.launch_func"(%cst, %cst, %cst, %cst, %cst, %cst, %0, %1) { kernel = @kernel_2 }
-      : (index, index, index, index, index, index, f32, memref<?xf32, 1>) -> ()
-
-  return
+
+  module @kernels attributes {gpu.kernel_module} {
+    func @kernel_1(%arg0 : f32, %arg1 : memref<?xf32, 1>)
+        attributes { gpu.kernel } {
+      %tIdX = "gpu.thread_id"() {dimension = "x"} : () -> (index)
+      %tIdY = "gpu.thread_id"() {dimension = "y"} : () -> (index)
+      %tIdZ = "gpu.thread_id"() {dimension = "z"} : () -> (index)
+
+      %bDimX = "gpu.block_dim"() {dimension = "x"} : () -> (index)
+      %bDimY = "gpu.block_dim"() {dimension = "y"} : () -> (index)
+      %bDimZ = "gpu.block_dim"() {dimension = "z"} : () -> (index)
+
+      %bIdX = "gpu.block_id"() {dimension = "x"} : () -> (index)
+      %bIdY = "gpu.block_id"() {dimension = "y"} : () -> (index)
+      %bIdZ = "gpu.block_id"() {dimension = "z"} : () -> (index)
+
+      %gDimX = "gpu.grid_dim"() {dimension = "x"} : () -> (index)
+      %gDimY = "gpu.grid_dim"() {dimension = "y"} : () -> (index)
+      %gDimZ = "gpu.grid_dim"() {dimension = "z"} : () -> (index)
+
+      %one = constant 1.0 : f32
+      %sum = "gpu.all_reduce"(%one) : (f32) -> (f32)
+
+      "some_op"(%bIdX, %tIdX) : (index, index) -> ()
+      %42 = load %arg1[%bIdX] : memref<?xf32, 1>
+      return
+    }
+
+    func @kernel_2(f32, memref<?xf32, 1>)
+        attributes { gpu.kernel }
+  }
+
+  func @foo() {
+    %0 = "op"() : () -> (f32)
+    %1 = "op"() : () -> (memref<?xf32, 1>)
+    // CHECK: %{{.*}} = constant 8
+    %cst = constant 8 : index
+
+    // CHECK: "gpu.launch_func"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {kernel = "kernel_1", kernel_module = @kernels} : (index, index, index, index, index, index, f32, memref<?xf32, 1>) -> ()
+    "gpu.launch_func"(%cst, %cst, %cst, %cst, %cst, %cst, %0, %1)
+    { kernel = "kernel_1", kernel_module = @kernels }
+        : (index, index, index, index, index, index, f32, memref<?xf32, 1>) -> ()
+
+    // CHECK: "gpu.launch_func"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {kernel = "kernel_2", kernel_module = @kernels} : (index, index, index, index, index, index, f32, memref<?xf32, 1>) -> ()
+    "gpu.launch_func"(%cst, %cst, %cst, %cst, %cst, %cst, %0, %1)
+    { kernel = "kernel_2", kernel_module = @kernels }
+        : (index, index, index, index, index, index, f32, memref<?xf32, 1>) -> ()
+
+    return
+  }
+
 }
--- a/mlir/test/Dialect/GPU/outlining.mlir
+++ b/mlir/test/Dialect/GPU/outlining.mlir
@ -1,5 +1,7 @@
 // RUN: mlir-opt -gpu-kernel-outlining -split-input-file -verify-diagnostics %s | FileCheck %s

+// CHECK: module attributes {gpu.container_module}
+
 // CHECK-LABEL: func @launch()
 func @launch() {
  // CHECK: %[[ARG0:.*]] = "op"() : () -> f32
@ -19,7 +21,7 @@ func @launch() {
  // CHECK: %[[BDIMZ:.*]] = constant 28
  %bDimZ = constant 28 : index

-  // CHECK: "gpu.launch_func"(%[[GDIMX]], %[[GDIMY]], %[[GDIMZ]], %[[BDIMX]], %[[BDIMY]], %[[BDIMZ]], %[[ARG0]], %[[ARG1]]) {kernel = @launch_kernel} : (index, index, index, index, index, index, f32, memref<?xf32, 1>) -> ()
+  // CHECK: "gpu.launch_func"(%[[GDIMX]], %[[GDIMY]], %[[GDIMZ]], %[[BDIMX]], %[[BDIMY]], %[[BDIMZ]], %[[ARG0]], %[[ARG1]]) {kernel = "launch_kernel", kernel_module = @launch_kernel} : (index, index, index, index, index, index, f32, memref<?xf32, 1>) -> ()
  // CHECK-NOT: gpu.launch blocks
  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %gDimX, %grid_y = %gDimY,
                                       %grid_z = %gDimZ)
@ -34,11 +36,9 @@ func @launch() {
  return
 }

-// CHECK-LABEL: func @launch_kernel
-// CHECK-SAME: (f32, memref<?xf32, 1>)
-// CHECK-NEXT: attributes {gpu.kernel}

-// CHECK-LABEL: func @launch_kernel
+// CHECK-LABEL: module @launch_kernel
+// CHECK-NEXT: func @launch_kernel
 // CHECK-SAME: (%[[KERNEL_ARG0:.*]]: f32, %[[KERNEL_ARG1:.*]]: memref<?xf32, 1>)
 // CHECK-NEXT: attributes {gpu.kernel}
 // CHECK-NEXT: %[[BID:.*]] = "gpu.block_id"() {dimension = "x"} : () -> index
@ -59,17 +59,19 @@ func @launch() {

 // -----

+// CHECK: module attributes {gpu.container_module}
+
 func @multiple_launches() {
  // CHECK: %[[CST:.*]] = constant 8 : index
  %cst = constant 8 : index
-  // CHECK: "gpu.launch_func"(%[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]]) {kernel = @multiple_launches_kernel} : (index, index, index, index, index, index) -> ()
+  // CHECK: "gpu.launch_func"(%[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]]) {kernel = "multiple_launches_kernel", kernel_module = @multiple_launches_kernel} : (index, index, index, index, index, index) -> ()
  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst,
                                       %grid_z = %cst)
             threads(%tx, %ty, %tz) in (%block_x = %cst, %block_y = %cst,
                                        %block_z = %cst) {
    gpu.return
  }
-  // CHECK: "gpu.launch_func"(%[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]]) {kernel = @multiple_launches_kernel_0} : (index, index, index, index, index, index) -> ()
+  // CHECK: "gpu.launch_func"(%[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]]) {kernel = "multiple_launches_kernel", kernel_module = @multiple_launches_kernel_0} : (index, index, index, index, index, index) -> ()
  gpu.launch blocks(%bx2, %by2, %bz2) in (%grid_x2 = %cst, %grid_y2 = %cst,
                                          %grid_z2 = %cst)
             threads(%tx2, %ty2, %tz2) in (%block_x2 = %cst, %block_y2 = %cst,
@ -79,8 +81,10 @@ func @multiple_launches() {
  return
 }

-// CHECK: func @multiple_launches_kernel()
-// CHECK: func @multiple_launches_kernel_0()
+// CHECK: module @multiple_launches_kernel
+// CHECK: func @multiple_launches_kernel
+// CHECK: module @multiple_launches_kernel_0
+// CHECK: func @multiple_launches_kernel

 // -----

@ -89,7 +93,7 @@ func @extra_constants(%arg0 : memref<?xf32>) {
  %cst = constant 8 : index
  %cst2 = constant 2 : index
  %cst3 = constant 3 : index
-  // CHECK: "gpu.launch_func"(%[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %{{.*}}) {kernel = @extra_constants_kernel} : (index, index, index, index, index, index, memref<?xf32>) -> ()
+  // CHECK: "gpu.launch_func"(%[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %{{.*}}) {kernel = "extra_constants_kernel", kernel_module = @extra_constants_kernel} : (index, index, index, index, index, index, memref<?xf32>) -> ()
  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst,
                                       %grid_z = %cst)
             threads(%tx, %ty, %tz) in (%block_x = %cst, %block_y = %cst,