diff --git a/mlir/g3doc/Dialects/GPU.md b/mlir/g3doc/Dialects/GPU.md
index 1c4edcb10d5a..2e0e06af2f51 100644
--- a/mlir/g3doc/Dialects/GPU.md
+++ b/mlir/g3doc/Dialects/GPU.md
@@ -120,12 +120,17 @@ to have that information by construction.
 
 ### `gpu.launch_func`
 
-Launch a kernel given as a function on the specified grid of thread blocks.
-`gpu.launch` operations are lowered to `gpu.launch_func` operations by outlining
-the kernel body into a function, which is closer to the NVVM model. The
-`gpu.launch_func` operation has a function attribute named `kernel` to specify
-the kernel function to launch. The kernel function itself has a `nvvm.kernel`
-attribute.
+Launch a kernel function on the specified grid of thread blocks. `gpu.launch`
+operations are lowered to `gpu.launch_func` operations by outlining the kernel
+body into a function in a dedicated module, which reflects the separate
+compilation process. The kernel function is required to have the `gpu.kernel`
+attribute. The module containing the kernel function is required to have the
+`gpu.kernel_module` attribute and must be named. And finally, the module
+containing the kernel module (which thus cannot be the top-level module) is
+required to have the `gpu.container_module` attribute. The `gpu.launch_func`
+operation has a string attribute named `kernel` to specify the name of the
+kernel function to launch and an attribute named `kernel_module` to specify the
+name of the module containing that kernel function.
 
 The operation takes at least six operands, with the first three operands being
 grid sizes along x,y,z dimensions and the following three being block sizes
@@ -138,36 +143,43 @@ A custom syntax for this operation is currently not available.
 Example:
 
 ```mlir {.mlir}
-func @kernel_1(%arg0 : f32, %arg1 : !llvm<"float*">)
-    attributes { nvvm.kernel: true } {
+module attributes {gpu.container_module} {
 
-  // Operations that produce block/thread IDs and dimensions are injected when
-  // outlining the `gpu.launch` body to a function called by `gpu.launch_func`.
-  %tIdX = "gpu.thread_id"() {dimension: "x"} : () -> (index)
-  %tIdY = "gpu.thread_id"() {dimension: "y"} : () -> (index)
-  %tIdZ = "gpu.thread_id"() {dimension: "z"} : () -> (index)
+  // This module creates a separate compilation unit for the GPU compiler.
+  module @kernels attributes {gpu.kernel_module} {
+    func @kernel_1(%arg0 : f32, %arg1 : !llvm<"float*">)
+        attributes { nvvm.kernel: true } {
 
-  %bDimX = "gpu.block_dim"() {dimension: "x"} : () -> (index)
-  %bDimY = "gpu.block_dim"() {dimension: "y"} : () -> (index)
-  %bDimZ = "gpu.block_dim"() {dimension: "z"} : () -> (index)
+      // Operations that produce block/thread IDs and dimensions are injected when
+      // outlining the `gpu.launch` body to a function called by `gpu.launch_func`.
+      %tIdX = "gpu.thread_id"() {dimension: "x"} : () -> (index)
+      %tIdY = "gpu.thread_id"() {dimension: "y"} : () -> (index)
+      %tIdZ = "gpu.thread_id"() {dimension: "z"} : () -> (index)
 
-  %bIdX = "gpu.block_id"() {dimension: "x"} : () -> (index)
-  %bIdY = "gpu.block_id"() {dimension: "y"} : () -> (index)
-  %bIdZ = "gpu.block_id"() {dimension: "z"} : () -> (index)
+      %bDimX = "gpu.block_dim"() {dimension: "x"} : () -> (index)
+      %bDimY = "gpu.block_dim"() {dimension: "y"} : () -> (index)
+      %bDimZ = "gpu.block_dim"() {dimension: "z"} : () -> (index)
 
-  %gDimX = "gpu.grid_dim"() {dimension: "x"} : () -> (index)
-  %gDimY = "gpu.grid_dim"() {dimension: "y"} : () -> (index)
-  %gDimZ = "gpu.grid_dim"() {dimension: "z"} : () -> (index)
+      %bIdX = "gpu.block_id"() {dimension: "x"} : () -> (index)
+      %bIdY = "gpu.block_id"() {dimension: "y"} : () -> (index)
+      %bIdZ = "gpu.block_id"() {dimension: "z"} : () -> (index)
 
-  "some_op"(%bx, %tx) : (index, index) -> ()
-  %42 = load %arg1[%bx] : memref<?xf32, 1>
+      %gDimX = "gpu.grid_dim"() {dimension: "x"} : () -> (index)
+      %gDimY = "gpu.grid_dim"() {dimension: "y"} : () -> (index)
+      %gDimZ = "gpu.grid_dim"() {dimension: "z"} : () -> (index)
+
+      "some_op"(%bx, %tx) : (index, index) -> ()
+      %42 = load %arg1[%bx] : memref<?xf32, 1>
+    }
+  }
+
+  "gpu.launch_func"(%cst, %cst, %cst,  // Grid sizes.
+                    %cst, %cst, %cst,  // Block sizes.
+                    %arg0, %arg1)      // Arguments passed to the kernel function.
+        { kernel_module = @kernels,    // Module containing the kernel function.
+          kernel = "kernel_1" }        // Kernel function.
+        : (index, index, index, index, index, index, f32, !llvm<"float*">) -> ()
 }
-
-"gpu.launch_func"(%cst, %cst, %cst,  // Grid sizes.
-                  %cst, %cst, %cst,  // Block sizes.
-                  %arg0, %arg1)      // Arguments passed to the kernel function.
-      {kernel: @kernel_1 : (f32, !llvm<"float*">) -> ()}  // Kernel function.
-      : (index, index, index, index, index, index, f32, !llvm<"float*">) -> ()
 ```
 
 ### `gpu.thread_id`
diff --git a/mlir/include/mlir/Dialect/GPU/GPUDialect.h b/mlir/include/mlir/Dialect/GPU/GPUDialect.h
index 7d797a5bb055..ec47823a8503 100644
--- a/mlir/include/mlir/Dialect/GPU/GPUDialect.h
+++ b/mlir/include/mlir/Dialect/GPU/GPUDialect.h
@@ -38,6 +38,12 @@ public:
   /// Create the dialect in the given `context`.
   GPUDialect(MLIRContext *context);
 
+  /// Get the name of the attribute used to annotate the modules that contain
+  /// kernel modules.
+  static StringRef getContainerModuleAttrName() {
+    return "gpu.container_module";
+  }
+
   /// Get the canonical string name of the dialect.
   static StringRef getDialectName();
 
@@ -50,6 +56,9 @@ public:
   /// Returns whether the given function is a kernel function, i.e., has the
   /// 'gpu.kernel' attribute.
   static bool isKernel(FuncOp function);
+
+  LogicalResult verifyOperationAttribute(Operation *op,
+                                         NamedAttribute attr) override;
 };
 
 /// Utility class for the GPU dialect to represent triples of `Value`s
@@ -147,6 +156,9 @@ public:
   StringRef kernel();
   /// The number of operands passed to the kernel function.
   unsigned getNumKernelOperands();
+  /// The name of the kernel module specified by the operation's `kernel_module`
+  /// attribute.
+  StringRef getKernelModuleName();
   /// The i-th operand passed to the kernel function.
   Value *getKernelOperand(unsigned i);
 
@@ -164,8 +176,17 @@ public:
   static constexpr unsigned kNumConfigOperands = 6;
 
 private:
-  /// The name of the function attribute specifying the kernel to launch.
+  // This needs to quietly verify if attributes with names defined below are
+  // present since it is run before the verifier of this op.
+  friend LogicalResult GPUDialect::verifyOperationAttribute(Operation *,
+                                                            NamedAttribute);
+
+  /// The name of the symbolRef attribute specifying the kernel to launch.
   static StringRef getKernelAttrName() { return "kernel"; }
+
+  /// The name of the symbolRef attribute specifying the name of the module
+  /// containing the kernel to launch.
+  static StringRef getKernelModuleAttrName() { return "kernel_module"; }
 };
 
 #define GET_OP_CLASSES
diff --git a/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp b/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp
index 961727d31c19..c0eb320e146d 100644
--- a/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp
+++ b/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp
@@ -135,6 +135,12 @@ public:
       func.walk(
           [this](mlir::gpu::LaunchFuncOp op) { translateGpuLaunchCalls(op); });
     }
+
+    // GPU kernel modules are no longer necessary since we have a global
+    // constant with the CUBIN data.
+    for (auto m : llvm::make_early_inc_range(getModule().getOps<ModuleOp>()))
+      if (m.getAttrOfType<UnitAttr>(gpu::GPUDialect::getKernelModuleAttrName()))
+        m.erase();
   }
 
 private:
@@ -342,11 +348,12 @@ void GpuLaunchFuncToCudaCallsPass::translateGpuLaunchCalls(
   // Emit a call to the cubin getter to retrieve a pointer to the data that
   // represents the cubin at runtime.
   // TODO(herhut): This should rather be a static global once supported.
-  auto kernelFunction = getModule().lookupSymbol<FuncOp>(launchOp.kernel());
-  if (!kernelFunction) {
-    launchOp.emitError("missing kernel function ") << launchOp.kernel();
-    return signalPassFailure();
-  }
+  auto kernelModule =
+      getModule().lookupSymbol<ModuleOp>(launchOp.getKernelModuleName());
+  assert(kernelModule && "expected a kernel module");
+  auto kernelFunction = kernelModule.lookupSymbol<FuncOp>(launchOp.kernel());
+  assert(kernelFunction && "expected a kernel function");
+
   auto cubinGetter =
       kernelFunction.getAttrOfType<SymbolRefAttr>(kCubinGetterAnnotation);
   if (!cubinGetter) {
diff --git a/mlir/lib/Conversion/GPUToCUDA/GenerateCubinAccessors.cpp b/mlir/lib/Conversion/GPUToCUDA/GenerateCubinAccessors.cpp
index 36ba6053ec46..4b7a6b1620d8 100644
--- a/mlir/lib/Conversion/GPUToCUDA/GenerateCubinAccessors.cpp
+++ b/mlir/lib/Conversion/GPUToCUDA/GenerateCubinAccessors.cpp
@@ -61,26 +61,21 @@ private:
     return LLVM::LLVMType::getIntNTy(llvmDialect, bits);
   }
 
-  // Inserts a global constant string containing `blob` into the parent module
-  // of `kernelFunc` and generates the function that returns the address of the
-  // first character of this string.
+  // Inserts a global constant string containing `blob` into the grand-parent
+  // module of `kernelFunc` and generates the function that returns the address
+  // of the first character of this string. Returns the generator function.
   // TODO(herhut): consider fusing this pass with launch-func-to-cuda.
-  void generate(FuncOp kernelFunc, StringAttr blob) {
-    auto stubFunc = getModule().lookupSymbol<FuncOp>(kernelFunc.getName());
-    if (!stubFunc) {
-      kernelFunc.emitError(
-          "corresponding external function not found in parent module");
-      return signalPassFailure();
-    }
-
-    Location loc = stubFunc.getLoc();
-    SmallString<128> nameBuffer(stubFunc.getName());
-    auto module = stubFunc.getParentOfType<ModuleOp>();
-    assert(module && "function must belong to a module");
+  FuncOp generate(FuncOp kernelFunc, StringAttr blob) {
+    Location loc = kernelFunc.getLoc();
+    SmallString<128> nameBuffer(kernelFunc.getName());
+    ModuleOp module = getModule();
+    assert(kernelFunc.getParentOp() &&
+           kernelFunc.getParentOp()->getParentOp() == module &&
+           "expected one level of module nesting");
 
     // Insert the getter function just after the original function.
-    OpBuilder moduleBuilder(module.getBody(), module.getBody()->begin());
-    moduleBuilder.setInsertionPointAfter(stubFunc.getOperation());
+    OpBuilder moduleBuilder(module.getBody());
+    moduleBuilder.setInsertionPointAfter(kernelFunc.getParentOp());
     auto getterType = moduleBuilder.getFunctionType(
         llvm::None, LLVM::LLVMType::getInt8PtrTy(llvmDialect));
     nameBuffer.append(kCubinGetterSuffix);
@@ -89,7 +84,7 @@ private:
     Block *entryBlock = result.addEntryBlock();
 
     // Drop the getter suffix before appending the storage suffix.
-    nameBuffer.resize(stubFunc.getName().size());
+    nameBuffer.resize(kernelFunc.getName().size());
     nameBuffer.append(kCubinStorageSuffix);
 
     // Obtain the address of the first character of the global string containing
@@ -98,25 +93,29 @@ private:
     Value *startPtr = LLVM::createGlobalString(
         loc, builder, StringRef(nameBuffer), blob.getValue(), llvmDialect);
     builder.create<LLVM::ReturnOp>(loc, startPtr);
-
-    // Store the name of the getter on the function for easier lookup.
-    stubFunc.setAttr(kCubinGetterAnnotation, builder.getSymbolRefAttr(result));
+    return result;
   }
 
 public:
   void runOnModule() override {
     llvmDialect = getContext().getRegisteredDialect<LLVM::LLVMDialect>();
 
-    auto modules = getModule().getOps<ModuleOp>();
-    for (auto module : llvm::make_early_inc_range(modules)) {
+    for (auto module : getModule().getOps<ModuleOp>()) {
       if (!module.getAttrOfType<UnitAttr>(
               gpu::GPUDialect::getKernelModuleAttrName()))
         continue;
       for (auto func : module.getOps<FuncOp>()) {
-        if (StringAttr blob = func.getAttrOfType<StringAttr>(kCubinAnnotation))
-          generate(func, blob);
+        if (StringAttr blob =
+                func.getAttrOfType<StringAttr>(kCubinAnnotation)) {
+          FuncOp getter = generate(func, blob);
+
+          // Store the name of the getter on the function for easier lookup and
+          // remove the CUBIN.
+          func.setAttr(kCubinGetterAnnotation,
+                       SymbolRefAttr::get(getter.getName(), func.getContext()));
+          func.removeAttr(kCubinAnnotation);
+        }
       }
-      module.erase();
     }
   }
 
diff --git a/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.cpp b/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.cpp
index 4760ed0e1ce1..2689572eb7d5 100644
--- a/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.cpp
+++ b/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.cpp
@@ -121,7 +121,7 @@ void GPUToSPIRVPass::runOnModule() {
   auto module = getModule();
 
   SmallVector<Operation *, 4> spirvModules;
-  for (auto funcOp : module.getOps<FuncOp>()) {
+  module.walk([&module, &spirvModules](FuncOp funcOp) {
     if (gpu::GPUDialect::isKernel(funcOp)) {
       OpBuilder builder(module.getBodyRegion());
       // Create a new spirv::ModuleOp for this function, and clone the
@@ -139,7 +139,7 @@ void GPUToSPIRVPass::runOnModule() {
       moduleBuilder.clone(*funcOp.getOperation());
       spirvModules.push_back(spvModule);
     }
-  }
+  });
 
   /// Dialect conversion to lower the functions with the spirv::ModuleOps.
   SPIRVBasicTypeConverter basicTypeConverter;
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index 98f5651279ff..2835578239f3 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -31,6 +31,10 @@
 using namespace mlir;
 using namespace mlir::gpu;
 
+//===----------------------------------------------------------------------===//
+// GPUDialect
+//===----------------------------------------------------------------------===//
+
 StringRef GPUDialect::getDialectName() { return "gpu"; }
 
 bool GPUDialect::isKernel(FuncOp function) {
@@ -47,6 +51,78 @@ GPUDialect::GPUDialect(MLIRContext *context)
                 >();
 }
 
+LogicalResult GPUDialect::verifyOperationAttribute(Operation *op,
+                                                   NamedAttribute attr) {
+  if (!attr.second.isa<UnitAttr>() ||
+      !attr.first.is(getContainerModuleAttrName()))
+    return success();
+
+  auto module = dyn_cast<ModuleOp>(op);
+  if (!module)
+    return op->emitError("expected '")
+           << getContainerModuleAttrName() << "' attribute to be attached to '"
+           << ModuleOp::getOperationName() << '\'';
+
+  auto walkResult = module.walk([&module](LaunchFuncOp launchOp) -> WalkResult {
+    // Ignore launches that are nested more or less deep than functions in the
+    // module we are currently checking.
+    if (!launchOp.getParentOp() ||
+        launchOp.getParentOp()->getParentOp() != module)
+      return success();
+
+    // Ignore launch ops with missing attributes here. The errors will be
+    // reported by the verifiers of those ops.
+    if (!launchOp.getAttrOfType<StringAttr>(
+            LaunchFuncOp::getKernelAttrName()) ||
+        !launchOp.getAttrOfType<SymbolRefAttr>(
+            LaunchFuncOp::getKernelModuleAttrName()))
+      return success();
+
+    // Check that `launch_func` refers to a well-formed GPU kernel module.
+    StringRef kernelModuleName = launchOp.getKernelModuleName();
+    auto kernelModule = module.lookupSymbol<ModuleOp>(kernelModuleName);
+    if (!kernelModule)
+      return launchOp.emitOpError()
+             << "kernel module '" << kernelModuleName << "' is undefined";
+    if (!kernelModule.getAttrOfType<UnitAttr>(
+            GPUDialect::getKernelModuleAttrName()))
+      return launchOp.emitOpError("module '")
+             << kernelModuleName << "' is missing the '"
+             << GPUDialect::getKernelModuleAttrName() << "' attribute";
+
+    // Check that `launch_func` refers to a well-formed kernel function.
+    StringRef kernelName = launchOp.kernel();
+    auto kernelFunction = kernelModule.lookupSymbol<FuncOp>(kernelName);
+    if (!kernelFunction)
+      return launchOp.emitOpError("kernel function '")
+             << kernelName << "' is undefined";
+    if (!kernelFunction.getAttrOfType<mlir::UnitAttr>(
+            GPUDialect::getKernelFuncAttrName()))
+      return launchOp.emitOpError("kernel function is missing the '")
+             << GPUDialect::getKernelFuncAttrName() << "' attribute";
+    if (launchOp.getNumKernelOperands() != kernelFunction.getNumArguments())
+      return launchOp.emitOpError("got ") << launchOp.getNumKernelOperands()
+                                          << " kernel operands but expected "
+                                          << kernelFunction.getNumArguments();
+
+    // Due to the ordering of the current impl of lowering and LLVMLowering,
+    // type checks need to be temporarily disabled.
+    // TODO(ntv,zinenko,herhut): reactivate checks once "changing gpu.launchFunc
+    // to encode target module" has landed.
+    // auto functionType = kernelFunc.getType();
+    // for (unsigned i = 0; i < numKernelFuncArgs; ++i) {
+    //   if (getKernelOperand(i)->getType() != functionType.getInput(i)) {
+    //     return emitOpError("type of function argument ")
+    //            << i << " does not match";
+    //   }
+    // }
+
+    return success();
+  });
+
+  return walkResult.wasInterrupted() ? failure() : success();
+}
+
 template <typename T> static LogicalResult verifyIndexOp(T op) {
   auto dimension = op.dimension();
   if (dimension != "x" && dimension != "y" && dimension != "z")
@@ -394,7 +470,11 @@ void LaunchFuncOp::build(Builder *builder, OperationState &result,
       {gridSizeX, gridSizeY, gridSizeZ, blockSizeX, blockSizeY, blockSizeZ});
   result.addOperands(kernelOperands);
   result.addAttribute(getKernelAttrName(),
-                      builder->getSymbolRefAttr(kernelFunc));
+                      builder->getStringAttr(kernelFunc.getName()));
+  auto kernelModule = kernelFunc.getParentOfType<ModuleOp>();
+  if (Optional<StringRef> kernelModuleName = kernelModule.getName())
+    result.addAttribute(getKernelModuleAttrName(),
+                        builder->getSymbolRefAttr(*kernelModuleName));
 }
 
 void LaunchFuncOp::build(Builder *builder, OperationState &result,
@@ -406,13 +486,17 @@ void LaunchFuncOp::build(Builder *builder, OperationState &result,
 }
 
 StringRef LaunchFuncOp::kernel() {
-  return getAttrOfType<SymbolRefAttr>(getKernelAttrName()).getValue();
+  return getAttrOfType<StringAttr>(getKernelAttrName()).getValue();
 }
 
 unsigned LaunchFuncOp::getNumKernelOperands() {
   return getNumOperands() - kNumConfigOperands;
 }
 
+StringRef LaunchFuncOp::getKernelModuleName() {
+  return getAttrOfType<SymbolRefAttr>(getKernelModuleAttrName()).getValue();
+}
+
 Value *LaunchFuncOp::getKernelOperand(unsigned i) {
   return getOperation()->getOperand(i + kNumConfigOperands);
 }
@@ -426,39 +510,25 @@ KernelDim3 LaunchFuncOp::getBlockSizeOperandValues() {
 }
 
 LogicalResult LaunchFuncOp::verify() {
-  auto kernelAttr = this->getAttr(getKernelAttrName());
-  if (!kernelAttr) {
-    return emitOpError("attribute 'kernel' must be specified");
-  } else if (!kernelAttr.isa<SymbolRefAttr>()) {
-    return emitOpError("attribute 'kernel' must be a function");
-  }
-
   auto module = getParentOfType<ModuleOp>();
-  FuncOp kernelFunc = module.lookupSymbol<FuncOp>(kernel());
-  if (!kernelFunc)
-    return emitOpError("kernel function '") << kernelAttr << "' is undefined";
+  if (!module)
+    return emitOpError("expected to belong to a module");
+
+  if (!module.getAttrOfType<UnitAttr>(GPUDialect::getContainerModuleAttrName()))
+    return emitOpError("expected the closest surrounding module to have the '" +
+                       GPUDialect::getContainerModuleAttrName() +
+                       "' attribute");
+
+  auto kernelAttr = getAttrOfType<StringAttr>(getKernelAttrName());
+  if (!kernelAttr)
+    return emitOpError("string attribute '" + getKernelAttrName() +
+                       "' must be specified");
+
+  auto kernelModuleAttr =
+      getAttrOfType<SymbolRefAttr>(getKernelModuleAttrName());
+  if (!kernelModuleAttr)
+    return emitOpError("symbol reference attribute '" +
+                       getKernelModuleAttrName() + "' must be specified");
 
-  if (!kernelFunc.getAttrOfType<mlir::UnitAttr>(
-          GPUDialect::getKernelFuncAttrName())) {
-    return emitOpError("kernel function is missing the '")
-           << GPUDialect::getKernelFuncAttrName() << "' attribute";
-  }
-  unsigned numKernelFuncArgs = kernelFunc.getNumArguments();
-  if (getNumKernelOperands() != numKernelFuncArgs) {
-    return emitOpError("got ")
-           << getNumKernelOperands() << " kernel operands but expected "
-           << numKernelFuncArgs;
-  }
-  // Due to the ordering of the current impl of lowering and LLVMLowering, type
-  // checks need to be temporarily disabled.
-  // TODO(ntv,zinenko,herhut): reactivate checks once "changing gpu.launchFunc
-  // to encode target module" has landed.
-  // auto functionType = kernelFunc.getType();
-  // for (unsigned i = 0; i < numKernelFuncArgs; ++i) {
-  //   if (getKernelOperand(i)->getType() != functionType.getInput(i)) {
-  //     return emitOpError("type of function argument ")
-  //            << i << " does not match";
-  //   }
-  // }
   return success();
 }
diff --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
index f38a2e81986a..e2b0e463de08 100644
--- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
@@ -144,27 +144,30 @@ class GpuKernelOutliningPass : public ModulePass<GpuKernelOutliningPass> {
 public:
   void runOnModule() override {
     ModuleManager moduleManager(getModule());
+    bool modified = false;
     for (auto func : getModule().getOps<FuncOp>()) {
       // Insert just after the function.
       Block::iterator insertPt(func.getOperation()->getNextNode());
       func.walk([&](gpu::LaunchOp op) {
         FuncOp outlinedFunc = outlineKernelFunc(op);
 
-        // Potentially renames outlinedFunc to make symbol unique.
-        moduleManager.insert(insertPt, outlinedFunc);
+        // Create nested module and insert outlinedFunc. The module will
+        // originally get the same name as the function, but may be renamed on
+        // insertion into the parent module.
+        auto kernelModule = createKernelModule(outlinedFunc, moduleManager);
+        moduleManager.insert(insertPt, kernelModule);
 
         // Potentially changes signature, pulling in constants.
         convertToLaunchFuncOp(op, outlinedFunc);
-
-        // Create clone and move body from outlinedFunc.
-        auto kernelFunc = outlinedFunc.cloneWithoutRegions();
-        kernelFunc.getBody().takeBody(outlinedFunc.getBody());
-
-        // Create nested module and insert kernelFunc.
-        auto kernelModule = createKernelModule(kernelFunc, moduleManager);
-        getModule().insert(insertPt, kernelModule);
+        modified = true;
       });
     }
+
+    // If any new module was inserted in this module, annotate this module as
+    // a container module.
+    if (modified)
+      getModule().setAttr(gpu::GPUDialect::getContainerModuleAttrName(),
+                          UnitAttr::get(&getContext()));
   }
 
 private:
@@ -172,9 +175,11 @@ private:
   ModuleOp createKernelModule(FuncOp kernelFunc,
                               const ModuleManager &parentModuleManager) {
     auto context = getModule().getContext();
-    auto kernelModule = ModuleOp::create(UnknownLoc::get(context));
+    Builder builder(context);
+    auto kernelModule =
+        ModuleOp::create(builder.getUnknownLoc(), kernelFunc.getName());
     kernelModule.setAttr(gpu::GPUDialect::getKernelModuleAttrName(),
-                         UnitAttr::get(context));
+                         builder.getUnitAttr());
     ModuleManager moduleManager(kernelModule);
 
     llvm::SmallVector<FuncOp, 8> funcsToInsert = {kernelFunc};
diff --git a/mlir/test/Conversion/GPUToCUDA/insert-cubin-getter.mlir b/mlir/test/Conversion/GPUToCUDA/insert-cubin-getter.mlir
index 9e0907f7477a..ef58433bdef3 100644
--- a/mlir/test/Conversion/GPUToCUDA/insert-cubin-getter.mlir
+++ b/mlir/test/Conversion/GPUToCUDA/insert-cubin-getter.mlir
@@ -1,15 +1,15 @@
 // RUN: mlir-opt %s --generate-cubin-accessors | FileCheck %s
 
+module attributes {gpu.container_module} {
+
 // CHECK: llvm.mlir.global constant @[[global:.*]]("CUBIN")
 
-module attributes {gpu.kernel_module} {
-  func @kernel(!llvm.float, !llvm<"float*">)
-  attributes  {nvvm.cubin = "CUBIN"}
-}
-
-func @kernel(!llvm.float, !llvm<"float*">)
-// CHECK: attributes  {gpu.kernel, nvvm.cubingetter = @[[getter:.*]]}
-  attributes  {gpu.kernel}
+  module attributes {gpu.kernel_module} {
+    // CHECK-LABEL: func @kernel
+    func @kernel(!llvm.float, !llvm<"float*">)
+    // CHECK: attributes  {nvvm.cubingetter = @[[getter:.*]]}
+    attributes  {nvvm.cubin = "CUBIN"}
+  }
 
 // CHECK: func @[[getter]]() -> !llvm<"i8*">
 // CHECK: %[[addressof:.*]] = llvm.mlir.addressof @[[global]]
@@ -17,3 +17,4 @@ func @kernel(!llvm.float, !llvm<"float*">)
 // CHECK: %[[gep:.*]] = llvm.getelementptr %[[addressof]][%[[c0]], %[[c0]]]
 // CHECK-SAME: -> !llvm<"i8*">
 // CHECK: llvm.return %[[gep]] : !llvm<"i8*">
+}
diff --git a/mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir b/mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir
index bc843e3595b0..a4ff3c97cbf4 100644
--- a/mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir
+++ b/mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir
@@ -1,27 +1,33 @@
 // RUN: mlir-opt %s --launch-func-to-cuda | FileCheck %s
 
-// CHECK: llvm.mlir.global constant @[[kernel_name:.*]]("kernel\00")
+module attributes {gpu.container_module} {
 
-func @cubin_getter() -> !llvm<"i8*">
+  // CHECK: llvm.mlir.global constant @[[kernel_name:.*]]("kernel\00")
 
-func @kernel(!llvm.float, !llvm<"float*">)
-    attributes { gpu.kernel, nvvm.cubingetter = @cubin_getter }
+  func @cubin_getter() -> !llvm<"i8*">
+
+  module @kernel_module attributes {gpu.kernel_module} {
+    func @kernel(!llvm.float, !llvm<"float*">)
+        attributes { gpu.kernel, nvvm.cubingetter = @cubin_getter }
+  }
 
 
-func @foo() {
-  %0 = "op"() : () -> (!llvm.float)
-  %1 = "op"() : () -> (!llvm<"float*">)
-  %cst = constant 8 : index
+  func @foo() {
+    %0 = "op"() : () -> (!llvm.float)
+    %1 = "op"() : () -> (!llvm<"float*">)
+    %cst = constant 8 : index
 
-  // CHECK: [[module_ptr:%.*]] = llvm.alloca {{.*}} x !llvm<"i8*"> : (!llvm.i32) -> !llvm<"i8**">
-  // CHECK: llvm.call @mcuModuleLoad([[module_ptr]], {{.*}}) : (!llvm<"i8**">, !llvm<"i8*">) -> !llvm.i32
-  // CHECK: [[func_ptr:%.*]] = llvm.alloca {{.*}} x !llvm<"i8*"> : (!llvm.i32) -> !llvm<"i8**">
-  // CHECK: llvm.call @mcuModuleGetFunction([[func_ptr]], {{.*}}, {{.*}}) : (!llvm<"i8**">, !llvm<"i8*">, !llvm<"i8*">) -> !llvm.i32
-  // CHECK: llvm.call @mcuGetStreamHelper
-  // CHECK: llvm.call @mcuLaunchKernel
-  // CHECK: llvm.call @mcuStreamSynchronize
-  "gpu.launch_func"(%cst, %cst, %cst, %cst, %cst, %cst, %0, %1) { kernel = @kernel }
-      : (index, index, index, index, index, index, !llvm.float, !llvm<"float*">) -> ()
+    // CHECK: [[module_ptr:%.*]] = llvm.alloca {{.*}} x !llvm<"i8*"> : (!llvm.i32) -> !llvm<"i8**">
+    // CHECK: llvm.call @mcuModuleLoad([[module_ptr]], {{.*}}) : (!llvm<"i8**">, !llvm<"i8*">) -> !llvm.i32
+    // CHECK: [[func_ptr:%.*]] = llvm.alloca {{.*}} x !llvm<"i8*"> : (!llvm.i32) -> !llvm<"i8**">
+    // CHECK: llvm.call @mcuModuleGetFunction([[func_ptr]], {{.*}}, {{.*}}) : (!llvm<"i8**">, !llvm<"i8*">, !llvm<"i8*">) -> !llvm.i32
+    // CHECK: llvm.call @mcuGetStreamHelper
+    // CHECK: llvm.call @mcuLaunchKernel
+    // CHECK: llvm.call @mcuStreamSynchronize
+    "gpu.launch_func"(%cst, %cst, %cst, %cst, %cst, %cst, %0, %1) { kernel = "kernel", kernel_module = @kernel_module }
+        : (index, index, index, index, index, index, !llvm.float, !llvm<"float*">) -> ()
+
+    return
+  }
 
-  return
 }
diff --git a/mlir/test/Conversion/GPUToSPIRV/builtins.mlir b/mlir/test/Conversion/GPUToSPIRV/builtins.mlir
index 73fb1833302d..2139cca6ef12 100644
--- a/mlir/test/Conversion/GPUToSPIRV/builtins.mlir
+++ b/mlir/test/Conversion/GPUToSPIRV/builtins.mlir
@@ -1,113 +1,137 @@
 // RUN: mlir-opt -split-input-file -convert-gpu-to-spirv %s -o - | FileCheck %s
 
-func @builtin() {
-  %c0 = constant 1 : index
-  "gpu.launch_func"(%c0, %c0, %c0, %c0, %c0, %c0) {kernel = @builtin_workgroup_id_x} : (index, index, index, index, index, index) -> ()
-  return
-}
+module attributes {gpu.container_module} {
+  func @builtin() {
+    %c0 = constant 1 : index
+    "gpu.launch_func"(%c0, %c0, %c0, %c0, %c0, %c0) {kernel = "builtin_workgroup_id_x", kernel_module = @kernels} : (index, index, index, index, index, index) -> ()
+    return
+  }
 
-// CHECK-LABEL:  spv.module "Logical" "GLSL450"
-// CHECK: spv.globalVariable [[WORKGROUPID:@.*]] built_in("WorkgroupId")
-func @builtin_workgroup_id_x()
-  attributes {gpu.kernel} {
-  // CHECK: [[ADDRESS:%.*]] = spv._address_of [[WORKGROUPID]]
-  // CHECK-NEXT: [[VEC:%.*]] = spv.Load "Input" [[ADDRESS]]
-  // CHECK-NEXT: {{%.*}} = spv.CompositeExtract [[VEC]]{{\[}}0 : i32{{\]}}
-  %0 = "gpu.block_id"() {dimension = "x"} : () -> index
-  return
+  // CHECK-LABEL:  spv.module "Logical" "GLSL450"
+  // CHECK: spv.globalVariable [[WORKGROUPID:@.*]] built_in("WorkgroupId")
+  module @kernels attributes {gpu.kernel_module} {
+    func @builtin_workgroup_id_x()
+      attributes {gpu.kernel} {
+      // CHECK: [[ADDRESS:%.*]] = spv._address_of [[WORKGROUPID]]
+      // CHECK-NEXT: [[VEC:%.*]] = spv.Load "Input" [[ADDRESS]]
+      // CHECK-NEXT: {{%.*}} = spv.CompositeExtract [[VEC]]{{\[}}0 : i32{{\]}}
+      %0 = "gpu.block_id"() {dimension = "x"} : () -> index
+      return
+    }
+  }
 }
 
 // -----
 
-func @builtin() {
-  %c0 = constant 1 : index
-  "gpu.launch_func"(%c0, %c0, %c0, %c0, %c0, %c0) {kernel = @builtin_workgroup_id_y} : (index, index, index, index, index, index) -> ()
-  return
-}
+module attributes {gpu.container_module} {
+  func @builtin() {
+    %c0 = constant 1 : index
+    "gpu.launch_func"(%c0, %c0, %c0, %c0, %c0, %c0) {kernel = "builtin_workgroup_id_y", kernel_module = @kernels} : (index, index, index, index, index, index) -> ()
+    return
+  }
 
-// CHECK-LABEL:  spv.module "Logical" "GLSL450"
-// CHECK: spv.globalVariable [[WORKGROUPID:@.*]] built_in("WorkgroupId")
-func @builtin_workgroup_id_y()
-  attributes {gpu.kernel} {
-  // CHECK: [[ADDRESS:%.*]] = spv._address_of [[WORKGROUPID]]
-  // CHECK-NEXT: [[VEC:%.*]] = spv.Load "Input" [[ADDRESS]]
-  // CHECK-NEXT: {{%.*}} = spv.CompositeExtract [[VEC]]{{\[}}1 : i32{{\]}}
-  %0 = "gpu.block_id"() {dimension = "y"} : () -> index
-  return
+  // CHECK-LABEL:  spv.module "Logical" "GLSL450"
+  // CHECK: spv.globalVariable [[WORKGROUPID:@.*]] built_in("WorkgroupId")
+  module @kernels attributes {gpu.kernel_module} {
+    func @builtin_workgroup_id_y()
+      attributes {gpu.kernel} {
+      // CHECK: [[ADDRESS:%.*]] = spv._address_of [[WORKGROUPID]]
+      // CHECK-NEXT: [[VEC:%.*]] = spv.Load "Input" [[ADDRESS]]
+      // CHECK-NEXT: {{%.*}} = spv.CompositeExtract [[VEC]]{{\[}}1 : i32{{\]}}
+      %0 = "gpu.block_id"() {dimension = "y"} : () -> index
+      return
+    }
+  }
 }
 
 // -----
 
-func @builtin() {
-  %c0 = constant 1 : index
-  "gpu.launch_func"(%c0, %c0, %c0, %c0, %c0, %c0) {kernel = @builtin_workgroup_id_z} : (index, index, index, index, index, index) -> ()
-  return
-}
+module attributes {gpu.container_module} {
+  func @builtin() {
+    %c0 = constant 1 : index
+    "gpu.launch_func"(%c0, %c0, %c0, %c0, %c0, %c0) {kernel = "builtin_workgroup_id_z", kernel_module = @kernels} : (index, index, index, index, index, index) -> ()
+    return
+  }
 
-// CHECK-LABEL:  spv.module "Logical" "GLSL450"
-// CHECK: spv.globalVariable [[WORKGROUPID:@.*]] built_in("WorkgroupId")
-func @builtin_workgroup_id_z()
-  attributes {gpu.kernel} {
-  // CHECK: [[ADDRESS:%.*]] = spv._address_of [[WORKGROUPID]]
-  // CHECK-NEXT: [[VEC:%.*]] = spv.Load "Input" [[ADDRESS]]
-  // CHECK-NEXT: {{%.*}} = spv.CompositeExtract [[VEC]]{{\[}}2 : i32{{\]}}
-  %0 = "gpu.block_id"() {dimension = "z"} : () -> index
-  return
+  // CHECK-LABEL:  spv.module "Logical" "GLSL450"
+  // CHECK: spv.globalVariable [[WORKGROUPID:@.*]] built_in("WorkgroupId")
+  module @kernels attributes {gpu.kernel_module} {
+    func @builtin_workgroup_id_z()
+      attributes {gpu.kernel} {
+      // CHECK: [[ADDRESS:%.*]] = spv._address_of [[WORKGROUPID]]
+      // CHECK-NEXT: [[VEC:%.*]] = spv.Load "Input" [[ADDRESS]]
+      // CHECK-NEXT: {{%.*}} = spv.CompositeExtract [[VEC]]{{\[}}2 : i32{{\]}}
+      %0 = "gpu.block_id"() {dimension = "z"} : () -> index
+      return
+    }
+  }
 }
 
 // -----
 
-func @builtin() {
-  %c0 = constant 1 : index
-  "gpu.launch_func"(%c0, %c0, %c0, %c0, %c0, %c0) {kernel = @builtin_workgroup_size_x} : (index, index, index, index, index, index) -> ()
-  return
-}
+module attributes {gpu.container_module} {
+  func @builtin() {
+    %c0 = constant 1 : index
+    "gpu.launch_func"(%c0, %c0, %c0, %c0, %c0, %c0) {kernel = "builtin_workgroup_size_x", kernel_module = @kernels} : (index, index, index, index, index, index) -> ()
+    return
+  }
 
-// CHECK-LABEL:  spv.module "Logical" "GLSL450"
-// CHECK: spv.globalVariable [[WORKGROUPSIZE:@.*]] built_in("WorkgroupSize")
-func @builtin_workgroup_size_x()
-  attributes {gpu.kernel} {
-  // CHECK: [[ADDRESS:%.*]] = spv._address_of [[WORKGROUPSIZE]]
-  // CHECK-NEXT: [[VEC:%.*]] = spv.Load "Input" [[ADDRESS]]
-  // CHECK-NEXT: {{%.*}} = spv.CompositeExtract [[VEC]]{{\[}}0 : i32{{\]}}
-  %0 = "gpu.block_dim"() {dimension = "x"} : () -> index
-  return
+  // CHECK-LABEL:  spv.module "Logical" "GLSL450"
+  // CHECK: spv.globalVariable [[WORKGROUPSIZE:@.*]] built_in("WorkgroupSize")
+  module @kernels attributes {gpu.kernel_module} {
+    func @builtin_workgroup_size_x()
+      attributes {gpu.kernel} {
+      // CHECK: [[ADDRESS:%.*]] = spv._address_of [[WORKGROUPSIZE]]
+      // CHECK-NEXT: [[VEC:%.*]] = spv.Load "Input" [[ADDRESS]]
+      // CHECK-NEXT: {{%.*}} = spv.CompositeExtract [[VEC]]{{\[}}0 : i32{{\]}}
+      %0 = "gpu.block_dim"() {dimension = "x"} : () -> index
+      return
+    }
+  }
 }
 
 // -----
 
-func @builtin() {
-  %c0 = constant 1 : index
-  "gpu.launch_func"(%c0, %c0, %c0, %c0, %c0, %c0) {kernel = @builtin_local_id_x} : (index, index, index, index, index, index) -> ()
-  return
-}
+module attributes {gpu.container_module} {
+  func @builtin() {
+    %c0 = constant 1 : index
+    "gpu.launch_func"(%c0, %c0, %c0, %c0, %c0, %c0) {kernel = "builtin_local_id_x", kernel_module = @kernels} : (index, index, index, index, index, index) -> ()
+    return
+  }
 
-// CHECK-LABEL:  spv.module "Logical" "GLSL450"
-// CHECK: spv.globalVariable [[LOCALINVOCATIONID:@.*]] built_in("LocalInvocationId")
-func @builtin_local_id_x()
-  attributes {gpu.kernel} {
-  // CHECK: [[ADDRESS:%.*]] = spv._address_of [[LOCALINVOCATIONID]]
-  // CHECK-NEXT: [[VEC:%.*]] = spv.Load "Input" [[ADDRESS]]
-  // CHECK-NEXT: {{%.*}} = spv.CompositeExtract [[VEC]]{{\[}}0 : i32{{\]}}
-  %0 = "gpu.thread_id"() {dimension = "x"} : () -> index
-  return
+  // CHECK-LABEL:  spv.module "Logical" "GLSL450"
+  // CHECK: spv.globalVariable [[LOCALINVOCATIONID:@.*]] built_in("LocalInvocationId")
+  module @kernels attributes {gpu.kernel_module} {
+    func @builtin_local_id_x()
+      attributes {gpu.kernel} {
+      // CHECK: [[ADDRESS:%.*]] = spv._address_of [[LOCALINVOCATIONID]]
+      // CHECK-NEXT: [[VEC:%.*]] = spv.Load "Input" [[ADDRESS]]
+      // CHECK-NEXT: {{%.*}} = spv.CompositeExtract [[VEC]]{{\[}}0 : i32{{\]}}
+      %0 = "gpu.thread_id"() {dimension = "x"} : () -> index
+      return
+    }
+  }
 }
 
 // -----
 
-func @builtin() {
-  %c0 = constant 1 : index
-  "gpu.launch_func"(%c0, %c0, %c0, %c0, %c0, %c0) {kernel = @builtin_num_workgroups_x} : (index, index, index, index, index, index) -> ()
-  return
-}
+module attributes {gpu.container_module} {
+  func @builtin() {
+    %c0 = constant 1 : index
+    "gpu.launch_func"(%c0, %c0, %c0, %c0, %c0, %c0) {kernel = "builtin_num_workgroups_x", kernel_module = @kernels} : (index, index, index, index, index, index) -> ()
+    return
+  }
 
-// CHECK-LABEL:  spv.module "Logical" "GLSL450"
-// CHECK: spv.globalVariable [[NUMWORKGROUPS:@.*]] built_in("NumWorkgroups")
-func @builtin_num_workgroups_x()
-  attributes {gpu.kernel} {
-  // CHECK: [[ADDRESS:%.*]] = spv._address_of [[NUMWORKGROUPS]]
-  // CHECK-NEXT: [[VEC:%.*]] = spv.Load "Input" [[ADDRESS]]
-  // CHECK-NEXT: {{%.*}} = spv.CompositeExtract [[VEC]]{{\[}}0 : i32{{\]}}
-  %0 = "gpu.grid_dim"() {dimension = "x"} : () -> index
-  return
+  // CHECK-LABEL:  spv.module "Logical" "GLSL450"
+  // CHECK: spv.globalVariable [[NUMWORKGROUPS:@.*]] built_in("NumWorkgroups")
+  module @kernels attributes {gpu.kernel_module} {
+    func @builtin_num_workgroups_x()
+      attributes {gpu.kernel} {
+      // CHECK: [[ADDRESS:%.*]] = spv._address_of [[NUMWORKGROUPS]]
+      // CHECK-NEXT: [[VEC:%.*]] = spv.Load "Input" [[ADDRESS]]
+      // CHECK-NEXT: {{%.*}} = spv.CompositeExtract [[VEC]]{{\[}}0 : i32{{\]}}
+      %0 = "gpu.grid_dim"() {dimension = "x"} : () -> index
+      return
+    }
+  }
 }
diff --git a/mlir/test/Conversion/GPUToSPIRV/load_store.mlir b/mlir/test/Conversion/GPUToSPIRV/load_store.mlir
index e86cc1913faf..d362ce1a839c 100644
--- a/mlir/test/Conversion/GPUToSPIRV/load_store.mlir
+++ b/mlir/test/Conversion/GPUToSPIRV/load_store.mlir
@@ -1,52 +1,56 @@
 // RUN: mlir-opt -convert-gpu-to-spirv %s -o - | FileCheck %s
 
-func @load_store(%arg0: memref<12x4xf32>, %arg1: memref<12x4xf32>, %arg2: memref<12x4xf32>) {
-  %c0 = constant 0 : index
-  %c12 = constant 12 : index
-  %0 = subi %c12, %c0 : index
-  %c1 = constant 1 : index
-  %c0_0 = constant 0 : index
-  %c4 = constant 4 : index
-  %1 = subi %c4, %c0_0 : index
-  %c1_1 = constant 1 : index
-  %c1_2 = constant 1 : index
-  "gpu.launch_func"(%0, %c1_2, %c1_2, %1, %c1_2, %c1_2, %arg0, %arg1, %arg2, %c0, %c0_0, %c1, %c1_1) {kernel = @load_store_kernel} : (index, index, index, index, index, index, memref<12x4xf32>, memref<12x4xf32>, memref<12x4xf32>, index, index, index, index) -> ()
-  return
-}
+module attributes {gpu.container_module} {
+  func @load_store(%arg0: memref<12x4xf32>, %arg1: memref<12x4xf32>, %arg2: memref<12x4xf32>) {
+    %c0 = constant 0 : index
+    %c12 = constant 12 : index
+    %0 = subi %c12, %c0 : index
+    %c1 = constant 1 : index
+    %c0_0 = constant 0 : index
+    %c4 = constant 4 : index
+    %1 = subi %c4, %c0_0 : index
+    %c1_1 = constant 1 : index
+    %c1_2 = constant 1 : index
+    "gpu.launch_func"(%0, %c1_2, %c1_2, %1, %c1_2, %c1_2, %arg0, %arg1, %arg2, %c0, %c0_0, %c1, %c1_1) {kernel = "load_store_kernel", kernel_module = @kernels} : (index, index, index, index, index, index, memref<12x4xf32>, memref<12x4xf32>, memref<12x4xf32>, index, index, index, index) -> ()
+    return
+  }
 
-// CHECK-LABEL: spv.module "Logical" "GLSL450"
-// CHECK: spv.globalVariable {{@.*}} bind(0, 0) : [[TYPE1:!spv.ptr<!spv.array<12 x !spv.array<4 x f32>>, StorageBuffer>]]
-// CHECK-NEXT: spv.globalVariable {{@.*}} bind(0, 1) : [[TYPE2:!spv.ptr<!spv.array<12 x !spv.array<4 x f32>>, StorageBuffer>]]
-// CHECK-NEXT: spv.globalVariable {{@.*}} bind(0, 2) : [[TYPE3:!spv.ptr<!spv.array<12 x !spv.array<4 x f32>>, StorageBuffer>]]
-// CHECK: func @load_store_kernel([[ARG0:%.*]]: [[TYPE1]], [[ARG1:%.*]]: [[TYPE2]], [[ARG2:%.*]]: [[TYPE3]], [[ARG3:%.*]]: i32, [[ARG4:%.*]]: i32, [[ARG5:%.*]]: i32, [[ARG6:%.*]]: i32)
-func @load_store_kernel(%arg0: memref<12x4xf32>, %arg1: memref<12x4xf32>, %arg2: memref<12x4xf32>, %arg3: index, %arg4: index, %arg5: index, %arg6: index)
-  attributes  {gpu.kernel} {
-  %0 = "gpu.block_id"() {dimension = "x"} : () -> index
-  %1 = "gpu.block_id"() {dimension = "y"} : () -> index
-  %2 = "gpu.block_id"() {dimension = "z"} : () -> index
-  %3 = "gpu.thread_id"() {dimension = "x"} : () -> index
-  %4 = "gpu.thread_id"() {dimension = "y"} : () -> index
-  %5 = "gpu.thread_id"() {dimension = "z"} : () -> index
-  %6 = "gpu.grid_dim"() {dimension = "x"} : () -> index
-  %7 = "gpu.grid_dim"() {dimension = "y"} : () -> index
-  %8 = "gpu.grid_dim"() {dimension = "z"} : () -> index
-  %9 = "gpu.block_dim"() {dimension = "x"} : () -> index
-  %10 = "gpu.block_dim"() {dimension = "y"} : () -> index
-  %11 = "gpu.block_dim"() {dimension = "z"} : () -> index
-  // CHECK: [[INDEX1:%.*]] = spv.IAdd [[ARG3]], {{%.*}}
-  %12 = addi %arg3, %0 : index
-  // CHECK: [[INDEX2:%.*]] = spv.IAdd [[ARG4]], {{%.*}}
-  %13 = addi %arg4, %3 : index
-  // CHECK: [[PTR1:%.*]] = spv.AccessChain [[ARG0]]{{\[}}[[INDEX1]], [[INDEX2]]{{\]}}
-  // CHECK-NEXT: [[VAL1:%.*]] = spv.Load "StorageBuffer" [[PTR1]]
-  %14 = load %arg0[%12, %13] : memref<12x4xf32>
-  // CHECK: [[PTR2:%.*]] = spv.AccessChain [[ARG1]]{{\[}}[[INDEX1]], [[INDEX2]]{{\]}}
-  // CHECK-NEXT: [[VAL2:%.*]] = spv.Load "StorageBuffer" [[PTR2]]
-  %15 = load %arg1[%12, %13] : memref<12x4xf32>
-  // CHECK: [[VAL3:%.*]] = spv.FAdd [[VAL1]], [[VAL2]]
-  %16 = addf %14, %15 : f32
-  // CHECK: [[PTR3:%.*]] = spv.AccessChain [[ARG2]]{{\[}}[[INDEX1]], [[INDEX2]]{{\]}}
-  // CHECK-NEXT: spv.Store "StorageBuffer" [[PTR3]], [[VAL3]]
-  store %16, %arg2[%12, %13] : memref<12x4xf32>
-  return
-}
\ No newline at end of file
+  // CHECK-LABEL: spv.module "Logical" "GLSL450"
+  // CHECK: spv.globalVariable {{@.*}} bind(0, 0) : [[TYPE1:!spv.ptr<!spv.array<12 x !spv.array<4 x f32>>, StorageBuffer>]]
+  // CHECK-NEXT: spv.globalVariable {{@.*}} bind(0, 1) : [[TYPE2:!spv.ptr<!spv.array<12 x !spv.array<4 x f32>>, StorageBuffer>]]
+  // CHECK-NEXT: spv.globalVariable {{@.*}} bind(0, 2) : [[TYPE3:!spv.ptr<!spv.array<12 x !spv.array<4 x f32>>, StorageBuffer>]]
+  // CHECK: func @load_store_kernel([[ARG0:%.*]]: [[TYPE1]], [[ARG1:%.*]]: [[TYPE2]], [[ARG2:%.*]]: [[TYPE3]], [[ARG3:%.*]]: i32, [[ARG4:%.*]]: i32, [[ARG5:%.*]]: i32, [[ARG6:%.*]]: i32)
+  module @kernels attributes {gpu.kernel_module} {
+    func @load_store_kernel(%arg0: memref<12x4xf32>, %arg1: memref<12x4xf32>, %arg2: memref<12x4xf32>, %arg3: index, %arg4: index, %arg5: index, %arg6: index)
+      attributes  {gpu.kernel} {
+      %0 = "gpu.block_id"() {dimension = "x"} : () -> index
+      %1 = "gpu.block_id"() {dimension = "y"} : () -> index
+      %2 = "gpu.block_id"() {dimension = "z"} : () -> index
+      %3 = "gpu.thread_id"() {dimension = "x"} : () -> index
+      %4 = "gpu.thread_id"() {dimension = "y"} : () -> index
+      %5 = "gpu.thread_id"() {dimension = "z"} : () -> index
+      %6 = "gpu.grid_dim"() {dimension = "x"} : () -> index
+      %7 = "gpu.grid_dim"() {dimension = "y"} : () -> index
+      %8 = "gpu.grid_dim"() {dimension = "z"} : () -> index
+      %9 = "gpu.block_dim"() {dimension = "x"} : () -> index
+      %10 = "gpu.block_dim"() {dimension = "y"} : () -> index
+      %11 = "gpu.block_dim"() {dimension = "z"} : () -> index
+      // CHECK: [[INDEX1:%.*]] = spv.IAdd [[ARG3]], {{%.*}}
+      %12 = addi %arg3, %0 : index
+      // CHECK: [[INDEX2:%.*]] = spv.IAdd [[ARG4]], {{%.*}}
+      %13 = addi %arg4, %3 : index
+      // CHECK: [[PTR1:%.*]] = spv.AccessChain [[ARG0]]{{\[}}[[INDEX1]], [[INDEX2]]{{\]}}
+      // CHECK-NEXT: [[VAL1:%.*]] = spv.Load "StorageBuffer" [[PTR1]]
+      %14 = load %arg0[%12, %13] : memref<12x4xf32>
+      // CHECK: [[PTR2:%.*]] = spv.AccessChain [[ARG1]]{{\[}}[[INDEX1]], [[INDEX2]]{{\]}}
+      // CHECK-NEXT: [[VAL2:%.*]] = spv.Load "StorageBuffer" [[PTR2]]
+      %15 = load %arg1[%12, %13] : memref<12x4xf32>
+      // CHECK: [[VAL3:%.*]] = spv.FAdd [[VAL1]], [[VAL2]]
+      %16 = addf %14, %15 : f32
+      // CHECK: [[PTR3:%.*]] = spv.AccessChain [[ARG2]]{{\[}}[[INDEX1]], [[INDEX2]]{{\]}}
+      // CHECK-NEXT: spv.Store "StorageBuffer" [[PTR3]], [[VAL3]]
+      store %16, %arg2[%12, %13] : memref<12x4xf32>
+      return
+    }
+  }
+}
diff --git a/mlir/test/Conversion/GPUToSPIRV/simple.mlir b/mlir/test/Conversion/GPUToSPIRV/simple.mlir
index a92ec96417b3..73c72cb0b466 100644
--- a/mlir/test/Conversion/GPUToSPIRV/simple.mlir
+++ b/mlir/test/Conversion/GPUToSPIRV/simple.mlir
@@ -1,21 +1,27 @@
 // RUN: mlir-opt -convert-gpu-to-spirv %s -o - | FileCheck %s
 
-// CHECK:       spv.module "Logical" "GLSL450" {
-// CHECK-NEXT:    spv.globalVariable [[VAR1:@.*]] bind(0, 0) : !spv.ptr<f32, StorageBuffer>
-// CHECK-NEXT:    spv.globalVariable [[VAR2:@.*]] bind(0, 1) : !spv.ptr<!spv.array<12 x f32>, StorageBuffer>
-// CHECK-NEXT:    func @kernel_1
-// CHECK-NEXT:      spv.Return
-// CHECK:       spv.EntryPoint "GLCompute" @kernel_1, [[VAR1]], [[VAR2]]
-func @kernel_1(%arg0 : f32, %arg1 : memref<12xf32, 1>)
-    attributes { gpu.kernel } {
-  return
-}
+module attributes {gpu.container_module} {
 
-func @foo() {
-  %0 = "op"() : () -> (f32)
-  %1 = "op"() : () -> (memref<12xf32, 1>)
-  %cst = constant 1 : index
-  "gpu.launch_func"(%cst, %cst, %cst, %cst, %cst, %cst, %0, %1) { kernel = @kernel_1 }
-      : (index, index, index, index, index, index, f32, memref<12xf32, 1>) -> ()
-  return
-}
\ No newline at end of file
+  // CHECK:       spv.module "Logical" "GLSL450" {
+  // CHECK-NEXT:    spv.globalVariable [[VAR1:@.*]] bind(0, 0) : !spv.ptr<f32, StorageBuffer>
+  // CHECK-NEXT:    spv.globalVariable [[VAR2:@.*]] bind(0, 1) : !spv.ptr<!spv.array<12 x f32>, StorageBuffer>
+  // CHECK-NEXT:    func @kernel_1
+  // CHECK-NEXT:      spv.Return
+  // CHECK:       spv.EntryPoint "GLCompute" @kernel_1, [[VAR1]], [[VAR2]]
+  module @kernels attributes {gpu.kernel_module} {
+    func @kernel_1(%arg0 : f32, %arg1 : memref<12xf32, 1>)
+        attributes { gpu.kernel } {
+      return
+    }
+  }
+
+  func @foo() {
+    %0 = "op"() : () -> (f32)
+    %1 = "op"() : () -> (memref<12xf32, 1>)
+    %cst = constant 1 : index
+    "gpu.launch_func"(%cst, %cst, %cst, %cst, %cst, %cst, %0, %1) { kernel = "kernel_1", kernel_module = @kernels }
+        : (index, index, index, index, index, index, f32, memref<12xf32, 1>) -> ()
+    return
+  }
+
+}
diff --git a/mlir/test/Dialect/GPU/invalid.mlir b/mlir/test/Dialect/GPU/invalid.mlir
index b7edf861b5ff..032eff081967 100644
--- a/mlir/test/Dialect/GPU/invalid.mlir
+++ b/mlir/test/Dialect/GPU/invalid.mlir
@@ -96,8 +96,8 @@ func @launch_func_too_few_operands(%sz : index) {
 
 // -----
 
-func @launch_func_missing_callee_attribute(%sz : index) {
-  // expected-error@+1 {{attribute 'kernel' must be specified}}
+func @launch_func_missing_parent_module_attribute(%sz : index) {
+  // expected-error@+1 {{expected the closest surrounding module to have the 'gpu.container_module' attribute}}
   "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz, %sz) {foo = "bar"}
       : (index, index, index, index, index, index) -> ()
   return
@@ -105,54 +105,134 @@ func @launch_func_missing_callee_attribute(%sz : index) {
 
 // -----
 
-func @launch_func_no_function_attribute(%sz : index) {
-  // expected-error@+1 {{attribute 'kernel' must be a function}}
-  "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz, %sz) {kernel = 10}
-      : (index, index, index, index, index, index) -> ()
-  return
+module attributes {gpu.container_module} {
+  func @launch_func_missing_callee_attribute(%sz : index) {
+    // expected-error@+1 {{string attribute 'kernel' must be specified}}
+    "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz, %sz) {foo = "bar"}
+        : (index, index, index, index, index, index) -> ()
+    return
+  }
 }
 
 // -----
 
-func @launch_func_undefined_function(%sz : index) {
-  // expected-error@+1 {{kernel function '@kernel_1' is undefined}}
-  "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz, %sz) { kernel = @kernel_1 }
-      : (index, index, index, index, index, index) -> ()
-  return
+module attributes {gpu.container_module} {
+  func @launch_func_missing_module_attribute(%sz : index) {
+    // expected-error@+1 {{attribute 'kernel_module' must be specified}}
+    "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz, %sz) {kernel = "launch_func_missing_kernel_attr"}
+        : (index, index, index, index, index, index) -> ()
+    return
+  }
 }
 
 // -----
 
-func @kernel_1(%arg1 : !llvm<"float*">) {
-  return
-}
-
-func @launch_func_missing_kernel_attr(%sz : index, %arg : !llvm<"float*">) {
-  // expected-error@+1 {{kernel function is missing the 'gpu.kernel' attribute}}
-  "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz, %sz, %arg) {kernel = @kernel_1}
-      : (index, index, index, index, index, index, !llvm<"float*">) -> ()
-  return
+module attributes {gpu.container_module} {
+  func @launch_func_no_function_attribute(%sz : index) {
+    // expected-error@+1 {{string attribute 'kernel' must be specified}}
+    "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz, %sz) {kernel = 10}
+        : (index, index, index, index, index, index) -> ()
+    return
+  }
 }
 
 // -----
 
-func @kernel_1(%arg1 : !llvm<"float*">) attributes { gpu.kernel } {
-  return
-}
-
-func @launch_func_kernel_operand_size(%sz : index, %arg : !llvm<"float*">) {
-  // expected-error@+1 {{got 2 kernel operands but expected 1}}
-  "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz, %sz, %arg, %arg)
-      {kernel = @kernel_1}
-      : (index, index, index, index, index, index, !llvm<"float*">,
-         !llvm<"float*">) -> ()
-  return
+module attributes {gpu.container_module} {
+  func @launch_func_module_attribute_wrong_type(%sz : index) {
+    // expected-error@+1 {{symbol reference attribute 'kernel_module' must be specified}}
+    "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz, %sz)
+    {kernel = "launch_func_module_attribute_wrong_type", kernel_module = 10}
+        : (index, index, index, index, index, index) -> ()
+    return
+  }
 }
 
 // -----
 
-func @kernel_1(%arg1 : !llvm<"float*">) attributes { gpu.kernel } {
-  return
+module attributes {gpu.container_module} {
+  func @launch_func_undefined_module(%sz : index) {
+    // expected-error@+1 {{kernel module 'kernels' is undefined}}
+    "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz, %sz)
+    { kernel = "kernel_1", kernel_module = @kernels }
+        : (index, index, index, index, index, index) -> ()
+    return
+  }
+}
+
+// -----
+
+module attributes {gpu.container_module} {
+  module @kernels {
+  }
+
+  func @launch_func_missing_module_attirbute(%sz : index) {
+    // expected-error@+1 {{module 'kernels' is missing the 'gpu.kernel_module' attribute}}
+    "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz, %sz)
+    { kernel = "kernel_1", kernel_module = @kernels }
+        : (index, index, index, index, index, index) -> ()
+    return
+  }
+}
+
+// -----
+
+module attributes {gpu.container_module} {
+  module @kernels attributes {gpu.kernel_module} {
+  }
+
+  func @launch_func_undefined_function(%sz : index) {
+    // expected-error@+1 {{kernel function 'kernel_1' is undefined}}
+    "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz, %sz)
+    { kernel = "kernel_1", kernel_module = @kernels }
+        : (index, index, index, index, index, index) -> ()
+    return
+  }
+}
+
+// -----
+
+module attributes {gpu.container_module} {
+  module @kernels attributes {gpu.kernel_module} {
+    func @kernel_1(%arg1 : !llvm<"float*">) {
+      return
+    }
+  }
+
+  func @launch_func_missing_kernel_attr(%sz : index, %arg : !llvm<"float*">) {
+    // expected-error@+1 {{kernel function is missing the 'gpu.kernel' attribute}}
+    "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz, %sz, %arg)
+    {kernel = "kernel_1", kernel_module = @kernels}
+        : (index, index, index, index, index, index, !llvm<"float*">) -> ()
+    return
+  }
+}
+
+// -----
+
+module attributes {gpu.container_module} {
+  module @kernels attributes {gpu.kernel_module} {
+    func @kernel_1(%arg1 : !llvm<"float*">) attributes { gpu.kernel } {
+      return
+    }
+  }
+
+  func @launch_func_kernel_operand_size(%sz : index, %arg : !llvm<"float*">) {
+    // expected-error@+1 {{got 2 kernel operands but expected 1}}
+    "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz, %sz, %arg, %arg)
+        {kernel = "kernel_1", kernel_module = @kernels}
+        : (index, index, index, index, index, index, !llvm<"float*">,
+           !llvm<"float*">) -> ()
+    return
+  }
+}
+
+// -----
+
+module @kernels attributes {gpu.kernel_module} {
+  func @kernel_1(%arg1 : !llvm<"float*">) attributes { gpu.kernel } {
+    return
+  }
 }
 
 // Due to the ordering of the current impl of lowering and LLVMLowering, type
@@ -162,7 +242,7 @@ func @kernel_1(%arg1 : !llvm<"float*">) attributes { gpu.kernel } {
 // func @launch_func_kernel_operand_types(%sz : index, %arg : f32) {
 //   // expected-err@+1 {{type of function argument 0 does not match}}
 //   "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz, %sz, %arg)
-//       {kernel = @kernel_1}
+//       {kernel = "kernel_1"}
 //       : (index, index, index, index, index, index, f32) -> ()
 //   return
 // }
diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir
index 7c8f682dcc84..d2e71e18f9f2 100644
--- a/mlir/test/Dialect/GPU/ops.mlir
+++ b/mlir/test/Dialect/GPU/ops.mlir
@@ -1,105 +1,113 @@
 // RUN: mlir-opt %s | FileCheck %s
 
-// CHECK-LABEL:func @no_args(%{{.*}}: index)
-func @no_args(%sz : index) {
-  // CHECK: gpu.launch blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}})
-  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %sz, %grid_y = %sz, %grid_z = %sz)
-             threads(%tx, %ty, %tz) in (%block_x = %sz, %block_y = %sz, %block_z = %sz) {
-    // CHECK: gpu.return
-    gpu.return
-  }
-  return
-}
+module attributes {gpu.container_module} {
 
-// CHECK-LABEL:func @args(%{{.*}}: index, %{{.*}}: index, %{{.*}}: f32, %{{.*}}: memref<?xf32, 1>) {
-func @args(%blk : index, %thrd : index, %float : f32, %data : memref<?xf32,1>) {
-  // CHECK: gpu.launch blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) args(%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) : f32, memref<?xf32, 1>
-  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %blk, %grid_y = %blk, %grid_z = %blk)
-             threads(%tx, %ty, %tz) in (%block_x = %thrd, %block_y = %thrd, %block_z = %thrd)
-	     args(%kernel_arg0 = %float, %kernel_arg1 = %data) : f32, memref<?xf32, 1> {
-    // CHECK: gpu.return
-    gpu.return
+  // CHECK-LABEL:func @no_args(%{{.*}}: index)
+  func @no_args(%sz : index) {
+    // CHECK: gpu.launch blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}})
+    gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %sz, %grid_y = %sz, %grid_z = %sz)
+               threads(%tx, %ty, %tz) in (%block_x = %sz, %block_y = %sz, %block_z = %sz) {
+      // CHECK: gpu.return
+      gpu.return
+    }
+    return
   }
-  return
-}
 
-// It is possible to use values passed into the region as arguments.
-// CHECK-LABEL: func @passing_values
-func @passing_values(%blk : index, %thrd : index, %float : f32, %data : memref<?xf32,1>) {
-  // CHECK: gpu.launch blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) args(%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) : f32, memref<?xf32, 1>
-  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %blk, %grid_y = %blk, %grid_z = %blk)
-             threads(%tx, %ty, %tz) in (%block_x = %thrd, %block_y = %thrd, %block_z = %thrd)
-	     args(%kernel_arg0 = %float, %kernel_arg1 = %data) : f32, memref<?xf32, 1> {
-    // CHECK: "use"(%{{.*}})
-    "use"(%kernel_arg0): (f32) -> ()
-    // CHECK: gpu.return
-    gpu.return
+  // CHECK-LABEL:func @args(%{{.*}}: index, %{{.*}}: index, %{{.*}}: f32, %{{.*}}: memref<?xf32, 1>) {
+  func @args(%blk : index, %thrd : index, %float : f32, %data : memref<?xf32,1>) {
+    // CHECK: gpu.launch blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) args(%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) : f32, memref<?xf32, 1>
+    gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %blk, %grid_y = %blk, %grid_z = %blk)
+               threads(%tx, %ty, %tz) in (%block_x = %thrd, %block_y = %thrd, %block_z = %thrd)
+               args(%kernel_arg0 = %float, %kernel_arg1 = %data) : f32, memref<?xf32, 1> {
+      // CHECK: gpu.return
+      gpu.return
+    }
+    return
   }
-  return
-}
 
-// It is possible to use values defined in nested regions as long as they don't
-// cross kernel launch region boundaries.
-// CHECK-LABEL: func @nested_isolation
-func @nested_isolation(%sz : index) {
-  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %sz, %grid_y = %sz, %grid_z = %sz)
-             threads(%tx, %ty, %tz) in (%block_x = %sz, %block_y = %sz, %block_z = %sz) {
-    "region"() ({
-      // CHECK: %{{.*}} = "produce"()
-      %val = "produce"() : () -> (index)
+  // It is possible to use values passed into the region as arguments.
+  // CHECK-LABEL: func @passing_values
+  func @passing_values(%blk : index, %thrd : index, %float : f32, %data : memref<?xf32,1>) {
+    // CHECK: gpu.launch blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) args(%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) : f32, memref<?xf32, 1>
+    gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %blk, %grid_y = %blk, %grid_z = %blk)
+               threads(%tx, %ty, %tz) in (%block_x = %thrd, %block_y = %thrd, %block_z = %thrd)
+               args(%kernel_arg0 = %float, %kernel_arg1 = %data) : f32, memref<?xf32, 1> {
+      // CHECK: "use"(%{{.*}})
+      "use"(%kernel_arg0): (f32) -> ()
+      // CHECK: gpu.return
+      gpu.return
+    }
+    return
+  }
+
+  // It is possible to use values defined in nested regions as long as they don't
+  // cross kernel launch region boundaries.
+  // CHECK-LABEL: func @nested_isolation
+  func @nested_isolation(%sz : index) {
+    gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %sz, %grid_y = %sz, %grid_z = %sz)
+               threads(%tx, %ty, %tz) in (%block_x = %sz, %block_y = %sz, %block_z = %sz) {
       "region"() ({
-        // CHECK: "use"(%{{.*}})
-        "use"(%val) : (index) -> ()
+        // CHECK: %{{.*}} = "produce"()
+        %val = "produce"() : () -> (index)
+        "region"() ({
+          // CHECK: "use"(%{{.*}})
+          "use"(%val) : (index) -> ()
+        }) : () -> ()
       }) : () -> ()
-    }) : () -> ()
-    // CHECK: gpu.return
-    gpu.return
+      // CHECK: gpu.return
+      gpu.return
+    }
+    return
   }
-  return
-}
-
-func @kernel_1(%arg0 : f32, %arg1 : memref<?xf32, 1>)
-    attributes { gpu.kernel } {
-  %tIdX = "gpu.thread_id"() {dimension = "x"} : () -> (index)
-  %tIdY = "gpu.thread_id"() {dimension = "y"} : () -> (index)
-  %tIdZ = "gpu.thread_id"() {dimension = "z"} : () -> (index)
-
-  %bDimX = "gpu.block_dim"() {dimension = "x"} : () -> (index)
-  %bDimY = "gpu.block_dim"() {dimension = "y"} : () -> (index)
-  %bDimZ = "gpu.block_dim"() {dimension = "z"} : () -> (index)
-
-  %bIdX = "gpu.block_id"() {dimension = "x"} : () -> (index)
-  %bIdY = "gpu.block_id"() {dimension = "y"} : () -> (index)
-  %bIdZ = "gpu.block_id"() {dimension = "z"} : () -> (index)
-
-  %gDimX = "gpu.grid_dim"() {dimension = "x"} : () -> (index)
-  %gDimY = "gpu.grid_dim"() {dimension = "y"} : () -> (index)
-  %gDimZ = "gpu.grid_dim"() {dimension = "z"} : () -> (index)
-
-  %one = constant 1.0 : f32
-  %sum = "gpu.all_reduce"(%one) : (f32) -> (f32)
-
-  "some_op"(%bIdX, %tIdX) : (index, index) -> ()
-  %42 = load %arg1[%bIdX] : memref<?xf32, 1>
-  return
-}
-
-func @kernel_2(f32, memref<?xf32, 1>)
-    attributes { gpu.kernel }
-
-func @foo() {
-  %0 = "op"() : () -> (f32)
-  %1 = "op"() : () -> (memref<?xf32, 1>)
-  // CHECK: %{{.*}} = constant 8
-  %cst = constant 8 : index
-
-  // CHECK: "gpu.launch_func"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {kernel = @kernel_1} : (index, index, index, index, index, index, f32, memref<?xf32, 1>) -> ()
-  "gpu.launch_func"(%cst, %cst, %cst, %cst, %cst, %cst, %0, %1) { kernel = @kernel_1 }
-      : (index, index, index, index, index, index, f32, memref<?xf32, 1>) -> ()
-
-  // CHECK: "gpu.launch_func"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {kernel = @kernel_2} : (index, index, index, index, index, index, f32, memref<?xf32, 1>) -> ()
-  "gpu.launch_func"(%cst, %cst, %cst, %cst, %cst, %cst, %0, %1) { kernel = @kernel_2 }
-      : (index, index, index, index, index, index, f32, memref<?xf32, 1>) -> ()
-
-  return
+
+  module @kernels attributes {gpu.kernel_module} {
+    func @kernel_1(%arg0 : f32, %arg1 : memref<?xf32, 1>)
+        attributes { gpu.kernel } {
+      %tIdX = "gpu.thread_id"() {dimension = "x"} : () -> (index)
+      %tIdY = "gpu.thread_id"() {dimension = "y"} : () -> (index)
+      %tIdZ = "gpu.thread_id"() {dimension = "z"} : () -> (index)
+
+      %bDimX = "gpu.block_dim"() {dimension = "x"} : () -> (index)
+      %bDimY = "gpu.block_dim"() {dimension = "y"} : () -> (index)
+      %bDimZ = "gpu.block_dim"() {dimension = "z"} : () -> (index)
+
+      %bIdX = "gpu.block_id"() {dimension = "x"} : () -> (index)
+      %bIdY = "gpu.block_id"() {dimension = "y"} : () -> (index)
+      %bIdZ = "gpu.block_id"() {dimension = "z"} : () -> (index)
+
+      %gDimX = "gpu.grid_dim"() {dimension = "x"} : () -> (index)
+      %gDimY = "gpu.grid_dim"() {dimension = "y"} : () -> (index)
+      %gDimZ = "gpu.grid_dim"() {dimension = "z"} : () -> (index)
+
+      %one = constant 1.0 : f32
+      %sum = "gpu.all_reduce"(%one) : (f32) -> (f32)
+
+      "some_op"(%bIdX, %tIdX) : (index, index) -> ()
+      %42 = load %arg1[%bIdX] : memref<?xf32, 1>
+      return
+    }
+
+    func @kernel_2(f32, memref<?xf32, 1>)
+        attributes { gpu.kernel }
+  }
+
+  func @foo() {
+    %0 = "op"() : () -> (f32)
+    %1 = "op"() : () -> (memref<?xf32, 1>)
+    // CHECK: %{{.*}} = constant 8
+    %cst = constant 8 : index
+
+    // CHECK: "gpu.launch_func"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {kernel = "kernel_1", kernel_module = @kernels} : (index, index, index, index, index, index, f32, memref<?xf32, 1>) -> ()
+    "gpu.launch_func"(%cst, %cst, %cst, %cst, %cst, %cst, %0, %1)
+    { kernel = "kernel_1", kernel_module = @kernels }
+        : (index, index, index, index, index, index, f32, memref<?xf32, 1>) -> ()
+
+    // CHECK: "gpu.launch_func"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {kernel = "kernel_2", kernel_module = @kernels} : (index, index, index, index, index, index, f32, memref<?xf32, 1>) -> ()
+    "gpu.launch_func"(%cst, %cst, %cst, %cst, %cst, %cst, %0, %1)
+    { kernel = "kernel_2", kernel_module = @kernels }
+        : (index, index, index, index, index, index, f32, memref<?xf32, 1>) -> ()
+
+    return
+  }
+
 }
diff --git a/mlir/test/Dialect/GPU/outlining.mlir b/mlir/test/Dialect/GPU/outlining.mlir
index 5f31486084ba..8398907b6c0a 100644
--- a/mlir/test/Dialect/GPU/outlining.mlir
+++ b/mlir/test/Dialect/GPU/outlining.mlir
@@ -1,5 +1,7 @@
 // RUN: mlir-opt -gpu-kernel-outlining -split-input-file -verify-diagnostics %s | FileCheck %s
 
+// CHECK: module attributes {gpu.container_module}
+
 // CHECK-LABEL: func @launch()
 func @launch() {
   // CHECK: %[[ARG0:.*]] = "op"() : () -> f32
@@ -19,7 +21,7 @@ func @launch() {
   // CHECK: %[[BDIMZ:.*]] = constant 28
   %bDimZ = constant 28 : index
 
-  // CHECK: "gpu.launch_func"(%[[GDIMX]], %[[GDIMY]], %[[GDIMZ]], %[[BDIMX]], %[[BDIMY]], %[[BDIMZ]], %[[ARG0]], %[[ARG1]]) {kernel = @launch_kernel} : (index, index, index, index, index, index, f32, memref<?xf32, 1>) -> ()
+  // CHECK: "gpu.launch_func"(%[[GDIMX]], %[[GDIMY]], %[[GDIMZ]], %[[BDIMX]], %[[BDIMY]], %[[BDIMZ]], %[[ARG0]], %[[ARG1]]) {kernel = "launch_kernel", kernel_module = @launch_kernel} : (index, index, index, index, index, index, f32, memref<?xf32, 1>) -> ()
   // CHECK-NOT: gpu.launch blocks
   gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %gDimX, %grid_y = %gDimY,
                                        %grid_z = %gDimZ)
@@ -34,11 +36,9 @@ func @launch() {
   return
 }
 
-// CHECK-LABEL: func @launch_kernel
-// CHECK-SAME: (f32, memref<?xf32, 1>)
-// CHECK-NEXT: attributes {gpu.kernel}
 
-// CHECK-LABEL: func @launch_kernel
+// CHECK-LABEL: module @launch_kernel
+// CHECK-NEXT: func @launch_kernel
 // CHECK-SAME: (%[[KERNEL_ARG0:.*]]: f32, %[[KERNEL_ARG1:.*]]: memref<?xf32, 1>)
 // CHECK-NEXT: attributes {gpu.kernel}
 // CHECK-NEXT: %[[BID:.*]] = "gpu.block_id"() {dimension = "x"} : () -> index
@@ -59,17 +59,19 @@ func @launch() {
 
 // -----
 
+// CHECK: module attributes {gpu.container_module}
+
 func @multiple_launches() {
   // CHECK: %[[CST:.*]] = constant 8 : index
   %cst = constant 8 : index
-  // CHECK: "gpu.launch_func"(%[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]]) {kernel = @multiple_launches_kernel} : (index, index, index, index, index, index) -> ()
+  // CHECK: "gpu.launch_func"(%[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]]) {kernel = "multiple_launches_kernel", kernel_module = @multiple_launches_kernel} : (index, index, index, index, index, index) -> ()
   gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst,
                                        %grid_z = %cst)
              threads(%tx, %ty, %tz) in (%block_x = %cst, %block_y = %cst,
                                         %block_z = %cst) {
     gpu.return
   }
-  // CHECK: "gpu.launch_func"(%[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]]) {kernel = @multiple_launches_kernel_0} : (index, index, index, index, index, index) -> ()
+  // CHECK: "gpu.launch_func"(%[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]]) {kernel = "multiple_launches_kernel", kernel_module = @multiple_launches_kernel_0} : (index, index, index, index, index, index) -> ()
   gpu.launch blocks(%bx2, %by2, %bz2) in (%grid_x2 = %cst, %grid_y2 = %cst,
                                           %grid_z2 = %cst)
              threads(%tx2, %ty2, %tz2) in (%block_x2 = %cst, %block_y2 = %cst,
@@ -79,8 +81,10 @@ func @multiple_launches() {
   return
 }
 
-// CHECK: func @multiple_launches_kernel()
-// CHECK: func @multiple_launches_kernel_0()
+// CHECK: module @multiple_launches_kernel
+// CHECK: func @multiple_launches_kernel
+// CHECK: module @multiple_launches_kernel_0
+// CHECK: func @multiple_launches_kernel
 
 // -----
 
@@ -89,7 +93,7 @@ func @extra_constants(%arg0 : memref<?xf32>) {
   %cst = constant 8 : index
   %cst2 = constant 2 : index
   %cst3 = constant 3 : index
-  // CHECK: "gpu.launch_func"(%[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %{{.*}}) {kernel = @extra_constants_kernel} : (index, index, index, index, index, index, memref<?xf32>) -> ()
+  // CHECK: "gpu.launch_func"(%[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %{{.*}}) {kernel = "extra_constants_kernel", kernel_module = @extra_constants_kernel} : (index, index, index, index, index, index, memref<?xf32>) -> ()
   gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst,
                                        %grid_z = %cst)
              threads(%tx, %ty, %tz) in (%block_x = %cst, %block_y = %cst,