[mlir][transform] Create GPU transform dialect

This revision adds GPU transform dialect. It also introduce a prefix such as "transform.gpu" for all ops related to this dialect. MLIR already had two GPU transform op in linalg. This revision moves these ops into GPUTransformOps. The Ops are as follows: `transform.structured.map_nested_foreach_thread_to_gpu_blocks` -> `transform.gpu.map_foreach_to_blocks` This op selects the outermost (toplevel) foreach_thread and parallelize across GPU blocks. It can also generate `gpu_launch`. `transform.structured.map_nested_foreach_thread_to_gpu_threads` -> `transform.gpu.map_nested_foreach_to_threads` This op parallelizes nested foreach_thread that are inside `gpu_launch` across GPU threads. It doesn't add new functionality, but there are some minor refactoring of the code. Reviewed By: ftynse Differential Revision: https://reviews.llvm.org/D134800
2022-10-03 09:56:42 +02:00 · 2022-10-03 09:56:42 +02:00 · 89bb0cae46
parent 491ac8f3e8
commit 89bb0cae46
13 changed files with 851 additions and 572 deletions
--- a/mlir/include/mlir/Dialect/GPU/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/GPU/CMakeLists.txt
@ -1,2 +1,3 @@
 add_subdirectory(IR)
 add_subdirectory(Transforms)
+add_subdirectory(TransformOps)
--- a/mlir/include/mlir/Dialect/GPU/TransformOps/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/GPU/TransformOps/CMakeLists.txt
@ -0,0 +1,6 @@
+set(LLVM_TARGET_DEFINITIONS GPUTransformOps.td)
+mlir_tablegen(GPUTransformOps.h.inc -gen-op-decls)
+mlir_tablegen(GPUTransformOps.cpp.inc -gen-op-defs)
+add_public_tablegen_target(MLIRGPUTransformOpsIncGen)
+
+add_mlir_doc(GPUTransformOps GPUTransformOps Dialects/ -gen-op-doc)
--- a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h
+++ b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h
@ -0,0 +1,75 @@
+//===- GPUTransformOps.h - GPU transform ops --------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_GPU_TRANSFORMOPS_GPUTRANSFORMOPS_H
+#define MLIR_DIALECT_GPU_TRANSFORMOPS_GPUTRANSFORMOPS_H
+
+#include "mlir/Dialect/PDL/IR/PDLTypes.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/PatternMatch.h"
+
+namespace mlir {
+namespace gpu {
+class GpuOp;
+} // namespace gpu
+} // namespace mlir
+
+//===----------------------------------------------------------------------===//
+// GPU Transform Operations
+//===----------------------------------------------------------------------===//
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h.inc"
+
+namespace mlir {
+class DialectRegistry;
+namespace transform {
+namespace gpu {
+
+/// Searches `scf.foreach_thread` ops nested under `target` and maps each such
+/// op to GPU threads. Mapping is one-to-one and the induction variables of
+/// `scf.foreach_thread` are rewritten to gpu.thread_id according to the
+/// thread_dim_apping attribute. Sibling `scf.foreach_thread` are supported in
+/// which case, the union of the number of threads is computed and may result in
+/// predication. Dynamic, `scf.foreach_thread` trip counts are currently not
+/// supported. Dynamic block dim sizes are currently not supported.
+DiagnosedSilenceableFailure
+mapNestedForeachToThreadsImp(RewriterBase &rewriter, Operation *target,
+                             const SmallVectorImpl<int64_t> &blockDim,
+                             bool syncAfterDistribute,
+                             llvm::Optional<TransformOpInterface> transformOp);
+
+/// Maps the top level `scf.foreach_thread` op to GPU Thread Blocks. Mapping is
+/// one-to-one and the induction variables of `scf.foreach_thread` are rewritten
+/// to gpu.block_id according to the thread_dim_apping attribute. Dynamic,
+/// `scf.foreach_thread` trip counts are currently not supported. Dynamic block
+/// dim sizes are currently not supported.
+DiagnosedSilenceableFailure mapForeachToBlocksImp(
+    RewriterBase &rewriter, scf::ForeachThreadOp foreachThreadOp,
+    function_ref<void(RewriterBase &, scf::ForeachThreadOp,
+                      SmallVectorImpl<Value> &)>
+        blockIdGenerator,
+    SmallVectorImpl<int64_t> &gridDims, TransformOpInterface transformOp);
+
+/// Finds the top level scf::ForeachThreadOp of given target.
+DiagnosedSilenceableFailure
+findTopLevelForeachThreadOp(Operation *target,
+                            scf::ForeachThreadOp &topLevelForeachThreadOp,
+                            TransformOpInterface transformOp);
+
+} // namespace gpu
+} // namespace transform
+
+namespace gpu {
+void registerTransformDialectExtension(DialectRegistry &registry);
+} // namespace gpu
+} // namespace mlir
+
+#endif // MLIR_DIALECT_GPU_TRANSFORMOPS_GPUTRANSFORMOPS_H
--- a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td
+++ b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td
@ -0,0 +1,175 @@
+//===- GPUTransformOps.td - GPU transform ops --------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef GPU_TRANSFORM_OPS
+#define GPU_TRANSFORM_OPS
+
+include "mlir/Dialect/Transform/IR/TransformDialect.td"
+include "mlir/Dialect/Transform/IR/TransformEffects.td"
+include "mlir/Dialect/Transform/IR/TransformInterfaces.td"
+include "mlir/Dialect/PDL/IR/PDLTypes.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+include "mlir/IR/OpBase.td"
+
+def MapNestedForeachToThreads : 
+  Op<Transform_Dialect, "gpu.map_nested_foreach_to_threads",
+    [FunctionalStyleTransformOpTrait, 
+     MemoryEffectsOpInterface,
+     TransformEachOpTrait,
+     TransformOpInterface]> {
+  let description = [{
+      Target the `gpu.launch op` and rewrite all `scf.foreach_thread`
+      nested in it to distributed `gpu.thread_id` attribute.
+
+      The operation searches for `scf.foreach_thread` ops nested under `target` 
+      and maps each such op to GPU threads. Mapping is one-to-one and the 
+      induction variables of `scf.foreach_thread` are rewritten to 
+      `gpu.thread_id` according to the `thread_dim_mapping` attribute. 
+      
+      Sibling `scf.foreach_thread` are supported in which case, the union of 
+      the number of threads is computed and may result in predication.
+
+      Multiple scf.foreach_thread are supported per `gpu.launch` in which case, 
+      the max of all the threads is computed and taken for the global 
+      `gpu.thread_id`. If necessary, `scf.foreach_thread` that do not use the 
+      whole thread range result in predicated computations.
+
+      Dynamic `scf.foreach_thread` trip counts are currently not supported. 
+      Dynamic block dim sizes are currently not supported.
+
+      Only **bufferized** `scf.foreach_thread` are currently supported.
+      Only `scf.foreach_thread` distributed to **at most 3 dimensions** are 
+      currently supported.
+    
+      Barriers are inserted after each scf.foreach_thread op for now.
+
+      The operation alters the block size of the given gpu_launch using 
+      blockDim argument.
+
+      #### Return modes:
+      
+      This operation ignores non-gpu_launch ops and drops them in the return.
+
+      If any scf.foreach_thread with tensors is found, the transform definitely 
+      fails.    
+
+      If all the scf.foreach_thread operations contained within the LaunchOp 
+      referred to by the `target` PDLOperation lower to GPU properly, the 
+      transform succeeds. Otherwise the transform definitely fails.
+      
+      The returned handle points to the same LaunchOp operand, consuming it and
+      producing a new SSA value to satisfy chaining and linearity of the IR 
+      properties.
+
+      #### Example:
+
+      ```
+      gpu.launch blocks(%bx, %by, %bz) in (%x = %0, %y = %1, %z = %2)
+                 threads(%tx, %ty, %tz) in (%tx = %3, %ty = %4, %tz = %5) {
+        scf.foreach_thread (%i, %j) in (7, 9) {
+          ... // body 1
+        } {thread_dim_mapping = [1, 0, 2]}
+        scf.foreach_thread (%i) in (12) {
+          ... // body 2
+        }
+        gpu.terminator
+      }
+      ```
+      is translated to:
+
+      ```
+      %bdimX = arith.constant 12 : index
+      %bdimY = arith.constant 9 : index
+      gpu.launch blocks(%bx, %by, %bz) in (%x = %0, %y = %1, %z = %2)
+             threads(%tx, %ty, %tz) in (%tx = %bdimX, %ty = %bdimY, %tz = %5) {
+        if (threadIdx.x < 9 && threadIdx.y < 7) {
+          ... // body 1
+        }
+        gpu.barrier
+        if (threadIdx.y < 1) {
+          ... // body 2
+        }
+        gpu.barrier
+        gpu.terminator
+      }      
+      ```
+    }];
+
+  let arguments = (ins PDL_Operation:$target,
+                   DefaultValuedAttr<I64ArrayAttr, "{}">:$blockDim,
+                   DefaultValuedAttr<BoolAttr, "true">:$syncAfterDistribute);
+  let results = (outs PDL_Operation:$result);
+
+  let assemblyFormat = "$target attr-dict";
+  let extraClassDeclaration = [{    
+    ::mlir::DiagnosedSilenceableFailure applyToOne(
+        ::mlir::Operation *target, 
+        ::llvm::SmallVectorImpl<::mlir::Operation *> &results, 
+        ::mlir::transform::TransformState &state);
+  }];
+}
+
+
+def MapForeachToBlocks : 
+  Op<Transform_Dialect, "gpu.map_foreach_to_blocks",
+    [FunctionalStyleTransformOpTrait,
+     MemoryEffectsOpInterface,
+     TransformOpInterface,
+     TransformEachOpTrait]> {
+  let description = [{
+    Target the gpu_launch op and rewrite the top level `scf.foreach_thread`
+    to distributed gpu.block_id attribute. If `generate_gpu_launch` attribute 
+    is set, then first generates `gpu_launch` and moves the top level 
+    `scf.foreach_thread` inside.
+
+    The operation searches top level `scf.foreach_thread` ops under 
+    `gpu_launch` and maps each such op to GPU blocks. Mapping is 
+    one-to-one and the induction variables of `scf.foreach_thread` are 
+    rewritten to gpu.block_id according to the `thread_dim_apping` attribute.
+
+    Dynamic, `scf.foreach_thread` trip counts are currently not supported. 
+    Dynamic block dim sizes are currently not supported.
+
+    Only **bufferized** scf.foreach_thread are currently supported.
+    Only scf.foreach_thread distributed to **at most 3 dimensions** are 
+    currently supported.
+
+    The operation alters the block size of the given gpu_launch using 
+    gridDim argument.
+
+    #### Return modes:
+    
+    This operation ignores non-gpu_launch ops and drops them in the return.
+
+    If any scf.foreach_thread with tensors is found, the transform definitely 
+    fails.    
+
+    If all the scf.foreach_thread operations contained within the LaunchOp 
+    referred to by the `target` PDLOperation lower to GPU properly, the 
+    transform succeeds. Otherwise the transform definitely fails.
+
+    The returned handle points to the same LaunchOp operand, consuming it and
+    producing a new SSA value to satisfy chaining and linearity of the IR 
+    properties.
+  }];
+
+  let arguments = (ins PDL_Operation:$target,
+                   DefaultValuedAttr<I64ArrayAttr, "{}">:$gridDim,
+                   UnitAttr:$generate_gpu_launch);
+  let results = (outs PDL_Operation:$result);
+
+  let assemblyFormat = "$target attr-dict";
+  let extraClassDeclaration = [{
+    ::mlir::DiagnosedSilenceableFailure applyToOne(
+        ::mlir::Operation *target, 
+        ::llvm::SmallVectorImpl<::mlir::Operation *> &results, 
+        ::mlir::transform::TransformState &state);
+  }];
+}
+
+#endif // GPU_TRANSFORM_OPS
--- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
@ -751,161 +751,6 @@ def TileToForeachThreadOp :
  }];
 }

-def MapNestedForeachThreadToGpuThreads : 
-  Op<Transform_Dialect, "structured.map_nested_foreach_thread_to_gpu_threads",
-    [FunctionalStyleTransformOpTrait, 
-     MemoryEffectsOpInterface,
-     TransformEachOpTrait,
-     TransformOpInterface]> {
-  let description = [{
-      Target the gpu_launch op and rewrite all scf.foreach_thread
-      to distributed gpu.thread_id attribute.
-
-      The operation searches `scf.foreach_thread` ops nested under `target` 
-      and maps each such op to GPU threads. Mapping is one-to-one and the 
-      induction variables of `scf.foreach_thread` are rewritten to 
-      gpu.thread_id according to the thread_dim_apping attribute. 
-      
-      Sibling `scf.foreach_thread` are supported in which case, the union of 
-      the number of threads is computed and may result in predication.
-
-      Multiple scf.foreach_thread are supported per function in which case, the
-      max of all the threads is computed and taken for the global gpu.thread_id.
-      If necessary, scf.foreach_thread that do not use the whole thread range 
-      result in predicated computations.
-
-      Dynamic, `scf.foreach_thread` trip counts are currently not supported. 
-      Dynamic block dim sizes are currently not supported.
-
-      Only **bufferized** scf.foreach_thread are currently supported.
-      Only scf.foreach_thread distributed to **at most 3 dimensions** are 
-      currently supported.
-    
-      Barriers are inserted after each scf.foreach_thread op for now.
-
-      The operation alters the block size of the given gpu_launch using 
-      blockDim argument.
-
-      #### Return modes:
-      
-      This operation ignores non-gpu_launch ops and drops them in the return.
-
-      If any scf.foreach_thread with tensors is found, the transform definitely 
-      fails.    
-
-      If all the scf.foreach_thread operations contained within the LaunchOp 
-      referred to by the `target` PDLOperation lower to GPU properly, the 
-      transform succeeds. Otherwise the transform definitely fails.
-      
-      The returned handle points to the same LaunchOp operand, consuming it and
-      producing a new SSA value to satisfy chaining and linearity of the IR 
-      properties.
-
-      #### Example:
-
-      ```
-      gpu.launch blocks(%bx, %by, %bz) in (%x = %0, %y = %1, %z = %2)
-                 threads(%tx, %ty, %tz) in (%tx = %3, %ty = %4, %tz = %5) {
-        scf.foreach_thread (%i, %j) in (7, 9) {
-          ... // body 1
-        } {thread_dim_mapping = [1, 0, 2]}
-        scf.foreach_thread (%i) in (12) {
-          ... // body 2
-        }
-        gpu.terminator
-      }
-      ```
-      is translated to:
-
-      ```
-      %bdimX = arith.constant 12 : index
-      %bdimY = arith.constant 9 : index
-      gpu.launch blocks(%bx, %by, %bz) in (%x = %0, %y = %1, %z = %2)
-             threads(%tx, %ty, %tz) in (%tx = %bdimX, %ty = %bdimY, %tz = %5) {
-        if (threadIdx.x < 9 && threadIdx.y < 7) {
-          ... // body 1
-        }
-        gpu.barrier
-        if (threadIdx.y < 1) {
-          ... // body 2
-        }
-        gpu.barrier
-        gpu.terminator
-      }      
-      ```
-    }];
-
-  let arguments = (ins PDL_Operation:$target,
-                   DefaultValuedAttr<I64ArrayAttr, "{}">:$blockDim,
-                   DefaultValuedAttr<BoolAttr, "true">:$syncAfterDistribute);
-  let results = (outs PDL_Operation:$result);
-
-  let assemblyFormat = "$target attr-dict";
-  let extraClassDeclaration = [{
-    ::mlir::DiagnosedSilenceableFailure applyToOne(
-        ::mlir::Operation *target, 
-        ::llvm::SmallVectorImpl<::mlir::Operation *> &results, 
-        ::mlir::transform::TransformState &state);
-  }];
-}
-
-def MapNestedForeachThreadToGpuBlocks : Op<Transform_Dialect, 
-    "structured.map_nested_foreach_thread_to_gpu_blocks",
-    [FunctionalStyleTransformOpTrait,
-     MemoryEffectsOpInterface,
-     TransformOpInterface,
-     TransformEachOpTrait]> {
-  let description = [{
-    Target the gpu_launch op and rewrite the top level `scf.foreach_thread`
-    to distributed gpu.block_id attribute. If `generate_gpu_launch` attribute 
-    is set, then first generates `gpu_launch` and moves the top level 
-    `scf.foreach_thread` inside.
-
-    The operation searches top level `scf.foreach_thread` ops under 
-    `gpu_launch` and maps each such op to GPU blocks. Mapping is 
-    one-to-one and the induction variables of `scf.foreach_thread` are 
-    rewritten to gpu.block_id according to the `thread_dim_apping` attribute.
-
-    Dynamic, `scf.foreach_thread` trip counts are currently not supported. 
-    Dynamic block dim sizes are currently not supported.
-
-    Only **bufferized** scf.foreach_thread are currently supported.
-    Only scf.foreach_thread distributed to **at most 3 dimensions** are 
-    currently supported.
-
-    The operation alters the block size of the given gpu_launch using 
-    gridDim argument.
-
-    #### Return modes:
-    
-    This operation ignores non-gpu_launch ops and drops them in the return.
-
-    If any scf.foreach_thread with tensors is found, the transform definitely 
-    fails.    
-
-    If all the scf.foreach_thread operations contained within the LaunchOp 
-    referred to by the `target` PDLOperation lower to GPU properly, the 
-    transform succeeds. Otherwise the transform definitely fails.
-
-    The returned handle points to the same LaunchOp operand, consuming it and
-    producing a new SSA value to satisfy chaining and linearity of the IR 
-    properties.
-  }];
-
-  let arguments = (ins PDL_Operation:$target,
-                   DefaultValuedAttr<I64ArrayAttr, "{}">:$gridDim,
-                   UnitAttr:$generate_gpu_launch);
-  let results = (outs PDL_Operation:$result);
-
-  let assemblyFormat = "$target attr-dict";
-  let extraClassDeclaration = [{
-    ::mlir::DiagnosedSilenceableFailure applyToOne(
-        ::mlir::Operation *target, 
-        ::llvm::SmallVectorImpl<::mlir::Operation *> &results, 
-        ::mlir::transform::TransformState &state);
-  }];
-}
-
 def VectorizeOp : Op<Transform_Dialect, "structured.vectorize",
    [FunctionalStyleTransformOpTrait, MemoryEffectsOpInterface,
     TransformEachOpTrait, TransformOpInterface]> {
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@ -125,32 +125,6 @@ bool areElementwiseOpsFusable(OpOperand *fusedOperand);
 FailureOr<Operation *> fuseElementwiseOps(RewriterBase &rewriter,
                                          OpOperand *fusedOperand);

-/// Maps the top level `scf.foreach_thread` op to GPU Thread Blocks. Mapping is
-/// one-to-one and the induction variables of `scf.foreach_thread` are rewritten
-/// to gpu.block_id according to the thread_dim_apping attribute. Dynamic,
-/// `scf.foreach_thread` trip counts are currently not supported. Dynamic block
-/// dim sizes are currently not supported.
-LogicalResult rewriteTopLevelForeachThreadToGpuBlocks(
-    RewriterBase &rewriter, scf::ForeachThreadOp foreachThreadOp,
-    function_ref<void(RewriterBase &, scf::ForeachThreadOp,
-                      SmallVector<Value> &)>
-        blockIdGenerator,
-    SmallVector<int64_t> &gridDims);
-
-/// Finds the top level scf::ForeachThreadOp of given target.
-FailureOr<scf::ForeachThreadOp> findTopLevelForeachThreadOp(Operation *target);
-
-/// Searches `scf.foreach_thread` ops nested under `target` and maps each such
-/// op to GPU threads. Mapping is one-to-one and the induction variables of
-/// `scf.foreach_thread` are rewritten to gpu.thread_id according to the
-/// thread_dim_apping attribute. Sibling `scf.foreach_thread` are supported in
-/// which case, the union of the number of threads is computed and may result in
-/// predication. Dynamic, `scf.foreach_thread` trip counts are currently not
-/// supported. Dynamic block dim sizes are currently not supported.
-mlir::WalkResult rewriteMapNestedForeachThreadToGpuThreads(
-    RewriterBase &rewriter, Operation *target,
-    const SmallVector<int64_t> &blockDim, bool syncAfterDistribute);
-
 /// Split the given `op` into two parts along the given iteration space
 /// `dimension` at the specified `splitPoint`, and return the two parts.
 ///
--- a/mlir/include/mlir/InitAllDialects.h
+++ b/mlir/include/mlir/InitAllDialects.h
@ -31,6 +31,7 @@
 #include "mlir/Dialect/EmitC/IR/EmitC.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"
 #include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
@ -115,6 +116,7 @@ inline void registerAllDialects(DialectRegistry &registry) {
  linalg::registerTransformDialectExtension(registry);
  memref::registerTransformDialectExtension(registry);
  scf::registerTransformDialectExtension(registry);
+  gpu::registerTransformDialectExtension(registry);

  // Register all external models.
  arith::registerBufferizableOpInterfaceExternalModels(registry);
--- a/mlir/lib/Dialect/GPU/CMakeLists.txt
+++ b/mlir/lib/Dialect/GPU/CMakeLists.txt
@ -82,6 +82,8 @@ add_mlir_dialect_library(MLIRGPUTransforms
  MLIRTransformUtils
  )

+add_subdirectory(TransformOps)
+
 if(MLIR_ENABLE_CUDA_RUNNER)
  if(NOT MLIR_ENABLE_CUDA_CONVERSIONS)
    message(SEND_ERROR
--- a/mlir/lib/Dialect/GPU/TransformOps/CMakeLists.txt
+++ b/mlir/lib/Dialect/GPU/TransformOps/CMakeLists.txt
@ -0,0 +1,18 @@
+add_mlir_dialect_library(MLIRGPUTransformOps
+  GPUTransformOps.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/GPU/TransformOps
+
+  DEPENDS
+  MLIRGPUTransformOpsIncGen
+
+  LINK_LIBS PUBLIC
+  MLIRIR
+  MLIRGPUTransforms
+  MLIRParser
+  MLIRPDLDialect
+  MLIRSideEffectInterfaces
+  MLIRTransformDialect
+  MLIRGPUOps
+  )  
--- a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
+++ b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
@ -0,0 +1,507 @@
+//===- GPUTransformOps.cpp - Implementation of GPU transform ops ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h"
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h"
+#include "mlir/Dialect/PDL/IR/PDL.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/Transform/IR/TransformDialect.h"
+#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/Value.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+
+using namespace mlir;
+using namespace mlir::gpu;
+using namespace mlir::transform;
+
+namespace {
+/// A simple pattern rewriter that implements no special logic.
+class SimpleRewriter : public PatternRewriter {
+public:
+  SimpleRewriter(MLIRContext *context) : PatternRewriter(context) {}
+};
+} // namespace
+
+/// Determines if the size of the kernel configuration is supported by the GPU
+/// architecture being used. It presently makes use of CUDA limitations, however
+/// that aspect may be enhanced for other GPUs.
+static DiagnosedSilenceableFailure
+checkGpuLimits(TransformOpInterface transformOp, Optional<int64_t> gridDimX,
+               Optional<int64_t> gridDimY, Optional<int64_t> gridDimZ,
+               Optional<int64_t> blockDimX, Optional<int64_t> blockDimY,
+               Optional<int64_t> blockDimZ) {
+
+  static constexpr int max_total_blockdim = 1024;
+  static constexpr int max_blockdimx = 1024;
+  static constexpr int max_blockdimy = 1024;
+  static constexpr int max_blockdimz = 64;
+  static constexpr int max_total_griddim = 2147483647;
+  static constexpr int max_griddimx = 2147483647;
+  static constexpr int max_griddimy = 65535;
+  static constexpr int max_griddimz = 65535;
+
+  if ((blockDimX.value_or(1) * blockDimY.value_or(1) * blockDimZ.value_or(1)) >
+          max_total_blockdim ||
+      (gridDimX.value_or(1) * gridDimY.value_or(1) * gridDimZ.value_or(1)) >
+          max_total_griddim ||
+      blockDimX.value_or(1) > max_blockdimx ||
+      blockDimY.value_or(1) > max_blockdimy ||
+      blockDimZ.value_or(1) > max_blockdimz ||
+      gridDimY.value_or(1) > max_griddimy ||
+      gridDimZ.value_or(1) > max_griddimz ||
+      gridDimX.value_or(1) > max_griddimx) {
+    return transformOp.emitSilenceableError()
+           << "Trying to launch a GPU kernel with gridDim = ("
+           << gridDimX.value_or(1) << ", " << gridDimY.value_or(1) << ", "
+           << gridDimZ.value_or(1) << ") blockDim = (" << blockDimX.value_or(1)
+           << ", " << blockDimY.value_or(1) << ", " << blockDimZ.value_or(1)
+           << "). It is larger than the limits.";
+  }
+  return DiagnosedSilenceableFailure::success();
+}
+
+/// Creates an empty-body gpu::LaunchOp using the provided kernel settings and
+/// put a terminator within.
+static DiagnosedSilenceableFailure
+createGpuLaunch(RewriterBase &rewriter, Location loc,
+                TransformOpInterface transformOp, LaunchOp &launchOp,
+                Optional<int64_t> gridDimX = llvm::None,
+                Optional<int64_t> gridDimY = llvm::None,
+                Optional<int64_t> gridDimZ = llvm::None,
+                Optional<int64_t> blockDimX = llvm::None,
+                Optional<int64_t> blockDimY = llvm::None,
+                Optional<int64_t> blockDimZ = llvm::None) {
+  DiagnosedSilenceableFailure diag =
+      checkGpuLimits(transformOp, gridDimX, gridDimY, gridDimZ, blockDimX,
+                     blockDimY, blockDimZ);
+  if (!diag.succeeded())
+    return diag;
+
+  auto createConst = [&](int dim) {
+    return rewriter.create<arith::ConstantIndexOp>(loc, dim);
+  };
+  OpBuilder::InsertionGuard guard(rewriter);
+  Value one = createConst(1);
+  Value gridSizeX = gridDimX.has_value() ? createConst(gridDimX.value()) : one;
+  Value gridSizeY = gridDimY.has_value() ? createConst(gridDimY.value()) : one;
+  Value gridSizeZ = gridDimZ.has_value() ? createConst(gridDimZ.value()) : one;
+  Value blkSizeX = blockDimX.has_value() ? createConst(blockDimX.value()) : one;
+  Value blkSizeY = blockDimY.has_value() ? createConst(blockDimY.value()) : one;
+  Value blkSizeZ = blockDimZ.has_value() ? createConst(blockDimZ.value()) : one;
+  launchOp = rewriter.create<LaunchOp>(loc, gridSizeX, gridSizeY, gridSizeZ,
+                                       blkSizeX, blkSizeY, blkSizeZ);
+  rewriter.setInsertionPointToEnd(&launchOp.getBody().front());
+  rewriter.create<TerminatorOp>(loc);
+  return DiagnosedSilenceableFailure(success());
+}
+
+/// Alter kernel configuration of the given kernel.
+static DiagnosedSilenceableFailure
+alterGpuLaunch(SimpleRewriter &rewriter, LaunchOp gpuLaunch,
+               TransformOpInterface transformOp,
+               Optional<int64_t> gridDimX = llvm::None,
+               Optional<int64_t> gridDimY = llvm::None,
+               Optional<int64_t> gridDimZ = llvm::None,
+               Optional<int64_t> blockDimX = llvm::None,
+               Optional<int64_t> blockDimY = llvm::None,
+               Optional<int64_t> blockDimZ = llvm::None) {
+  DiagnosedSilenceableFailure diag =
+      checkGpuLimits(transformOp, gridDimX, gridDimY, gridDimZ, blockDimX,
+                     blockDimY, blockDimZ);
+  if (!diag.succeeded())
+    return diag;
+
+  KernelDim3 currentBlockdim = gpuLaunch.getBlockSizeOperandValues();
+  OpBuilder::InsertionGuard guard(rewriter);
+  rewriter.setInsertionPointAfterValue(currentBlockdim.x);
+  auto createConstValue = [&](int dim) {
+    return rewriter.create<arith::ConstantIndexOp>(currentBlockdim.x.getLoc(),
+                                                   dim);
+  };
+
+  if (gridDimX.has_value())
+    gpuLaunch.getGridSizeXMutable().assign(createConstValue(gridDimX.value()));
+  if (gridDimY.has_value())
+    gpuLaunch.getGridSizeYMutable().assign(createConstValue(gridDimY.value()));
+  if (gridDimZ.has_value())
+    gpuLaunch.getGridSizeZMutable().assign(createConstValue(gridDimZ.value()));
+  if (blockDimX.has_value())
+    gpuLaunch.getBlockSizeXMutable().assign(
+        createConstValue(blockDimX.value()));
+  if (blockDimY.has_value())
+    gpuLaunch.getBlockSizeYMutable().assign(
+        createConstValue(blockDimY.value()));
+  if (blockDimZ.has_value())
+    gpuLaunch.getBlockSizeZMutable().assign(
+        createConstValue(blockDimZ.value()));
+  return DiagnosedSilenceableFailure::success();
+}
+
+//===----------------------------------------------------------------------===//
+// MapForeachToBlocks
+//===----------------------------------------------------------------------===//
+
+DiagnosedSilenceableFailure mlir::transform::gpu::mapForeachToBlocksImp(
+    RewriterBase &rewriter, scf::ForeachThreadOp foreachThreadOp,
+    function_ref<void(RewriterBase &, scf::ForeachThreadOp,
+                      SmallVectorImpl<Value> &)>
+        blockIdGenerator,
+    SmallVectorImpl<int64_t> &gridDims, TransformOpInterface transformOp) {
+  if (foreachThreadOp.getNumResults() > 0)
+    return transformOp.emitSilenceableError()
+           << "only bufferized scf.foreach_thread lowers to gpu.block_id";
+  if (foreachThreadOp.getNumThreads().size() > 3)
+    return transformOp.emitSilenceableError()
+           << "scf.foreach_thread with rank > 3 does not lower to gpu.block_id";
+
+  // Step 0. Outline the compute workload region and set up the workload
+  // operands.
+  FailureOr<SmallVector<OpFoldResult>> potentialGridDim =
+      foreachThreadOp.getPermutedNumThreads(rewriter);
+
+  if (failed(potentialGridDim) ||
+      llvm::any_of(*potentialGridDim, [](OpFoldResult ofr) {
+        return !getConstantIntValue(ofr).has_value();
+      })) {
+    return transformOp.emitSilenceableError() << "unsupported dynamic gridDim";
+  }
+
+  for (OpFoldResult ofr : *potentialGridDim)
+    gridDims.push_back(getConstantIntValue(ofr).value());
+
+  SmallVector<Value> blockOps;
+  blockIdGenerator(rewriter, foreachThreadOp, blockOps);
+
+  // Step 1. Move the body of foreachThreadOp.
+  // Erase the terminator first, it will not be used since we are on buffers.
+  rewriter.eraseOp(foreachThreadOp.getTerminator());
+  Block *targetBlock = foreachThreadOp->getBlock();
+  Block::iterator insertionPoint = Block::iterator(foreachThreadOp);
+  Block &sourceBlock = foreachThreadOp.getRegion().front();
+  targetBlock->getOperations().splice(insertionPoint,
+                                      sourceBlock.getOperations());
+
+  // Step 2. RAUW thread indices to thread ops.
+  SmallVector<Value> threadIndices =
+      *foreachThreadOp.getPermutedThreadIndices();
+  assert(blockOps.size() == 3 && "3 block id ops are required");
+  for (auto [blockIdx, blockOp] : llvm::zip(threadIndices, blockOps)) {
+    Value val = blockIdx;
+    Value blkOp = blockOp;
+    if (!val)
+      continue;
+    for (Operation *user : llvm::make_early_inc_range(val.getUsers()))
+      user->replaceUsesOfWith(val, blkOp);
+  }
+
+  // Step 3. Erase old op.
+  rewriter.eraseOp(foreachThreadOp);
+
+  return DiagnosedSilenceableFailure::success();
+}
+
+DiagnosedSilenceableFailure mlir::transform::gpu::findTopLevelForeachThreadOp(
+    Operation *target, scf::ForeachThreadOp &topLevelForeachThreadOp,
+    TransformOpInterface transformOp) {
+  auto walkResult = target->walk([&](scf::ForeachThreadOp foreachThreadOp) {
+    if (foreachThreadOp->getParentOfType<scf::ForeachThreadOp>())
+      return WalkResult::advance();
+    if (topLevelForeachThreadOp)
+      // TODO: Handle multiple foreach if there is no dependences between them
+      return WalkResult::interrupt();
+    topLevelForeachThreadOp = foreachThreadOp;
+    return WalkResult::advance();
+  });
+
+  if (walkResult.wasInterrupted())
+    return transformOp.emitSilenceableError()
+           << "could not find a unique topLevel scf.foreach_thread";
+  return DiagnosedSilenceableFailure::success();
+}
+
+/// This is a helper that is only used in
+/// rewriteTopLevelForeachThreadToGpuBlocks. It generates GPU dialects block_id.
+static void generateGpuBlockIds(RewriterBase &rewriter,
+                                scf::ForeachThreadOp foreachOp,
+                                SmallVectorImpl<Value> &blockOps) {
+  Location loc = foreachOp->getLoc();
+  OpBuilder::InsertionGuard guard(rewriter);
+  rewriter.setInsertionPoint(foreachOp);
+  IndexType indexType = rewriter.getIndexType();
+  SmallVector<Dimension> gpuDims{Dimension::x, Dimension::y, Dimension::z};
+  for (int64_t idx : llvm::seq<int64_t>(0, gpuDims.size())) {
+    blockOps.push_back(
+        rewriter.create<BlockIdOp>(loc, indexType, gpuDims[idx]));
+  }
+}
+
+DiagnosedSilenceableFailure
+transform::MapForeachToBlocks::applyToOne(Operation *target,
+                                          SmallVectorImpl<Operation *> &results,
+                                          transform::TransformState &state) {
+  LaunchOp gpuLaunch = dyn_cast<LaunchOp>(target);
+  SimpleRewriter rewriter(getContext());
+  auto transformOp = cast<TransformOpInterface>(getOperation());
+
+  if (!getGenerateGpuLaunch() && !gpuLaunch) {
+    results.assign({target});
+    DiagnosedSilenceableFailure diag =
+        emitSilenceableError()
+        << "Given target is not gpu.launch, set `generate_gpu_launch` "
+           "attribute";
+    diag.attachNote(target->getLoc()) << "when applied to this payload op";
+    return diag;
+  }
+
+  scf::ForeachThreadOp topLevelForeachThreadOp;
+  DiagnosedSilenceableFailure diag =
+      mlir::transform::gpu::findTopLevelForeachThreadOp(
+          target, topLevelForeachThreadOp, transformOp);
+  if (!diag.succeeded()) {
+    results.assign({target});
+    diag.attachNote(target->getLoc()) << "when applied to this payload op";
+    return diag;
+  }
+
+  OpBuilder::InsertionGuard guard(rewriter);
+  rewriter.setInsertionPoint(topLevelForeachThreadOp);
+
+  // Generate gpu launch here and move the foreach_thread inside
+  if (getGenerateGpuLaunch()) {
+    DiagnosedSilenceableFailure diag =
+        createGpuLaunch(rewriter, target->getLoc(), transformOp, gpuLaunch);
+    if (!diag.succeeded()) {
+      results.assign({target});
+      return diag;
+    }
+    rewriter.setInsertionPointToStart(&gpuLaunch.getBody().front());
+    Operation *newForeachThreadOp = rewriter.clone(*topLevelForeachThreadOp);
+    rewriter.eraseOp(topLevelForeachThreadOp);
+    topLevelForeachThreadOp = cast<scf::ForeachThreadOp>(newForeachThreadOp);
+  }
+
+  SmallVector<int64_t> gridDim = extractFromI64ArrayAttr(getGridDim());
+  diag = mlir::transform::gpu::mapForeachToBlocksImp(
+      rewriter, topLevelForeachThreadOp, generateGpuBlockIds, gridDim,
+      transformOp);
+  if (diag.succeeded()) {
+    diag = alterGpuLaunch(rewriter, gpuLaunch,
+                          cast<TransformOpInterface>(getOperation()),
+                          gridDim[0], gridDim[1], gridDim[2]);
+  }
+
+  results.assign({gpuLaunch});
+  return diag;
+}
+
+//===----------------------------------------------------------------------===//
+// MapNestedForeachToThreads
+//===----------------------------------------------------------------------===//
+
+/// Searches `scf.foreach_thread` ops nested under `target` and maps each such
+/// op to GPU threads. Mapping is one-to-one and the induction variables of
+/// `scf.foreach_thread` are rewritten to gpu.thread_id according to the
+/// thread_dim_apping attribute. Sibling `scf.foreach_thread` are supported in
+/// which case, the union of the number of threads is computed and may result
+/// in predication. Dynamic, `scf.foreach_thread` trip counts are currently
+/// not supported. Dynamic block dim sizes are currently not supported.
+static DiagnosedSilenceableFailure rewriteOneForeachThreadToGpuThreads(
+    RewriterBase &rewriter, scf::ForeachThreadOp foreachThreadOp,
+    const SmallVectorImpl<int64_t> &globalBlockDims, bool syncAfterDistribute,
+    llvm::Optional<TransformOpInterface> transformOp) {
+  auto failureHelper =
+      [&](const Twine &message) -> DiagnosedSilenceableFailure {
+    if (transformOp.has_value()) {
+      return transformOp->emitSilenceableError() << message;
+    }
+    foreachThreadOp->emitError() << message;
+    return DiagnosedSilenceableFailure::definiteFailure();
+  };
+
+  if (foreachThreadOp.getNumResults() > 0)
+    return failureHelper(
+        "only bufferized scf.foreach_thread lowers to gpu.thread_id");
+
+  if (foreachThreadOp.getNumThreads().size() > 3)
+    return failureHelper(
+        "scf.foreach_thread with rank > 3 does not lower to gpu.thread_id");
+
+  auto potentialBlockDim = foreachThreadOp.getPermutedNumThreads(rewriter);
+  if (failed(potentialBlockDim) ||
+      llvm::any_of(*potentialBlockDim, [](OpFoldResult ofr) {
+        return !getConstantIntValue(ofr).has_value();
+      })) {
+    return failureHelper("unsupported dynamic blockdim size");
+  }
+
+  SmallVector<int64_t> blockDim =
+      llvm::to_vector(llvm::map_range(*potentialBlockDim, [](OpFoldResult ofr) {
+        return getConstantIntValue(ofr).value();
+      }));
+
+  // Step 1. Create the gpu.thread ops
+  Location loc = foreachThreadOp.getLoc();
+  IndexType indexType = rewriter.getIndexType();
+
+  SmallVector<Dimension> gpuDims{Dimension::x, Dimension::y, Dimension::z};
+  SmallVector<Value> threadOps;
+  for (int64_t idx : llvm::seq<int64_t>(0, blockDim.size())) {
+    threadOps.push_back(
+        rewriter.create<ThreadIdOp>(loc, indexType, gpuDims[idx]));
+  }
+  // Step 2. Maybe create conditionals to predicate the region.
+  Value predicate;
+  for (auto [threadId, blockDim, globalBlockDim] :
+       llvm::zip(threadOps, blockDim, globalBlockDims)) {
+    if (blockDim > globalBlockDim) {
+      return failureHelper(
+          "The GPU threads are fewer than the loop trip counts. "
+          "Try to tile scf.foreach_thread before mapping.");
+    }
+    if (blockDim == globalBlockDim)
+      continue;
+    Value blockIdx = rewriter.create<arith::ConstantIndexOp>(loc, blockDim);
+    Value tmpPredicate = rewriter.create<arith::CmpIOp>(
+        loc, arith::CmpIPredicate::ult, threadId, blockIdx);
+    predicate =
+        predicate ? rewriter.create<arith::AndIOp>(loc, predicate, tmpPredicate)
+                  : tmpPredicate;
+  }
+
+  // Step 3. Move the body of foreachThreadOp.
+  // Erase the terminator first, it will not be used.
+  rewriter.eraseOp(foreachThreadOp.getTerminator());
+  Block *targetBlock;
+  Block::iterator insertionPoint;
+  if (predicate) {
+    // Step 3.a. If predicated, move at the beginning.
+    auto ifOp =
+        rewriter.create<scf::IfOp>(loc, predicate, /*withElseRegion=*/false);
+    targetBlock = ifOp.thenBlock();
+    insertionPoint = ifOp.thenBlock()->begin();
+  } else {
+    // Step 3.a. Otherwise, move inline just before foreachThreadOp.
+    targetBlock = foreachThreadOp->getBlock();
+    insertionPoint = Block::iterator(foreachThreadOp);
+  }
+  Block &sourceBlock = foreachThreadOp.getRegion().front();
+  targetBlock->getOperations().splice(insertionPoint,
+                                      sourceBlock.getOperations());
+
+  // Step 4. RAUW thread indices to thread ops.
+  SmallVector<Value> threadIndices =
+      *foreachThreadOp.getPermutedThreadIndices();
+  for (auto [threadIdx, threadOp] : llvm::zip(threadIndices, threadOps)) {
+    Value val = threadIdx;
+    Value op = threadOp;
+    if (!val)
+      continue;
+    for (Operation *user : llvm::make_early_inc_range(val.getUsers())) {
+      user->replaceUsesOfWith(val, op);
+    }
+  }
+
+  // Step 5. syncthreads.
+  // TODO: Need warpsync
+  if (syncAfterDistribute)
+    rewriter.create<BarrierOp>(loc);
+
+  // Step 6. Erase old op.
+  rewriter.eraseOp(foreachThreadOp);
+
+  return DiagnosedSilenceableFailure::success();
+}
+
+DiagnosedSilenceableFailure mlir::transform::gpu::mapNestedForeachToThreadsImp(
+    RewriterBase &rewriter, Operation *target,
+    const SmallVectorImpl<int64_t> &blockDim, bool syncAfterDistribute,
+    llvm::Optional<TransformOpInterface> transformOp) {
+  DiagnosedSilenceableFailure diag = DiagnosedSilenceableFailure::success();
+  target->walk([&](scf::ForeachThreadOp foreachThreadOp) {
+    rewriter.setInsertionPoint(foreachThreadOp);
+    diag = rewriteOneForeachThreadToGpuThreads(
+        rewriter, foreachThreadOp, blockDim, syncAfterDistribute, transformOp);
+    return diag.succeeded() ? WalkResult::advance() : WalkResult::interrupt();
+  });
+  return diag;
+}
+
+DiagnosedSilenceableFailure transform::MapNestedForeachToThreads::applyToOne(
+    ::mlir::Operation *target,
+    ::llvm::SmallVectorImpl<::mlir::Operation *> &results,
+    ::mlir::transform::TransformState &state) {
+  LaunchOp gpuLaunch = dyn_cast<LaunchOp>(target);
+  auto transformOp = cast<TransformOpInterface>(getOperation());
+
+  if (!gpuLaunch) {
+    results.assign({target});
+    return emitSilenceableError() << "Given target is not gpu.launch";
+  }
+
+  SmallVector<int64_t> blockDim = extractFromI64ArrayAttr(getBlockDim());
+  blockDim.resize(/*size=*/3, /*value=*/1);
+
+  DiagnosedSilenceableFailure diag =
+      checkGpuLimits(transformOp, llvm::None, llvm::None, llvm::None,
+                     blockDim[0], blockDim[1], blockDim[2]);
+  if (diag.isSilenceableFailure()) {
+    results.assign({target});
+    diag.attachNote(getLoc()) << getBlockDimAttrName() << " is very large";
+    return diag;
+  }
+
+  SimpleRewriter rewriter(getContext());
+  rewriter.setInsertionPoint(target);
+
+  diag = mlir::transform::gpu::mapNestedForeachToThreadsImp(
+      rewriter, target, blockDim, getSyncAfterDistribute(), llvm::None);
+  if (diag.succeeded()) {
+    diag =
+        alterGpuLaunch(rewriter, gpuLaunch, transformOp, llvm::None, llvm::None,
+                       llvm::None, blockDim[0], blockDim[1], blockDim[2]);
+  }
+
+  results.assign({gpuLaunch});
+  return diag;
+}
+
+//===----------------------------------------------------------------------===//
+// Transform op registration
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// Registers new ops and declares PDL as dependent dialect since the
+/// additional ops are using PDL types for operands and results.
+class GPUTransformDialectExtension
+    : public transform::TransformDialectExtension<
+          GPUTransformDialectExtension> {
+public:
+  GPUTransformDialectExtension() {
+    declareDependentDialect<pdl::PDLDialect>();
+    declareGeneratedDialect<scf::SCFDialect>();
+    declareGeneratedDialect<arith::ArithDialect>();
+    declareGeneratedDialect<GPUDialect>();
+    registerTransformOps<
+#define GET_OP_LIST
+#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.cpp.inc"
+        >();
+  }
+};
+} // namespace
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.cpp.inc"
+
+void mlir::gpu::registerTransformDialectExtension(DialectRegistry &registry) {
+  registry.addExtensions<GPUTransformDialectExtension>();
+}
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@ -1167,392 +1167,6 @@ void transform::TileOp::getEffects(
  modifiesPayload(effects);
 }

-//===----------------------------------------------------------------------===//
-// MapNestedForeachThreadToGpuThreads
-//===----------------------------------------------------------------------===//
-
-/// Searches `scf.foreach_thread` ops nested under `target` and maps each such
-/// op to GPU threads. Mapping is one-to-one and the induction variables of
-/// `scf.foreach_thread` are rewritten to gpu.thread_id according to the
-/// thread_dim_apping attribute. Sibling `scf.foreach_thread` are supported in
-/// which case, the union of the number of threads is computed and may result in
-/// predication. Dynamic, `scf.foreach_thread` trip counts are currently not
-/// supported. Dynamic block dim sizes are currently not supported.
-static FailureOr<SmallVector<OpFoldResult>> rewriteOneForeachThreadToGpuThreads(
-    RewriterBase &rewriter, scf::ForeachThreadOp foreachThreadOp,
-    const SmallVector<int64_t> &globalBlockDims, bool syncAfterDistribute) {
-  if (foreachThreadOp.getNumResults() > 0)
-    return foreachThreadOp->emitError(
-        "only bufferized scf.foreach_thread lowers to gpu.thread");
-  if (foreachThreadOp.getNumThreads().size() > 3)
-    return foreachThreadOp->emitError(
-        "scf.foreach_thread with rank > 3 does not lower to gpu.thread");
-
-  auto potentialBlockDim = foreachThreadOp.getPermutedNumThreads(rewriter);
-  if (failed(potentialBlockDim) ||
-      llvm::any_of(*potentialBlockDim, [](OpFoldResult ofr) {
-        return !getConstantIntValue(ofr).has_value();
-      }))
-    return foreachThreadOp->emitError("unsupported dynamic blockdim size");
-
-  SmallVector<int64_t> blockDim =
-      llvm::to_vector(llvm::map_range(*potentialBlockDim, [](OpFoldResult ofr) {
-        return getConstantIntValue(ofr).value();
-      }));
-
-  // Step 1. Create the gpu.thread ops
-  Location loc = foreachThreadOp.getLoc();
-  IndexType indexType = rewriter.getIndexType();
-
-  SmallVector<gpu::Dimension> gpuDims{gpu::Dimension::x, gpu::Dimension::y,
-                                      gpu::Dimension::z};
-  SmallVector<Value> threadOps;
-  for (int64_t idx : llvm::seq<int64_t>(0, blockDim.size())) {
-    threadOps.push_back(
-        rewriter.create<gpu::ThreadIdOp>(loc, indexType, gpuDims[idx]));
-  }
-  // Step 2. Maybe create conditionals to predicate the region.
-  Value predicate;
-  for (auto [threadId, blockDim, globalBlockDim] :
-       llvm::zip(threadOps, blockDim, globalBlockDims)) {
-    if (blockDim > globalBlockDim) {
-      return foreachThreadOp.emitOpError("blockDim size overflow: ")
-             << blockDim << " > " << globalBlockDim;
-    }
-    if (blockDim == globalBlockDim)
-      continue;
-    Value tmpPredicate = rewriter.create<arith::CmpIOp>(
-        loc, arith::CmpIPredicate::ult, threadId,
-        rewriter.create<arith::ConstantIndexOp>(loc, blockDim));
-    predicate =
-        predicate ? rewriter.create<arith::AndIOp>(loc, predicate, tmpPredicate)
-                  : tmpPredicate;
-  }
-
-  // Step 3. Move the body of foreachThreadOp.
-  // Erase the terminator first, it will not be used.
-  rewriter.eraseOp(foreachThreadOp.getTerminator());
-  Block *targetBlock;
-  Block::iterator insertionPoint;
-  if (predicate) {
-    // Step 3.a. If predicated, move at the beginning.
-    auto ifOp =
-        rewriter.create<scf::IfOp>(loc, predicate, /*withElseRegion=*/false);
-    targetBlock = ifOp.thenBlock();
-    insertionPoint = ifOp.thenBlock()->begin();
-  } else {
-    // Step 3.a. Otherwise, move inline just before foreachThreadOp.
-    targetBlock = foreachThreadOp->getBlock();
-    insertionPoint = Block::iterator(foreachThreadOp);
-  }
-  Block &sourceBlock = foreachThreadOp.getRegion().front();
-  targetBlock->getOperations().splice(insertionPoint,
-                                      sourceBlock.getOperations());
-
-  // Step 4. RAUW thread indices to thread ops.
-  SmallVector<Value> threadIndices =
-      *foreachThreadOp.getPermutedThreadIndices();
-  for (auto it : llvm::zip(threadIndices, threadOps)) {
-    Value val = std::get<0>(it);
-    if (!val)
-      continue;
-    for (Operation *user : llvm::make_early_inc_range(val.getUsers())) {
-      rewriter.updateRootInPlace(
-          user, [&]() { user->replaceUsesOfWith(val, std::get<1>(it)); });
-    }
-  }
-
-  // Step 5. syncthreads.
-  // TODO: Need warpsync
-  if (syncAfterDistribute)
-    rewriter.create<gpu::BarrierOp>(loc);
-
-  // Step 6. Erase old op.
-  rewriter.eraseOp(foreachThreadOp);
-
-  return *potentialBlockDim;
-}
-
-mlir::WalkResult mlir::linalg::rewriteMapNestedForeachThreadToGpuThreads(
-    RewriterBase &rewriter, Operation *target,
-    const SmallVector<int64_t> &blockDim, bool syncAfterDistribute) {
-  auto walkResult = target->walk([&](scf::ForeachThreadOp foreachThreadOp) {
-    rewriter.setInsertionPoint(foreachThreadOp);
-    if (failed(rewriteOneForeachThreadToGpuThreads(
-            rewriter, foreachThreadOp, blockDim, syncAfterDistribute)))
-      return WalkResult::interrupt();
-    return WalkResult::advance();
-  });
-  return walkResult;
-}
-
-static LogicalResult
-checkGpuLimits(Optional<int64_t> gridDimX, Optional<int64_t> gridDimY,
-               Optional<int64_t> gridDimZ, Optional<int64_t> blockDimX,
-               Optional<int64_t> blockDimY, Optional<int64_t> blockDimZ) {
-  // TODO The limits should live in the gpu dialect, but it's not like that
-  // right now. Read them in the common gpu dialect
-  if ((blockDimX.value_or(1) * blockDimY.value_or(1) * blockDimZ.value_or(1)) >
-          1024 ||
-      gridDimY.value_or(1) > 65535 || gridDimZ.value_or(1) > 65535 ||
-      gridDimX.value_or(1) > 2147483647)
-    return failure();
-  return success();
-}
-
-/// Alter grid or block dimensions of the given kernel
-static LogicalResult alterGpuLaunch(SimpleRewriter &rewriter,
-                                    gpu::LaunchOp gpuLaunch,
-                                    Optional<int64_t> gridDimX = llvm::None,
-                                    Optional<int64_t> gridDimY = llvm::None,
-                                    Optional<int64_t> gridDimZ = llvm::None,
-                                    Optional<int64_t> blockDimX = llvm::None,
-                                    Optional<int64_t> blockDimY = llvm::None,
-                                    Optional<int64_t> blockDimZ = llvm::None) {
-  if (failed(checkGpuLimits(gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY,
-                            blockDimZ))) {
-    gpuLaunch->emitError(
-        "Requested kernel thread configuration is larger than the limits");
-    return failure();
-  }
-
-  gpu::KernelDim3 currentBlockdim = gpuLaunch.getBlockSizeOperandValues();
-  OpBuilder::InsertionGuard guard(rewriter);
-  rewriter.setInsertionPointAfterValue(currentBlockdim.x);
-  auto createConstValue = [&](int dim) {
-    return rewriter.create<arith::ConstantIndexOp>(currentBlockdim.x.getLoc(),
-                                                   dim);
-  };
-
-  if (gridDimX.has_value())
-    gpuLaunch.getGridSizeXMutable().assign(createConstValue(gridDimX.value()));
-  if (gridDimY.has_value())
-    gpuLaunch.getGridSizeYMutable().assign(createConstValue(gridDimY.value()));
-  if (gridDimZ.has_value())
-    gpuLaunch.getGridSizeZMutable().assign(createConstValue(gridDimZ.value()));
-  if (blockDimX.has_value())
-    gpuLaunch.getBlockSizeXMutable().assign(
-        createConstValue(blockDimX.value()));
-  if (blockDimY.has_value())
-    gpuLaunch.getBlockSizeYMutable().assign(
-        createConstValue(blockDimY.value()));
-  if (blockDimZ.has_value())
-    gpuLaunch.getBlockSizeZMutable().assign(
-        createConstValue(blockDimZ.value()));
-  return success();
-}
-
-DiagnosedSilenceableFailure
-transform::MapNestedForeachThreadToGpuThreads::applyToOne(
-    Operation *target, SmallVectorImpl<Operation *> &results,
-    transform::TransformState &state) {
-
-  gpu::LaunchOp gpuLaunch = dyn_cast<gpu::LaunchOp>(target);
-  if (!gpuLaunch) {
-    target->emitError("Given target is not gpu.launch");
-    return DiagnosedSilenceableFailure::definiteFailure();
-  }
-
-  SmallVector<int64_t> blockDim = extractFromI64ArrayAttr(getBlockDim());
-  blockDim.resize(/*size=*/3, /*value=*/1);
-  SimpleRewriter rewriter(getContext());
-  rewriter.setInsertionPoint(target);
-  auto walkResult = mlir::linalg::rewriteMapNestedForeachThreadToGpuThreads(
-      rewriter, target, blockDim, getSyncAfterDistribute());
-  if (walkResult.wasInterrupted())
-    return DiagnosedSilenceableFailure(reportUnknownTransformError(target));
-
-  LogicalResult result =
-      alterGpuLaunch(rewriter, gpuLaunch, llvm::None, llvm::None, llvm::None,
-                     blockDim[0], blockDim[1], blockDim[2]);
-  if (failed(result))
-    return DiagnosedSilenceableFailure::definiteFailure();
-
-  results.assign({target});
-  return DiagnosedSilenceableFailure(success());
-}
-
-//===----------------------------------------------------------------------===//
-// MapNestedForeachThreadToGpuBlocks
-//===----------------------------------------------------------------------===//
-
-LogicalResult mlir::linalg::rewriteTopLevelForeachThreadToGpuBlocks(
-    RewriterBase &rewriter, scf::ForeachThreadOp foreachThreadOp,
-    function_ref<void(RewriterBase &, scf::ForeachThreadOp,
-                      SmallVector<Value> &)>
-        blockIdGenerator,
-    SmallVector<int64_t> &gridDims) {
-  if (foreachThreadOp.getNumResults() > 0)
-    return foreachThreadOp->emitError(
-        "only bufferized scf.foreach_thread lowers to gpu.block_id");
-  if (foreachThreadOp.getNumThreads().size() > 3)
-    return foreachThreadOp->emitError(
-        "scf.foreach_thread with rank > 3 does not lower to gpu.block_id");
-
-  // Step 0. Outline the compute workload region and set up the workload
-  // operands.
-  auto potentialGridDim = foreachThreadOp.getPermutedNumThreads(rewriter);
-  if (failed(potentialGridDim) ||
-      llvm::any_of(*potentialGridDim, [](OpFoldResult ofr) {
-        return !getConstantIntValue(ofr).has_value();
-      }))
-    return foreachThreadOp->emitError("unsupported dynamic gridDim");
-
-  for (OpFoldResult ofr : *potentialGridDim)
-    gridDims.push_back(getConstantIntValue(ofr).value());
-
-  SmallVector<Value> blockOps;
-  blockIdGenerator(rewriter, foreachThreadOp, blockOps);
-
-  // Step 1. Move the body of foreachThreadOp.
-  // Erase the terminator first, it will not be used since we are on buffers.
-  rewriter.eraseOp(foreachThreadOp.getTerminator());
-  Block *targetBlock = foreachThreadOp->getBlock();
-  Block::iterator insertionPoint = Block::iterator(foreachThreadOp);
-  Block &sourceBlock = foreachThreadOp.getRegion().front();
-  targetBlock->getOperations().splice(insertionPoint,
-                                      sourceBlock.getOperations());
-
-  // Step 2. RAUW thread indices to thread ops.
-  SmallVector<Value> threadIndices =
-      *foreachThreadOp.getPermutedThreadIndices();
-  assert(blockOps.size() == 3 && "3 block id ops are required");
-  for (auto it : llvm::zip(threadIndices, blockOps)) {
-    Value val = std::get<0>(it);
-    if (!val)
-      continue;
-    for (Operation *user : llvm::make_early_inc_range(val.getUsers())) {
-      rewriter.updateRootInPlace(
-          user, [&]() { user->replaceUsesOfWith(val, std::get<1>(it)); });
-    }
-  }
-
-  // Step 3. Erase old op.
-  rewriter.eraseOp(foreachThreadOp);
-
-  return success();
-}
-
-FailureOr<scf::ForeachThreadOp>
-mlir::linalg::findTopLevelForeachThreadOp(Operation *target) {
-  scf::ForeachThreadOp topLevelForeachThreadOp;
-  auto walkResult = target->walk([&](scf::ForeachThreadOp foreachThreadOp) {
-    if (foreachThreadOp->getParentOfType<scf::ForeachThreadOp>())
-      return WalkResult::advance();
-    if (topLevelForeachThreadOp)
-      // TODO Handle multiple foreach if there is no dependences between them
-      return WalkResult::interrupt();
-    topLevelForeachThreadOp = foreachThreadOp;
-    return WalkResult::advance();
-  });
-
-  if (walkResult.wasInterrupted())
-    return target->emitError(
-        "could not find a unique topLevel scf.foreach_thread");
-
-  return topLevelForeachThreadOp;
-}
-
-/// Create gpuLauncOp with given kernel configurations
-static FailureOr<gpu::LaunchOp>
-createGpuLaunch(RewriterBase &rewriter, Location loc,
-                Optional<int64_t> gridDimX = llvm::None,
-                Optional<int64_t> gridDimY = llvm::None,
-                Optional<int64_t> gridDimZ = llvm::None,
-                Optional<int64_t> blockDimX = llvm::None,
-                Optional<int64_t> blockDimY = llvm::None,
-                Optional<int64_t> blockDimZ = llvm::None) {
-  if (failed(checkGpuLimits(gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY,
-                            blockDimZ)))
-    return failure();
-  auto createConstant = [&](int dim) {
-    return rewriter.create<arith::ConstantIndexOp>(loc, dim);
-  };
-  Value one = createConstant(1);
-  Value gridSizeX =
-      gridDimX.has_value() ? createConstant(gridDimX.value()) : one;
-  Value gridSizeY =
-      gridDimY.has_value() ? createConstant(gridDimY.value()) : one;
-  Value gridSizeZ =
-      gridDimZ.has_value() ? createConstant(gridDimZ.value()) : one;
-  Value blockSizeX =
-      blockDimX.has_value() ? createConstant(blockDimX.value()) : one;
-  Value blockSizeY =
-      blockDimY.has_value() ? createConstant(blockDimY.value()) : one;
-  Value blockSizeZ =
-      blockDimZ.has_value() ? createConstant(blockDimZ.value()) : one;
-  auto launchOp = rewriter.create<gpu::LaunchOp>(
-      loc, gridSizeX, gridSizeY, gridSizeZ, blockSizeX, blockSizeY, blockSizeZ);
-  rewriter.setInsertionPointToEnd(&launchOp.getBody().front());
-  rewriter.create<gpu::TerminatorOp>(loc);
-  return launchOp;
-}
-
-/// This is an helper that is only used in
-/// rewriteTopLevelForeachThreadToGpuBlocks. It generates GPU dialects block_id
-static void generateGpuBlockIds(RewriterBase &rewriter,
-                                scf::ForeachThreadOp foreachOp,
-                                SmallVector<Value> &blockOps) {
-  Location loc = foreachOp->getLoc();
-  OpBuilder::InsertionGuard guard(rewriter);
-  rewriter.setInsertionPoint(foreachOp);
-  IndexType indexType = rewriter.getIndexType();
-  SmallVector<gpu::Dimension> gpuDims{gpu::Dimension::x, gpu::Dimension::y,
-                                      gpu::Dimension::z};
-  for (int64_t idx : llvm::seq<int64_t>(0, gpuDims.size())) {
-    blockOps.push_back(
-        rewriter.create<gpu::BlockIdOp>(loc, indexType, gpuDims[idx]));
-  }
-}
-
-DiagnosedSilenceableFailure
-transform::MapNestedForeachThreadToGpuBlocks::applyToOne(
-    Operation *target, SmallVectorImpl<Operation *> &results,
-    transform::TransformState &state) {
-  gpu::LaunchOp gpuLaunch = dyn_cast<gpu::LaunchOp>(target);
-  SimpleRewriter rewriter(getContext());
-
-  if (!getGenerateGpuLaunch() && !gpuLaunch) {
-    target->emitError("Given target is not gpu.launch, set "
-                      "`generate_gpu_launch` attribute");
-    return DiagnosedSilenceableFailure::definiteFailure();
-  }
-
-  auto res = mlir::linalg::findTopLevelForeachThreadOp(target);
-  if (failed(res))
-    return DiagnosedSilenceableFailure(reportUnknownTransformError(target));
-
-  scf::ForeachThreadOp topLevelForeachThreadOp = *res;
-  OpBuilder::InsertionGuard guard(rewriter);
-  rewriter.setInsertionPoint(topLevelForeachThreadOp);
-
-  // Generate gpu launch here and move the foreach_thread inside
-  if (getGenerateGpuLaunch()) {
-    FailureOr<gpu::LaunchOp> maybeGpuLaunch =
-        createGpuLaunch(rewriter, target->getLoc());
-    if (failed(maybeGpuLaunch))
-      return DiagnosedSilenceableFailure(reportUnknownTransformError(target));
-    gpuLaunch = *maybeGpuLaunch;
-    rewriter.setInsertionPointToStart(&gpuLaunch.getBody().front());
-    Operation *newForeachThreadOp = rewriter.clone(*topLevelForeachThreadOp);
-    rewriter.eraseOp(topLevelForeachThreadOp);
-    topLevelForeachThreadOp =
-        dyn_cast<scf::ForeachThreadOp>(newForeachThreadOp);
-  }
-
-  SmallVector<int64_t> gridDim = extractFromI64ArrayAttr(getGridDim());
-  if (failed(mlir::linalg::rewriteTopLevelForeachThreadToGpuBlocks(
-          rewriter, topLevelForeachThreadOp, generateGpuBlockIds, gridDim)))
-    return DiagnosedSilenceableFailure(reportUnknownTransformError(target));
-
-  if (failed(alterGpuLaunch(rewriter, gpuLaunch, gridDim[0], gridDim[1],
-                            gridDim[2])))
-    return DiagnosedSilenceableFailure::definiteFailure();
-
-  results.assign({gpuLaunch});
-  return DiagnosedSilenceableFailure(success());
-}
-
 //===----------------------------------------------------------------------===//
 // TileToForeachThreadOp
 //===----------------------------------------------------------------------===//
--- a/mlir/test/Dialect/Linalg/transform-gpu.mlir
+++ b/mlir/test/Dialect/Linalg/transform-gpu.mlir
@ -35,7 +35,7 @@ transform.with_pdl_patterns {
  transform.sequence %arg0 failures(propagate) {
  ^bb1(%arg1: !pdl.operation):
    %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0
-    transform.structured.map_nested_foreach_thread_to_gpu_blocks %funcop { blockDim = [12, 9, 1]}
+    transform.gpu.map_foreach_to_blocks %funcop { blockDim = [12, 9, 1]}
  }
 }

@ -92,7 +92,7 @@ transform.with_pdl_patterns {
  transform.sequence %arg0 failures(propagate) {
  ^bb1(%arg1: !pdl.operation):
    %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0
-    transform.structured.map_nested_foreach_thread_to_gpu_threads %funcop { blockDim = [12, 9, 1] }
+    transform.gpu.map_nested_foreach_to_threads %funcop { blockDim = [12, 9, 1] }
  }
 }

@ -134,8 +134,8 @@ transform.with_pdl_patterns {
  transform.sequence %arg0 failures(propagate) {
  ^bb1(%arg1: !pdl.operation):
    %funcop = transform.structured.match ops{["func.func"]} in %arg0
-    %gpuLaunch = transform.structured.map_nested_foreach_thread_to_gpu_blocks %funcop { generate_gpu_launch }
-    transform.structured.map_nested_foreach_thread_to_gpu_threads %gpuLaunch { blockDim = [32, 4, 1] }
+    %gpuLaunch = transform.gpu.map_foreach_to_blocks %funcop { generate_gpu_launch }
+    transform.gpu.map_nested_foreach_to_threads %gpuLaunch { blockDim = [32, 4, 1] }
  }
 }

@ -171,6 +171,6 @@ transform.with_pdl_patterns {
  transform.sequence %arg0 failures(propagate) {
  ^bb1(%arg1: !pdl.operation):
    %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0
-    transform.structured.map_nested_foreach_thread_to_gpu_threads %funcop { blockDim = [12, 9, 1], syncAfterDistribute = false }
+    transform.gpu.map_nested_foreach_to_threads %funcop { blockDim = [12, 9, 1], syncAfterDistribute = false }
  }
 }
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@ -3761,6 +3761,65 @@ cc_library(
    ]),
 )

+td_library(
+    name = "GPUTransformOpsTdFiles",
+    srcs = [
+        "include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td",
+    ],
+    includes = ["include"],
+    deps = [
+        ":PDLDialectTdFiles",
+        ":TransformDialectTdFiles",
+    ],
+)
+
+gentbl_cc_library(
+    name = "GPUTransformOpsIncGen",
+    strip_include_prefix = "include",
+    tbl_outs = [
+        (
+            ["-gen-op-decls"],
+            "include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h.inc",
+        ),
+        (
+            ["-gen-op-defs"],
+            "include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.cpp.inc",
+        ),
+    ],
+    tblgen = ":mlir-tblgen",
+    td_file = "include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td",
+    deps = [
+        ":GPUTransformOpsTdFiles",
+    ],
+)
+
+cc_library(
+    name = "GPUTransformOps",
+    srcs = [
+        "lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp",
+    ],
+    hdrs = [
+        "include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h",
+    ],
+    includes = ["include"],
+    deps = [
+        ":ArithDialect",
+        ":AsmParser",
+        ":ControlFlowDialect",
+        ":GPUDialect",
+        ":GPUTransformOpsIncGen",
+        ":GPUTransforms",
+        ":IR",
+        ":PDLDialect",
+        ":Parser",
+        ":SCFDialect",
+        ":SideEffectInterfaces",
+        ":TransformDialect",
+        ":TransformUtils",
+        "//llvm:Support",
+    ],
+)
+
 td_library(
    name = "LLVMOpsTdFiles",
    srcs = [
@ -6401,6 +6460,7 @@ cc_library(
        ":GPUToROCDLTransforms",
        ":GPUToSPIRV",
        ":GPUToVulkanTransforms",
+        ":GPUTransformOps",
        ":GPUTransforms",
        ":IR",
        ":LLVMDialect",