forked from OSchip/llvm-project
Add 'gpu.terminator' operation.
Summary: The 'gpu.terminator' operation is used as the terminator for the regions of gpu.launch. This is to disambugaute them from the return operation on 'gpu.func' functions. This is a breaking change and users of the gpu dialect will need to adapt their code when producting 'gpu.launch' operations. Reviewers: nicolasvasilache Subscribers: mehdi_amini, rriddle, jpienaar, burmako, shauheen, antiagainst, csigg, arpith-jacob, mgester, lucyrfox, liufengdb, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D73620
This commit is contained in:
parent
2930dab315
commit
2692751895
|
@ -472,8 +472,24 @@ def GPU_LaunchOp : GPU_Op<"launch", [IsolatedFromAbove]>,
|
|||
let verifier = [{ return ::verify(*this); }];
|
||||
}
|
||||
|
||||
def GPU_ReturnOp : GPU_Op<"return", [Terminator]>, Arguments<(ins)>,
|
||||
Results<(outs)> {
|
||||
def GPU_ReturnOp : GPU_Op<"return", [HasParent<"GPUFuncOp">, Terminator]>,
|
||||
Arguments<(ins Variadic<AnyType>:$operands)>, Results<(outs)> {
|
||||
let summary = "Terminator for GPU functions.";
|
||||
let description = [{
|
||||
A terminator operation for regions that appear in the body of `gpu.func`
|
||||
functions. The operands to the `gpu.return` are the result values returned
|
||||
by an incovation of the `gpu.func`.
|
||||
}];
|
||||
|
||||
let builders = [OpBuilder<"Builder *builder, OperationState &result", " // empty">];
|
||||
|
||||
let parser = [{ return parseReturnOp(parser, result); }];
|
||||
let printer = [{ p << getOperationName(); }];
|
||||
let verifier = [{ return ::verify(*this); }];
|
||||
}
|
||||
|
||||
def GPU_TerminatorOp : GPU_Op<"terminator", [HasParent<"LaunchOp">, Terminator]>,
|
||||
Arguments<(ins)>, Results<(outs)> {
|
||||
let summary = "Terminator for GPU launch regions.";
|
||||
let description = [{
|
||||
A terminator operation for regions that appear in the body of `gpu.launch`
|
||||
|
|
|
@ -306,9 +306,9 @@ createLaunchBody(OpBuilder &builder, OpTy rootForOp, gpu::LaunchOp launchOp,
|
|||
unsigned numBlockDims, unsigned numThreadDims) {
|
||||
OpBuilder::InsertionGuard bodyInsertionGuard(builder);
|
||||
builder.setInsertionPointToEnd(&launchOp.body().front());
|
||||
auto returnOp = builder.create<gpu::ReturnOp>(launchOp.getLoc());
|
||||
auto terminatorOp = builder.create<gpu::TerminatorOp>(launchOp.getLoc());
|
||||
|
||||
rootForOp.getOperation()->moveBefore(returnOp);
|
||||
rootForOp.getOperation()->moveBefore(terminatorOp);
|
||||
SmallVector<Value, 3> workgroupID, numWorkGroups;
|
||||
packIdAndNumId(launchOp.getBlockIds(), launchOp.getGridSize(), numBlockDims,
|
||||
workgroupID, numWorkGroups);
|
||||
|
@ -435,7 +435,7 @@ void LoopToGpuConverter::createLaunch(OpTy rootForOp, OpTy innermostForOp,
|
|||
Location terminatorLoc = terminator.getLoc();
|
||||
terminator.erase();
|
||||
builder.setInsertionPointToEnd(innermostForOp.getBody());
|
||||
builder.create<gpu::ReturnOp>(terminatorLoc);
|
||||
builder.create<gpu::TerminatorOp>(terminatorLoc, llvm::None);
|
||||
launchOp.body().front().getOperations().splice(
|
||||
launchOp.body().front().begin(),
|
||||
innermostForOp.getBody()->getOperations());
|
||||
|
|
|
@ -270,18 +270,19 @@ static LogicalResult verify(LaunchOp op) {
|
|||
}
|
||||
|
||||
// Block terminators without successors are expected to exit the kernel region
|
||||
// and must be `gpu.launch`.
|
||||
// and must be `gpu.terminator`.
|
||||
for (Block &block : op.body()) {
|
||||
if (block.empty())
|
||||
continue;
|
||||
if (block.back().getNumSuccessors() != 0)
|
||||
continue;
|
||||
if (!isa<gpu::ReturnOp>(&block.back())) {
|
||||
if (!isa<gpu::TerminatorOp>(&block.back())) {
|
||||
return block.back()
|
||||
.emitError("expected 'gpu.terminator' or a terminator with "
|
||||
"successors")
|
||||
.attachNote(op.getLoc())
|
||||
<< "in '" << LaunchOp::getOperationName() << "' body region";
|
||||
.emitError()
|
||||
.append("expected '", gpu::TerminatorOp::getOperationName(),
|
||||
"' or a terminator with successors")
|
||||
.attachNote(op.getLoc())
|
||||
.append("in '", LaunchOp::getOperationName(), "' body region");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -680,7 +681,7 @@ static ParseResult parseGPUFuncOp(OpAsmParser &parser, OperationState &result) {
|
|||
<< "gpu.func requires named arguments";
|
||||
|
||||
// Construct the function type. More types will be added to the region, but
|
||||
// not to the functiont type.
|
||||
// not to the function type.
|
||||
Builder &builder = parser.getBuilder();
|
||||
auto type = builder.getFunctionType(argTypes, resultTypes);
|
||||
result.addAttribute(GPUFuncOp::getTypeAttrName(), TypeAttr::get(type));
|
||||
|
@ -767,6 +768,10 @@ LogicalResult GPUFuncOp::verifyType() {
|
|||
if (!type.isa<FunctionType>())
|
||||
return emitOpError("requires '" + getTypeAttrName() +
|
||||
"' attribute of function type");
|
||||
|
||||
if (isKernel() && getType().getNumResults() != 0)
|
||||
return emitOpError() << "expected void return type for kernel function";
|
||||
|
||||
return success();
|
||||
}
|
||||
|
||||
|
@ -814,6 +819,45 @@ LogicalResult GPUFuncOp::verifyBody() {
|
|||
return success();
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// ReturnOp
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
static ParseResult parseReturnOp(OpAsmParser &parser, OperationState &result) {
|
||||
llvm::SmallVector<OpAsmParser::OperandType, 4> operands;
|
||||
llvm::SmallVector<Type, 4> types;
|
||||
if (parser.parseOperandList(operands) ||
|
||||
parser.parseOptionalColonTypeList(types) ||
|
||||
parser.resolveOperands(operands, types, parser.getCurrentLocation(),
|
||||
result.operands))
|
||||
return failure();
|
||||
|
||||
return success();
|
||||
}
|
||||
|
||||
static LogicalResult verify(gpu::ReturnOp returnOp) {
|
||||
GPUFuncOp function = returnOp.getParentOfType<GPUFuncOp>();
|
||||
|
||||
FunctionType funType = function.getType();
|
||||
|
||||
if (funType.getNumResults() != returnOp.operands().size())
|
||||
return returnOp.emitOpError()
|
||||
.append("expected ", funType.getNumResults(), " result operands")
|
||||
.attachNote(function.getLoc())
|
||||
.append("return type declared here");
|
||||
|
||||
for (auto pair : llvm::enumerate(
|
||||
llvm::zip(function.getType().getResults(), returnOp.operands()))) {
|
||||
Type type;
|
||||
Value operand;
|
||||
std::tie(type, operand) = pair.value();
|
||||
if (type != operand.getType())
|
||||
return returnOp.emitOpError() << "unexpected type `" << operand.getType()
|
||||
<< "' for operand #" << pair.index();
|
||||
}
|
||||
return success();
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// GPUModuleOp
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
|
|
@ -99,7 +99,7 @@ static gpu::LaunchFuncOp inlineBeneficiaryOps(gpu::GPUFuncOp kernelFunc,
|
|||
}
|
||||
|
||||
// Outline the `gpu.launch` operation body into a kernel function. Replace
|
||||
// `gpu.return` operations by `std.return` in the generated function.
|
||||
// `gpu.terminator` operations by `gpu.return` in the generated function.
|
||||
static gpu::GPUFuncOp outlineKernelFunc(gpu::LaunchOp launchOp) {
|
||||
Location loc = launchOp.getLoc();
|
||||
// Create a builder with no insertion point, insertion will happen separately
|
||||
|
@ -116,6 +116,12 @@ static gpu::GPUFuncOp outlineKernelFunc(gpu::LaunchOp launchOp) {
|
|||
builder.getUnitAttr());
|
||||
outlinedFunc.body().takeBody(launchOp.body());
|
||||
injectGpuIndexOperations(loc, outlinedFunc.body());
|
||||
outlinedFunc.walk([](gpu::TerminatorOp op) {
|
||||
OpBuilder replacer(op);
|
||||
replacer.create<gpu::ReturnOp>(op.getLoc());
|
||||
op.erase();
|
||||
});
|
||||
|
||||
return outlinedFunc;
|
||||
}
|
||||
|
||||
|
|
|
@ -23,7 +23,7 @@ func @foo(%arg0: memref<?xf32>, %arg1 : index) {
|
|||
// CHECK: %[[prod_j:.*]] = muli %{{.*}}, %{{.*}} : index
|
||||
// CHECK: addi %{{.*}}, %[[prod_j]] : index
|
||||
|
||||
// CHECK: gpu.return
|
||||
// CHECK: gpu.terminator
|
||||
}
|
||||
}
|
||||
return
|
||||
|
|
|
@ -73,8 +73,8 @@ func @step_1(%A : memref<?x?x?x?xf32>, %B : memref<?x?x?x?xf32>) {
|
|||
// CHECK-22-NEXT: store {{.*}}, %{{.*}}[%[[i]], %[[j]], %[[ii]], %[[jj]]] : memref<?x?x?x?xf32>
|
||||
store %0, %B[%i, %j, %ii, %jj] : memref<?x?x?x?xf32>
|
||||
|
||||
// CHECK-11: gpu.return
|
||||
// CHECK-22: gpu.return
|
||||
// CHECK-11: gpu.terminator
|
||||
// CHECK-22: gpu.terminator
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,7 +21,7 @@ func @propagate_constant(%arg1: memref<?xf32>) {
|
|||
|
||||
// CHECK: "bar"(%[[inner_arg]])
|
||||
"bar"(%y) : (memref<?xf32>) -> ()
|
||||
gpu.return
|
||||
gpu.terminator
|
||||
}
|
||||
return
|
||||
}
|
||||
|
|
|
@ -376,7 +376,7 @@ func @shuffle_unsupported_type(%arg0 : index, %arg1 : i32, %arg2 : i32) {
|
|||
// -----
|
||||
|
||||
module {
|
||||
module @gpu_funcs attributes {gpu.kernel_module} {
|
||||
gpu.module @gpu_funcs {
|
||||
// expected-error @+1 {{custom op 'gpu.func' gpu.func requires named arguments}}
|
||||
gpu.func @kernel_1(f32, f32) {
|
||||
^bb0(%arg0: f32):
|
||||
|
@ -428,3 +428,39 @@ module {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
module {
|
||||
module @gpu_funcs attributes {gpu.kernel_module} {
|
||||
// expected-error @+1 {{expected memory space 5 in attribution}}
|
||||
gpu.func @kernel() private(%0: memref<4xf32>) {
|
||||
gpu.return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
module {
|
||||
gpu.module @gpu_funcs {
|
||||
// expected-note @+1 {{return type declared here}}
|
||||
gpu.func @kernel() {
|
||||
%0 = constant 0 : index
|
||||
// expected-error @+1 {{'gpu.return' op expected 0 result operands}}
|
||||
gpu.return %0 : index
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
module {
|
||||
gpu.module @gpu_funcs {
|
||||
// expected-error @+1 {{'gpu.func' op expected void return type for kernel function}}
|
||||
gpu.func @kernel() -> index kernel {
|
||||
%0 = constant 0 : index
|
||||
gpu.return
|
||||
}
|
||||
}
|
||||
}
|
|
@ -7,8 +7,8 @@ module attributes {gpu.container_module} {
|
|||
// CHECK: gpu.launch blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}})
|
||||
gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %sz, %grid_y = %sz, %grid_z = %sz)
|
||||
threads(%tx, %ty, %tz) in (%block_x = %sz, %block_y = %sz, %block_z = %sz) {
|
||||
// CHECK: gpu.return
|
||||
gpu.return
|
||||
// CHECK: gpu.terminator
|
||||
gpu.terminator
|
||||
}
|
||||
return
|
||||
}
|
||||
|
@ -19,8 +19,8 @@ module attributes {gpu.container_module} {
|
|||
gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %blk, %grid_y = %blk, %grid_z = %blk)
|
||||
threads(%tx, %ty, %tz) in (%block_x = %thrd, %block_y = %thrd, %block_z = %thrd)
|
||||
args(%kernel_arg0 = %float, %kernel_arg1 = %data) : f32, memref<?xf32, 1> {
|
||||
// CHECK: gpu.return
|
||||
gpu.return
|
||||
// CHECK: gpu.terminator
|
||||
gpu.terminator
|
||||
}
|
||||
return
|
||||
}
|
||||
|
@ -34,8 +34,8 @@ module attributes {gpu.container_module} {
|
|||
args(%kernel_arg0 = %float, %kernel_arg1 = %data) : f32, memref<?xf32, 1> {
|
||||
// CHECK: "use"(%{{.*}})
|
||||
"use"(%kernel_arg0): (f32) -> ()
|
||||
// CHECK: gpu.return
|
||||
gpu.return
|
||||
// CHECK: gpu.terminator
|
||||
gpu.terminator
|
||||
}
|
||||
return
|
||||
}
|
||||
|
@ -54,8 +54,8 @@ module attributes {gpu.container_module} {
|
|||
"use"(%val) : (index) -> ()
|
||||
}) : () -> ()
|
||||
}) : () -> ()
|
||||
// CHECK: gpu.return
|
||||
gpu.return
|
||||
// CHECK: gpu.terminator
|
||||
gpu.terminator
|
||||
}
|
||||
return
|
||||
}
|
||||
|
@ -118,11 +118,11 @@ module attributes {gpu.container_module} {
|
|||
}
|
||||
|
||||
module @gpu_funcs attributes {gpu.kernel_module} {
|
||||
// CHECK-LABEL: gpu.func @kernel_1({{.*}}: f32) -> f32
|
||||
// CHECK-LABEL: gpu.func @kernel_1({{.*}}: f32)
|
||||
// CHECK: workgroup
|
||||
// CHECK: private
|
||||
// CHECK: attributes
|
||||
gpu.func @kernel_1(%arg0: f32) -> f32
|
||||
gpu.func @kernel_1(%arg0: f32)
|
||||
workgroup(%arg1: memref<42xf32, 3>)
|
||||
private(%arg2: memref<2xf32, 5>, %arg3: memref<1xf32, 5>)
|
||||
kernel
|
||||
|
|
|
@ -31,7 +31,7 @@ func @launch() {
|
|||
"use"(%arg0): (f32) -> ()
|
||||
"some_op"(%bx, %block_x) : (index, index) -> ()
|
||||
%42 = load %arg1[%tx] : memref<?xf32, 1>
|
||||
gpu.return
|
||||
gpu.terminator
|
||||
}
|
||||
return
|
||||
}
|
||||
|
@ -68,14 +68,14 @@ func @multiple_launches() {
|
|||
%grid_z = %cst)
|
||||
threads(%tx, %ty, %tz) in (%block_x = %cst, %block_y = %cst,
|
||||
%block_z = %cst) {
|
||||
gpu.return
|
||||
gpu.terminator
|
||||
}
|
||||
// CHECK: "gpu.launch_func"(%[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]]) {kernel = "multiple_launches_kernel", kernel_module = @multiple_launches_kernel_0} : (index, index, index, index, index, index) -> ()
|
||||
gpu.launch blocks(%bx2, %by2, %bz2) in (%grid_x2 = %cst, %grid_y2 = %cst,
|
||||
%grid_z2 = %cst)
|
||||
threads(%tx2, %ty2, %tz2) in (%block_x2 = %cst, %block_y2 = %cst,
|
||||
%block_z2 = %cst) {
|
||||
gpu.return
|
||||
gpu.terminator
|
||||
}
|
||||
return
|
||||
}
|
||||
|
@ -99,7 +99,7 @@ func @extra_constants(%arg0 : memref<?xf32>) {
|
|||
%block_z = %cst)
|
||||
args(%kernel_arg0 = %cst2, %kernel_arg1 = %arg0, %kernel_arg2 = %cst3) : index, memref<?xf32>, index {
|
||||
"use"(%kernel_arg0, %kernel_arg1, %kernel_arg2) : (index, memref<?xf32>, index) -> ()
|
||||
gpu.return
|
||||
gpu.terminator
|
||||
}
|
||||
return
|
||||
}
|
||||
|
@ -121,19 +121,19 @@ func @function_call(%arg0 : memref<?xf32>) {
|
|||
call @device_function() : () -> ()
|
||||
call @device_function() : () -> ()
|
||||
%0 = llvm.mlir.addressof @global : !llvm<"i64*">
|
||||
gpu.return
|
||||
gpu.terminator
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func @device_function() {
|
||||
call @recursive_device_function() : () -> ()
|
||||
gpu.return
|
||||
return
|
||||
}
|
||||
|
||||
func @recursive_device_function() {
|
||||
call @recursive_device_function() : () -> ()
|
||||
gpu.return
|
||||
return
|
||||
}
|
||||
|
||||
// CHECK: gpu.module @function_call_kernel {
|
||||
|
@ -141,6 +141,7 @@ func @recursive_device_function() {
|
|||
// CHECK: call @device_function() : () -> ()
|
||||
// CHECK: call @device_function() : () -> ()
|
||||
// CHECK: llvm.mlir.addressof @global : !llvm<"i64*">
|
||||
// CHECK: gpu.return
|
||||
//
|
||||
// CHECK: llvm.mlir.global internal @global(42 : i64) : !llvm.i64
|
||||
//
|
||||
|
|
|
@ -20,7 +20,7 @@ func @main() {
|
|||
%val = sitofp %t3 : i32 to f32
|
||||
%sum = "gpu.all_reduce"(%val) ({}) { op = "add" } : (f32) -> (f32)
|
||||
store %sum, %kernel_dst[%tz, %ty, %tx] : memref<?x?x?xf32>
|
||||
gpu.return
|
||||
gpu.terminator
|
||||
}
|
||||
%U = memref_cast %dst : memref<?x?x?xf32> to memref<*xf32>
|
||||
call @print_memref_f32(%U) : (memref<*xf32>) -> ()
|
||||
|
|
|
@ -18,7 +18,7 @@ func @main() {
|
|||
}) : (i32) -> (i32)
|
||||
%res = sitofp %xor : i32 to f32
|
||||
store %res, %kernel_dst[%tx] : memref<?xf32>
|
||||
gpu.return
|
||||
gpu.terminator
|
||||
}
|
||||
%U = memref_cast %dst : memref<?xf32> to memref<*xf32>
|
||||
call @print_memref_f32(%U) : (memref<*xf32>) -> ()
|
||||
|
|
|
@ -7,7 +7,7 @@ func @other_func(%arg0 : f32, %arg1 : memref<?xf32>) {
|
|||
threads(%tx, %ty, %tz) in (%block_x = %cst2, %block_y = %cst, %block_z = %cst)
|
||||
args(%kernel_arg0 = %arg0, %kernel_arg1 = %arg1) : f32, memref<?xf32> {
|
||||
store %kernel_arg0, %kernel_arg1[%tx] : memref<?xf32>
|
||||
gpu.return
|
||||
gpu.terminator
|
||||
}
|
||||
return
|
||||
}
|
||||
|
|
|
@ -21,7 +21,7 @@ func @main() {
|
|||
br ^bb1(%m1 : f32)
|
||||
^bb1(%value : f32):
|
||||
store %value, %kernel_dst[%tx] : memref<?xf32>
|
||||
gpu.return
|
||||
gpu.terminator
|
||||
}
|
||||
%U = memref_cast %dst : memref<?xf32> to memref<*xf32>
|
||||
call @print_memref_f32(%U) : (memref<*xf32>) -> ()
|
||||
|
|
Loading…
Reference in New Issue