Add 'gpu.terminator' operation.

Summary:
The 'gpu.terminator' operation is used as the terminator for the
regions of gpu.launch. This is to disambugaute them from the
return operation on 'gpu.func' functions.

This is a breaking change and users of the gpu dialect will need
to adapt their code when producting 'gpu.launch' operations.

Reviewers: nicolasvasilache

Subscribers: mehdi_amini, rriddle, jpienaar, burmako, shauheen, antiagainst, csigg, arpith-jacob, mgester, lucyrfox, liufengdb, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D73620
This commit is contained in:
Stephan Herhut 2020-01-29 13:59:36 +01:00
parent 2930dab315
commit 2692751895
14 changed files with 142 additions and 39 deletions

View File

@ -472,8 +472,24 @@ def GPU_LaunchOp : GPU_Op<"launch", [IsolatedFromAbove]>,
let verifier = [{ return ::verify(*this); }];
}
def GPU_ReturnOp : GPU_Op<"return", [Terminator]>, Arguments<(ins)>,
Results<(outs)> {
def GPU_ReturnOp : GPU_Op<"return", [HasParent<"GPUFuncOp">, Terminator]>,
Arguments<(ins Variadic<AnyType>:$operands)>, Results<(outs)> {
let summary = "Terminator for GPU functions.";
let description = [{
A terminator operation for regions that appear in the body of `gpu.func`
functions. The operands to the `gpu.return` are the result values returned
by an incovation of the `gpu.func`.
}];
let builders = [OpBuilder<"Builder *builder, OperationState &result", " // empty">];
let parser = [{ return parseReturnOp(parser, result); }];
let printer = [{ p << getOperationName(); }];
let verifier = [{ return ::verify(*this); }];
}
def GPU_TerminatorOp : GPU_Op<"terminator", [HasParent<"LaunchOp">, Terminator]>,
Arguments<(ins)>, Results<(outs)> {
let summary = "Terminator for GPU launch regions.";
let description = [{
A terminator operation for regions that appear in the body of `gpu.launch`

View File

@ -306,9 +306,9 @@ createLaunchBody(OpBuilder &builder, OpTy rootForOp, gpu::LaunchOp launchOp,
unsigned numBlockDims, unsigned numThreadDims) {
OpBuilder::InsertionGuard bodyInsertionGuard(builder);
builder.setInsertionPointToEnd(&launchOp.body().front());
auto returnOp = builder.create<gpu::ReturnOp>(launchOp.getLoc());
auto terminatorOp = builder.create<gpu::TerminatorOp>(launchOp.getLoc());
rootForOp.getOperation()->moveBefore(returnOp);
rootForOp.getOperation()->moveBefore(terminatorOp);
SmallVector<Value, 3> workgroupID, numWorkGroups;
packIdAndNumId(launchOp.getBlockIds(), launchOp.getGridSize(), numBlockDims,
workgroupID, numWorkGroups);
@ -435,7 +435,7 @@ void LoopToGpuConverter::createLaunch(OpTy rootForOp, OpTy innermostForOp,
Location terminatorLoc = terminator.getLoc();
terminator.erase();
builder.setInsertionPointToEnd(innermostForOp.getBody());
builder.create<gpu::ReturnOp>(terminatorLoc);
builder.create<gpu::TerminatorOp>(terminatorLoc, llvm::None);
launchOp.body().front().getOperations().splice(
launchOp.body().front().begin(),
innermostForOp.getBody()->getOperations());

View File

@ -270,18 +270,19 @@ static LogicalResult verify(LaunchOp op) {
}
// Block terminators without successors are expected to exit the kernel region
// and must be `gpu.launch`.
// and must be `gpu.terminator`.
for (Block &block : op.body()) {
if (block.empty())
continue;
if (block.back().getNumSuccessors() != 0)
continue;
if (!isa<gpu::ReturnOp>(&block.back())) {
if (!isa<gpu::TerminatorOp>(&block.back())) {
return block.back()
.emitError("expected 'gpu.terminator' or a terminator with "
"successors")
.attachNote(op.getLoc())
<< "in '" << LaunchOp::getOperationName() << "' body region";
.emitError()
.append("expected '", gpu::TerminatorOp::getOperationName(),
"' or a terminator with successors")
.attachNote(op.getLoc())
.append("in '", LaunchOp::getOperationName(), "' body region");
}
}
@ -680,7 +681,7 @@ static ParseResult parseGPUFuncOp(OpAsmParser &parser, OperationState &result) {
<< "gpu.func requires named arguments";
// Construct the function type. More types will be added to the region, but
// not to the functiont type.
// not to the function type.
Builder &builder = parser.getBuilder();
auto type = builder.getFunctionType(argTypes, resultTypes);
result.addAttribute(GPUFuncOp::getTypeAttrName(), TypeAttr::get(type));
@ -767,6 +768,10 @@ LogicalResult GPUFuncOp::verifyType() {
if (!type.isa<FunctionType>())
return emitOpError("requires '" + getTypeAttrName() +
"' attribute of function type");
if (isKernel() && getType().getNumResults() != 0)
return emitOpError() << "expected void return type for kernel function";
return success();
}
@ -814,6 +819,45 @@ LogicalResult GPUFuncOp::verifyBody() {
return success();
}
//===----------------------------------------------------------------------===//
// ReturnOp
//===----------------------------------------------------------------------===//
static ParseResult parseReturnOp(OpAsmParser &parser, OperationState &result) {
llvm::SmallVector<OpAsmParser::OperandType, 4> operands;
llvm::SmallVector<Type, 4> types;
if (parser.parseOperandList(operands) ||
parser.parseOptionalColonTypeList(types) ||
parser.resolveOperands(operands, types, parser.getCurrentLocation(),
result.operands))
return failure();
return success();
}
static LogicalResult verify(gpu::ReturnOp returnOp) {
GPUFuncOp function = returnOp.getParentOfType<GPUFuncOp>();
FunctionType funType = function.getType();
if (funType.getNumResults() != returnOp.operands().size())
return returnOp.emitOpError()
.append("expected ", funType.getNumResults(), " result operands")
.attachNote(function.getLoc())
.append("return type declared here");
for (auto pair : llvm::enumerate(
llvm::zip(function.getType().getResults(), returnOp.operands()))) {
Type type;
Value operand;
std::tie(type, operand) = pair.value();
if (type != operand.getType())
return returnOp.emitOpError() << "unexpected type `" << operand.getType()
<< "' for operand #" << pair.index();
}
return success();
}
//===----------------------------------------------------------------------===//
// GPUModuleOp
//===----------------------------------------------------------------------===//

View File

@ -99,7 +99,7 @@ static gpu::LaunchFuncOp inlineBeneficiaryOps(gpu::GPUFuncOp kernelFunc,
}
// Outline the `gpu.launch` operation body into a kernel function. Replace
// `gpu.return` operations by `std.return` in the generated function.
// `gpu.terminator` operations by `gpu.return` in the generated function.
static gpu::GPUFuncOp outlineKernelFunc(gpu::LaunchOp launchOp) {
Location loc = launchOp.getLoc();
// Create a builder with no insertion point, insertion will happen separately
@ -116,6 +116,12 @@ static gpu::GPUFuncOp outlineKernelFunc(gpu::LaunchOp launchOp) {
builder.getUnitAttr());
outlinedFunc.body().takeBody(launchOp.body());
injectGpuIndexOperations(loc, outlinedFunc.body());
outlinedFunc.walk([](gpu::TerminatorOp op) {
OpBuilder replacer(op);
replacer.create<gpu::ReturnOp>(op.getLoc());
op.erase();
});
return outlinedFunc;
}

View File

@ -23,7 +23,7 @@ func @foo(%arg0: memref<?xf32>, %arg1 : index) {
// CHECK: %[[prod_j:.*]] = muli %{{.*}}, %{{.*}} : index
// CHECK: addi %{{.*}}, %[[prod_j]] : index
// CHECK: gpu.return
// CHECK: gpu.terminator
}
}
return

View File

@ -73,8 +73,8 @@ func @step_1(%A : memref<?x?x?x?xf32>, %B : memref<?x?x?x?xf32>) {
// CHECK-22-NEXT: store {{.*}}, %{{.*}}[%[[i]], %[[j]], %[[ii]], %[[jj]]] : memref<?x?x?x?xf32>
store %0, %B[%i, %j, %ii, %jj] : memref<?x?x?x?xf32>
// CHECK-11: gpu.return
// CHECK-22: gpu.return
// CHECK-11: gpu.terminator
// CHECK-22: gpu.terminator
}
}
}

View File

@ -21,7 +21,7 @@ func @propagate_constant(%arg1: memref<?xf32>) {
// CHECK: "bar"(%[[inner_arg]])
"bar"(%y) : (memref<?xf32>) -> ()
gpu.return
gpu.terminator
}
return
}

View File

@ -376,7 +376,7 @@ func @shuffle_unsupported_type(%arg0 : index, %arg1 : i32, %arg2 : i32) {
// -----
module {
module @gpu_funcs attributes {gpu.kernel_module} {
gpu.module @gpu_funcs {
// expected-error @+1 {{custom op 'gpu.func' gpu.func requires named arguments}}
gpu.func @kernel_1(f32, f32) {
^bb0(%arg0: f32):
@ -428,3 +428,39 @@ module {
}
}
}
// -----
module {
module @gpu_funcs attributes {gpu.kernel_module} {
// expected-error @+1 {{expected memory space 5 in attribution}}
gpu.func @kernel() private(%0: memref<4xf32>) {
gpu.return
}
}
}
// -----
module {
gpu.module @gpu_funcs {
// expected-note @+1 {{return type declared here}}
gpu.func @kernel() {
%0 = constant 0 : index
// expected-error @+1 {{'gpu.return' op expected 0 result operands}}
gpu.return %0 : index
}
}
}
// -----
module {
gpu.module @gpu_funcs {
// expected-error @+1 {{'gpu.func' op expected void return type for kernel function}}
gpu.func @kernel() -> index kernel {
%0 = constant 0 : index
gpu.return
}
}
}

View File

@ -7,8 +7,8 @@ module attributes {gpu.container_module} {
// CHECK: gpu.launch blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}})
gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %sz, %grid_y = %sz, %grid_z = %sz)
threads(%tx, %ty, %tz) in (%block_x = %sz, %block_y = %sz, %block_z = %sz) {
// CHECK: gpu.return
gpu.return
// CHECK: gpu.terminator
gpu.terminator
}
return
}
@ -19,8 +19,8 @@ module attributes {gpu.container_module} {
gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %blk, %grid_y = %blk, %grid_z = %blk)
threads(%tx, %ty, %tz) in (%block_x = %thrd, %block_y = %thrd, %block_z = %thrd)
args(%kernel_arg0 = %float, %kernel_arg1 = %data) : f32, memref<?xf32, 1> {
// CHECK: gpu.return
gpu.return
// CHECK: gpu.terminator
gpu.terminator
}
return
}
@ -34,8 +34,8 @@ module attributes {gpu.container_module} {
args(%kernel_arg0 = %float, %kernel_arg1 = %data) : f32, memref<?xf32, 1> {
// CHECK: "use"(%{{.*}})
"use"(%kernel_arg0): (f32) -> ()
// CHECK: gpu.return
gpu.return
// CHECK: gpu.terminator
gpu.terminator
}
return
}
@ -54,8 +54,8 @@ module attributes {gpu.container_module} {
"use"(%val) : (index) -> ()
}) : () -> ()
}) : () -> ()
// CHECK: gpu.return
gpu.return
// CHECK: gpu.terminator
gpu.terminator
}
return
}
@ -118,11 +118,11 @@ module attributes {gpu.container_module} {
}
module @gpu_funcs attributes {gpu.kernel_module} {
// CHECK-LABEL: gpu.func @kernel_1({{.*}}: f32) -> f32
// CHECK-LABEL: gpu.func @kernel_1({{.*}}: f32)
// CHECK: workgroup
// CHECK: private
// CHECK: attributes
gpu.func @kernel_1(%arg0: f32) -> f32
gpu.func @kernel_1(%arg0: f32)
workgroup(%arg1: memref<42xf32, 3>)
private(%arg2: memref<2xf32, 5>, %arg3: memref<1xf32, 5>)
kernel

View File

@ -31,7 +31,7 @@ func @launch() {
"use"(%arg0): (f32) -> ()
"some_op"(%bx, %block_x) : (index, index) -> ()
%42 = load %arg1[%tx] : memref<?xf32, 1>
gpu.return
gpu.terminator
}
return
}
@ -68,14 +68,14 @@ func @multiple_launches() {
%grid_z = %cst)
threads(%tx, %ty, %tz) in (%block_x = %cst, %block_y = %cst,
%block_z = %cst) {
gpu.return
gpu.terminator
}
// CHECK: "gpu.launch_func"(%[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]]) {kernel = "multiple_launches_kernel", kernel_module = @multiple_launches_kernel_0} : (index, index, index, index, index, index) -> ()
gpu.launch blocks(%bx2, %by2, %bz2) in (%grid_x2 = %cst, %grid_y2 = %cst,
%grid_z2 = %cst)
threads(%tx2, %ty2, %tz2) in (%block_x2 = %cst, %block_y2 = %cst,
%block_z2 = %cst) {
gpu.return
gpu.terminator
}
return
}
@ -99,7 +99,7 @@ func @extra_constants(%arg0 : memref<?xf32>) {
%block_z = %cst)
args(%kernel_arg0 = %cst2, %kernel_arg1 = %arg0, %kernel_arg2 = %cst3) : index, memref<?xf32>, index {
"use"(%kernel_arg0, %kernel_arg1, %kernel_arg2) : (index, memref<?xf32>, index) -> ()
gpu.return
gpu.terminator
}
return
}
@ -121,19 +121,19 @@ func @function_call(%arg0 : memref<?xf32>) {
call @device_function() : () -> ()
call @device_function() : () -> ()
%0 = llvm.mlir.addressof @global : !llvm<"i64*">
gpu.return
gpu.terminator
}
return
}
func @device_function() {
call @recursive_device_function() : () -> ()
gpu.return
return
}
func @recursive_device_function() {
call @recursive_device_function() : () -> ()
gpu.return
return
}
// CHECK: gpu.module @function_call_kernel {
@ -141,6 +141,7 @@ func @recursive_device_function() {
// CHECK: call @device_function() : () -> ()
// CHECK: call @device_function() : () -> ()
// CHECK: llvm.mlir.addressof @global : !llvm<"i64*">
// CHECK: gpu.return
//
// CHECK: llvm.mlir.global internal @global(42 : i64) : !llvm.i64
//

View File

@ -20,7 +20,7 @@ func @main() {
%val = sitofp %t3 : i32 to f32
%sum = "gpu.all_reduce"(%val) ({}) { op = "add" } : (f32) -> (f32)
store %sum, %kernel_dst[%tz, %ty, %tx] : memref<?x?x?xf32>
gpu.return
gpu.terminator
}
%U = memref_cast %dst : memref<?x?x?xf32> to memref<*xf32>
call @print_memref_f32(%U) : (memref<*xf32>) -> ()

View File

@ -18,7 +18,7 @@ func @main() {
}) : (i32) -> (i32)
%res = sitofp %xor : i32 to f32
store %res, %kernel_dst[%tx] : memref<?xf32>
gpu.return
gpu.terminator
}
%U = memref_cast %dst : memref<?xf32> to memref<*xf32>
call @print_memref_f32(%U) : (memref<*xf32>) -> ()

View File

@ -7,7 +7,7 @@ func @other_func(%arg0 : f32, %arg1 : memref<?xf32>) {
threads(%tx, %ty, %tz) in (%block_x = %cst2, %block_y = %cst, %block_z = %cst)
args(%kernel_arg0 = %arg0, %kernel_arg1 = %arg1) : f32, memref<?xf32> {
store %kernel_arg0, %kernel_arg1[%tx] : memref<?xf32>
gpu.return
gpu.terminator
}
return
}

View File

@ -21,7 +21,7 @@ func @main() {
br ^bb1(%m1 : f32)
^bb1(%value : f32):
store %value, %kernel_dst[%tx] : memref<?xf32>
gpu.return
gpu.terminator
}
%U = memref_cast %dst : memref<?xf32> to memref<*xf32>
call @print_memref_f32(%U) : (memref<*xf32>) -> ()