[mlir][gpu] Add lowering to LLVM for `gpu.wait` and `gpu.wait async`.

Reviewed By: herhut

Differential Revision: https://reviews.llvm.org/D89686
This commit is contained in:
Christian Sigg 2020-10-21 08:24:53 +02:00
parent 1c1803dbb0
commit 3ac561d8c3
3 changed files with 120 additions and 1 deletions

View File

@ -91,6 +91,7 @@ def ConvertAVX512ToLLVM : Pass<"convert-avx512-to-llvm", "ModuleOp"> {
def GpuToLLVMConversionPass : Pass<"gpu-to-llvm", "ModuleOp"> {
let summary = "Convert GPU dialect to LLVM dialect with GPU runtime calls";
let constructor = "mlir::createGpuToLLVMConversionPass()";
let dependentDialects = ["LLVM::LLVMDialect"];
let options = [
Option<"gpuBinaryAnnotation", "gpu-binary-annotation", "std::string",
"", "Annotation attribute string for GPU binary">,

View File

@ -157,6 +157,34 @@ private:
ConversionPatternRewriter &rewriter) const override;
};
/// A rewrite pattern to convert gpu.wait operations into a GPU runtime
/// call. Currently it supports CUDA and ROCm (HIP).
class ConvertWaitOpToGpuRuntimeCallPattern
: public ConvertOpToGpuRuntimeCallPattern<gpu::WaitOp> {
public:
ConvertWaitOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
: ConvertOpToGpuRuntimeCallPattern<gpu::WaitOp>(typeConverter) {}
private:
LogicalResult
matchAndRewrite(Operation *op, ArrayRef<Value> operands,
ConversionPatternRewriter &rewriter) const override;
};
/// A rewrite pattern to convert gpu.wait async operations into a GPU runtime
/// call. Currently it supports CUDA and ROCm (HIP).
class ConvertWaitAsyncOpToGpuRuntimeCallPattern
: public ConvertOpToGpuRuntimeCallPattern<gpu::WaitOp> {
public:
ConvertWaitAsyncOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
: ConvertOpToGpuRuntimeCallPattern<gpu::WaitOp>(typeConverter) {}
private:
LogicalResult
matchAndRewrite(Operation *op, ArrayRef<Value> operands,
ConversionPatternRewriter &rewriter) const override;
};
/// A rewrite patter to convert gpu.launch_func operations into a sequence of
/// GPU runtime calls. Currently it supports CUDA and ROCm (HIP).
///
@ -257,6 +285,69 @@ LogicalResult ConvertHostRegisterOpToGpuRuntimeCallPattern::matchAndRewrite(
return success();
}
// Converts `gpu.wait` to runtime calls. The operands are all CUDA or ROCm
// streams (i.e. void*). The converted op synchronizes the host with every
// stream and then destroys it. That is, it assumes that the stream is not used
// afterwards. In case this isn't correct, we will get a runtime error.
// Eventually, we will have a pass that guarantees this property.
LogicalResult ConvertWaitOpToGpuRuntimeCallPattern::matchAndRewrite(
Operation *op, ArrayRef<Value> operands,
ConversionPatternRewriter &rewriter) const {
if (cast<gpu::WaitOp>(op).asyncToken())
return failure(); // The gpu.wait is async.
Location loc = op->getLoc();
for (auto asyncDependency : operands)
streamSynchronizeCallBuilder.create(loc, rewriter, {asyncDependency});
for (auto asyncDependency : operands)
streamDestroyCallBuilder.create(loc, rewriter, {asyncDependency});
rewriter.eraseOp(op);
return success();
}
// Converts `gpu.wait async` to runtime calls. The result is a new stream that
// is synchronized with all operands, which are CUDA or ROCm streams (i.e.
// void*). We create and record an event after the definition of the stream
// and make the new stream wait on that event before destroying it again. This
// assumes that there is no other use between the definition and this op, and
// the plan is to have a pass that guarantees this property.
LogicalResult ConvertWaitAsyncOpToGpuRuntimeCallPattern::matchAndRewrite(
Operation *op, ArrayRef<Value> operands,
ConversionPatternRewriter &rewriter) const {
if (!cast<gpu::WaitOp>(op).asyncToken())
return failure(); // The gpu.wait is not async.
Location loc = op->getLoc();
auto insertionPoint = rewriter.saveInsertionPoint();
SmallVector<Value, 1> events;
for (auto pair : llvm::zip(op->getOperands(), operands)) {
auto token = std::get<0>(pair);
if (auto *defOp = token.getDefiningOp()) {
rewriter.setInsertionPointAfter(defOp);
} else {
// If we can't find the defining op, we record the event at block start,
// which is late and therefore misses parallelism, but still valid.
rewriter.setInsertionPointToStart(op->getBlock());
}
auto event = eventCreateCallBuilder.create(loc, rewriter, {}).getResult(0);
auto stream = std::get<1>(pair);
eventRecordCallBuilder.create(loc, rewriter, {event, stream});
events.push_back(event);
}
rewriter.restoreInsertionPoint(insertionPoint);
auto stream = streamCreateCallBuilder.create(loc, rewriter, {}).getResult(0);
for (auto event : events)
streamWaitEventCallBuilder.create(loc, rewriter, {stream, event});
for (auto event : events)
eventDestroyCallBuilder.create(loc, rewriter, {event});
rewriter.replaceOp(op, {stream});
return success();
}
// Creates a struct containing all kernel parameters on the stack and returns
// an array of type-erased pointers to the fields of the struct. The array can
// then be passed to the CUDA / ROCm (HIP) kernel launch calls.
@ -411,7 +502,13 @@ mlir::createGpuToLLVMConversionPass(StringRef gpuBinaryAnnotation) {
void mlir::populateGpuToLLVMConversionPatterns(
LLVMTypeConverter &converter, OwningRewritePatternList &patterns,
StringRef gpuBinaryAnnotation) {
patterns.insert<ConvertHostRegisterOpToGpuRuntimeCallPattern>(converter);
converter.addConversion(
[context = &converter.getContext()](gpu::AsyncTokenType type) -> Type {
return LLVM::LLVMType::getInt8PtrTy(context);
});
patterns.insert<ConvertHostRegisterOpToGpuRuntimeCallPattern,
ConvertWaitOpToGpuRuntimeCallPattern,
ConvertWaitAsyncOpToGpuRuntimeCallPattern>(converter);
patterns.insert<ConvertLaunchFuncOpToGpuRuntimeCallPattern>(
converter, gpuBinaryAnnotation);
patterns.insert<EraseGpuModuleOpPattern>(&converter.getContext());

View File

@ -0,0 +1,21 @@
// RUN: mlir-opt -allow-unregistered-dialect %s --gpu-to-llvm | FileCheck %s
module attributes {gpu.container_module} {
func @foo() {
// CHECK: %[[t0:.*]] = llvm.call @mgpuStreamCreate
// CHECK: %[[e0:.*]] = llvm.call @mgpuEventCreate
// CHECK: llvm.call @mgpuEventRecord(%[[e0]], %[[t0]])
%t0 = gpu.wait async
// CHECK: %[[t1:.*]] = llvm.call @mgpuStreamCreate
// CHECK: llvm.call @mgpuStreamWaitEvent(%[[t1]], %[[e0]])
// CHECK: llvm.call @mgpuEventDestroy(%[[e0]])
%t1 = gpu.wait async [%t0]
// CHECK: llvm.call @mgpuStreamSynchronize(%[[t0]])
// CHECK: llvm.call @mgpuStreamSynchronize(%[[t1]])
// CHECK: llvm.call @mgpuStreamDestroy(%[[t0]])
// CHECK: llvm.call @mgpuStreamDestroy(%[[t1]])
gpu.wait [%t0, %t1]
return
}
}