[mlir][gpu] Move gpu.wait ops from async.execute regions to its dependencies.

This can prevent unnecessary host synchronization.

Reviewed By: herhut

Differential Revision: https://reviews.llvm.org/D90346
This commit is contained in:
Christian Sigg 2020-12-02 09:48:59 +01:00
parent a36f8fb021
commit d9adde5ae2
2 changed files with 240 additions and 9 deletions

View File

@ -12,6 +12,7 @@
//===----------------------------------------------------------------------===//
#include "PassDetail.h"
#include "mlir/Dialect/Async/IR/Async.h"
#include "mlir/Dialect/GPU/GPUDialect.h"
#include "mlir/Dialect/GPU/Passes.h"
#include "mlir/Dialect/GPU/Utils.h"
@ -22,24 +23,35 @@
#include "mlir/IR/SymbolTable.h"
#include "mlir/Support/LLVM.h"
#include "mlir/Transforms/RegionUtils.h"
#include "llvm/ADT/TypeSwitch.h"
using namespace mlir;
namespace {
class GpuAsyncRegionPass : public GpuAsyncRegionPassBase<GpuAsyncRegionPass> {
struct Callback;
struct ThreadTokenCallback;
struct DeferWaitCallback;
void runOnFunction() override;
};
} // namespace
static bool isTerminator(Operation *op) { return !op->isKnownNonTerminator(); }
static bool hasSideEffects(Operation *op) {
return !MemoryEffectOpInterface::hasNoEffect(op);
}
// Region walk callback which makes GPU ops implementing the AsyncOpInterface
// execute asynchronously.
struct GpuAsyncRegionPass::Callback {
struct GpuAsyncRegionPass::ThreadTokenCallback {
ThreadTokenCallback(MLIRContext &context) : builder(&context) {}
// If `op` implements the AsyncOpInterface, insert a `gpu.wait async` to
// create a current token (unless it already exists), and 'thread' that token
// through the `op` so that it executes asynchronously.
//
// If `op` is a terminator or an op with side-effects, insert a `gpu.wait` to
// host-synchronize execution.
// host-synchronize execution. A `!gpu.async.token` will therefore only be
// used inside of its block and GPU execution will always synchronize with
// the host at block boundaries.
WalkResult operator()(Operation *op) {
if (isa<gpu::LaunchOp>(op))
return op->emitOpError("replace with gpu.launch_func first");
@ -50,14 +62,13 @@ struct GpuAsyncRegionPass::Callback {
return rewriteAsyncOp(asyncOp); // Replace GPU op with async version.
if (!currentToken)
return success();
if (!op->hasTrait<OpTrait::IsTerminator>() &&
MemoryEffectOpInterface::hasNoEffect(op))
return success();
// Insert host synchronization before terminator or op with side effects.
currentToken = createWaitOp(op->getLoc(), Type(), {currentToken});
if (isTerminator(op) || hasSideEffects(op))
currentToken = createWaitOp(op->getLoc(), Type(), {currentToken});
return success();
}
private:
// Replaces asyncOp with a clone that returns a token.
LogicalResult rewriteAsyncOp(gpu::AsyncOpInterface asyncOp) {
auto *op = asyncOp.getOperation();
@ -104,13 +115,159 @@ struct GpuAsyncRegionPass::Callback {
Value currentToken = {};
};
// Callback for `async.execute` ops which tries to push the contained
// synchronous `gpu.wait` op to the dependencies of the `async.execute`.
struct GpuAsyncRegionPass::DeferWaitCallback {
// If the `executeOp`s token is used only in `async.execute` or `async.await`
// ops, add the region's last `gpu.wait` op to the worklist if it is
// synchronous and is the last op with side effects.
void operator()(async::ExecuteOp executeOp) {
if (!areAllUsersExecuteOrAwait(executeOp.token()))
return;
// async.execute's region is currently restricted to one block.
for (auto &op : llvm::reverse(executeOp.getBody()->without_terminator())) {
if (auto waitOp = dyn_cast<gpu::WaitOp>(op)) {
if (!waitOp.asyncToken())
worklist.push_back(waitOp);
return;
}
if (hasSideEffects(&op))
return;
}
}
// The destructor performs the actual rewrite work.
~DeferWaitCallback() {
for (size_t i = 0; i < worklist.size(); ++i) {
auto waitOp = worklist[i];
auto executeOp = waitOp.getParentOfType<async::ExecuteOp>();
auto numDependencies = waitOp.asyncDependencies().size();
// Erase `gpu.wait` and return async dependencies from region instead.
auto &yieldOp = executeOp.getBody()->getOperations().back();
yieldOp.insertOperands(yieldOp.getNumOperands(),
waitOp.asyncDependencies());
waitOp.erase();
auto asyncTokens = addAsyncTokenResults(executeOp, numDependencies);
// Add the async dependency to each user of the `async.execute` token.
for (Operation *user : executeOp.token().getUsers())
addAsyncDependencyAfter(asyncTokens, user);
}
}
private:
// Append `count` `!async.value<!gpu.async.token>` results to `executeOp`.
static ValueRange addAsyncTokenResults(async::ExecuteOp &executeOp,
unsigned count) {
auto numResults = executeOp.getNumResults() + count;
// Construct new result type list with `count` additional types.
SmallVector<Type, 2> resultTypes;
resultTypes.reserve(numResults);
copy(executeOp.getResultTypes(), std::back_inserter(resultTypes));
OpBuilder builder(executeOp);
auto tokenType = builder.getType<gpu::AsyncTokenType>();
resultTypes.resize(numResults, tokenType);
// Clone executeOp with the extra `!gpu.async.token` results.
auto newOp = builder.create<async::ExecuteOp>(
executeOp.getLoc(), TypeRange{resultTypes}.drop_front() /*drop token*/,
executeOp.dependencies(), executeOp.operands());
BlockAndValueMapping mapper;
newOp.getRegion().getBlocks().clear();
executeOp.getRegion().cloneInto(&newOp.getRegion(), mapper);
// Replace executeOp with cloned one.
executeOp.getOperation()->replaceAllUsesWith(
newOp.getResults().drop_back(count));
executeOp.erase();
executeOp = newOp;
// Return the new result values.
return executeOp.getResults().take_back(count);
}
// Returns whether all token users are either 'async.execute' or 'async.await'
// ops. This is used as a requirement for pushing 'gpu.wait' ops from a
// 'async.execute' body to it's users. Specifically, we do not allow
// terminator users, because it could mean that the `async.execute` is inside
// control flow code.
static bool areAllUsersExecuteOrAwait(Value token) {
return llvm::all_of(token.getUsers(), [](Operation *user) {
return isa<async::ExecuteOp, async::AwaitOp>(user);
});
}
// Add the `asyncToken` as dependency as needed after `op`.
void addAsyncDependencyAfter(ValueRange asyncTokens, Operation *op) {
OpBuilder builder(op->getContext());
auto loc = op->getLoc();
Block::iterator it;
SmallVector<Value, 1> tokens;
tokens.reserve(asyncTokens.size());
TypeSwitch<Operation *>(op)
.Case<async::AwaitOp>([&](auto awaitOp) {
// Add async.await ops to wait for the !gpu.async.tokens.
builder.setInsertionPointAfter(op);
for (auto asyncToken : asyncTokens)
tokens.push_back(
builder.create<async::AwaitOp>(loc, asyncToken).result());
// Set `it` after the inserted async.await ops.
it = builder.getInsertionPoint();
})
.Case<async::ExecuteOp>([&](auto executeOp) {
// Set `it` to the beginning of the region and add asyncTokens to the
// async.execute operands.
it = executeOp.getBody()->begin();
executeOp.operandsMutable().append(asyncTokens);
SmallVector<Type, 1> tokenTypes(
asyncTokens.size(), builder.getType<gpu::AsyncTokenType>());
copy(executeOp.getBody()->addArguments(tokenTypes),
std::back_inserter(tokens));
});
// Advance `it` to terminator or op with side-effects.
it = std::find_if(it, Block::iterator(), [](Operation &op) {
return isTerminator(&op) || hasSideEffects(&op);
});
// If `op` implements the AsyncOpInterface, add `token` to the list of async
// dependencies.
if (auto asyncOp = dyn_cast<gpu::AsyncOpInterface>(*it)) {
for (auto token : tokens)
asyncOp.addAsyncDependency(token);
return;
}
// Otherwise, insert a gpu.wait before 'it'.
builder.setInsertionPoint(it->getBlock(), it);
auto waitOp = builder.create<gpu::WaitOp>(loc, Type{}, tokens);
// If the new waitOp is at the end of an async.execute region, add it to the
// worklist. 'operator()(executeOp)' would do the same, but this is faster.
auto executeOp = dyn_cast<async::ExecuteOp>(it->getParentOp());
if (executeOp && areAllUsersExecuteOrAwait(executeOp.token()) &&
!it->getNextNode())
worklist.push_back(waitOp);
}
SmallVector<gpu::WaitOp, 8> worklist;
};
// Replaces synchronous GPU ops in the op's region with asynchronous ones and
// inserts the necessary synchronization (as gpu.wait ops). Assumes sequential
// execution semantics and that no GPU ops are asynchronous yet.
void GpuAsyncRegionPass::runOnFunction() {
Callback callback{OpBuilder(&getContext())};
if (getFunction().getRegion().walk(callback).wasInterrupted())
if (getFunction()
.getRegion()
.walk(ThreadTokenCallback(getContext()))
.wasInterrupted())
return signalPassFailure();
// Collect gpu.wait ops that we can move out of gpu.execute regions.
getFunction().getRegion().walk(DeferWaitCallback());
}
std::unique_ptr<OperationPass<FuncOp>> mlir::createGpuAsyncRegionPass() {

View File

@ -24,4 +24,78 @@ module attributes {gpu.container_module} {
return
}
// CHECK-LABEL:func @defer_wait(%{{.*}}: index)
func @defer_wait(%sz : index) {
// CHECK: %[[a0:.*]], %[[f0:.*]] = async.execute
%a0 = async.execute {
// CHECK: %[[t:.*]] = gpu.launch_func async
gpu.launch_func @kernels::@kernel
blocks in (%sz, %sz, %sz) threads in (%sz, %sz, %sz)
// CHECK-NOT: gpu.wait
// CHECK: async.yield %[[t]]
async.yield
}
// CHECK: %[[a1:.*]], %[[f1:.*]] = async.execute
// CHECK-SAME: %[[f0]]
%a1 = async.execute [%a0] {
// CHECK: %[[t:.*]] = gpu.launch_func async
gpu.launch_func @kernels::@kernel
blocks in (%sz, %sz, %sz) threads in (%sz, %sz, %sz)
// CHECK-NOT: gpu.wait
// CHECK: async.yield %[[t]]
async.yield
}
// CHECK: async.await %[[a1]]
// CHECK: %[[t:.*]] = async.await %[[f1]]
// CHECK: gpu.wait [%[[t]]]
async.await %a1 : !async.token
return
}
// CHECK-LABEL:func @defer_wait_blocked_by_side_effect(%{{.*}}: index)
func @defer_wait_blocked_by_side_effect(%sz : index) {
// CHECK: %[[a:.*]] = async.execute
%a = async.execute {
// CHECK: %[[t:.*]] = gpu.launch_func async
gpu.launch_func @kernels::@kernel
blocks in (%sz, %sz, %sz) threads in (%sz, %sz, %sz)
// CHECK: gpu.wait [%[[t]]]
call @foo() : () -> ()
async.yield
}
// CHECK: async.await %[[a]]
// CHECK-NOT: gpu.wait
async.await %a : !async.token
return
}
// CHECK-LABEL:func @defer_wait_pass_through(%{{.*}}: index)
func @defer_wait_pass_through(%sz : index) {
// CHECK: %[[a0:.*]], %[[f0:.*]] = async.execute
%a0 = async.execute {
// CHECK: %[[t:.*]] = gpu.launch_func async
gpu.launch_func @kernels::@kernel
blocks in (%sz, %sz, %sz) threads in (%sz, %sz, %sz)
// CHECK-NOT: gpu.wait
// CHECK: async.yield %[[t]]
async.yield
}
// CHECK: %[[a1:.*]], %[[f1:.*]] = async.execute
// CHECK-SAME: %[[f0]]
%a1 = async.execute [%a0] {
// CHECK-NOT: gpu.wait
// CHECK: async.yield %{{.*}}
async.yield
}
// CHECK: async.await %[[a1]]
// CHECK: %[[t:.*]] = async.await %[[f1]]
// CHECK: gpu.wait [%[[t]]]
async.await %a1 : !async.token
return
}
}