[MLIR][GPU] Add canonicalizer for gpu.memcpy

Erase gpu.memcpy op when only uses of dest are
the memcpy op in question, its allocation and deallocation
ops.

Reviewed By: bondhugula, csigg

Differential Revision: https://reviews.llvm.org/D124257
This commit is contained in:
Arnab Dutta 2022-05-13 16:50:16 +05:30 committed by Uday Bondhugula
parent 8db72d9d04
commit 16219f8c94
5 changed files with 139 additions and 0 deletions

View File

@ -1007,6 +1007,7 @@ def GPU_MemcpyOp : GPU_Op<"memcpy", [GPU_AsyncOpInterface]> {
}];
let hasFolder = 1;
let hasVerifier = 1;
let hasCanonicalizer = 1;
}
def GPU_MemsetOp : GPU_Op<"memset",

View File

@ -248,6 +248,10 @@ struct Write : public Effect::Base<Write> {};
// SideEffect Utilities
//===----------------------------------------------------------------------===//
/// Returns true if this operation only has the given effect on `value`.
template <typename EffectTy>
bool hasSingleEffect(Operation *op, Value value);
/// Return true if the given operation is unused, and has no side effects on
/// memory that prevent erasing.
bool isOpTriviallyDead(Operation *op);

View File

@ -24,6 +24,7 @@
#include "mlir/IR/OpImplementation.h"
#include "mlir/IR/PatternMatch.h"
#include "mlir/IR/TypeUtilities.h"
#include "mlir/Interfaces/SideEffectInterfaces.h"
#include "mlir/Transforms/InliningUtils.h"
#include "llvm/ADT/TypeSwitch.h"
@ -1105,6 +1106,48 @@ LogicalResult MemcpyOp::verify() {
return success();
}
namespace {
/// Erases a common case of copy ops where a destination value is used only by
/// the copy op, alloc and dealloc ops.
struct EraseTrivialCopyOp : public OpRewritePattern<MemcpyOp> {
using OpRewritePattern<MemcpyOp>::OpRewritePattern;
LogicalResult matchAndRewrite(MemcpyOp op,
PatternRewriter &rewriter) const override {
Value dest = op.dst();
Operation *destDefOp = dest.getDefiningOp();
// `dest` must be defined by an op having Allocate memory effect in order to
// perform the folding.
if (!destDefOp ||
!hasSingleEffect<MemoryEffects::Allocate>(destDefOp, dest))
return failure();
// We can erase `op` iff `dest` has no other use apart from its
// use by `op` and dealloc ops.
if (llvm::any_of(dest.getUsers(), [op, dest](Operation *user) {
return user != op &&
!hasSingleEffect<MemoryEffects::Free>(user, dest);
}))
return failure();
// We can perform the folding if and only if op has a single async
// dependency and produces an async token as result, or if it does not have
// any async dependency and does not produce any async token result.
if (op.asyncDependencies().size() > 1 ||
((op.asyncDependencies().empty() && op.asyncToken()) ||
(!op.asyncDependencies().empty() && !op.asyncToken())))
return failure();
rewriter.replaceOp(op, op.asyncDependencies());
return success();
}
};
} // end anonymous namespace
void MemcpyOp::getCanonicalizationPatterns(RewritePatternSet &results,
MLIRContext *context) {
results.add<EraseTrivialCopyOp>(context);
}
//===----------------------------------------------------------------------===//
// GPU_SubgroupMmaLoadMatrixOp
//===----------------------------------------------------------------------===//

View File

@ -90,6 +90,33 @@ static bool wouldOpBeTriviallyDeadImpl(Operation *rootOp) {
return true;
}
template <typename EffectTy>
bool mlir::hasSingleEffect(Operation *op, Value value) {
auto memOp = dyn_cast<MemoryEffectOpInterface>(op);
if (!memOp)
return false;
SmallVector<SideEffects::EffectInstance<MemoryEffects::Effect>, 4> effects;
memOp.getEffects(effects);
bool doesOpOnlyHaveSingleEffectOnVal = false;
// Iterate through `effects` and check if and only if effect of type
// `EffectTy` is present.
for (auto &effect : effects) {
if (effect.getValue() == value && isa<EffectTy>(effect.getEffect()))
doesOpOnlyHaveSingleEffectOnVal = true;
if (effect.getValue() == value && !isa<EffectTy>(effect.getEffect())) {
doesOpOnlyHaveSingleEffectOnVal = false;
break;
}
}
return doesOpOnlyHaveSingleEffectOnVal;
}
template bool mlir::hasSingleEffect<MemoryEffects::Allocate>(Operation *,
Value);
template bool mlir::hasSingleEffect<MemoryEffects::Free>(Operation *, Value);
template bool mlir::hasSingleEffect<MemoryEffects::Write>(Operation *, Value);
template bool mlir::hasSingleEffect<MemoryEffects::Read>(Operation *, Value);
bool mlir::wouldOpBeTriviallyDead(Operation *op) {
if (op->mightHaveTrait<OpTrait::IsTerminator>())
return false;

View File

@ -28,6 +28,70 @@ func.func @fold_wait_op_test2(%arg0: i1) -> (memref<5xf16>, memref<5xf16>) {
// CHECK-NEXT: gpu.alloc async [%[[TOKEN1]]] ()
// CHECK-NEXT: return
// CHECK-LABEL: func @fold_memcpy_op
func.func @fold_memcpy_op(%arg0: i1) {
%cst = arith.constant 0.000000e+00 : f16
%1 = memref.alloc() : memref<2xf16>
%2 = gpu.wait async
%memref, %asyncToken = gpu.alloc async [%2] () : memref<2xf16>
gpu.wait [%2]
affine.store %cst, %memref[0] : memref<2xf16>
%3 = gpu.wait async
%4 = gpu.memcpy async [%3] %1, %memref : memref<2xf16>, memref<2xf16>
gpu.wait [%3]
%5 = scf.if %arg0 -> (i1) {
memref.dealloc %1 : memref<2xf16>
scf.yield %arg0 : i1
} else {
memref.dealloc %1 : memref<2xf16>
scf.yield %arg0 : i1
}
return
}
// CHECK-NOT: gpu.memcpy
// We cannot fold memcpy here as dest is a block argument.
// CHECK-LABEL: func @do_not_fold_memcpy_op1
func.func @do_not_fold_memcpy_op1(%arg0: i1, %arg1: memref<2xf16>) {
%cst = arith.constant 0.000000e+00 : f16
%2 = gpu.wait async
%memref, %asyncToken = gpu.alloc async [%2] () : memref<2xf16>
gpu.wait [%2]
affine.store %cst, %memref[0] : memref<2xf16>
%3 = gpu.wait async
%4 = gpu.memcpy async [%3] %arg1, %memref : memref<2xf16>, memref<2xf16>
gpu.wait [%3]
return
}
// CHECK: gpu.memcpy
// We cannot fold gpu.memcpy as it is used by an op having read effect on dest.
// CHECK-LABEL: func @do_not_fold_memcpy_op2
func.func @do_not_fold_memcpy_op2(%arg0: i1, %arg1: index) -> f16 {
%cst = arith.constant 0.000000e+00 : f16
%1 = memref.alloc() : memref<2xf16>
%2 = gpu.wait async
%memref, %asyncToken = gpu.alloc async [%2] () : memref<2xf16>
gpu.wait [%2]
affine.store %cst, %memref[0] : memref<2xf16>
%3 = gpu.wait async
%4 = gpu.memcpy async [%3] %1, %memref : memref<2xf16>, memref<2xf16>
gpu.wait [%3]
%5 = memref.load %1[%arg1] : memref<2xf16>
return %5 : f16
}
// CHECK: gpu.memcpy
// We cannot fold gpu.memcpy, as the defining op if dest is not a alloc like op.
// CHECK-LABEL: func @do_not_fold_memcpy_op3
func.func @do_not_fold_memcpy_op3(%arg0: memref<1xi8>, %arg1: memref<i1>) {
%0 = arith.constant 0 : index
%1 = memref.view %arg0[%0][] : memref<1xi8> to memref<i1>
gpu.memcpy %1, %arg1 : memref<i1>, memref<i1>
func.return
}
// CHECK: gpu.memcpy
// CHECK-LABEL: @memcpy_after_cast
func.func @memcpy_after_cast(%arg0: memref<10xf32>, %arg1: memref<10xf32>) {
// CHECK-NOT: memref.cast