forked from OSchip/llvm-project
[MLIR][GPU] Add canonicalizer for gpu.memcpy
Erase gpu.memcpy op when only uses of dest are the memcpy op in question, its allocation and deallocation ops. Reviewed By: bondhugula, csigg Differential Revision: https://reviews.llvm.org/D124257
This commit is contained in:
parent
8db72d9d04
commit
16219f8c94
|
@ -1007,6 +1007,7 @@ def GPU_MemcpyOp : GPU_Op<"memcpy", [GPU_AsyncOpInterface]> {
|
|||
}];
|
||||
let hasFolder = 1;
|
||||
let hasVerifier = 1;
|
||||
let hasCanonicalizer = 1;
|
||||
}
|
||||
|
||||
def GPU_MemsetOp : GPU_Op<"memset",
|
||||
|
|
|
@ -248,6 +248,10 @@ struct Write : public Effect::Base<Write> {};
|
|||
// SideEffect Utilities
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
/// Returns true if this operation only has the given effect on `value`.
|
||||
template <typename EffectTy>
|
||||
bool hasSingleEffect(Operation *op, Value value);
|
||||
|
||||
/// Return true if the given operation is unused, and has no side effects on
|
||||
/// memory that prevent erasing.
|
||||
bool isOpTriviallyDead(Operation *op);
|
||||
|
|
|
@ -24,6 +24,7 @@
|
|||
#include "mlir/IR/OpImplementation.h"
|
||||
#include "mlir/IR/PatternMatch.h"
|
||||
#include "mlir/IR/TypeUtilities.h"
|
||||
#include "mlir/Interfaces/SideEffectInterfaces.h"
|
||||
#include "mlir/Transforms/InliningUtils.h"
|
||||
#include "llvm/ADT/TypeSwitch.h"
|
||||
|
||||
|
@ -1105,6 +1106,48 @@ LogicalResult MemcpyOp::verify() {
|
|||
return success();
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
/// Erases a common case of copy ops where a destination value is used only by
|
||||
/// the copy op, alloc and dealloc ops.
|
||||
struct EraseTrivialCopyOp : public OpRewritePattern<MemcpyOp> {
|
||||
using OpRewritePattern<MemcpyOp>::OpRewritePattern;
|
||||
|
||||
LogicalResult matchAndRewrite(MemcpyOp op,
|
||||
PatternRewriter &rewriter) const override {
|
||||
Value dest = op.dst();
|
||||
Operation *destDefOp = dest.getDefiningOp();
|
||||
// `dest` must be defined by an op having Allocate memory effect in order to
|
||||
// perform the folding.
|
||||
if (!destDefOp ||
|
||||
!hasSingleEffect<MemoryEffects::Allocate>(destDefOp, dest))
|
||||
return failure();
|
||||
// We can erase `op` iff `dest` has no other use apart from its
|
||||
// use by `op` and dealloc ops.
|
||||
if (llvm::any_of(dest.getUsers(), [op, dest](Operation *user) {
|
||||
return user != op &&
|
||||
!hasSingleEffect<MemoryEffects::Free>(user, dest);
|
||||
}))
|
||||
return failure();
|
||||
// We can perform the folding if and only if op has a single async
|
||||
// dependency and produces an async token as result, or if it does not have
|
||||
// any async dependency and does not produce any async token result.
|
||||
if (op.asyncDependencies().size() > 1 ||
|
||||
((op.asyncDependencies().empty() && op.asyncToken()) ||
|
||||
(!op.asyncDependencies().empty() && !op.asyncToken())))
|
||||
return failure();
|
||||
rewriter.replaceOp(op, op.asyncDependencies());
|
||||
return success();
|
||||
}
|
||||
};
|
||||
|
||||
} // end anonymous namespace
|
||||
|
||||
void MemcpyOp::getCanonicalizationPatterns(RewritePatternSet &results,
|
||||
MLIRContext *context) {
|
||||
results.add<EraseTrivialCopyOp>(context);
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// GPU_SubgroupMmaLoadMatrixOp
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
|
|
@ -90,6 +90,33 @@ static bool wouldOpBeTriviallyDeadImpl(Operation *rootOp) {
|
|||
return true;
|
||||
}
|
||||
|
||||
template <typename EffectTy>
|
||||
bool mlir::hasSingleEffect(Operation *op, Value value) {
|
||||
auto memOp = dyn_cast<MemoryEffectOpInterface>(op);
|
||||
if (!memOp)
|
||||
return false;
|
||||
SmallVector<SideEffects::EffectInstance<MemoryEffects::Effect>, 4> effects;
|
||||
memOp.getEffects(effects);
|
||||
bool doesOpOnlyHaveSingleEffectOnVal = false;
|
||||
// Iterate through `effects` and check if and only if effect of type
|
||||
// `EffectTy` is present.
|
||||
for (auto &effect : effects) {
|
||||
if (effect.getValue() == value && isa<EffectTy>(effect.getEffect()))
|
||||
doesOpOnlyHaveSingleEffectOnVal = true;
|
||||
if (effect.getValue() == value && !isa<EffectTy>(effect.getEffect())) {
|
||||
doesOpOnlyHaveSingleEffectOnVal = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return doesOpOnlyHaveSingleEffectOnVal;
|
||||
}
|
||||
|
||||
template bool mlir::hasSingleEffect<MemoryEffects::Allocate>(Operation *,
|
||||
Value);
|
||||
template bool mlir::hasSingleEffect<MemoryEffects::Free>(Operation *, Value);
|
||||
template bool mlir::hasSingleEffect<MemoryEffects::Write>(Operation *, Value);
|
||||
template bool mlir::hasSingleEffect<MemoryEffects::Read>(Operation *, Value);
|
||||
|
||||
bool mlir::wouldOpBeTriviallyDead(Operation *op) {
|
||||
if (op->mightHaveTrait<OpTrait::IsTerminator>())
|
||||
return false;
|
||||
|
|
|
@ -28,6 +28,70 @@ func.func @fold_wait_op_test2(%arg0: i1) -> (memref<5xf16>, memref<5xf16>) {
|
|||
// CHECK-NEXT: gpu.alloc async [%[[TOKEN1]]] ()
|
||||
// CHECK-NEXT: return
|
||||
|
||||
// CHECK-LABEL: func @fold_memcpy_op
|
||||
func.func @fold_memcpy_op(%arg0: i1) {
|
||||
%cst = arith.constant 0.000000e+00 : f16
|
||||
%1 = memref.alloc() : memref<2xf16>
|
||||
%2 = gpu.wait async
|
||||
%memref, %asyncToken = gpu.alloc async [%2] () : memref<2xf16>
|
||||
gpu.wait [%2]
|
||||
affine.store %cst, %memref[0] : memref<2xf16>
|
||||
%3 = gpu.wait async
|
||||
%4 = gpu.memcpy async [%3] %1, %memref : memref<2xf16>, memref<2xf16>
|
||||
gpu.wait [%3]
|
||||
%5 = scf.if %arg0 -> (i1) {
|
||||
memref.dealloc %1 : memref<2xf16>
|
||||
scf.yield %arg0 : i1
|
||||
} else {
|
||||
memref.dealloc %1 : memref<2xf16>
|
||||
scf.yield %arg0 : i1
|
||||
}
|
||||
return
|
||||
}
|
||||
// CHECK-NOT: gpu.memcpy
|
||||
|
||||
// We cannot fold memcpy here as dest is a block argument.
|
||||
// CHECK-LABEL: func @do_not_fold_memcpy_op1
|
||||
func.func @do_not_fold_memcpy_op1(%arg0: i1, %arg1: memref<2xf16>) {
|
||||
%cst = arith.constant 0.000000e+00 : f16
|
||||
%2 = gpu.wait async
|
||||
%memref, %asyncToken = gpu.alloc async [%2] () : memref<2xf16>
|
||||
gpu.wait [%2]
|
||||
affine.store %cst, %memref[0] : memref<2xf16>
|
||||
%3 = gpu.wait async
|
||||
%4 = gpu.memcpy async [%3] %arg1, %memref : memref<2xf16>, memref<2xf16>
|
||||
gpu.wait [%3]
|
||||
return
|
||||
}
|
||||
// CHECK: gpu.memcpy
|
||||
|
||||
// We cannot fold gpu.memcpy as it is used by an op having read effect on dest.
|
||||
// CHECK-LABEL: func @do_not_fold_memcpy_op2
|
||||
func.func @do_not_fold_memcpy_op2(%arg0: i1, %arg1: index) -> f16 {
|
||||
%cst = arith.constant 0.000000e+00 : f16
|
||||
%1 = memref.alloc() : memref<2xf16>
|
||||
%2 = gpu.wait async
|
||||
%memref, %asyncToken = gpu.alloc async [%2] () : memref<2xf16>
|
||||
gpu.wait [%2]
|
||||
affine.store %cst, %memref[0] : memref<2xf16>
|
||||
%3 = gpu.wait async
|
||||
%4 = gpu.memcpy async [%3] %1, %memref : memref<2xf16>, memref<2xf16>
|
||||
gpu.wait [%3]
|
||||
%5 = memref.load %1[%arg1] : memref<2xf16>
|
||||
return %5 : f16
|
||||
}
|
||||
// CHECK: gpu.memcpy
|
||||
|
||||
// We cannot fold gpu.memcpy, as the defining op if dest is not a alloc like op.
|
||||
// CHECK-LABEL: func @do_not_fold_memcpy_op3
|
||||
func.func @do_not_fold_memcpy_op3(%arg0: memref<1xi8>, %arg1: memref<i1>) {
|
||||
%0 = arith.constant 0 : index
|
||||
%1 = memref.view %arg0[%0][] : memref<1xi8> to memref<i1>
|
||||
gpu.memcpy %1, %arg1 : memref<i1>, memref<i1>
|
||||
func.return
|
||||
}
|
||||
// CHECK: gpu.memcpy
|
||||
|
||||
// CHECK-LABEL: @memcpy_after_cast
|
||||
func.func @memcpy_after_cast(%arg0: memref<10xf32>, %arg1: memref<10xf32>) {
|
||||
// CHECK-NOT: memref.cast
|
||||
|
|
Loading…
Reference in New Issue