forked from OSchip/llvm-project
[mlir][Linalg] Add a useLinalgCopy option to Linalg bufferization.
Benchmarks show that memref::CopyOp is curently up to 200x slower than tiled and vectorized versions of linalg::Copy. Add a temporary flag to allow comprehensive bufferize to generate a linalg::GenericOp that implements a copy until this performance bug is resolved. Differential Revision: https://reviews.llvm.org/D117696
This commit is contained in:
parent
baa9b7c3c8
commit
d492a7b2ca
|
@ -64,6 +64,8 @@ std::unique_ptr<OperationPass<FuncOp>> createConvertLinalgToAffineLoopsPass();
|
|||
/// on SSA use-def chains starting from function operands that are annotated
|
||||
/// with the 'inplaceable' attribute.
|
||||
std::unique_ptr<Pass> createLinalgComprehensiveModuleBufferizePass();
|
||||
std::unique_ptr<Pass>
|
||||
createLinalgComprehensiveModuleBufferizePass(bool useLinalgCopy);
|
||||
|
||||
/// Create a pass to convert Linalg operations which work on tensors to use
|
||||
/// buffers instead.
|
||||
|
|
|
@ -52,6 +52,9 @@ def LinalgComprehensiveModuleBufferize :
|
|||
Option<"useAlloca", "use-alloca", "bool",
|
||||
/*default=*/"false",
|
||||
"Use stack allocations for memrefs (for testing purposes only)">,
|
||||
Option<"useLinalgCopy", "use-linalg-copy", "bool",
|
||||
/*default=*/"false",
|
||||
"Use a copy operation implemented as a Linalg op.">,
|
||||
Option<"analysisFuzzerSeed", "analysis-fuzzer-seed", "unsigned",
|
||||
/*default=*/"0",
|
||||
"Analyze ops in random order with a given seed (fuzzer)">,
|
||||
|
|
|
@ -39,6 +39,10 @@ struct LinalgComprehensiveModuleBufferize
|
|||
LinalgComprehensiveModuleBufferize(
|
||||
const LinalgComprehensiveModuleBufferize &p) = default;
|
||||
|
||||
LinalgComprehensiveModuleBufferize(bool linalgCopy) {
|
||||
this->useLinalgCopy = linalgCopy;
|
||||
}
|
||||
|
||||
void runOnOperation() override;
|
||||
|
||||
void getDependentDialects(DialectRegistry ®istry) const override {
|
||||
|
@ -74,6 +78,32 @@ static FailureOr<Value> allocationFnUsingAlloca(OpBuilder &b, Location loc,
|
|||
return allocated;
|
||||
}
|
||||
|
||||
/// Create a linalg::GenericOp version of an n-D copy that can further tile,
|
||||
/// lower to loops or vectorize, unlike the current implementation of
|
||||
/// memref::CopyOp.
|
||||
/// Do not depend on linalg::CopyOp that is getting deprecated.
|
||||
static LogicalResult createLinalgCopyOp(OpBuilder &b, Location loc, Value from,
|
||||
Value to) {
|
||||
auto memrefTypeFrom = from.getType().cast<MemRefType>();
|
||||
auto memrefTypeTo = to.getType().cast<MemRefType>();
|
||||
if (!memrefTypeFrom || !memrefTypeTo ||
|
||||
memrefTypeFrom.getRank() != memrefTypeTo.getRank())
|
||||
return failure();
|
||||
AffineMap id =
|
||||
AffineMap::getMultiDimIdentityMap(memrefTypeTo.getRank(), b.getContext());
|
||||
SmallVector<StringRef> iteratorTypes(memrefTypeTo.getRank(),
|
||||
getParallelIteratorTypeName());
|
||||
b.create<linalg::GenericOp>(loc,
|
||||
/*inputs=*/from,
|
||||
/*outputs=*/to,
|
||||
/*indexingMaps=*/llvm::makeArrayRef({id, id}),
|
||||
/*iteratorTypes=*/iteratorTypes,
|
||||
[](OpBuilder &b, Location loc, ValueRange args) {
|
||||
b.create<linalg::YieldOp>(loc, args.front());
|
||||
});
|
||||
return success();
|
||||
}
|
||||
|
||||
void LinalgComprehensiveModuleBufferize::runOnOperation() {
|
||||
auto options = std::make_unique<AnalysisBufferizationOptions>();
|
||||
if (useAlloca) {
|
||||
|
@ -82,13 +112,17 @@ void LinalgComprehensiveModuleBufferize::runOnOperation() {
|
|||
return success();
|
||||
};
|
||||
}
|
||||
// TODO: atm memref::CopyOp can be 200x slower than linalg::GenericOp.
|
||||
// Once this perf bug is fixed more systematically, we can revisit.
|
||||
if (useLinalgCopy)
|
||||
options->memCpyFn = createLinalgCopyOp;
|
||||
|
||||
options->allowReturnMemref = allowReturnMemref;
|
||||
options->allowUnknownOps = allowUnknownOps;
|
||||
options->analysisFuzzerSeed = analysisFuzzerSeed;
|
||||
options->testAnalysisOnly = testAnalysisOnly;
|
||||
options->printConflicts = printConflicts;
|
||||
options->createDeallocs = createDeallocs;
|
||||
options->printConflicts = printConflicts;
|
||||
options->testAnalysisOnly = testAnalysisOnly;
|
||||
|
||||
// Enable InitTensorOp elimination.
|
||||
if (initTensorElimination) {
|
||||
|
@ -120,3 +154,8 @@ void LinalgComprehensiveModuleBufferize::runOnOperation() {
|
|||
std::unique_ptr<Pass> mlir::createLinalgComprehensiveModuleBufferizePass() {
|
||||
return std::make_unique<LinalgComprehensiveModuleBufferize>();
|
||||
}
|
||||
|
||||
std::unique_ptr<Pass>
|
||||
mlir::createLinalgComprehensiveModuleBufferizePass(bool useLinalgCopy) {
|
||||
return std::make_unique<LinalgComprehensiveModuleBufferize>(useLinalgCopy);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue