llvm-project/mlir/lib/Conversion/LoopsToGPU/LoopsToGPUPass.cpp

148 lines
6.3 KiB
C++

//===- LoopsToGPUPass.cpp - Convert a loop nest to a GPU kernel -----------===//
//
// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "mlir/Conversion/LoopsToGPU/LoopsToGPUPass.h"
#include "mlir/Conversion/LoopsToGPU/LoopsToGPU.h"
#include "mlir/Dialect/AffineOps/AffineOps.h"
#include "mlir/Dialect/LoopOps/LoopOps.h"
#include "mlir/Dialect/StandardOps/Ops.h"
#include "mlir/Pass/Pass.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/Support/CommandLine.h"
#define PASS_NAME "convert-loops-to-gpu"
#define LOOPOP_TO_GPU_PASS_NAME "convert-loop-op-to-gpu"
using namespace mlir;
using namespace mlir::loop;
static llvm::cl::OptionCategory clOptionsCategory(PASS_NAME " options");
static llvm::cl::opt<unsigned>
clNumBlockDims("gpu-block-dims",
llvm::cl::desc("Number of GPU block dimensions for mapping"),
llvm::cl::cat(clOptionsCategory), llvm::cl::init(1u));
static llvm::cl::opt<unsigned> clNumThreadDims(
"gpu-thread-dims",
llvm::cl::desc("Number of GPU thread dimensions for mapping"),
llvm::cl::cat(clOptionsCategory), llvm::cl::init(1u));
static llvm::cl::OptionCategory clLoopOpToGPUCategory(LOOPOP_TO_GPU_PASS_NAME
" options");
static llvm::cl::list<unsigned>
clNumWorkGroups("gpu-num-workgroups",
llvm::cl::desc("Num workgroups in the GPU launch"),
llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated,
llvm::cl::cat(clLoopOpToGPUCategory));
static llvm::cl::list<unsigned>
clWorkGroupSize("gpu-workgroup-size",
llvm::cl::desc("Workgroup Size in the GPU launch"),
llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated,
llvm::cl::cat(clLoopOpToGPUCategory));
namespace {
// A pass that traverses top-level loops in the function and converts them to
// GPU launch operations. Nested launches are not allowed, so this does not
// walk the function recursively to avoid considering nested loops.
struct ForLoopMapper : public FunctionPass<ForLoopMapper> {
ForLoopMapper(unsigned numBlockDims, unsigned numThreadDims)
: numBlockDims(numBlockDims), numThreadDims(numThreadDims) {}
void runOnFunction() override {
for (Block &block : getFunction())
for (Operation &op : llvm::make_early_inc_range(block)) {
if (auto forOp = dyn_cast<AffineForOp>(&op)) {
if (failed(convertAffineLoopNestToGPULaunch(forOp, numBlockDims,
numThreadDims)))
signalPassFailure();
} else if (auto forOp = dyn_cast<ForOp>(&op)) {
if (failed(convertLoopNestToGPULaunch(forOp, numBlockDims,
numThreadDims)))
signalPassFailure();
}
}
}
unsigned numBlockDims;
unsigned numThreadDims;
};
// A pass that traverses top-level loops in the function and convertes them to
// GPU launch operations. The top-level loops itself does not have to be
// perfectly nested. The only requirement is that there be as many perfectly
// nested loops as the size of `numWorkGroups`. Within these any loop nest has
// to be perfectly nested upto depth equal to size of `workGroupSize`.
struct ImperfectlyNestedForLoopMapper
: public FunctionPass<ImperfectlyNestedForLoopMapper> {
ImperfectlyNestedForLoopMapper(ArrayRef<int64_t> numWorkGroups,
ArrayRef<int64_t> workGroupSize)
: numWorkGroups(numWorkGroups.begin(), numWorkGroups.end()),
workGroupSize(workGroupSize.begin(), workGroupSize.end()) {}
void runOnFunction() override {
// Insert the num work groups and workgroup sizes as constant values. This
// pass is only used for testing.
FuncOp funcOp = getFunction();
OpBuilder builder(funcOp.getOperation()->getRegion(0));
SmallVector<Value, 3> numWorkGroupsVal, workGroupSizeVal;
for (auto val : numWorkGroups) {
auto constOp = builder.create<ConstantOp>(
funcOp.getLoc(), builder.getIntegerAttr(builder.getIndexType(), val));
numWorkGroupsVal.push_back(constOp);
}
for (auto val : workGroupSize) {
auto constOp = builder.create<ConstantOp>(
funcOp.getLoc(), builder.getIntegerAttr(builder.getIndexType(), val));
workGroupSizeVal.push_back(constOp);
}
for (Block &block : getFunction()) {
for (Operation &op : llvm::make_early_inc_range(block)) {
if (auto forOp = dyn_cast<ForOp>(&op)) {
if (failed(convertLoopToGPULaunch(forOp, numWorkGroupsVal,
workGroupSizeVal))) {
return signalPassFailure();
}
}
}
}
}
SmallVector<int64_t, 3> numWorkGroups;
SmallVector<int64_t, 3> workGroupSize;
};
} // namespace
std::unique_ptr<OpPassBase<FuncOp>>
mlir::createSimpleLoopsToGPUPass(unsigned numBlockDims,
unsigned numThreadDims) {
return std::make_unique<ForLoopMapper>(numBlockDims, numThreadDims);
}
std::unique_ptr<OpPassBase<FuncOp>>
mlir::createLoopToGPUPass(ArrayRef<int64_t> numWorkGroups,
ArrayRef<int64_t> workGroupSize) {
return std::make_unique<ImperfectlyNestedForLoopMapper>(numWorkGroups,
workGroupSize);
}
static PassRegistration<ForLoopMapper>
registration(PASS_NAME, "Convert top-level loops to GPU kernels", [] {
return std::make_unique<ForLoopMapper>(clNumBlockDims.getValue(),
clNumThreadDims.getValue());
});
static PassRegistration<ImperfectlyNestedForLoopMapper> loopOpToGPU(
LOOPOP_TO_GPU_PASS_NAME, "Convert top-level loop::ForOp to GPU kernels",
[] {
SmallVector<int64_t, 3> numWorkGroups, workGroupSize;
numWorkGroups.assign(clNumWorkGroups.begin(), clNumWorkGroups.end());
workGroupSize.assign(clWorkGroupSize.begin(), clWorkGroupSize.end());
return std::make_unique<ImperfectlyNestedForLoopMapper>(numWorkGroups,
workGroupSize);
});