forked from OSchip/llvm-project
148 lines
6.3 KiB
C++
148 lines
6.3 KiB
C++
//===- LoopsToGPUPass.cpp - Convert a loop nest to a GPU kernel -----------===//
|
|
//
|
|
// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "mlir/Conversion/LoopsToGPU/LoopsToGPUPass.h"
|
|
#include "mlir/Conversion/LoopsToGPU/LoopsToGPU.h"
|
|
#include "mlir/Dialect/AffineOps/AffineOps.h"
|
|
#include "mlir/Dialect/LoopOps/LoopOps.h"
|
|
#include "mlir/Dialect/StandardOps/Ops.h"
|
|
#include "mlir/Pass/Pass.h"
|
|
|
|
#include "llvm/ADT/ArrayRef.h"
|
|
#include "llvm/Support/CommandLine.h"
|
|
|
|
#define PASS_NAME "convert-loops-to-gpu"
|
|
#define LOOPOP_TO_GPU_PASS_NAME "convert-loop-op-to-gpu"
|
|
|
|
using namespace mlir;
|
|
using namespace mlir::loop;
|
|
|
|
static llvm::cl::OptionCategory clOptionsCategory(PASS_NAME " options");
|
|
static llvm::cl::opt<unsigned>
|
|
clNumBlockDims("gpu-block-dims",
|
|
llvm::cl::desc("Number of GPU block dimensions for mapping"),
|
|
llvm::cl::cat(clOptionsCategory), llvm::cl::init(1u));
|
|
static llvm::cl::opt<unsigned> clNumThreadDims(
|
|
"gpu-thread-dims",
|
|
llvm::cl::desc("Number of GPU thread dimensions for mapping"),
|
|
llvm::cl::cat(clOptionsCategory), llvm::cl::init(1u));
|
|
|
|
static llvm::cl::OptionCategory clLoopOpToGPUCategory(LOOPOP_TO_GPU_PASS_NAME
|
|
" options");
|
|
static llvm::cl::list<unsigned>
|
|
clNumWorkGroups("gpu-num-workgroups",
|
|
llvm::cl::desc("Num workgroups in the GPU launch"),
|
|
llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated,
|
|
llvm::cl::cat(clLoopOpToGPUCategory));
|
|
static llvm::cl::list<unsigned>
|
|
clWorkGroupSize("gpu-workgroup-size",
|
|
llvm::cl::desc("Workgroup Size in the GPU launch"),
|
|
llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated,
|
|
llvm::cl::cat(clLoopOpToGPUCategory));
|
|
|
|
namespace {
|
|
// A pass that traverses top-level loops in the function and converts them to
|
|
// GPU launch operations. Nested launches are not allowed, so this does not
|
|
// walk the function recursively to avoid considering nested loops.
|
|
struct ForLoopMapper : public FunctionPass<ForLoopMapper> {
|
|
ForLoopMapper(unsigned numBlockDims, unsigned numThreadDims)
|
|
: numBlockDims(numBlockDims), numThreadDims(numThreadDims) {}
|
|
|
|
void runOnFunction() override {
|
|
for (Block &block : getFunction())
|
|
for (Operation &op : llvm::make_early_inc_range(block)) {
|
|
if (auto forOp = dyn_cast<AffineForOp>(&op)) {
|
|
if (failed(convertAffineLoopNestToGPULaunch(forOp, numBlockDims,
|
|
numThreadDims)))
|
|
signalPassFailure();
|
|
} else if (auto forOp = dyn_cast<ForOp>(&op)) {
|
|
if (failed(convertLoopNestToGPULaunch(forOp, numBlockDims,
|
|
numThreadDims)))
|
|
signalPassFailure();
|
|
}
|
|
}
|
|
}
|
|
|
|
unsigned numBlockDims;
|
|
unsigned numThreadDims;
|
|
};
|
|
|
|
// A pass that traverses top-level loops in the function and convertes them to
|
|
// GPU launch operations. The top-level loops itself does not have to be
|
|
// perfectly nested. The only requirement is that there be as many perfectly
|
|
// nested loops as the size of `numWorkGroups`. Within these any loop nest has
|
|
// to be perfectly nested upto depth equal to size of `workGroupSize`.
|
|
struct ImperfectlyNestedForLoopMapper
|
|
: public FunctionPass<ImperfectlyNestedForLoopMapper> {
|
|
ImperfectlyNestedForLoopMapper(ArrayRef<int64_t> numWorkGroups,
|
|
ArrayRef<int64_t> workGroupSize)
|
|
: numWorkGroups(numWorkGroups.begin(), numWorkGroups.end()),
|
|
workGroupSize(workGroupSize.begin(), workGroupSize.end()) {}
|
|
|
|
void runOnFunction() override {
|
|
// Insert the num work groups and workgroup sizes as constant values. This
|
|
// pass is only used for testing.
|
|
FuncOp funcOp = getFunction();
|
|
OpBuilder builder(funcOp.getOperation()->getRegion(0));
|
|
SmallVector<Value, 3> numWorkGroupsVal, workGroupSizeVal;
|
|
for (auto val : numWorkGroups) {
|
|
auto constOp = builder.create<ConstantOp>(
|
|
funcOp.getLoc(), builder.getIntegerAttr(builder.getIndexType(), val));
|
|
numWorkGroupsVal.push_back(constOp);
|
|
}
|
|
for (auto val : workGroupSize) {
|
|
auto constOp = builder.create<ConstantOp>(
|
|
funcOp.getLoc(), builder.getIntegerAttr(builder.getIndexType(), val));
|
|
workGroupSizeVal.push_back(constOp);
|
|
}
|
|
for (Block &block : getFunction()) {
|
|
for (Operation &op : llvm::make_early_inc_range(block)) {
|
|
if (auto forOp = dyn_cast<ForOp>(&op)) {
|
|
if (failed(convertLoopToGPULaunch(forOp, numWorkGroupsVal,
|
|
workGroupSizeVal))) {
|
|
return signalPassFailure();
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
SmallVector<int64_t, 3> numWorkGroups;
|
|
SmallVector<int64_t, 3> workGroupSize;
|
|
};
|
|
|
|
} // namespace
|
|
|
|
std::unique_ptr<OpPassBase<FuncOp>>
|
|
mlir::createSimpleLoopsToGPUPass(unsigned numBlockDims,
|
|
unsigned numThreadDims) {
|
|
return std::make_unique<ForLoopMapper>(numBlockDims, numThreadDims);
|
|
}
|
|
|
|
std::unique_ptr<OpPassBase<FuncOp>>
|
|
mlir::createLoopToGPUPass(ArrayRef<int64_t> numWorkGroups,
|
|
ArrayRef<int64_t> workGroupSize) {
|
|
return std::make_unique<ImperfectlyNestedForLoopMapper>(numWorkGroups,
|
|
workGroupSize);
|
|
}
|
|
|
|
static PassRegistration<ForLoopMapper>
|
|
registration(PASS_NAME, "Convert top-level loops to GPU kernels", [] {
|
|
return std::make_unique<ForLoopMapper>(clNumBlockDims.getValue(),
|
|
clNumThreadDims.getValue());
|
|
});
|
|
|
|
static PassRegistration<ImperfectlyNestedForLoopMapper> loopOpToGPU(
|
|
LOOPOP_TO_GPU_PASS_NAME, "Convert top-level loop::ForOp to GPU kernels",
|
|
[] {
|
|
SmallVector<int64_t, 3> numWorkGroups, workGroupSize;
|
|
numWorkGroups.assign(clNumWorkGroups.begin(), clNumWorkGroups.end());
|
|
workGroupSize.assign(clWorkGroupSize.begin(), clWorkGroupSize.end());
|
|
return std::make_unique<ImperfectlyNestedForLoopMapper>(numWorkGroups,
|
|
workGroupSize);
|
|
});
|