[MLIR][GPU] Expose GpuParallelLoopMapping as non-test pass.

Reviewed By: bondhugula, herhut

Differential Revision: https://reviews.llvm.org/D126199
This commit is contained in:
Christian Sigg 2022-05-30 08:32:01 +02:00
parent a5ddd4a238
commit bcf3d52486
8 changed files with 46 additions and 80 deletions

View File

@ -60,13 +60,5 @@ ParallelLoopDimMapping getParallelLoopDimMappingAttr(Processor processor,
LogicalResult setMappingAttr(scf::ParallelOp ploopOp, LogicalResult setMappingAttr(scf::ParallelOp ploopOp,
ArrayRef<ParallelLoopDimMapping> mapping); ArrayRef<ParallelLoopDimMapping> mapping);
} // namespace gpu } // namespace gpu
/// Maps the parallel loops found in the given function to workgroups. The first
/// loop encountered will be mapped to the global workgroup and the second loop
/// encountered to the local workgroup. Within each mapping, the first three
/// dimensions are mapped to x/y/z hardware ids and all following dimensions are
/// mapped to sequential loops.
void greedilyMapParallelSCFToGPU(Region &region);
} // namespace mlir } // namespace mlir
#endif // MLIR_DIALECT_GPU_PARALLELLOOPMAPPER_H #endif // MLIR_DIALECT_GPU_PARALLELLOOPMAPPER_H

View File

@ -39,6 +39,13 @@ createGpuKernelOutliningPass(StringRef dataLayoutStr = StringRef());
/// Rewrites a function region so that GPU ops execute asynchronously. /// Rewrites a function region so that GPU ops execute asynchronously.
std::unique_ptr<OperationPass<func::FuncOp>> createGpuAsyncRegionPass(); std::unique_ptr<OperationPass<func::FuncOp>> createGpuAsyncRegionPass();
/// Maps the parallel loops found in the given function to workgroups. The first
/// loop encountered will be mapped to the global workgroup and the second loop
/// encountered to the local workgroup. Within each mapping, the first three
/// dimensions are mapped to x/y/z hardware ids and all following dimensions are
/// mapped to sequential loops.
std::unique_ptr<OperationPass<func::FuncOp>> createGpuMapParallelLoopsPass();
/// Collect a set of patterns to rewrite all-reduce ops within the GPU dialect. /// Collect a set of patterns to rewrite all-reduce ops within the GPU dialect.
void populateGpuAllReducePatterns(RewritePatternSet &patterns); void populateGpuAllReducePatterns(RewritePatternSet &patterns);

View File

@ -29,4 +29,11 @@ def GpuAsyncRegionPass : Pass<"gpu-async-region", "func::FuncOp"> {
let dependentDialects = ["async::AsyncDialect"]; let dependentDialects = ["async::AsyncDialect"];
} }
def GpuMapParallelLoopsPass
: Pass<"gpu-map-parallel-loops", "mlir::func::FuncOp"> {
let summary = "Greedily maps loops to GPU hardware dimensions.";
let constructor = "mlir::createGpuMapParallelLoopsPass()";
let description = "Greedily maps loops to GPU hardware dimensions.";
}
#endif // MLIR_DIALECT_GPU_PASSES #endif // MLIR_DIALECT_GPU_PASSES

View File

@ -13,26 +13,25 @@
#include "mlir/Dialect/GPU/ParallelLoopMapper.h" #include "mlir/Dialect/GPU/ParallelLoopMapper.h"
#include "PassDetail.h"
#include "mlir/Dialect/GPU/GPUDialect.h" #include "mlir/Dialect/GPU/GPUDialect.h"
#include "mlir/Dialect/GPU/Passes.h" #include "mlir/Dialect/GPU/Passes.h"
#include "mlir/Dialect/SCF/SCF.h" #include "mlir/Dialect/SCF/SCF.h"
#include "mlir/IR/AffineMap.h" #include "mlir/IR/AffineMap.h"
#include "mlir/Pass/Pass.h" #include "mlir/Pass/Pass.h"
using namespace mlir;
using namespace mlir::gpu;
using namespace mlir::scf;
#include "mlir/Dialect/GPU/ParallelLoopMapperAttr.cpp.inc" #include "mlir/Dialect/GPU/ParallelLoopMapperAttr.cpp.inc"
#include "mlir/Dialect/GPU/ParallelLoopMapperEnums.cpp.inc" #include "mlir/Dialect/GPU/ParallelLoopMapperEnums.cpp.inc"
namespace mlir { namespace mlir {
namespace gpu {
StringRef getMappingAttrName() { return "mapping"; } using scf::ParallelOp;
ParallelLoopDimMapping getParallelLoopDimMappingAttr(Processor processor, StringRef gpu::getMappingAttrName() { return "mapping"; }
AffineMap map,
AffineMap bound) { gpu::ParallelLoopDimMapping
gpu::getParallelLoopDimMappingAttr(Processor processor, AffineMap map,
AffineMap bound) {
MLIRContext *context = map.getContext(); MLIRContext *context = map.getContext();
OpBuilder builder(context); OpBuilder builder(context);
return ParallelLoopDimMapping::get( return ParallelLoopDimMapping::get(
@ -40,8 +39,8 @@ ParallelLoopDimMapping getParallelLoopDimMappingAttr(Processor processor,
AffineMapAttr::get(map), AffineMapAttr::get(bound), context); AffineMapAttr::get(map), AffineMapAttr::get(bound), context);
} }
LogicalResult setMappingAttr(scf::ParallelOp ploopOp, LogicalResult gpu::setMappingAttr(ParallelOp ploopOp,
ArrayRef<ParallelLoopDimMapping> mapping) { ArrayRef<ParallelLoopDimMapping> mapping) {
// Verify that each processor is mapped to only once. // Verify that each processor is mapped to only once.
llvm::DenseSet<gpu::Processor> specifiedMappings; llvm::DenseSet<gpu::Processor> specifiedMappings;
for (auto dimAttr : mapping) { for (auto dimAttr : mapping) {
@ -56,20 +55,17 @@ LogicalResult setMappingAttr(scf::ParallelOp ploopOp,
ArrayAttr::get(ploopOp.getContext(), mappingAsAttrs)); ArrayAttr::get(ploopOp.getContext(), mappingAsAttrs));
return success(); return success();
} }
} // namespace gpu
} // namespace mlir
namespace gpu {
namespace { namespace {
enum MappingLevel { MapGrid = 0, MapBlock = 1, Sequential = 2 }; enum MappingLevel { MapGrid = 0, MapBlock = 1, Sequential = 2 };
} // namespace
static constexpr int kNumHardwareIds = 3; static constexpr int kNumHardwareIds = 3;
} // namespace
/// Bounded increment on MappingLevel. Increments to the next /// Bounded increment on MappingLevel. Increments to the next
/// level unless Sequential was already reached. /// level unless Sequential was already reached.
MappingLevel &operator++(MappingLevel &mappingLevel) { static MappingLevel &operator++(MappingLevel &mappingLevel) {
if (mappingLevel < Sequential) { if (mappingLevel < Sequential) {
mappingLevel = static_cast<MappingLevel>(mappingLevel + 1); mappingLevel = static_cast<MappingLevel>(mappingLevel + 1);
} }
@ -82,8 +78,7 @@ MappingLevel &operator++(MappingLevel &mappingLevel) {
/// TODO: Make this use x for the inner-most loop that is /// TODO: Make this use x for the inner-most loop that is
/// distributed to map to x, the next innermost to y and the next innermost to /// distributed to map to x, the next innermost to y and the next innermost to
/// z. /// z.
static gpu::Processor getHardwareIdForMapping(MappingLevel level, static Processor getHardwareIdForMapping(MappingLevel level, int dimension) {
int dimension) {
if (dimension >= kNumHardwareIds || level == Sequential) if (dimension >= kNumHardwareIds || level == Sequential)
return Processor::Sequential; return Processor::Sequential;
@ -145,6 +140,21 @@ static void mapParallelOp(ParallelOp parallelOp,
} }
} }
void mlir::greedilyMapParallelSCFToGPU(Region &region) { namespace {
region.walk([](ParallelOp parallelOp) { mapParallelOp(parallelOp); }); struct GpuMapParallelLoopsPass
: public GpuMapParallelLoopsPassBase<GpuMapParallelLoopsPass> {
void runOnOperation() override {
for (Region &region : getOperation()->getRegions()) {
region.walk([](ParallelOp parallelOp) { mapParallelOp(parallelOp); });
}
}
};
} // namespace
} // namespace gpu
} // namespace mlir
std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
mlir::createGpuMapParallelLoopsPass() {
return std::make_unique<gpu::GpuMapParallelLoopsPass>();
} }

View File

@ -1,4 +1,4 @@
// RUN: mlir-opt -test-gpu-greedy-parallel-loop-mapping -split-input-file %s | FileCheck %s // RUN: mlir-opt -gpu-map-parallel-loops -split-input-file %s | FileCheck %s
func.func @parallel_loop(%arg0 : index, %arg1 : index, %arg2 : index, func.func @parallel_loop(%arg0 : index, %arg1 : index, %arg2 : index,
%arg3 : index) { %arg3 : index) {

View File

@ -3,7 +3,6 @@ add_mlir_library(MLIRGPUTestPasses
TestConvertGPUKernelToCubin.cpp TestConvertGPUKernelToCubin.cpp
TestConvertGPUKernelToHsaco.cpp TestConvertGPUKernelToHsaco.cpp
TestGpuMemoryPromotion.cpp TestGpuMemoryPromotion.cpp
TestGpuParallelLoopMapping.cpp
TestGpuRewrite.cpp TestGpuRewrite.cpp
EXCLUDE_FROM_LIBMLIR EXCLUDE_FROM_LIBMLIR

View File

@ -1,47 +0,0 @@
//===- TestGPUParallelLoopMapping.cpp - Test pass for GPU loop mapping ----===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements the pass testing the utilities for mapping parallel
// loops to gpu hardware ids.
//
//===----------------------------------------------------------------------===//
#include "mlir/Dialect/GPU/ParallelLoopMapper.h"
#include "mlir/Pass/Pass.h"
using namespace mlir;
namespace {
/// Simple pass for testing the mapping of parallel loops to hardware ids using
/// a greedy mapping strategy.
struct TestGpuGreedyParallelLoopMappingPass
: public PassWrapper<TestGpuGreedyParallelLoopMappingPass,
OperationPass<>> {
MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(
TestGpuGreedyParallelLoopMappingPass)
StringRef getArgument() const final {
return "test-gpu-greedy-parallel-loop-mapping";
}
StringRef getDescription() const final {
return "Greedily maps all parallel loops to gpu hardware ids.";
}
void runOnOperation() override {
for (Region &region : getOperation()->getRegions())
greedilyMapParallelSCFToGPU(region);
}
};
} // namespace
namespace mlir {
namespace test {
void registerTestGpuParallelLoopMappingPass() {
PassRegistration<TestGpuGreedyParallelLoopMappingPass>();
}
} // namespace test
} // namespace mlir

View File

@ -79,7 +79,6 @@ void registerTestDynamicPipelinePass();
void registerTestExpandTanhPass(); void registerTestExpandTanhPass();
void registerTestComposeSubView(); void registerTestComposeSubView();
void registerTestMultiBuffering(); void registerTestMultiBuffering();
void registerTestGpuParallelLoopMappingPass();
void registerTestIRVisitorsPass(); void registerTestIRVisitorsPass();
void registerTestGenericIRVisitorsPass(); void registerTestGenericIRVisitorsPass();
void registerTestGenericIRVisitorsInterruptPass(); void registerTestGenericIRVisitorsInterruptPass();
@ -176,7 +175,6 @@ void registerTestPasses() {
mlir::test::registerTestExpandTanhPass(); mlir::test::registerTestExpandTanhPass();
mlir::test::registerTestComposeSubView(); mlir::test::registerTestComposeSubView();
mlir::test::registerTestMultiBuffering(); mlir::test::registerTestMultiBuffering();
mlir::test::registerTestGpuParallelLoopMappingPass();
mlir::test::registerTestIRVisitorsPass(); mlir::test::registerTestIRVisitorsPass();
mlir::test::registerTestGenericIRVisitorsPass(); mlir::test::registerTestGenericIRVisitorsPass();
mlir::test::registerTestInterfaces(); mlir::test::registerTestInterfaces();